mirror of
https://sourceware.org/git/glibc.git
synced 2025-11-02 09:33:31 +03:00
math: Consolidate erf/erfc definitions
The common code definitions are consolidated in s_erf_common.h and s_erf_common.c. Checked on x86_64-linux-gnu, aarch64-linux-gnu, and powerpc64le-linux-gnu. Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
@@ -363,6 +363,7 @@ type-double-routines := \
|
|||||||
math_err \
|
math_err \
|
||||||
s_asincosh_data \
|
s_asincosh_data \
|
||||||
s_atanh_data \
|
s_atanh_data \
|
||||||
|
s_erf_common \
|
||||||
s_erf_data \
|
s_erf_data \
|
||||||
s_erfc_data \
|
s_erfc_data \
|
||||||
sincostab \
|
sincostab \
|
||||||
|
|||||||
@@ -30,191 +30,12 @@ SOFTWARE.
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <libm-alias-double.h>
|
#include <libm-alias-double.h>
|
||||||
#include "math_config.h"
|
#include "math_config.h"
|
||||||
#include "s_erf_data.h"
|
#include "s_erf_common.h"
|
||||||
|
|
||||||
/* CH+CL is a double-double approximation of 2/sqrt(pi) to nearest */
|
/* CH+CL is a double-double approximation of 2/sqrt(pi) to nearest */
|
||||||
static const double CH = 0x1.20dd750429b6dp+0;
|
static const double CH = 0x1.20dd750429b6dp+0;
|
||||||
static const double CL = 0x1.1ae3a914fed8p-56;
|
static const double CL = 0x1.1ae3a914fed8p-56;
|
||||||
|
|
||||||
/* Add a + b, such that *hi + *lo approximates a + b.
|
|
||||||
Assumes |a| >= |b|.
|
|
||||||
For rounding to nearest we have hi + lo = a + b exactly.
|
|
||||||
For directed rounding, we have
|
|
||||||
(a) hi + lo = a + b exactly when the exponent difference between a and b
|
|
||||||
is at most 53 (the binary64 precision)
|
|
||||||
(b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|)
|
|
||||||
(see https://hal.inria.fr/hal-03798376)
|
|
||||||
We also have |lo| < ulp(hi). */
|
|
||||||
static inline void
|
|
||||||
fast_two_sum (double *hi, double *lo, double a, double b)
|
|
||||||
{
|
|
||||||
double e;
|
|
||||||
|
|
||||||
*hi = a + b;
|
|
||||||
e = *hi - a; /* exact */
|
|
||||||
*lo = b - e; /* exact */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Reference: https://hal.science/hal-01351529v3/document */
|
|
||||||
static inline void
|
|
||||||
two_sum (double *hi, double *lo, double a, double b)
|
|
||||||
{
|
|
||||||
*hi = a + b;
|
|
||||||
double aa = *hi - b;
|
|
||||||
double bb = *hi - aa;
|
|
||||||
double da = a - aa;
|
|
||||||
double db = b - bb;
|
|
||||||
*lo = da + db;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Multiply exactly a and b, such that *hi + *lo = a * b.
|
|
||||||
static inline void
|
|
||||||
a_mul (double *hi, double *lo, double a, double b)
|
|
||||||
{
|
|
||||||
*hi = a * b;
|
|
||||||
*lo = fma (a, b, -*hi);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
|
|
||||||
of erf(z). Return err the maximal relative error:
|
|
||||||
|(h + l)/erf(z) - 1| < err*|h+l| */
|
|
||||||
static double
|
|
||||||
cr_erf_fast (double *h, double *l, double z)
|
|
||||||
{
|
|
||||||
double th, tl;
|
|
||||||
/* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16,
|
|
||||||
and for each interval, we use a minimax polynomial:
|
|
||||||
* for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero,
|
|
||||||
since if we evaluate in the middle 1/32, we will get bad accuracy
|
|
||||||
for tiny z, and moreover z-1/32 might not be exact
|
|
||||||
* for 1 <= i <= 94, we use a polynomial evaluated in the middle of
|
|
||||||
the interval, namely i/16+1/32
|
|
||||||
*/
|
|
||||||
if (z < 0.0625) /* z < 1/16 */
|
|
||||||
{
|
|
||||||
double z2h, z2l, z4;
|
|
||||||
a_mul (&z2h, &z2l, z, z);
|
|
||||||
z4 = z2h * z2h;
|
|
||||||
double c9 = fma (C0[7], z2h, C0[6]);
|
|
||||||
double c5 = fma (C0[5], z2h, C0[4]);
|
|
||||||
c5 = fma (c9, z4, c5);
|
|
||||||
/* compute C0[2] + C0[3] + z2h*c5 */
|
|
||||||
a_mul (&th, &tl, z2h, c5);
|
|
||||||
fast_two_sum (h, l, C0[2], th);
|
|
||||||
*l += tl + C0[3];
|
|
||||||
/* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */
|
|
||||||
double h_copy = *h;
|
|
||||||
a_mul (&th, &tl, z2h, *h);
|
|
||||||
tl += fma (z2h, *l, C0[1]);
|
|
||||||
fast_two_sum (h, l, C0[0], th);
|
|
||||||
*l += fma (z2l, h_copy, tl);
|
|
||||||
/* multiply (h,l) by z */
|
|
||||||
a_mul (h, &tl, *h, z);
|
|
||||||
*l = fma (*l, z, tl);
|
|
||||||
return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */
|
|
||||||
}
|
|
||||||
double v = floor (16.0 * z);
|
|
||||||
uint32_t i = 16.0 * z;
|
|
||||||
/* i/16 <= z < (i+1)/16 */
|
|
||||||
/* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact:
|
|
||||||
(1) either z - 0.03125 is in the same binade as z, then 0.03125 is
|
|
||||||
an integer multiple of ulp(z), so is z - 0.03125
|
|
||||||
(2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are
|
|
||||||
integer multiple of the ulp() of that smaller binade.
|
|
||||||
Also, subtracting 0.0625 * v is exact. */
|
|
||||||
z = (z - 0.03125) - 0.0625 * v;
|
|
||||||
/* now |z| <= 1/32 */
|
|
||||||
const double *c = C[i - 1];
|
|
||||||
double z2 = z * z, z4 = z2 * z2;
|
|
||||||
/* the degree-10 coefficient is c[12] */
|
|
||||||
double c9 = fma (c[12], z, c[11]);
|
|
||||||
double c7 = fma (c[10], z, c[9]);
|
|
||||||
double c5 = fma (c[8], z, c[7]);
|
|
||||||
double c3h, c3l;
|
|
||||||
/* c3h, c3l <- c[5] + z*c[6] */
|
|
||||||
fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
|
|
||||||
c7 = fma (c9, z2, c7);
|
|
||||||
/* c3h, c3l <- c3h, c3l + c5*z2 */
|
|
||||||
fast_two_sum (&c3h, &tl, c3h, c5 * z2);
|
|
||||||
c3l += tl;
|
|
||||||
/* c3h, c3l <- c3h, c3l + c7*z4 */
|
|
||||||
fast_two_sum (&c3h, &tl, c3h, c7 * z4);
|
|
||||||
c3l += tl;
|
|
||||||
/* c2h, c2l <- c[4] + z*(c3h + c3l) */
|
|
||||||
double c2h, c2l;
|
|
||||||
a_mul (&th, &tl, z, c3h);
|
|
||||||
fast_two_sum (&c2h, &c2l, c[4], th);
|
|
||||||
c2l += fma (z, c3l, tl);
|
|
||||||
/* compute c[2] + c[3] + z*(c2h + c2l) */
|
|
||||||
a_mul (&th, &tl, z, c2h);
|
|
||||||
fast_two_sum (h, l, c[2], th);
|
|
||||||
*l += tl + fma (z, c2l, c[3]);
|
|
||||||
/* compute c[0] + c[1] + z*(h + l) */
|
|
||||||
a_mul (&th, &tl, z, *h);
|
|
||||||
tl = fma (z, *l, tl); /* tl += z*l */
|
|
||||||
fast_two_sum (h, l, c[0], th);
|
|
||||||
*l += tl + c[1];
|
|
||||||
return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1)
|
|
||||||
(larger values of i yield smaller error bounds) */
|
|
||||||
}
|
|
||||||
|
|
||||||
/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
|
|
||||||
static void
|
|
||||||
cr_erf_accurate_tiny (double *h, double *l, double z)
|
|
||||||
{
|
|
||||||
uint64_t i, j, k;
|
|
||||||
/* use dichotomy */
|
|
||||||
for (i = 0, j = EXCEPTIONS_LEN; i + 1 < j;)
|
|
||||||
{
|
|
||||||
k = (i + j) / 2;
|
|
||||||
if (EXCEPTIONS_TINY[k][0] <= z)
|
|
||||||
i = k;
|
|
||||||
else
|
|
||||||
j = k;
|
|
||||||
}
|
|
||||||
/* Either i=0 and z < exceptions[i+1][0],
|
|
||||||
or i=n-1 and exceptions[i][0] <= z
|
|
||||||
or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0].
|
|
||||||
In all cases z can only equal exceptions[i][0]. */
|
|
||||||
if (z == EXCEPTIONS_TINY[i][0])
|
|
||||||
{
|
|
||||||
*h = EXCEPTIONS_TINY[i][1];
|
|
||||||
*l = EXCEPTIONS_TINY[i][2];
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
double z2 = z * z, th, tl;
|
|
||||||
*h = P[21 / 2 + 4]; /* degree 21 */
|
|
||||||
for (int a = 19; a > 11; a -= 2)
|
|
||||||
*h = fma (*h, z2, P[a / 2 + 4]); /* degree j */
|
|
||||||
*l = 0;
|
|
||||||
for (int a = 11; a > 7; a -= 2)
|
|
||||||
{
|
|
||||||
/* multiply h+l by z^2 */
|
|
||||||
a_mul (&th, &tl, *h, z);
|
|
||||||
tl = fma (*l, z, tl);
|
|
||||||
a_mul (h, l, th, z);
|
|
||||||
*l = fma (tl, z, *l);
|
|
||||||
/* add p[j] to h + l */
|
|
||||||
fast_two_sum (h, &tl, P[a / 2 + 4], *h);
|
|
||||||
*l += tl;
|
|
||||||
}
|
|
||||||
for (int a = 7; a >= 1; a -= 2)
|
|
||||||
{
|
|
||||||
/* multiply h+l by z^2 */
|
|
||||||
a_mul (&th, &tl, *h, z);
|
|
||||||
tl = fma (*l, z, tl);
|
|
||||||
a_mul (h, l, th, z);
|
|
||||||
*l = fma (tl, z, *l);
|
|
||||||
fast_two_sum (h, &tl, P[a - 1], *h);
|
|
||||||
*l += P[a] + tl;
|
|
||||||
}
|
|
||||||
/* multiply by z */
|
|
||||||
a_mul (h, &tl, *h, z);
|
|
||||||
*l = fma (*l, z, tl);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
|
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
|
||||||
approximation of erf(z).
|
approximation of erf(z).
|
||||||
Assumes z >= 2^-61, thus no underflow can occur. */
|
Assumes z >= 2^-61, thus no underflow can occur. */
|
||||||
@@ -238,10 +59,7 @@ cr_erf_accurate (double *h, double *l, double z)
|
|||||||
the interval, namely i/8+1/16
|
the interval, namely i/8+1/16
|
||||||
*/
|
*/
|
||||||
if (z < 0.125) /* z < 1/8 */
|
if (z < 0.125) /* z < 1/8 */
|
||||||
{
|
return __cr_erf_accurate_tiny (h, l, z, true);
|
||||||
cr_erf_accurate_tiny (h, l, z);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
double v = floor (8.0 * z);
|
double v = floor (8.0 * z);
|
||||||
uint32_t i = 8.0 * z;
|
uint32_t i = 8.0 * z;
|
||||||
z = (z - 0.0625) - 0.125 * v;
|
z = (z - 0.0625) - 0.125 * v;
|
||||||
@@ -313,7 +131,7 @@ __erf (double x)
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
/* now z >= 2^-61 */
|
/* now z >= 2^-61 */
|
||||||
err = cr_erf_fast (&h, &l, z);
|
err = __cr_erf_fast (&h, &l, z);
|
||||||
uint64_t u = asuint64 (h), v = asuint64 (l);
|
uint64_t u = asuint64 (h), v = asuint64 (l);
|
||||||
t = asuint64 (x);
|
t = asuint64 (x);
|
||||||
u ^= t & SIGN_MASK;
|
u ^= t & SIGN_MASK;
|
||||||
|
|||||||
170
sysdeps/ieee754/dbl-64/s_erf_common.c
Normal file
170
sysdeps/ieee754/dbl-64/s_erf_common.c
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
/* Common definitions for erf/erc implementation.
|
||||||
|
|
||||||
|
Copyright (c) 2023-2025 Paul Zimmermann
|
||||||
|
|
||||||
|
The original version of this file was copied from the CORE-MATH
|
||||||
|
project (file src/binary64/erf/erf.c, revision 384ed01d).
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include "s_erf_common.h"
|
||||||
|
|
||||||
|
double
|
||||||
|
__cr_erf_fast (double *h, double *l, double z)
|
||||||
|
{
|
||||||
|
double th, tl;
|
||||||
|
/* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16,
|
||||||
|
and for each interval, we use a minimax polynomial:
|
||||||
|
* for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero,
|
||||||
|
since if we evaluate in the middle 1/32, we will get bad accuracy
|
||||||
|
for tiny z, and moreover z-1/32 might not be exact
|
||||||
|
* for 1 <= i <= 94, we use a polynomial evaluated in the middle of
|
||||||
|
the interval, namely i/16+1/32
|
||||||
|
*/
|
||||||
|
if (z < 0.0625) /* z < 1/16 */
|
||||||
|
{
|
||||||
|
double z2h, z2l, z4;
|
||||||
|
a_mul (&z2h, &z2l, z, z);
|
||||||
|
z4 = z2h * z2h;
|
||||||
|
double c9 = fma (C0[7], z2h, C0[6]);
|
||||||
|
double c5 = fma (C0[5], z2h, C0[4]);
|
||||||
|
c5 = fma (c9, z4, c5);
|
||||||
|
/* compute C0[2] + C0[3] + z2h*c5 */
|
||||||
|
a_mul (&th, &tl, z2h, c5);
|
||||||
|
fast_two_sum (h, l, C0[2], th);
|
||||||
|
*l += tl + C0[3];
|
||||||
|
/* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */
|
||||||
|
double h_copy = *h;
|
||||||
|
a_mul (&th, &tl, z2h, *h);
|
||||||
|
tl += fma (z2h, *l, C0[1]);
|
||||||
|
fast_two_sum (h, l, C0[0], th);
|
||||||
|
*l += fma (z2l, h_copy, tl);
|
||||||
|
/* multiply (h,l) by z */
|
||||||
|
a_mul (h, &tl, *h, z);
|
||||||
|
*l = fma (*l, z, tl);
|
||||||
|
return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */
|
||||||
|
}
|
||||||
|
double v = floor (16.0 * z);
|
||||||
|
uint32_t i = 16.0 * z;
|
||||||
|
/* i/16 <= z < (i+1)/16 */
|
||||||
|
/* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact:
|
||||||
|
(1) either z - 0.03125 is in the same binade as z, then 0.03125 is
|
||||||
|
an integer multiple of ulp(z), so is z - 0.03125
|
||||||
|
(2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are
|
||||||
|
integer multiple of the ulp() of that smaller binade.
|
||||||
|
Also, subtracting 0.0625 * v is exact. */
|
||||||
|
z = (z - 0.03125) - 0.0625 * v;
|
||||||
|
/* now |z| <= 1/32 */
|
||||||
|
const double *c = C[i - 1];
|
||||||
|
double z2 = z * z, z4 = z2 * z2;
|
||||||
|
/* the degree-10 coefficient is c[12] */
|
||||||
|
double c9 = fma (c[12], z, c[11]);
|
||||||
|
double c7 = fma (c[10], z, c[9]);
|
||||||
|
double c5 = fma (c[8], z, c[7]);
|
||||||
|
double c3h, c3l;
|
||||||
|
/* c3h, c3l <- c[5] + z*c[6] */
|
||||||
|
fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
|
||||||
|
c7 = fma (c9, z2, c7);
|
||||||
|
/* c3h, c3l <- c3h, c3l + c5*z2 */
|
||||||
|
fast_two_sum (&c3h, &tl, c3h, c5 * z2);
|
||||||
|
c3l += tl;
|
||||||
|
/* c3h, c3l <- c3h, c3l + c7*z4 */
|
||||||
|
fast_two_sum (&c3h, &tl, c3h, c7 * z4);
|
||||||
|
c3l += tl;
|
||||||
|
/* c2h, c2l <- c[4] + z*(c3h + c3l) */
|
||||||
|
double c2h, c2l;
|
||||||
|
a_mul (&th, &tl, z, c3h);
|
||||||
|
fast_two_sum (&c2h, &c2l, c[4], th);
|
||||||
|
c2l += fma (z, c3l, tl);
|
||||||
|
/* compute c[2] + c[3] + z*(c2h + c2l) */
|
||||||
|
a_mul (&th, &tl, z, c2h);
|
||||||
|
fast_two_sum (h, l, c[2], th);
|
||||||
|
*l += tl + fma (z, c2l, c[3]);
|
||||||
|
/* compute c[0] + c[1] + z*(h + l) */
|
||||||
|
a_mul (&th, &tl, z, *h);
|
||||||
|
tl = fma (z, *l, tl); /* tl += z*l */
|
||||||
|
fast_two_sum (h, l, c[0], th);
|
||||||
|
*l += tl + c[1];
|
||||||
|
return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1)
|
||||||
|
(larger values of i yield smaller error bounds) */
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
__cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions)
|
||||||
|
{
|
||||||
|
if (exceptions)
|
||||||
|
{
|
||||||
|
uint64_t i, j, k;
|
||||||
|
/* use dichotomy */
|
||||||
|
for (i = 0, j = EXCEPTIONS_TINY_LEN; i + 1 < j;)
|
||||||
|
{
|
||||||
|
k = (i + j) / 2;
|
||||||
|
if (EXCEPTIONS_TINY[k][0] <= z)
|
||||||
|
i = k;
|
||||||
|
else
|
||||||
|
j = k;
|
||||||
|
}
|
||||||
|
/* Either i=0 and z < exceptions[i+1][0],
|
||||||
|
or i=n-1 and exceptions[i][0] <= z
|
||||||
|
or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0].
|
||||||
|
In all cases z can only equal exceptions[i][0]. */
|
||||||
|
if (z == EXCEPTIONS_TINY[i][0])
|
||||||
|
{
|
||||||
|
*h = EXCEPTIONS_TINY[i][1];
|
||||||
|
*l = EXCEPTIONS_TINY[i][2];
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double z2 = z * z, th, tl;
|
||||||
|
*h = P[21 / 2 + 4]; /* degree 21 */
|
||||||
|
for (int a = 19; a > 11; a -= 2)
|
||||||
|
*h = fma (*h, z2, P[a / 2 + 4]); /* degree j */
|
||||||
|
*l = 0;
|
||||||
|
for (int a = 11; a > 7; a -= 2)
|
||||||
|
{
|
||||||
|
/* multiply h+l by z^2 */
|
||||||
|
a_mul (&th, &tl, *h, z);
|
||||||
|
tl = fma (*l, z, tl);
|
||||||
|
a_mul (h, l, th, z);
|
||||||
|
*l = fma (tl, z, *l);
|
||||||
|
/* add p[j] to h + l */
|
||||||
|
fast_two_sum (h, &tl, P[a / 2 + 4], *h);
|
||||||
|
*l += tl;
|
||||||
|
}
|
||||||
|
for (int a = 7; a >= 1; a -= 2)
|
||||||
|
{
|
||||||
|
/* multiply h+l by z^2 */
|
||||||
|
a_mul (&th, &tl, *h, z);
|
||||||
|
tl = fma (*l, z, tl);
|
||||||
|
a_mul (h, l, th, z);
|
||||||
|
*l = fma (tl, z, *l);
|
||||||
|
fast_two_sum (h, &tl, P[a - 1], *h);
|
||||||
|
*l += P[a] + tl;
|
||||||
|
}
|
||||||
|
/* multiply by z */
|
||||||
|
a_mul (h, &tl, *h, z);
|
||||||
|
*l = fma (*l, z, tl);
|
||||||
|
return;
|
||||||
|
}
|
||||||
80
sysdeps/ieee754/dbl-64/s_erf_common.h
Normal file
80
sysdeps/ieee754/dbl-64/s_erf_common.h
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
/* Common definitions for erf/erc implementation.
|
||||||
|
|
||||||
|
Copyright (c) 2023-2025 Paul Zimmermann
|
||||||
|
|
||||||
|
This file is part of the CORE-MATH project
|
||||||
|
(https://core-math.gitlabpages.inria.fr/).
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _S_ERF_COMMON_H
|
||||||
|
#define _S_ERF_COMMON_H
|
||||||
|
|
||||||
|
#include "s_erf_data.h"
|
||||||
|
|
||||||
|
/* Add a + b, such that *hi + *lo approximates a + b.
|
||||||
|
Assumes |a| >= |b|.
|
||||||
|
For rounding to nearest we have hi + lo = a + b exactly.
|
||||||
|
For directed rounding, we have
|
||||||
|
(a) hi + lo = a + b exactly when the exponent difference between a and b
|
||||||
|
is at most 53 (the binary64 precision)
|
||||||
|
(b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|)
|
||||||
|
(see https://hal.inria.fr/hal-03798376)
|
||||||
|
We also have |lo| < ulp(hi). */
|
||||||
|
static inline void
|
||||||
|
fast_two_sum (double *hi, double *lo, double a, double b)
|
||||||
|
{
|
||||||
|
double e;
|
||||||
|
|
||||||
|
*hi = a + b;
|
||||||
|
e = *hi - a; /* exact */
|
||||||
|
*lo = b - e; /* exact */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Reference: https://hal.science/hal-01351529v3/document */
|
||||||
|
static inline void
|
||||||
|
two_sum (double *hi, double *lo, double a, double b)
|
||||||
|
{
|
||||||
|
*hi = a + b;
|
||||||
|
double aa = *hi - b;
|
||||||
|
double bb = *hi - aa;
|
||||||
|
double da = a - aa;
|
||||||
|
double db = b - bb;
|
||||||
|
*lo = da + db;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Multiply exactly a and b, such that *hi + *lo = a * b.
|
||||||
|
static inline void
|
||||||
|
a_mul (double *hi, double *lo, double a, double b)
|
||||||
|
{
|
||||||
|
*hi = a * b;
|
||||||
|
*lo = fma (a, b, -*hi);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
|
||||||
|
of erf(z). Return err the maximal relative error:
|
||||||
|
|(h + l)/erf(z) - 1| < err*|h+l| */
|
||||||
|
double __cr_erf_fast (double *h, double *l, double z) attribute_hidden;
|
||||||
|
|
||||||
|
/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
|
||||||
|
void __cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions)
|
||||||
|
attribute_hidden;
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -48,129 +48,9 @@ SOFTWARE.
|
|||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <libm-alias-double.h>
|
#include <libm-alias-double.h>
|
||||||
#include "math_config.h"
|
#include "math_config.h"
|
||||||
#include "s_erf_data.h"
|
#include "s_erf_common.h"
|
||||||
#include "s_erfc_data.h"
|
#include "s_erfc_data.h"
|
||||||
|
|
||||||
static inline void
|
|
||||||
fast_two_sum (double *hi, double *lo, double a, double b)
|
|
||||||
{
|
|
||||||
double e;
|
|
||||||
|
|
||||||
*hi = a + b;
|
|
||||||
e = *hi - a; /* exact */
|
|
||||||
*lo = b - e; /* exact */
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
two_sum (double *hi, double *lo, double a, double b)
|
|
||||||
{
|
|
||||||
*hi = a + b;
|
|
||||||
double aa = *hi - b;
|
|
||||||
double bb = *hi - aa;
|
|
||||||
double da = a - aa;
|
|
||||||
double db = b - bb;
|
|
||||||
*lo = da + db;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
a_mul (double *hi, double *lo, double a, double b)
|
|
||||||
{
|
|
||||||
*hi = a * b;
|
|
||||||
*lo = fma (a, b, -*hi);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
|
|
||||||
of erf(z). Return err the maximal relative error:
|
|
||||||
|(h + l)/erf(z) - 1| < err*|h+l| */
|
|
||||||
static double
|
|
||||||
cr_erf_fast (double *h, double *l, double z)
|
|
||||||
{
|
|
||||||
double th, tl;
|
|
||||||
if (z < 0.0625) /* z < 1/16 */
|
|
||||||
{
|
|
||||||
double z2h, z2l, z4;
|
|
||||||
a_mul (&z2h, &z2l, z, z);
|
|
||||||
z4 = z2h * z2h;
|
|
||||||
double c9 = fma (C0[7], z2h, C0[6]);
|
|
||||||
double c5 = fma (C0[5], z2h, C0[4]);
|
|
||||||
c5 = fma (c9, z4, c5);
|
|
||||||
a_mul (&th, &tl, z2h, c5);
|
|
||||||
fast_two_sum (h, l, C0[2], th);
|
|
||||||
*l += tl + C0[3];
|
|
||||||
double h_copy = *h;
|
|
||||||
a_mul (&th, &tl, z2h, *h);
|
|
||||||
tl += fma (z2h, *l, C0[1]);
|
|
||||||
fast_two_sum (h, l, C0[0], th);
|
|
||||||
*l += fma (z2l, h_copy, tl);
|
|
||||||
a_mul (h, &tl, *h, z);
|
|
||||||
*l = fma (*l, z, tl);
|
|
||||||
return 0x1.78p-69;
|
|
||||||
}
|
|
||||||
double v = floor (16.0 * z);
|
|
||||||
uint32_t i = 16.0 * z;
|
|
||||||
z = (z - 0.03125) - 0.0625 * v;
|
|
||||||
const double *c = C[i - 1];
|
|
||||||
double z2 = z * z, z4 = z2 * z2;
|
|
||||||
double c9 = fma (c[12], z, c[11]);
|
|
||||||
double c7 = fma (c[10], z, c[9]);
|
|
||||||
double c5 = fma (c[8], z, c[7]);
|
|
||||||
double c3h, c3l;
|
|
||||||
fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
|
|
||||||
c7 = fma (c9, z2, c7);
|
|
||||||
fast_two_sum (&c3h, &tl, c3h, c5 * z2);
|
|
||||||
c3l += tl;
|
|
||||||
fast_two_sum (&c3h, &tl, c3h, c7 * z4);
|
|
||||||
c3l += tl;
|
|
||||||
double c2h, c2l;
|
|
||||||
a_mul (&th, &tl, z, c3h);
|
|
||||||
fast_two_sum (&c2h, &c2l, c[4], th);
|
|
||||||
c2l += fma (z, c3l, tl);
|
|
||||||
a_mul (&th, &tl, z, c2h);
|
|
||||||
fast_two_sum (h, l, c[2], th);
|
|
||||||
*l += tl + fma (z, c2l, c[3]);
|
|
||||||
a_mul (&th, &tl, z, *h);
|
|
||||||
tl = fma (z, *l, tl); /* tl += z*l */
|
|
||||||
fast_two_sum (h, l, c[0], th);
|
|
||||||
*l += tl + c[1];
|
|
||||||
return 0x1.11p-69;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
|
|
||||||
static void
|
|
||||||
cr_erf_accurate_tiny (double *h, double *l, double z)
|
|
||||||
{
|
|
||||||
double z2 = z * z, th, tl;
|
|
||||||
*h = P[21 / 2 + 4]; /* degree 21 */
|
|
||||||
for (int j = 19; j > 11; j -= 2)
|
|
||||||
*h = fma (*h, z2, P[j / 2 + 4]); /* degree j */
|
|
||||||
*l = 0;
|
|
||||||
for (int j = 11; j > 7; j -= 2)
|
|
||||||
{
|
|
||||||
/* multiply h+l by z^2 */
|
|
||||||
a_mul (&th, &tl, *h, z);
|
|
||||||
tl = fma (*l, z, tl);
|
|
||||||
a_mul (h, l, th, z);
|
|
||||||
*l = fma (tl, z, *l);
|
|
||||||
/* add P[j] to h + l */
|
|
||||||
fast_two_sum (h, &tl, P[j / 2 + 4], *h);
|
|
||||||
*l += tl;
|
|
||||||
}
|
|
||||||
for (int j = 7; j >= 1; j -= 2)
|
|
||||||
{
|
|
||||||
/* multiply h+l by z^2 */
|
|
||||||
a_mul (&th, &tl, *h, z);
|
|
||||||
tl = fma (*l, z, tl);
|
|
||||||
a_mul (h, l, th, z);
|
|
||||||
*l = fma (tl, z, *l);
|
|
||||||
fast_two_sum (h, &tl, P[j - 1], *h);
|
|
||||||
*l += P[j] + tl;
|
|
||||||
}
|
|
||||||
/* multiply by z */
|
|
||||||
a_mul (h, &tl, *h, z);
|
|
||||||
*l = fma (*l, z, tl);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
|
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
|
||||||
approximation of erf(z).
|
approximation of erf(z).
|
||||||
Assumes z >= 2^-61, thus no underflow can occur. */
|
Assumes z >= 2^-61, thus no underflow can occur. */
|
||||||
@@ -179,7 +59,7 @@ cr_erf_accurate (double *h, double *l, double z)
|
|||||||
{
|
{
|
||||||
double th, tl;
|
double th, tl;
|
||||||
if (z < 0.125) /* z < 1/8 */
|
if (z < 0.125) /* z < 1/8 */
|
||||||
return cr_erf_accurate_tiny (h, l, z);
|
return __cr_erf_accurate_tiny (h, l, z, false);
|
||||||
double v = floor (8.0 * z);
|
double v = floor (8.0 * z);
|
||||||
uint32_t i = 8.0 * z;
|
uint32_t i = 8.0 * z;
|
||||||
z = (z - 0.0625) - 0.125 * v;
|
z = (z - 0.0625) - 0.125 * v;
|
||||||
@@ -507,7 +387,7 @@ cr_erfc_fast (double *h, double *l, double x)
|
|||||||
throughput is about 44 cycles */
|
throughput is about 44 cycles */
|
||||||
if (x < 0) // erfc(x) = 1 - erf(x) = 1 + erf(-x)
|
if (x < 0) // erfc(x) = 1 - erf(x) = 1 + erf(-x)
|
||||||
{
|
{
|
||||||
double err = cr_erf_fast (h, l, -x);
|
double err = __cr_erf_fast (h, l, -x);
|
||||||
/* h+l approximates erf(-x), with relative error bounded by err,
|
/* h+l approximates erf(-x), with relative error bounded by err,
|
||||||
where err <= 0x1.78p-69 */
|
where err <= 0x1.78p-69 */
|
||||||
err = err * *h; /* convert into absolute error */
|
err = err * *h; /* convert into absolute error */
|
||||||
@@ -533,7 +413,7 @@ cr_erfc_fast (double *h, double *l, double x)
|
|||||||
the average reciprocal throughput is about 59 cycles */
|
the average reciprocal throughput is about 59 cycles */
|
||||||
else if (x <= THRESHOLD1)
|
else if (x <= THRESHOLD1)
|
||||||
{
|
{
|
||||||
double err = cr_erf_fast (h, l, x);
|
double err = __cr_erf_fast (h, l, x);
|
||||||
/* h+l approximates erf(x), with relative error bounded by err,
|
/* h+l approximates erf(x), with relative error bounded by err,
|
||||||
where err <= 0x1.78p-69 */
|
where err <= 0x1.78p-69 */
|
||||||
err = err * *h; /* convert into absolute error */
|
err = err * *h; /* convert into absolute error */
|
||||||
|
|||||||
Reference in New Issue
Block a user