mirror of
https://sourceware.org/git/glibc.git
synced 2025-10-31 22:10:34 +03:00
math: Consolidate erf/erfc definitions
The common code definitions are consolidated in s_erf_common.h and s_erf_common.c. Checked on x86_64-linux-gnu, aarch64-linux-gnu, and powerpc64le-linux-gnu. Reviewed-by: DJ Delorie <dj@redhat.com>
This commit is contained in:
@@ -363,6 +363,7 @@ type-double-routines := \
|
||||
math_err \
|
||||
s_asincosh_data \
|
||||
s_atanh_data \
|
||||
s_erf_common \
|
||||
s_erf_data \
|
||||
s_erfc_data \
|
||||
sincostab \
|
||||
|
||||
@@ -30,191 +30,12 @@ SOFTWARE.
|
||||
#include <errno.h>
|
||||
#include <libm-alias-double.h>
|
||||
#include "math_config.h"
|
||||
#include "s_erf_data.h"
|
||||
#include "s_erf_common.h"
|
||||
|
||||
/* CH+CL is a double-double approximation of 2/sqrt(pi) to nearest */
|
||||
static const double CH = 0x1.20dd750429b6dp+0;
|
||||
static const double CL = 0x1.1ae3a914fed8p-56;
|
||||
|
||||
/* Add a + b, such that *hi + *lo approximates a + b.
|
||||
Assumes |a| >= |b|.
|
||||
For rounding to nearest we have hi + lo = a + b exactly.
|
||||
For directed rounding, we have
|
||||
(a) hi + lo = a + b exactly when the exponent difference between a and b
|
||||
is at most 53 (the binary64 precision)
|
||||
(b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|)
|
||||
(see https://hal.inria.fr/hal-03798376)
|
||||
We also have |lo| < ulp(hi). */
|
||||
static inline void
|
||||
fast_two_sum (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
double e;
|
||||
|
||||
*hi = a + b;
|
||||
e = *hi - a; /* exact */
|
||||
*lo = b - e; /* exact */
|
||||
}
|
||||
|
||||
/* Reference: https://hal.science/hal-01351529v3/document */
|
||||
static inline void
|
||||
two_sum (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
*hi = a + b;
|
||||
double aa = *hi - b;
|
||||
double bb = *hi - aa;
|
||||
double da = a - aa;
|
||||
double db = b - bb;
|
||||
*lo = da + db;
|
||||
}
|
||||
|
||||
// Multiply exactly a and b, such that *hi + *lo = a * b.
|
||||
static inline void
|
||||
a_mul (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
*hi = a * b;
|
||||
*lo = fma (a, b, -*hi);
|
||||
}
|
||||
|
||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
|
||||
of erf(z). Return err the maximal relative error:
|
||||
|(h + l)/erf(z) - 1| < err*|h+l| */
|
||||
static double
|
||||
cr_erf_fast (double *h, double *l, double z)
|
||||
{
|
||||
double th, tl;
|
||||
/* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16,
|
||||
and for each interval, we use a minimax polynomial:
|
||||
* for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero,
|
||||
since if we evaluate in the middle 1/32, we will get bad accuracy
|
||||
for tiny z, and moreover z-1/32 might not be exact
|
||||
* for 1 <= i <= 94, we use a polynomial evaluated in the middle of
|
||||
the interval, namely i/16+1/32
|
||||
*/
|
||||
if (z < 0.0625) /* z < 1/16 */
|
||||
{
|
||||
double z2h, z2l, z4;
|
||||
a_mul (&z2h, &z2l, z, z);
|
||||
z4 = z2h * z2h;
|
||||
double c9 = fma (C0[7], z2h, C0[6]);
|
||||
double c5 = fma (C0[5], z2h, C0[4]);
|
||||
c5 = fma (c9, z4, c5);
|
||||
/* compute C0[2] + C0[3] + z2h*c5 */
|
||||
a_mul (&th, &tl, z2h, c5);
|
||||
fast_two_sum (h, l, C0[2], th);
|
||||
*l += tl + C0[3];
|
||||
/* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */
|
||||
double h_copy = *h;
|
||||
a_mul (&th, &tl, z2h, *h);
|
||||
tl += fma (z2h, *l, C0[1]);
|
||||
fast_two_sum (h, l, C0[0], th);
|
||||
*l += fma (z2l, h_copy, tl);
|
||||
/* multiply (h,l) by z */
|
||||
a_mul (h, &tl, *h, z);
|
||||
*l = fma (*l, z, tl);
|
||||
return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */
|
||||
}
|
||||
double v = floor (16.0 * z);
|
||||
uint32_t i = 16.0 * z;
|
||||
/* i/16 <= z < (i+1)/16 */
|
||||
/* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact:
|
||||
(1) either z - 0.03125 is in the same binade as z, then 0.03125 is
|
||||
an integer multiple of ulp(z), so is z - 0.03125
|
||||
(2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are
|
||||
integer multiple of the ulp() of that smaller binade.
|
||||
Also, subtracting 0.0625 * v is exact. */
|
||||
z = (z - 0.03125) - 0.0625 * v;
|
||||
/* now |z| <= 1/32 */
|
||||
const double *c = C[i - 1];
|
||||
double z2 = z * z, z4 = z2 * z2;
|
||||
/* the degree-10 coefficient is c[12] */
|
||||
double c9 = fma (c[12], z, c[11]);
|
||||
double c7 = fma (c[10], z, c[9]);
|
||||
double c5 = fma (c[8], z, c[7]);
|
||||
double c3h, c3l;
|
||||
/* c3h, c3l <- c[5] + z*c[6] */
|
||||
fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
|
||||
c7 = fma (c9, z2, c7);
|
||||
/* c3h, c3l <- c3h, c3l + c5*z2 */
|
||||
fast_two_sum (&c3h, &tl, c3h, c5 * z2);
|
||||
c3l += tl;
|
||||
/* c3h, c3l <- c3h, c3l + c7*z4 */
|
||||
fast_two_sum (&c3h, &tl, c3h, c7 * z4);
|
||||
c3l += tl;
|
||||
/* c2h, c2l <- c[4] + z*(c3h + c3l) */
|
||||
double c2h, c2l;
|
||||
a_mul (&th, &tl, z, c3h);
|
||||
fast_two_sum (&c2h, &c2l, c[4], th);
|
||||
c2l += fma (z, c3l, tl);
|
||||
/* compute c[2] + c[3] + z*(c2h + c2l) */
|
||||
a_mul (&th, &tl, z, c2h);
|
||||
fast_two_sum (h, l, c[2], th);
|
||||
*l += tl + fma (z, c2l, c[3]);
|
||||
/* compute c[0] + c[1] + z*(h + l) */
|
||||
a_mul (&th, &tl, z, *h);
|
||||
tl = fma (z, *l, tl); /* tl += z*l */
|
||||
fast_two_sum (h, l, c[0], th);
|
||||
*l += tl + c[1];
|
||||
return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1)
|
||||
(larger values of i yield smaller error bounds) */
|
||||
}
|
||||
|
||||
/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
|
||||
static void
|
||||
cr_erf_accurate_tiny (double *h, double *l, double z)
|
||||
{
|
||||
uint64_t i, j, k;
|
||||
/* use dichotomy */
|
||||
for (i = 0, j = EXCEPTIONS_LEN; i + 1 < j;)
|
||||
{
|
||||
k = (i + j) / 2;
|
||||
if (EXCEPTIONS_TINY[k][0] <= z)
|
||||
i = k;
|
||||
else
|
||||
j = k;
|
||||
}
|
||||
/* Either i=0 and z < exceptions[i+1][0],
|
||||
or i=n-1 and exceptions[i][0] <= z
|
||||
or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0].
|
||||
In all cases z can only equal exceptions[i][0]. */
|
||||
if (z == EXCEPTIONS_TINY[i][0])
|
||||
{
|
||||
*h = EXCEPTIONS_TINY[i][1];
|
||||
*l = EXCEPTIONS_TINY[i][2];
|
||||
return;
|
||||
}
|
||||
|
||||
double z2 = z * z, th, tl;
|
||||
*h = P[21 / 2 + 4]; /* degree 21 */
|
||||
for (int a = 19; a > 11; a -= 2)
|
||||
*h = fma (*h, z2, P[a / 2 + 4]); /* degree j */
|
||||
*l = 0;
|
||||
for (int a = 11; a > 7; a -= 2)
|
||||
{
|
||||
/* multiply h+l by z^2 */
|
||||
a_mul (&th, &tl, *h, z);
|
||||
tl = fma (*l, z, tl);
|
||||
a_mul (h, l, th, z);
|
||||
*l = fma (tl, z, *l);
|
||||
/* add p[j] to h + l */
|
||||
fast_two_sum (h, &tl, P[a / 2 + 4], *h);
|
||||
*l += tl;
|
||||
}
|
||||
for (int a = 7; a >= 1; a -= 2)
|
||||
{
|
||||
/* multiply h+l by z^2 */
|
||||
a_mul (&th, &tl, *h, z);
|
||||
tl = fma (*l, z, tl);
|
||||
a_mul (h, l, th, z);
|
||||
*l = fma (tl, z, *l);
|
||||
fast_two_sum (h, &tl, P[a - 1], *h);
|
||||
*l += P[a] + tl;
|
||||
}
|
||||
/* multiply by z */
|
||||
a_mul (h, &tl, *h, z);
|
||||
*l = fma (*l, z, tl);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
|
||||
approximation of erf(z).
|
||||
Assumes z >= 2^-61, thus no underflow can occur. */
|
||||
@@ -238,10 +59,7 @@ cr_erf_accurate (double *h, double *l, double z)
|
||||
the interval, namely i/8+1/16
|
||||
*/
|
||||
if (z < 0.125) /* z < 1/8 */
|
||||
{
|
||||
cr_erf_accurate_tiny (h, l, z);
|
||||
return;
|
||||
}
|
||||
return __cr_erf_accurate_tiny (h, l, z, true);
|
||||
double v = floor (8.0 * z);
|
||||
uint32_t i = 8.0 * z;
|
||||
z = (z - 0.0625) - 0.125 * v;
|
||||
@@ -313,7 +131,7 @@ __erf (double x)
|
||||
return res;
|
||||
}
|
||||
/* now z >= 2^-61 */
|
||||
err = cr_erf_fast (&h, &l, z);
|
||||
err = __cr_erf_fast (&h, &l, z);
|
||||
uint64_t u = asuint64 (h), v = asuint64 (l);
|
||||
t = asuint64 (x);
|
||||
u ^= t & SIGN_MASK;
|
||||
|
||||
170
sysdeps/ieee754/dbl-64/s_erf_common.c
Normal file
170
sysdeps/ieee754/dbl-64/s_erf_common.c
Normal file
@@ -0,0 +1,170 @@
|
||||
/* Common definitions for erf/erc implementation.
|
||||
|
||||
Copyright (c) 2023-2025 Paul Zimmermann
|
||||
|
||||
The original version of this file was copied from the CORE-MATH
|
||||
project (file src/binary64/erf/erf.c, revision 384ed01d).
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include "s_erf_common.h"
|
||||
|
||||
double
|
||||
__cr_erf_fast (double *h, double *l, double z)
|
||||
{
|
||||
double th, tl;
|
||||
/* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16,
|
||||
and for each interval, we use a minimax polynomial:
|
||||
* for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero,
|
||||
since if we evaluate in the middle 1/32, we will get bad accuracy
|
||||
for tiny z, and moreover z-1/32 might not be exact
|
||||
* for 1 <= i <= 94, we use a polynomial evaluated in the middle of
|
||||
the interval, namely i/16+1/32
|
||||
*/
|
||||
if (z < 0.0625) /* z < 1/16 */
|
||||
{
|
||||
double z2h, z2l, z4;
|
||||
a_mul (&z2h, &z2l, z, z);
|
||||
z4 = z2h * z2h;
|
||||
double c9 = fma (C0[7], z2h, C0[6]);
|
||||
double c5 = fma (C0[5], z2h, C0[4]);
|
||||
c5 = fma (c9, z4, c5);
|
||||
/* compute C0[2] + C0[3] + z2h*c5 */
|
||||
a_mul (&th, &tl, z2h, c5);
|
||||
fast_two_sum (h, l, C0[2], th);
|
||||
*l += tl + C0[3];
|
||||
/* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */
|
||||
double h_copy = *h;
|
||||
a_mul (&th, &tl, z2h, *h);
|
||||
tl += fma (z2h, *l, C0[1]);
|
||||
fast_two_sum (h, l, C0[0], th);
|
||||
*l += fma (z2l, h_copy, tl);
|
||||
/* multiply (h,l) by z */
|
||||
a_mul (h, &tl, *h, z);
|
||||
*l = fma (*l, z, tl);
|
||||
return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */
|
||||
}
|
||||
double v = floor (16.0 * z);
|
||||
uint32_t i = 16.0 * z;
|
||||
/* i/16 <= z < (i+1)/16 */
|
||||
/* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact:
|
||||
(1) either z - 0.03125 is in the same binade as z, then 0.03125 is
|
||||
an integer multiple of ulp(z), so is z - 0.03125
|
||||
(2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are
|
||||
integer multiple of the ulp() of that smaller binade.
|
||||
Also, subtracting 0.0625 * v is exact. */
|
||||
z = (z - 0.03125) - 0.0625 * v;
|
||||
/* now |z| <= 1/32 */
|
||||
const double *c = C[i - 1];
|
||||
double z2 = z * z, z4 = z2 * z2;
|
||||
/* the degree-10 coefficient is c[12] */
|
||||
double c9 = fma (c[12], z, c[11]);
|
||||
double c7 = fma (c[10], z, c[9]);
|
||||
double c5 = fma (c[8], z, c[7]);
|
||||
double c3h, c3l;
|
||||
/* c3h, c3l <- c[5] + z*c[6] */
|
||||
fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
|
||||
c7 = fma (c9, z2, c7);
|
||||
/* c3h, c3l <- c3h, c3l + c5*z2 */
|
||||
fast_two_sum (&c3h, &tl, c3h, c5 * z2);
|
||||
c3l += tl;
|
||||
/* c3h, c3l <- c3h, c3l + c7*z4 */
|
||||
fast_two_sum (&c3h, &tl, c3h, c7 * z4);
|
||||
c3l += tl;
|
||||
/* c2h, c2l <- c[4] + z*(c3h + c3l) */
|
||||
double c2h, c2l;
|
||||
a_mul (&th, &tl, z, c3h);
|
||||
fast_two_sum (&c2h, &c2l, c[4], th);
|
||||
c2l += fma (z, c3l, tl);
|
||||
/* compute c[2] + c[3] + z*(c2h + c2l) */
|
||||
a_mul (&th, &tl, z, c2h);
|
||||
fast_two_sum (h, l, c[2], th);
|
||||
*l += tl + fma (z, c2l, c[3]);
|
||||
/* compute c[0] + c[1] + z*(h + l) */
|
||||
a_mul (&th, &tl, z, *h);
|
||||
tl = fma (z, *l, tl); /* tl += z*l */
|
||||
fast_two_sum (h, l, c[0], th);
|
||||
*l += tl + c[1];
|
||||
return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1)
|
||||
(larger values of i yield smaller error bounds) */
|
||||
}
|
||||
|
||||
void
|
||||
__cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions)
|
||||
{
|
||||
if (exceptions)
|
||||
{
|
||||
uint64_t i, j, k;
|
||||
/* use dichotomy */
|
||||
for (i = 0, j = EXCEPTIONS_TINY_LEN; i + 1 < j;)
|
||||
{
|
||||
k = (i + j) / 2;
|
||||
if (EXCEPTIONS_TINY[k][0] <= z)
|
||||
i = k;
|
||||
else
|
||||
j = k;
|
||||
}
|
||||
/* Either i=0 and z < exceptions[i+1][0],
|
||||
or i=n-1 and exceptions[i][0] <= z
|
||||
or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0].
|
||||
In all cases z can only equal exceptions[i][0]. */
|
||||
if (z == EXCEPTIONS_TINY[i][0])
|
||||
{
|
||||
*h = EXCEPTIONS_TINY[i][1];
|
||||
*l = EXCEPTIONS_TINY[i][2];
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
double z2 = z * z, th, tl;
|
||||
*h = P[21 / 2 + 4]; /* degree 21 */
|
||||
for (int a = 19; a > 11; a -= 2)
|
||||
*h = fma (*h, z2, P[a / 2 + 4]); /* degree j */
|
||||
*l = 0;
|
||||
for (int a = 11; a > 7; a -= 2)
|
||||
{
|
||||
/* multiply h+l by z^2 */
|
||||
a_mul (&th, &tl, *h, z);
|
||||
tl = fma (*l, z, tl);
|
||||
a_mul (h, l, th, z);
|
||||
*l = fma (tl, z, *l);
|
||||
/* add p[j] to h + l */
|
||||
fast_two_sum (h, &tl, P[a / 2 + 4], *h);
|
||||
*l += tl;
|
||||
}
|
||||
for (int a = 7; a >= 1; a -= 2)
|
||||
{
|
||||
/* multiply h+l by z^2 */
|
||||
a_mul (&th, &tl, *h, z);
|
||||
tl = fma (*l, z, tl);
|
||||
a_mul (h, l, th, z);
|
||||
*l = fma (tl, z, *l);
|
||||
fast_two_sum (h, &tl, P[a - 1], *h);
|
||||
*l += P[a] + tl;
|
||||
}
|
||||
/* multiply by z */
|
||||
a_mul (h, &tl, *h, z);
|
||||
*l = fma (*l, z, tl);
|
||||
return;
|
||||
}
|
||||
80
sysdeps/ieee754/dbl-64/s_erf_common.h
Normal file
80
sysdeps/ieee754/dbl-64/s_erf_common.h
Normal file
@@ -0,0 +1,80 @@
|
||||
/* Common definitions for erf/erc implementation.
|
||||
|
||||
Copyright (c) 2023-2025 Paul Zimmermann
|
||||
|
||||
This file is part of the CORE-MATH project
|
||||
(https://core-math.gitlabpages.inria.fr/).
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef _S_ERF_COMMON_H
|
||||
#define _S_ERF_COMMON_H
|
||||
|
||||
#include "s_erf_data.h"
|
||||
|
||||
/* Add a + b, such that *hi + *lo approximates a + b.
|
||||
Assumes |a| >= |b|.
|
||||
For rounding to nearest we have hi + lo = a + b exactly.
|
||||
For directed rounding, we have
|
||||
(a) hi + lo = a + b exactly when the exponent difference between a and b
|
||||
is at most 53 (the binary64 precision)
|
||||
(b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|)
|
||||
(see https://hal.inria.fr/hal-03798376)
|
||||
We also have |lo| < ulp(hi). */
|
||||
static inline void
|
||||
fast_two_sum (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
double e;
|
||||
|
||||
*hi = a + b;
|
||||
e = *hi - a; /* exact */
|
||||
*lo = b - e; /* exact */
|
||||
}
|
||||
|
||||
/* Reference: https://hal.science/hal-01351529v3/document */
|
||||
static inline void
|
||||
two_sum (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
*hi = a + b;
|
||||
double aa = *hi - b;
|
||||
double bb = *hi - aa;
|
||||
double da = a - aa;
|
||||
double db = b - bb;
|
||||
*lo = da + db;
|
||||
}
|
||||
|
||||
// Multiply exactly a and b, such that *hi + *lo = a * b.
|
||||
static inline void
|
||||
a_mul (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
*hi = a * b;
|
||||
*lo = fma (a, b, -*hi);
|
||||
}
|
||||
|
||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
|
||||
of erf(z). Return err the maximal relative error:
|
||||
|(h + l)/erf(z) - 1| < err*|h+l| */
|
||||
double __cr_erf_fast (double *h, double *l, double z) attribute_hidden;
|
||||
|
||||
/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
|
||||
void __cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions)
|
||||
attribute_hidden;
|
||||
|
||||
#endif
|
||||
@@ -48,129 +48,9 @@ SOFTWARE.
|
||||
#include <errno.h>
|
||||
#include <libm-alias-double.h>
|
||||
#include "math_config.h"
|
||||
#include "s_erf_data.h"
|
||||
#include "s_erf_common.h"
|
||||
#include "s_erfc_data.h"
|
||||
|
||||
static inline void
|
||||
fast_two_sum (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
double e;
|
||||
|
||||
*hi = a + b;
|
||||
e = *hi - a; /* exact */
|
||||
*lo = b - e; /* exact */
|
||||
}
|
||||
|
||||
static inline void
|
||||
two_sum (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
*hi = a + b;
|
||||
double aa = *hi - b;
|
||||
double bb = *hi - aa;
|
||||
double da = a - aa;
|
||||
double db = b - bb;
|
||||
*lo = da + db;
|
||||
}
|
||||
|
||||
static inline void
|
||||
a_mul (double *hi, double *lo, double a, double b)
|
||||
{
|
||||
*hi = a * b;
|
||||
*lo = fma (a, b, -*hi);
|
||||
}
|
||||
|
||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
|
||||
of erf(z). Return err the maximal relative error:
|
||||
|(h + l)/erf(z) - 1| < err*|h+l| */
|
||||
static double
|
||||
cr_erf_fast (double *h, double *l, double z)
|
||||
{
|
||||
double th, tl;
|
||||
if (z < 0.0625) /* z < 1/16 */
|
||||
{
|
||||
double z2h, z2l, z4;
|
||||
a_mul (&z2h, &z2l, z, z);
|
||||
z4 = z2h * z2h;
|
||||
double c9 = fma (C0[7], z2h, C0[6]);
|
||||
double c5 = fma (C0[5], z2h, C0[4]);
|
||||
c5 = fma (c9, z4, c5);
|
||||
a_mul (&th, &tl, z2h, c5);
|
||||
fast_two_sum (h, l, C0[2], th);
|
||||
*l += tl + C0[3];
|
||||
double h_copy = *h;
|
||||
a_mul (&th, &tl, z2h, *h);
|
||||
tl += fma (z2h, *l, C0[1]);
|
||||
fast_two_sum (h, l, C0[0], th);
|
||||
*l += fma (z2l, h_copy, tl);
|
||||
a_mul (h, &tl, *h, z);
|
||||
*l = fma (*l, z, tl);
|
||||
return 0x1.78p-69;
|
||||
}
|
||||
double v = floor (16.0 * z);
|
||||
uint32_t i = 16.0 * z;
|
||||
z = (z - 0.03125) - 0.0625 * v;
|
||||
const double *c = C[i - 1];
|
||||
double z2 = z * z, z4 = z2 * z2;
|
||||
double c9 = fma (c[12], z, c[11]);
|
||||
double c7 = fma (c[10], z, c[9]);
|
||||
double c5 = fma (c[8], z, c[7]);
|
||||
double c3h, c3l;
|
||||
fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
|
||||
c7 = fma (c9, z2, c7);
|
||||
fast_two_sum (&c3h, &tl, c3h, c5 * z2);
|
||||
c3l += tl;
|
||||
fast_two_sum (&c3h, &tl, c3h, c7 * z4);
|
||||
c3l += tl;
|
||||
double c2h, c2l;
|
||||
a_mul (&th, &tl, z, c3h);
|
||||
fast_two_sum (&c2h, &c2l, c[4], th);
|
||||
c2l += fma (z, c3l, tl);
|
||||
a_mul (&th, &tl, z, c2h);
|
||||
fast_two_sum (h, l, c[2], th);
|
||||
*l += tl + fma (z, c2l, c[3]);
|
||||
a_mul (&th, &tl, z, *h);
|
||||
tl = fma (z, *l, tl); /* tl += z*l */
|
||||
fast_two_sum (h, l, c[0], th);
|
||||
*l += tl + c[1];
|
||||
return 0x1.11p-69;
|
||||
}
|
||||
|
||||
/* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
|
||||
static void
|
||||
cr_erf_accurate_tiny (double *h, double *l, double z)
|
||||
{
|
||||
double z2 = z * z, th, tl;
|
||||
*h = P[21 / 2 + 4]; /* degree 21 */
|
||||
for (int j = 19; j > 11; j -= 2)
|
||||
*h = fma (*h, z2, P[j / 2 + 4]); /* degree j */
|
||||
*l = 0;
|
||||
for (int j = 11; j > 7; j -= 2)
|
||||
{
|
||||
/* multiply h+l by z^2 */
|
||||
a_mul (&th, &tl, *h, z);
|
||||
tl = fma (*l, z, tl);
|
||||
a_mul (h, l, th, z);
|
||||
*l = fma (tl, z, *l);
|
||||
/* add P[j] to h + l */
|
||||
fast_two_sum (h, &tl, P[j / 2 + 4], *h);
|
||||
*l += tl;
|
||||
}
|
||||
for (int j = 7; j >= 1; j -= 2)
|
||||
{
|
||||
/* multiply h+l by z^2 */
|
||||
a_mul (&th, &tl, *h, z);
|
||||
tl = fma (*l, z, tl);
|
||||
a_mul (h, l, th, z);
|
||||
*l = fma (tl, z, *l);
|
||||
fast_two_sum (h, &tl, P[j - 1], *h);
|
||||
*l += P[j] + tl;
|
||||
}
|
||||
/* multiply by z */
|
||||
a_mul (h, &tl, *h, z);
|
||||
*l = fma (*l, z, tl);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
|
||||
approximation of erf(z).
|
||||
Assumes z >= 2^-61, thus no underflow can occur. */
|
||||
@@ -179,7 +59,7 @@ cr_erf_accurate (double *h, double *l, double z)
|
||||
{
|
||||
double th, tl;
|
||||
if (z < 0.125) /* z < 1/8 */
|
||||
return cr_erf_accurate_tiny (h, l, z);
|
||||
return __cr_erf_accurate_tiny (h, l, z, false);
|
||||
double v = floor (8.0 * z);
|
||||
uint32_t i = 8.0 * z;
|
||||
z = (z - 0.0625) - 0.125 * v;
|
||||
@@ -507,7 +387,7 @@ cr_erfc_fast (double *h, double *l, double x)
|
||||
throughput is about 44 cycles */
|
||||
if (x < 0) // erfc(x) = 1 - erf(x) = 1 + erf(-x)
|
||||
{
|
||||
double err = cr_erf_fast (h, l, -x);
|
||||
double err = __cr_erf_fast (h, l, -x);
|
||||
/* h+l approximates erf(-x), with relative error bounded by err,
|
||||
where err <= 0x1.78p-69 */
|
||||
err = err * *h; /* convert into absolute error */
|
||||
@@ -533,7 +413,7 @@ cr_erfc_fast (double *h, double *l, double x)
|
||||
the average reciprocal throughput is about 59 cycles */
|
||||
else if (x <= THRESHOLD1)
|
||||
{
|
||||
double err = cr_erf_fast (h, l, x);
|
||||
double err = __cr_erf_fast (h, l, x);
|
||||
/* h+l approximates erf(x), with relative error bounded by err,
|
||||
where err <= 0x1.78p-69 */
|
||||
err = err * *h; /* convert into absolute error */
|
||||
|
||||
Reference in New Issue
Block a user