math: Consolidate erf/erfc definitions

The common code definitions are consolidated in s_erf_common.h and s_erf_common.c. Checked on x86_64-linux-gnu, aarch64-linux-gnu, and powerpc64le-linux-gnu. Reviewed-by: DJ Delorie <dj@redhat.com>
2025-11-02 09:33:31 +03:00 · 2025-10-10 15:15:32 -03:00
parent fc419290f9
commit e4d812c980
5 changed files with 258 additions and 309 deletions
--- a/math/Makefile
+++ b/math/Makefile
@@ -363,6 +363,7 @@ type-double-routines := \
  math_err \
  s_asincosh_data \
  s_atanh_data \
  s_erf_common \
  s_erf_data \
  s_erfc_data \
  sincostab \
--- a/sysdeps/ieee754/dbl-64/s_erf.c
+++ b/sysdeps/ieee754/dbl-64/s_erf.c
@@ -30,191 +30,12 @@ SOFTWARE.
 #include <errno.h>
 #include <libm-alias-double.h>
 #include "math_config.h"
-#include "s_erf_data.h"
+#include "s_erf_common.h"
 /* CH+CL is a double-double approximation of 2/sqrt(pi) to nearest */
 static const double CH = 0x1.20dd750429b6dp+0;
 static const double CL = 0x1.1ae3a914fed8p-56;
 /* Add a + b, such that *hi + *lo approximates a + b.
   Assumes |a| >= |b|.
   For rounding to nearest we have hi + lo = a + b exactly.
   For directed rounding, we have
   (a) hi + lo = a + b exactly when the exponent difference between a and b
       is at most 53 (the binary64 precision)
   (b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|)
       (see https://hal.inria.fr/hal-03798376)
   We also have |lo| < ulp(hi). */
 static inline void
 fast_two_sum (double *hi, double *lo, double a, double b)
 {
  double e;
  *hi = a + b;
  e = *hi - a; /* exact */
  *lo = b - e; /* exact */
 }
 /* Reference: https://hal.science/hal-01351529v3/document */
 static inline void
 two_sum (double *hi, double *lo, double a, double b)
 {
  *hi = a + b;
  double aa = *hi - b;
  double bb = *hi - aa;
  double da = a - aa;
  double db = b - bb;
  *lo = da + db;
 }
 // Multiply exactly a and b, such that *hi + *lo = a * b.
 static inline void
 a_mul (double *hi, double *lo, double a, double b)
 {
  *hi = a * b;
  *lo = fma (a, b, -*hi);
 }
 /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
   of erf(z). Return err the maximal relative error:
   |(h + l)/erf(z) - 1| < err*|h+l| */
 static double
 cr_erf_fast (double *h, double *l, double z)
 {
  double th, tl;
  /* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16,
     and for each interval, we use a minimax polynomial:
     * for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero,
       since if we evaluate in the middle 1/32, we will get bad accuracy
       for tiny z, and moreover z-1/32 might not be exact
     * for 1 <= i <= 94, we use a polynomial evaluated in the middle of
       the interval, namely i/16+1/32
  */
  if (z < 0.0625) /* z < 1/16 */
    {
      double z2h, z2l, z4;
      a_mul (&z2h, &z2l, z, z);
      z4 = z2h * z2h;
      double c9 = fma (C0[7], z2h, C0[6]);
      double c5 = fma (C0[5], z2h, C0[4]);
      c5 = fma (c9, z4, c5);
      /* compute C0[2] + C0[3] + z2h*c5 */
      a_mul (&th, &tl, z2h, c5);
      fast_two_sum (h, l, C0[2], th);
      *l += tl + C0[3];
      /* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */
      double h_copy = *h;
      a_mul (&th, &tl, z2h, *h);
      tl += fma (z2h, *l, C0[1]);
      fast_two_sum (h, l, C0[0], th);
      *l += fma (z2l, h_copy, tl);
      /* multiply (h,l) by z */
      a_mul (h, &tl, *h, z);
      *l = fma (*l, z, tl);
      return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */
    }
  double v = floor (16.0 * z);
  uint32_t i = 16.0 * z;
  /* i/16 <= z < (i+1)/16 */
  /* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact:
     (1) either z - 0.03125 is in the same binade as z, then 0.03125 is
 	 an integer multiple of ulp(z), so is z - 0.03125
     (2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are
 	 integer multiple of the ulp() of that smaller binade.
     Also, subtracting 0.0625 * v is exact. */
  z = (z - 0.03125) - 0.0625 * v;
  /* now |z| <= 1/32 */
  const double *c = C[i - 1];
  double z2 = z * z, z4 = z2 * z2;
  /* the degree-10 coefficient is c[12] */
  double c9 = fma (c[12], z, c[11]);
  double c7 = fma (c[10], z, c[9]);
  double c5 = fma (c[8], z, c[7]);
  double c3h, c3l;
  /* c3h, c3l <- c[5] + z*c[6] */
  fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
  c7 = fma (c9, z2, c7);
  /* c3h, c3l <- c3h, c3l + c5*z2 */
  fast_two_sum (&c3h, &tl, c3h, c5 * z2);
  c3l += tl;
  /* c3h, c3l <- c3h, c3l + c7*z4 */
  fast_two_sum (&c3h, &tl, c3h, c7 * z4);
  c3l += tl;
  /* c2h, c2l <- c[4] + z*(c3h + c3l) */
  double c2h, c2l;
  a_mul (&th, &tl, z, c3h);
  fast_two_sum (&c2h, &c2l, c[4], th);
  c2l += fma (z, c3l, tl);
  /* compute c[2] + c[3] + z*(c2h + c2l) */
  a_mul (&th, &tl, z, c2h);
  fast_two_sum (h, l, c[2], th);
  *l += tl + fma (z, c2l, c[3]);
  /* compute c[0] + c[1] + z*(h + l) */
  a_mul (&th, &tl, z, *h);
  tl = fma (z, *l, tl); /* tl += z*l */
  fast_two_sum (h, l, c[0], th);
  *l += tl + c[1];
  return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1)
 			(larger values of i yield smaller error bounds) */
 }
 /* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
 static void
 cr_erf_accurate_tiny (double *h, double *l, double z)
 {
  uint64_t i, j, k;
  /* use dichotomy */
  for (i = 0, j = EXCEPTIONS_LEN; i + 1 < j;)
    {
      k = (i + j) / 2;
      if (EXCEPTIONS_TINY[k][0] <= z)
 	i = k;
      else
 	j = k;
    }
  /* Either i=0 and z < exceptions[i+1][0],
     or i=n-1 and exceptions[i][0] <= z
     or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0].
     In all cases z can only equal exceptions[i][0]. */
  if (z == EXCEPTIONS_TINY[i][0])
    {
      *h = EXCEPTIONS_TINY[i][1];
      *l = EXCEPTIONS_TINY[i][2];
      return;
    }
  double z2 = z * z, th, tl;
  *h = P[21 / 2 + 4]; /* degree 21 */
  for (int a = 19; a > 11; a -= 2)
    *h = fma (*h, z2, P[a / 2 + 4]); /* degree j */
  *l = 0;
  for (int a = 11; a > 7; a -= 2)
    {
      /* multiply h+l by z^2 */
      a_mul (&th, &tl, *h, z);
      tl = fma (*l, z, tl);
      a_mul (h, l, th, z);
      *l = fma (tl, z, *l);
      /* add p[j] to h + l */
      fast_two_sum (h, &tl, P[a / 2 + 4], *h);
      *l += tl;
    }
  for (int a = 7; a >= 1; a -= 2)
    {
      /* multiply h+l by z^2 */
      a_mul (&th, &tl, *h, z);
      tl = fma (*l, z, tl);
      a_mul (h, l, th, z);
      *l = fma (tl, z, *l);
      fast_two_sum (h, &tl, P[a - 1], *h);
      *l += P[a] + tl;
    }
  /* multiply by z */
  a_mul (h, &tl, *h, z);
  *l = fma (*l, z, tl);
  return;
 }
 /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
   approximation of erf(z).
   Assumes z >= 2^-61, thus no underflow can occur. */
@@ -238,10 +59,7 @@ cr_erf_accurate (double *h, double *l, double z)
       the interval, namely i/8+1/16
  */
  if (z < 0.125) /* z < 1/8 */
-    {
+    return __cr_erf_accurate_tiny (h, l, z, true);
      cr_erf_accurate_tiny (h, l, z);
      return;
    }
  double v = floor (8.0 * z);
  uint32_t i = 8.0 * z;
  z = (z - 0.0625) - 0.125 * v;
@@ -313,7 +131,7 @@ __erf (double x)
      return res;
    }
  /* now z >= 2^-61 */
-  err = cr_erf_fast (&h, &l, z);
+  err = __cr_erf_fast (&h, &l, z);
  uint64_t u = asuint64 (h), v = asuint64 (l);
  t = asuint64 (x);
  u ^= t & SIGN_MASK;
--- a/sysdeps/ieee754/dbl-64/s_erf_common.c
+++ b/sysdeps/ieee754/dbl-64/s_erf_common.c
@@ -0,0 +1,170 @@
 /* Common definitions for erf/erc implementation.
 Copyright (c) 2023-2025 Paul Zimmermann
 The original version of this file was copied from the CORE-MATH
 project (file src/binary64/erf/erf.c, revision 384ed01d).
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 #include <stdint.h>
 #include <math.h>
 #include <stddef.h>
 #include <stdbool.h>
 #include "s_erf_common.h"
 double
 __cr_erf_fast (double *h, double *l, double z)
 {
  double th, tl;
  /* we split [0,0x1.7afb48dc96626p+2] into intervals i/16 <= z < (i+1)/16,
     and for each interval, we use a minimax polynomial:
     * for i=0 (0 <= z < 1/16) we use a polynomial evaluated at zero,
       since if we evaluate in the middle 1/32, we will get bad accuracy
       for tiny z, and moreover z-1/32 might not be exact
     * for 1 <= i <= 94, we use a polynomial evaluated in the middle of
       the interval, namely i/16+1/32
  */
  if (z < 0.0625) /* z < 1/16 */
    {
      double z2h, z2l, z4;
      a_mul (&z2h, &z2l, z, z);
      z4 = z2h * z2h;
      double c9 = fma (C0[7], z2h, C0[6]);
      double c5 = fma (C0[5], z2h, C0[4]);
      c5 = fma (c9, z4, c5);
      /* compute C0[2] + C0[3] + z2h*c5 */
      a_mul (&th, &tl, z2h, c5);
      fast_two_sum (h, l, C0[2], th);
      *l += tl + C0[3];
      /* compute C0[0] + C0[1] + (z2h + z2l)*(h + l) */
      double h_copy = *h;
      a_mul (&th, &tl, z2h, *h);
      tl += fma (z2h, *l, C0[1]);
      fast_two_sum (h, l, C0[0], th);
      *l += fma (z2l, h_copy, tl);
      /* multiply (h,l) by z */
      a_mul (h, &tl, *h, z);
      *l = fma (*l, z, tl);
      return 0x1.78p-69; /* err < 2.48658249618372e-21, cf Analyze0() */
    }
  double v = floor (16.0 * z);
  uint32_t i = 16.0 * z;
  /* i/16 <= z < (i+1)/16 */
  /* For 0.0625 0 <= z <= 0x1.7afb48dc96626p+2, z - 0.03125 is exact:
     (1) either z - 0.03125 is in the same binade as z, then 0.03125 is
 	 an integer multiple of ulp(z), so is z - 0.03125
     (2) if z - 0.03125 is in a smaller binade, both z and 0.03125 are
 	 integer multiple of the ulp() of that smaller binade.
     Also, subtracting 0.0625 * v is exact. */
  z = (z - 0.03125) - 0.0625 * v;
  /* now |z| <= 1/32 */
  const double *c = C[i - 1];
  double z2 = z * z, z4 = z2 * z2;
  /* the degree-10 coefficient is c[12] */
  double c9 = fma (c[12], z, c[11]);
  double c7 = fma (c[10], z, c[9]);
  double c5 = fma (c[8], z, c[7]);
  double c3h, c3l;
  /* c3h, c3l <- c[5] + z*c[6] */
  fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
  c7 = fma (c9, z2, c7);
  /* c3h, c3l <- c3h, c3l + c5*z2 */
  fast_two_sum (&c3h, &tl, c3h, c5 * z2);
  c3l += tl;
  /* c3h, c3l <- c3h, c3l + c7*z4 */
  fast_two_sum (&c3h, &tl, c3h, c7 * z4);
  c3l += tl;
  /* c2h, c2l <- c[4] + z*(c3h + c3l) */
  double c2h, c2l;
  a_mul (&th, &tl, z, c3h);
  fast_two_sum (&c2h, &c2l, c[4], th);
  c2l += fma (z, c3l, tl);
  /* compute c[2] + c[3] + z*(c2h + c2l) */
  a_mul (&th, &tl, z, c2h);
  fast_two_sum (h, l, c[2], th);
  *l += tl + fma (z, c2l, c[3]);
  /* compute c[0] + c[1] + z*(h + l) */
  a_mul (&th, &tl, z, *h);
  tl = fma (z, *l, tl); /* tl += z*l */
  fast_two_sum (h, l, c[0], th);
  *l += tl + c[1];
  return 0x1.11p-69; /* err < 1.80414390200020e-21, cf analyze_p(1)
 			(larger values of i yield smaller error bounds) */
 }
 void
 __cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions)
 {
  if (exceptions)
    {
      uint64_t i, j, k;
      /* use dichotomy */
      for (i = 0, j = EXCEPTIONS_TINY_LEN; i + 1 < j;)
 	{
 	  k = (i + j) / 2;
 	  if (EXCEPTIONS_TINY[k][0] <= z)
 	    i = k;
 	  else
 	    j = k;
 	}
      /* Either i=0 and z < exceptions[i+1][0],
 	 or i=n-1 and exceptions[i][0] <= z
 	 or 0 < i < n-1 and exceptions[i][0] <= z < exceptions[i+1][0].
 	 In all cases z can only equal exceptions[i][0]. */
      if (z == EXCEPTIONS_TINY[i][0])
 	{
 	  *h = EXCEPTIONS_TINY[i][1];
 	  *l = EXCEPTIONS_TINY[i][2];
 	  return;
 	}
    }
  double z2 = z * z, th, tl;
  *h = P[21 / 2 + 4]; /* degree 21 */
  for (int a = 19; a > 11; a -= 2)
    *h = fma (*h, z2, P[a / 2 + 4]); /* degree j */
  *l = 0;
  for (int a = 11; a > 7; a -= 2)
    {
      /* multiply h+l by z^2 */
      a_mul (&th, &tl, *h, z);
      tl = fma (*l, z, tl);
      a_mul (h, l, th, z);
      *l = fma (tl, z, *l);
      /* add p[j] to h + l */
      fast_two_sum (h, &tl, P[a / 2 + 4], *h);
      *l += tl;
    }
  for (int a = 7; a >= 1; a -= 2)
    {
      /* multiply h+l by z^2 */
      a_mul (&th, &tl, *h, z);
      tl = fma (*l, z, tl);
      a_mul (h, l, th, z);
      *l = fma (tl, z, *l);
      fast_two_sum (h, &tl, P[a - 1], *h);
      *l += P[a] + tl;
    }
  /* multiply by z */
  a_mul (h, &tl, *h, z);
  *l = fma (*l, z, tl);
  return;
 }
--- a/sysdeps/ieee754/dbl-64/s_erf_common.h
+++ b/sysdeps/ieee754/dbl-64/s_erf_common.h
@@ -0,0 +1,80 @@
 /* Common definitions for erf/erc implementation.
 Copyright (c) 2023-2025 Paul Zimmermann
 This file is part of the CORE-MATH project
 (https://core-math.gitlabpages.inria.fr/).
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 #ifndef _S_ERF_COMMON_H
 #define _S_ERF_COMMON_H
 #include "s_erf_data.h"
 /* Add a + b, such that *hi + *lo approximates a + b.
   Assumes |a| >= |b|.
   For rounding to nearest we have hi + lo = a + b exactly.
   For directed rounding, we have
   (a) hi + lo = a + b exactly when the exponent difference between a and b
       is at most 53 (the binary64 precision)
   (b) otherwise |(a+b)-(hi+lo)| <= 2^-105 min(|a+b|,|hi|)
       (see https://hal.inria.fr/hal-03798376)
   We also have |lo| < ulp(hi). */
 static inline void
 fast_two_sum (double *hi, double *lo, double a, double b)
 {
  double e;
  *hi = a + b;
  e = *hi - a; /* exact */
  *lo = b - e; /* exact */
 }
 /* Reference: https://hal.science/hal-01351529v3/document */
 static inline void
 two_sum (double *hi, double *lo, double a, double b)
 {
  *hi = a + b;
  double aa = *hi - b;
  double bb = *hi - aa;
  double da = a - aa;
  double db = b - bb;
  *lo = da + db;
 }
 // Multiply exactly a and b, such that *hi + *lo = a * b.
 static inline void
 a_mul (double *hi, double *lo, double a, double b)
 {
  *hi = a * b;
  *lo = fma (a, b, -*hi);
 }
 /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
   of erf(z). Return err the maximal relative error:
   |(h + l)/erf(z) - 1| < err*|h+l| */
 double __cr_erf_fast (double *h, double *l, double z) attribute_hidden;
 /* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
 void __cr_erf_accurate_tiny (double *h, double *l, double z, bool exceptions)
     attribute_hidden;
 #endif
--- a/sysdeps/ieee754/dbl-64/s_erfc.c
+++ b/sysdeps/ieee754/dbl-64/s_erfc.c
@@ -48,129 +48,9 @@ SOFTWARE.
 #include <errno.h>
 #include <libm-alias-double.h>
 #include "math_config.h"
-#include "s_erf_data.h"
+#include "s_erf_common.h"
 #include "s_erfc_data.h"
 static inline void
 fast_two_sum (double *hi, double *lo, double a, double b)
 {
  double e;
  *hi = a + b;
  e = *hi - a; /* exact */
  *lo = b - e; /* exact */
 }
 static inline void
 two_sum (double *hi, double *lo, double a, double b)
 {
  *hi = a + b;
  double aa = *hi - b;
  double bb = *hi - aa;
  double da = a - aa;
  double db = b - bb;
  *lo = da + db;
 }
 static inline void
 a_mul (double *hi, double *lo, double a, double b)
 {
  *hi = a * b;
  *lo = fma (a, b, -*hi);
 }
 /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an approximation
   of erf(z). Return err the maximal relative error:
   |(h + l)/erf(z) - 1| < err*|h+l| */
 static double
 cr_erf_fast (double *h, double *l, double z)
 {
  double th, tl;
  if (z < 0.0625) /* z < 1/16 */
    {
      double z2h, z2l, z4;
      a_mul (&z2h, &z2l, z, z);
      z4 = z2h * z2h;
      double c9 = fma (C0[7], z2h, C0[6]);
      double c5 = fma (C0[5], z2h, C0[4]);
      c5 = fma (c9, z4, c5);
      a_mul (&th, &tl, z2h, c5);
      fast_two_sum (h, l, C0[2], th);
      *l += tl + C0[3];
      double h_copy = *h;
      a_mul (&th, &tl, z2h, *h);
      tl += fma (z2h, *l, C0[1]);
      fast_two_sum (h, l, C0[0], th);
      *l += fma (z2l, h_copy, tl);
      a_mul (h, &tl, *h, z);
      *l = fma (*l, z, tl);
      return 0x1.78p-69;
    }
  double v = floor (16.0 * z);
  uint32_t i = 16.0 * z;
  z = (z - 0.03125) - 0.0625 * v;
  const double *c = C[i - 1];
  double z2 = z * z, z4 = z2 * z2;
  double c9 = fma (c[12], z, c[11]);
  double c7 = fma (c[10], z, c[9]);
  double c5 = fma (c[8], z, c[7]);
  double c3h, c3l;
  fast_two_sum (&c3h, &c3l, c[5], z * c[6]);
  c7 = fma (c9, z2, c7);
  fast_two_sum (&c3h, &tl, c3h, c5 * z2);
  c3l += tl;
  fast_two_sum (&c3h, &tl, c3h, c7 * z4);
  c3l += tl;
  double c2h, c2l;
  a_mul (&th, &tl, z, c3h);
  fast_two_sum (&c2h, &c2l, c[4], th);
  c2l += fma (z, c3l, tl);
  a_mul (&th, &tl, z, c2h);
  fast_two_sum (h, l, c[2], th);
  *l += tl + fma (z, c2l, c[3]);
  a_mul (&th, &tl, z, *h);
  tl = fma (z, *l, tl); /* tl += z*l */
  fast_two_sum (h, l, c[0], th);
  *l += tl + c[1];
  return 0x1.11p-69;
 }
 /* for |z| < 1/8, assuming z >= 2^-61, thus no underflow can occur */
 static void
 cr_erf_accurate_tiny (double *h, double *l, double z)
 {
  double z2 = z * z, th, tl;
  *h = P[21 / 2 + 4]; /* degree 21 */
  for (int j = 19; j > 11; j -= 2)
    *h = fma (*h, z2, P[j / 2 + 4]); /* degree j */
  *l = 0;
  for (int j = 11; j > 7; j -= 2)
    {
      /* multiply h+l by z^2 */
      a_mul (&th, &tl, *h, z);
      tl = fma (*l, z, tl);
      a_mul (h, l, th, z);
      *l = fma (tl, z, *l);
      /* add P[j] to h + l */
      fast_two_sum (h, &tl, P[j / 2 + 4], *h);
      *l += tl;
    }
  for (int j = 7; j >= 1; j -= 2)
    {
      /* multiply h+l by z^2 */
      a_mul (&th, &tl, *h, z);
      tl = fma (*l, z, tl);
      a_mul (h, l, th, z);
      *l = fma (tl, z, *l);
      fast_two_sum (h, &tl, P[j - 1], *h);
      *l += P[j] + tl;
    }
  /* multiply by z */
  a_mul (h, &tl, *h, z);
  *l = fma (*l, z, tl);
  return;
 }
 /* Assuming 0 <= z <= 0x1.7afb48dc96626p+2, put in h+l an accurate
   approximation of erf(z).
   Assumes z >= 2^-61, thus no underflow can occur. */
@@ -179,7 +59,7 @@ cr_erf_accurate (double *h, double *l, double z)
 {
  double th, tl;
  if (z < 0.125) /* z < 1/8 */
-    return cr_erf_accurate_tiny (h, l, z);
+    return __cr_erf_accurate_tiny (h, l, z, false);
  double v = floor (8.0 * z);
  uint32_t i = 8.0 * z;
  z = (z - 0.0625) - 0.125 * v;
@@ -507,7 +387,7 @@ cr_erfc_fast (double *h, double *l, double x)
     throughput is about 44 cycles */
  if (x < 0) // erfc(x) = 1 - erf(x) = 1 + erf(-x)
    {
-      double err = cr_erf_fast (h, l, -x);
+      double err = __cr_erf_fast (h, l, -x);
      /* h+l approximates erf(-x), with relative error bounded by err,
 	 where err <= 0x1.78p-69 */
      err = err * *h; /* convert into absolute error */
@@ -533,7 +413,7 @@ cr_erfc_fast (double *h, double *l, double x)
     the average reciprocal throughput is about 59 cycles */
  else if (x <= THRESHOLD1)
    {
-      double err = cr_erf_fast (h, l, x);
+      double err = __cr_erf_fast (h, l, x);
      /* h+l approximates erf(x), with relative error bounded by err,
 	 where err <= 0x1.78p-69 */
      err = err * *h; /* convert into absolute error */