rationalize pico_float/pico_double libraries (#2208)

* on RP2350 _dcp variant now enables -msoft-float, since if you're using this at all it is likely because you don't want to use the VFP unit at all (to save stack space) * implement all float_ and double_ conversion functions in all pico_float_pico_ variants and pico_double_pico on RP2040 and RP2350 (many were missing in some combinations) * provide better granularity of what functions are wrapped in each case also marked custom_xxx_funcs_test.c as not in bazel build yet
2025-08-06 06:02:39 +03:00 · 2025-02-04 16:19:17 -06:00
parent 7d450bf097
commit e85c3e5515
17 changed files with 2012 additions and 142 deletions
--- a/src/rp2_common/hardware_dma/include/hardware/dma.h
+++ b/src/rp2_common/hardware_dma/include/hardware/dma.h
@@ -535,7 +535,7 @@ static inline void dma_channel_start(uint channel) {
 *\endcode
 *
 * \if rp2350_specific
- * RP2350 only: Due to errata RP12350-E5 (see the RP2350 datasheet for further detail), it is necessary to clear the enable bit of
+ * RP2350 only: Due to errata RP2350-E5 (see the RP2350 datasheet for further detail), it is necessary to clear the enable bit of
 * the aborted channel and any chained channels prior to the abort to prevent re-triggering.
 * \endif
 *
--- a/src/rp2_common/pico_double/double_aeabi_dcp.S
+++ b/src/rp2_common/pico_double/double_aeabi_dcp.S
@@ -7,7 +7,7 @@
 #include "pico/asm_helper.S"
 #if !HAS_DOUBLE_COPROCESSOR
-#error attempt to compile double_aeabi_rp2350 when there is no DCP
+#error attempt to compile double_aeabi_dcp when there is no DCP
 #else
 #include "hardware/dcp_instr.inc.S"
@@ -29,7 +29,7 @@ double_section WRAPPER_FUNC_NAME(\func)
 // ============== STATE SAVE AND RESTORE ===============
-.macro saving_func type func
+.macro saving_func type func, opt_label1='-', opt_label2='-'
  // Note we are usually 32-bit aligned already at this point, as most of the
  // function bodies contain exactly two 16-bit instructions: bmi and bx lr.
  // We want the PCMP word-aligned.
@@ -41,6 +41,12 @@ double_section WRAPPER_FUNC_NAME(\func)
  push {lr}              // 16-bit instruction
  bl generic_save_state  // 32-bit instruction
  b 1f                   // 16-bit instruction
 .ifnc \opt_label1,'-'
 regular_func \opt_label1
 .endif
 .ifnc \opt_label2,'-'
 regular_func \opt_label2
 .endif
  // This is the actual entry point:
 \type\()_func \func
  PCMP apsr_nzcv
@@ -128,53 +134,124 @@ saving_func wrapper sqrt
  dcp_dsqrt_m r0,r1,r0,r1,r0,r1,r2,r3,r12
  saving_func_return
-// todo not a real thing
+double_section dclassify
-double_wrapper_section __aeabi_dclassify
+saving_func regular dclassify
 saving_func wrapper __aeabi_dclassify
@ with correct rounding
  dcp_dclassify_m apsr_nzcv,r0,r1
  saving_func_return
 // ============== CONVERSION FUNCTIONS ===============
 double_wrapper_section __aeabi_d2f
-saving_func wrapper __aeabi_d2f
+saving_func wrapper __aeabi_d2f double2float
@ with rounding
  dcp_double2float_m r0,r0,r1
  saving_func_return
 double_wrapper_section __aeabi_i2d
-saving_func wrapper __aeabi_i2d
+saving_func wrapper __aeabi_i2d int2double
  dcp_int2double_m r0,r1,r0
  saving_func_return
 double_wrapper_section __aeabi_ui2d
-saving_func wrapper __aeabi_ui2d
+saving_func wrapper __aeabi_ui2d uint2double
  dcp_uint2double_m r0,r1,r0
  saving_func_return
 double_section double2fix_z
 saving_func regular double2fix_z
  ubfx r3, r1, #20, #11
  adds r3, r2
  beq 1f // very small; we don't care that we might make a denormal
  asrs ip, r3, #11
  beq 1f
  ite pl
  movpl r3, #0x7ff
  movsmi r3, #0
 1:
  bfi r1, r3, #20, #11
  b double2int_z_entry
 double_section double2ufix
 saving_func regular double2ufix_z double2ufix
 double2ufix_z_entry:
  ubfx r3, r1, #20, #11
  adds r3, r2
  beq 1f // very small; we don't care that we might make a denormal
  asrs ip, r3, #11
  beq 1f
  ite pl
  lsrspl r3, r1, #20 // 0x7ff
  movsmi r3, #0
 1:
  bfi r1, r3, #20, #11
  b double2uint_z_entry
 double_section double2fix
 saving_func regular double2fix
  ubfx r3, r1, #20, #11
  cbz r3, 2f // 0 or denormal
  adds r3, r2
  beq 1f // very small; we don't care that we might make a denormal
  asrs ip, r3, #11
  beq 1f
  ite pl
  movpl r3, #0x7ff
  movsmi r3, #0
 1:
  bfi r1, r3, #20, #11
  b double2int_entry
 2:
  movs r0, #0
 saving_func_return
 double_section double2int
 saving_func regular double2int
 double2int_entry:
  lsls r2, r1, #1
  bcc double2int_z_entry // positive is ok for int64_z
  lsrs r3, r2, #21
  beq double2int_z_entry // 0 or -0 or denormal is ok for int_z
  lsrs r2, #21
  adds r2, #1
  subs r2, r2, #0x400
  bcc 1f // <1 means subtract 1
  cmp r2, #31
  bge double2int_z_entry // must be an integer or maxed out
  lsls r3, r1, #12
  adds r3, r3, r0, lsr #20 // r3 now has highest 32 mantissa bits
  lsls r3, r2
  orrs r3, r3, r0, lsl #12 // these bits are all guaranteed to be in the fraction
  beq double2int_z_entry // integer
 1:
  dcp_double2int_m r0,r0,r1
  subs r0, #1
 saving_func_return
 double_wrapper_section __aeabi_d2iz
-saving_func wrapper __aeabi_d2iz
+saving_func wrapper __aeabi_d2iz double2int_z
 double2int_z_entry:
@ with truncation towards 0
  dcp_double2int_m r0,r0,r1
  // note: this works with either saved or not saved call as it is just a `bx lr`
  saving_func_return
 double_wrapper_section __aeabi_d2uiz
-saving_func wrapper __aeabi_d2uiz
+saving_func wrapper __aeabi_d2uiz double2uint double2uint_z
 double2uint_z_entry:
@ with truncation towards 0
  dcp_double2uint_m r0,r0,r1
  saving_func_return
-// todo not a real thing
+double_section double2int_r
-double_wrapper_section __aeabi_d2i_r
+saving_func regular double2int_r
 saving_func wrapper __aeabi_d2i_r
@ with rounding
  dcp_double2int_r_m r0,r0,r1
  saving_func_return
-// todo not a real thing
+double_section double2uint_r
-double_wrapper_section __aeabi_d2ui_r
+saving_func regular double2uint_r
 saving_func wrapper __aeabi_d2ui_r
@ with rounding
  dcp_double2uint_r_m r0,r0,r1
  saving_func_return
@@ -189,7 +266,6 @@ saving_func wrapper __aeabi_dcmpun
  saving_func_return
 double_wrapper_section __aeabi_dcmp
 saving_func wrapper __aeabi_cdrcmple
  dcp_dcmp_m apsr_nzcv,r2,r3,r0,r1 // with arguments reversed
  bvs cmp_nan
--- a/src/rp2_common/pico_double/double_aeabi_rp2040.S
+++ b/src/rp2_common/pico_double/double_aeabi_rp2040.S
@@ -425,6 +425,7 @@ double_wrapper_section __aeabi_ui2d
 double_wrapper_section __aeabi_i2d
 wrapper_func __aeabi_ui2d
 regular_func uint2double
    movs r1, #0
    cmp r0, #0
    bne 2f
@@ -432,6 +433,7 @@ wrapper_func __aeabi_ui2d
    bx lr
 // double FUNC_NAME(__aeabi_i2d)(int)                     integer to double (double precision) conversion
 wrapper_func __aeabi_i2d
 regular_func int2double
    asrs r1, r0, #31
    eors r0, r1
    subs r0, r1
@@ -506,6 +508,7 @@ regular_func double2int
 // unsigned FUNC_NAME(__aeabi_d2uiz)(double)             double (double precision) to unsigned C-style conversion [3]
 double_wrapper_section __aeabi_d2uiz
 wrapper_func __aeabi_d2uiz
 regular_func double2uint_z
 regular_func double2uint
    shimmable_table_tail_call SF_TABLE_FLOAT2UINT double2uint_shim
@@ -528,11 +531,13 @@ regular_func ufix642double
 // double FUNC_NAME(__aeabi_l2d)(long long)             long long to double (double precision) conversion
 double_wrapper_section __aeabi_l2d
 wrapper_func __aeabi_l2d
 regular_func int642double
    shimmable_table_tail_call SF_TABLE_INT642FLOAT int642double_shim
 // double FUNC_NAME(__aeabi_l2f)(long long)             long long to double (double precision) conversion
 double_wrapper_section __aeabi_ul2d
 wrapper_func __aeabi_ul2d
 regular_func uint642double
    shimmable_table_tail_call SF_TABLE_UINT642FLOAT uint642double_shim
 // long long FUNC_NAME(__aeabi_d2lz)(double)             double (double precision) to long long C-style conversion [3]
@@ -566,22 +571,106 @@ regular_func double2int64
 // unsigned long long FUNC_NAME(__aeabi_d2ulz)(double)     double to unsigned long long C-style conversion [3]
 double_wrapper_section __aeabi_d2ulz
 wrapper_func __aeabi_d2ulz
 regular_func double2uint64
 regular_func double2uint64_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UINT64 double2uint64_shim
 double_section double2fix64_z
 regular_func double2fix64_z
  lsls r3, r1, #1
  bcc double2fix64 // input positive is ok for fix64
  mov ip, r2
  asrs r2, r3, #21
  beq 3f           // input zero or denormal, so just return zero
  adds r2, #1
  beq double2fix64 // input infinite/nan is ok for fix64
  lsrs r3, #21
  add r3, ip
  movs r2, #1
  negs r2, r2
  lsrs r2, #22
  subs r3, r2 // r3 = modified e - 0x3ff
  bcc 3f // modified input < 1.0 means result is zero
  cmp r3, #52
  bge 2f // modified input must be an integer or infinite
  adds r3, #12
  mov r2, r1
  lsls r2, r2, r3    // r2 has remaining fractional mantissa bits of r1
  bne 1f             // not integer as non zero fractional bits remain
  subs r3, #32
  asrs r2, r3, #31
  bics r3, r3, r2
  movs r2, r0
  lsls r2, r2, r3
  bne 1f             // remaining fractional bits are non-zero, so argument was not an integer
 2:
  // integer
  mov r2, ip
  b double2fix64
 3: // result is zero
  movs r0, #0
  movs r1, #0
  bx lr
 1:
  push {lr}
  mov r2, ip
  bl double2fix64
  movs r2, #0
  adds r0, #1
  adcs r1, r2
  pop {pc}
 double_section double2fix64
 regular_func double2fix64
    shimmable_table_tail_call SF_TABLE_FLOAT2FIX64 double2fix64_shim
 double_section double2ufix64
 regular_func double2ufix64
 regular_func double2ufix64_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UFIX64 double2ufix64_shim
 double_section double2fix
 regular_func double2fix
    shimmable_table_tail_call SF_TABLE_FLOAT2FIX double2fix_shim
 double_section double2fix_z
 regular_func double2fix_z
  lsls r3, r1, #1
  asrs r3, #21
  beq 2f // input is zero or denormal
  adds r3, #1
  beq 3f // input is infinite or nan
  // extract exponent again
  lsls r3, r1, #1
  lsrs r3, #21
  // adjust
  adds r3, r2
  ble 2f // adjusted input is zero or dedornmal or < 1
  lsrs r3, r3, #11
  bne 3f // adjusted input is > infinite
  lsls r2, r2, #20 // align exponent adjustment offset
  adds r1, r1, r2  // we know adjustment is safe
  b double2int_z
 2:
  // result is zero
  movs r0, #0
  bx lr
 3:
  movs r0, #0
  subs r0, #1
  lsrs r0, #1
  asrs r1, #31
  eors r0, r1
  bx lr
 double_section double2ufix
 regular_func double2ufix
 regular_func double2ufix_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UFIX double2ufix_shim
 double_wrapper_section __aeabi_d2f
--- a/src/rp2_common/pico_double/double_conv_m33.S
+++ b/src/rp2_common/pico_double/double_conv_m33.S
@@ -249,7 +249,69 @@ regular_func ufix2double
 movs r1,#0
 bx r14
-double_wrapper_section conv_dtoi64
+double_section conv_dtoi64
 regular_func double2int64
  lsls r3, r1, #1
  bcc double2int64_z // input positive is ok for int64_z
  cmp r3, #0xffe00000
  bcs double2int64_z // input is infinite
  lsrs r3, #21
  beq 2f // input zero or denormal, means answer remains zero
  sub r3, #0x3ff
  cmp r3, #0
  blt 1f // input is less than 1.0
  cmp r3, #52
  bge double2int64_z // modified input must be an integer or infinite
  adds r3, #12
  lsls r2, r1, r3    // r2 has remaining fractional mantissa bits of r1
  bne 1f             // not integer as non zero fractional bits remain
  subs r3, #32
  bics r3, r3, r3, asr #31 // map negative shift to zero
  lsls r3, r0, r3
  beq double2int64_z   // remaining fractional bits are 0, so argument was an integer
 1:
  push {lr}
  bl double2int64_z
  subs r0, #1
  sbcs r1, r1, #0
  pop {pc}
 2:
  movs r0, #0
  movs r1, #0
  bx lr
 double_section conv_dtofix64
 regular_func double2fix64
  lsls r3, r1, #1
  bcc double2fix64_z // input positive is ok for fix64_z
  cmp r3, #0xffe00000
  bcs double2fix64_z // input is infinite
  lsrs r3, #21
  beq 2f // input zero or denormal, means answer remains zero
  sub r3, #0x3ff
  adds r3, r2
  blt 1f // modified input zero or denormal, or less than 1.0
  cmp r3, #52
  bge double2fix64_z // modified input must be an integer or infinite
  adds r3, #12
  lsls ip, r1, r3    // ip has remaining fractional mantissa bits of r1
  bne 1f             // not integer as non zero fractional bits remain
  subs r3, #32
  bics r3, r3, r3, asr #31 // map negative shift to zero
  lsls r3, r0, r3
  beq double2fix64_z   // remaining fractional bits are 0, so argument was an integer
 1:
  push {lr}
  bl double2fix64_z
  subs r0, #1
  sbcs r1, r1, #0
  pop {pc}
 2:
  movs r0, #0
  movs r1, #0
  bx lr
 double_wrapper_section conv_dtoi64_z
@ convert double to signed int64, rounding towards 0, clamping
 wrapper_func __aeabi_d2lz
--- a/src/rp2_common/pico_double/double_fma_dcp.S
+++ b/src/rp2_common/pico_double/double_fma_dcp.S
@@ -582,7 +582,7 @@ wrapper_func fma
 saving_func_return
-double_wrapper_section __dmla
+double_section fma_fast
@ cf saving_func macro: but here we need to record the SP before the state save possibly changes it
 1:
 push {lr}              // 16-bit instruction
@@ -592,6 +592,7 @@ double_wrapper_section __dmla
@ r0:r1 m
@ r2:r3 n
@ [r13,#0] a
 regular_func fma_fast
 regular_func mla
 mov r12,sp                  @ save the SP
 PCMP apsr_nzcv              @ test the engaged flag
--- a/src/rp2_common/pico_double/include/pico/double.h
+++ b/src/rp2_common/pico_double/include/pico/double.h
@@ -16,50 +16,153 @@ extern "C" {
 #endif
 /** \file double.h
-*  \defgroup pico_double pico_double
+* \defgroup pico_double pico_double
 *
 * \brief Optimized double-precision floating point functions
 *
-* (Replacement) optimized implementations are provided of the following compiler built-ins
+* An application can take control of the floating point routines used in the application over and above what is provided by the compiler,
-* and math library functions:
+* by depending on the pico_double library. A user might want to do this:
 *
-* - __aeabi_dadd, __aeabi_ddiv, __aeabi_dmul, __aeabi_drsub, __aeabi_dsub, __aeabi_cdcmpeq, __aeabi_cdrcmple, __aeabi_cdcmple, __aeabi_dcmpeq, __aeabi_dcmplt, __aeabi_dcmple, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmpun, __aeabi_i2d, __aeabi_l2d, __aeabi_ui2d, __aeabi_ul2d, __aeabi_d2iz, __aeabi_d2lz, __aeabi_d2uiz, __aeabi_d2ulz, __aeabi_d2f
+* 1. To use optimized software implementations provided by the RP2-series device's bootrom or the SDK
-* - sqrt, cos, sin, tan, atan2, exp, log, ldexp, copysign, trunc, floor, ceil, round, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh, exp2, log2, exp10, log10, pow,, hypot, cbrt, fmod, drem, remainder, remquo, expm1, log1p, fma
+* 2. To use optimized combined software/hardware implementations utilizing custom RP2-series hardware for acceleration
-* - powint, sincos (GNU extensions)
+* 3. To control the amount of C compiler/library code bloat
 * 4. To make sure no floating point is called at all
 *
-* The following additional optimized functions are also provided:
+* The pico_double library comes in three main flavors:
 *
-* - int2double, uint2double, int642double, uint642double, fix2double, ufix2double, fix642double, ufix642double
+* 1. `pico_double_none` - all floating point operations cause a \ref panic - no double-precision floating point code is included
-* - double2fix, double2ufix, double2fix64, double2ufix64, double2int, double2uint, double2int64, double2uint64, double2int_z, double2int64_z,
+* 2. `pico_double_compiler` - no custom functions are provided; all double-precision floating point is handled by the C compiler/library
-* - exp10, sincos, powint
+* 3. `pico_double_pico` - the smallest and fastest available for the platform, along with additional functionality (e.g. fixed point conversions) which are detailed below
 *
-* On RP2350 the following additional functions are available; the _fast methods are faster but do not round correctly"
+* The user can control which version they want (e.g. **pico_double_xxx** by either setting the CMake global variable
 * `PICO_DEFAULT_DOUBLE_IMPL=xxx`, or by using the CMake function `pico_set_double_implementation(<TARGET> xxx)`. Note that in the absence
 * of either, pico_double_pico is used by default.
 *
-* - ddiv_fast, sqrt_fast
+* \if rp2040_specific
 * On RP2040, `pico_double_pico` uses optimized hand coded implementations from the bootrom and the SDK for both
 * basic double-precision floating point operations and floating point math library functions. These implementations
 * are generally faster and smaller than those provided by the C compiler/library, though they don't support all the features of a fully compliant
 * floating point implementation; they are however usually fine for the majority of cases
 * \endif
 *
 * \if rp2350_specific
 * On RP2350, `pico_double_pico` uses RP2350 DCP instructions (double co-processor) to implement fast version of the basic
 * arithmetic functions, and provides optimized M33 implementations of trignometric and scientific functions.
 * These implementations are generally faster and smaller than those provided by the C compiler/library, though they don't support all the features of a fully compliant
 * floating point implementation; they are however usually fine for the majority of cases
 * \endif
 *
 * On Arm, (replacement) optimized implementations are provided for the following compiler built-ins
 * and math library functions when using `pico_double_pico`:
 *
 * - basic arithmetic:
 *
 *   __aeabi_dadd, __aeabi_ddiv, __aeabi_dmul, __aeabi_drsub, __aeabi_dsub
 *
 * - comparison:
 *
 *   __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_dcmpeq, __aeabi_dcmplt, __aeabi_dcmple, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmpun
 *
 * - (u)int32 <-> double:
 *
 *    __aeabi_i2d, __aeabi_ui2d, __aeabi_d2iz, __aeabi_d2uiz
 *
 * - (u)int64 <-> double:
 *
 *   __aeabi_l2d, __aeabi_ul2d, __aeabi_d2lz, __aeabi_d2ulz
 *
 * - double -> float:
 *
 *   __aeabi_d2d
 *
 * - basic trigonometric:
 *
 *   sqrt, cos, sin, tan, atan2, exp, log
 *
 * - trigonometric and scientific
 *
 *   ldexp, copysign, trunc, floor, ceil, round, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh, exp2, log2, exp10, log10, pow, hypot, cbrt, fmod, drem, remainder, remquo, expm1, log1p, fma
 *
 * - GNU exetnsions:
 *
 *   powint, sincos
 *
 * On Arm, the following additional optimized functions are also provided when using `pico_double_pico`:
 *
 * - Conversions to/from integer types:
 *
 *   - (u)int -> double (round to nearest):
 *
 *     int2double, uint2double, int642double, uint642double
 *
 *   - (u)double -> int (round towards zero):
 *
 *     double2int_z, double2uint_z, double2int64_z, double2uint64_z
 *
 *   - (u)double -> int (round towards -infinity):
 *
 *     double2int, double2uint, double2int64, double2uint64
 *
 * - Conversions to/from fixed point integers:
 *
 *   - (u)fix -> double (round to nearest):
 *
 *       fix2double, ufix2double, fix642double, ufix642double
 *
 *   - double -> (u)fix (round towards zero):
 *
 *       double2fix_z, double2ufix_z, double2fix64_z, double2ufix64_z
 *
 *   - double -> (u)fix (round towards -infinity):
 *
 *       double2fix, double2ufix, double2fix64, double2ufix64
 *
 * - Even faster versions of divide and square-root functions that do not round correctly:
 *
 *   ddiv_fast, sqrt_fast (these do not round correctly)
 *
 * - Faster unfused multiply and accumulate:
 *
 *   mla (fast fma)
 *
 * \if rp2350_specific
 * On RISC-V there is no custom double-precision floating point support, so `pico_double_pico` is equivalent to `pico_double_compiler`
 * \endif
 */
 #if !defined(__riscv) || PICO_COMBINED_DOCS
 #if PICO_COMBINED_DOCS || !LIB_PICO_DOUBLE_COMPILER
 double int2double(int32_t i);
-double uint2double(uint32_t u);
+double uint2double(uint32_t i);
 double int642double(int64_t i);
-double uint642double(uint64_t u);
+double uint642double(uint64_t i);
 double fix2double(int32_t m, int e);
 double ufix2double(uint32_t m, int e);
 double fix642double(int64_t m, int e);
 double ufix642double(uint64_t m, int e);
-// These methods round towards -Infinity.
+// These methods round towards 0, which IS the C way
-int32_t double2fix(double d, int e);
+int32_t double2int_z(double f);
-uint32_t double2ufix(double d, int e);
+int64_t double2int64_z(double f);
-int64_t double2fix64(double d, int e);
+int32_t double2uint_z(double f);
-uint64_t double2ufix64(double d, int e);
+int64_t double2uint64_z(double f);
-int32_t double2int(double d);
+int32_t double2fix_z(double f, int e);
-uint32_t double2uint(double d);
+uint32_t double2ufix_z(double f, int e);
-int64_t double2int64(double d);
+int64_t double2fix64_z(double f, int e);
-uint64_t double2uint64(double d);
+uint64_t double2ufix64_z(double f, int e);
-// These methods round towards 0.
+// These methods round towards -Infinity - which IS NOT the C way for negative numbers;
-int32_t double2int_z(double d);
+// as such the naming is not ideal, however is kept for backwards compatibility
-int64_t double2int64_z(double d);
+int32_t double2int(double f);
 uint32_t double2uint(double f);
 int64_t double2int64(double f);
 uint64_t double2uint64(double f);
 int32_t double2fix(double f, int e);
 uint32_t double2ufix(double f, int e);
 int64_t double2fix64(double f, int e);
 uint64_t double2ufix64(double f, int e);
 #endif
 double exp10(double x);
 void sincos(double x, double *sinx, double *cosx);
@@ -67,8 +170,24 @@ double powint(double x, int y);
 #if !PICO_RP2040 || PICO_COMBINED_DOCS
 double ddiv_fast(double n, double d);
-double sqrt_fast(double d);
+double sqrt_fast(double f);
-double mla(double x, double y, double z); // note this is not fused
+double fma_fast(double x, double y, double z); // this is not fused
 double mla(double x, double y, double z); // another name for fma_fast
 #endif
 #endif
 #if LIB_PICO_DOUBLE_COMPILER || defined(__riscv)
 // when using the compiler; we provide as many functions as we trivially can, though in the double case they are not optimal
 static inline double int2double(int32_t i) { return (double)i; }
 static inline double uint2double(uint32_t i) { return (double)i; }
 static inline double int642double(int64_t i) { return (double)i; }
 static inline double uint642double(uint64_t i) { return (double)i; }
 static inline int32_t double2int_z(double d) { return (int32_t)d; }
 static inline int64_t double2int64_z(double d) { return (int64_t)d; }
 static inline int32_t double2uint_z(double d) { return (uint32_t)d; }
 static inline int64_t double2uint64_z(double d) { return (uint64_t)d; }
 #endif
 #ifdef __cplusplus
@@ -76,4 +195,3 @@ double mla(double x, double y, double z); // note this is not fused
 #endif
 #endif
--- a/src/rp2_common/pico_float/BUILD.bazel
+++ b/src/rp2_common/pico_float/BUILD.bazel
@@ -2,13 +2,16 @@ load("//bazel:defs.bzl", "compatible_with_rp2", "incompatible_with_config")
 package(default_visibility = ["//visibility:public"])
-_WRAP_FLOAT_AEABI_FLAGS = [
+_WRAP_FLOAT_AEABI_ARITHMETIC_FLAGS = [
    "-Wl,--wrap=__aeabi_fadd",
    "-Wl,--wrap=__aeabi_fdiv",
    "-Wl,--wrap=__aeabi_fmul",
    "-Wl,--wrap=__aeabi_frsub",
    "-Wl,--wrap=__aeabi_fsub",
    "-Wl,--wrap=__aeabi_cfcmpeq",
 ]
 _WRAP_FLOAT_AEABI_CMP_FLAGS = [
    "-Wl,--wrap=__aeabi_cfrcmple",
    "-Wl,--wrap=__aeabi_cfcmple",
    "-Wl,--wrap=__aeabi_fcmpeq",
@@ -17,15 +20,27 @@ _WRAP_FLOAT_AEABI_FLAGS = [
    "-Wl,--wrap=__aeabi_fcmpge",
    "-Wl,--wrap=__aeabi_fcmpgt",
    "-Wl,--wrap=__aeabi_fcmpun",
 ]
 _WRAP_FLOAT_AEABI_CONV_32_FLAGS = [
    "-Wl,--wrap=__aeabi_i2f",
    "-Wl,--wrap=__aeabi_l2f",
    "-Wl,--wrap=__aeabi_ui2f",
    "-Wl,--wrap=__aeabi_ul2f",
 ]
 _WRAP_FLOAT_AEABI_CONV_64_FLAGS = [
    "-Wl,--wrap=__aeabi_f2iz",
    "-Wl,--wrap=__aeabi_f2lz",
    "-Wl,--wrap=__aeabi_f2uiz",
    "-Wl,--wrap=__aeabi_f2ulz",
 ]
 _WRAP_FLOAT_AEABI_CONV_DOUBLE_FLAGS = [
    "-Wl,--wrap=__aeabi_f2d",
 ]
 _WRAP_FLOAT_SQRTF_FLAGS = [
    "-Wl,--wrap=sqrtf",
 ]
@@ -36,13 +51,16 @@ _WRAP_FLOAT_SCI_FLAGS = [
    "-Wl,--wrap=atan2f",
    "-Wl,--wrap=expf",
    "-Wl,--wrap=logf",
    "-Wl,--wrap=sincosf",  # gnu
 ]
 _WRAP_FLOAT_SCI_EXTRA_FLAGS = [
    "-Wl,--wrap=ldexpf",
    "-Wl,--wrap=copysignf",
    "-Wl,--wrap=truncf",
    "-Wl,--wrap=floorf",
    "-Wl,--wrap=ceilf",
    "-Wl,--wrap=roundf",
    "-Wl,--wrap=sincosf",  # gnu
    "-Wl,--wrap=asinf",
    "-Wl,--wrap=acosf",
    "-Wl,--wrap=atanf",
@@ -114,30 +132,31 @@ _PICO_FLOAT_IMPLS = [
        ],
        "compatibility": incompatible_with_config("@platforms//cpu:riscv32") + ["//bazel/constraint:rp2040"],
        "extra_deps": [],
-        "linkopts": _WRAP_FLOAT_AEABI_FLAGS + _WRAP_FLOAT_SCI_FLAGS,
+        "linkopts": _WRAP_FLOAT_AEABI_ARITHMETIC_FLAGS + _WRAP_FLOAT_AEABI_CMP_FLAGS + _WRAP_FLOAT_AEABI_CONV_32_FLAGS + _WRAP_FLOAT_AEABI_CONV_64_FLAGS + _WRAP_FLOAT_AEABI_CONV_DOUBLE_FLAGS + _WRAP_FLOAT_SQRTF_FLAGS + _WRAP_FLOAT_SCI_FLAGS + _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    },
    {
        "name": "dcp",
        "srcs": [
            "float_aeabi_dcp.S",
-            "float_conv_m33.S",
+            "float_common_m33.S",
            "float_math.c",
            "float_sci_m33.S",
        ],
        "compatibility": compatible_with_rp2() + incompatible_with_config("@platforms//cpu:riscv32") + incompatible_with_config("//bazel/constraint:rp2040"),
        "extra_deps": ["//src/rp2_common/hardware_dcp"],
-        "linkopts": _WRAP_FLOAT_SCI_FLAGS,
+        "linkopts": _WRAP_FLOAT_AEABI_ARITHMETIC_FLAGS + _WRAP_FLOAT_AEABI_CMP_FLAGS + _WRAP_FLOAT_AEABI_CONV_32_FLAGS + _WRAP_FLOAT_AEABI_CONV_64_FLAGS + _WRAP_FLOAT_AEABI_CONV_DOUBLE_FLAGS + _WRAP_FLOAT_SQRTF_FLAGS + _WRAP_FLOAT_SCI_FLAGS + _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    },
    {
        "name": "vfp",
        "srcs": [
            "float_conv32_vfp.S",
            "float_sci_m33_vfp.S",
-            "float_conv_m33.S",
+            "float_common_m33.S",
            "float_math.c",
        ],
        "compatibility": compatible_with_rp2() + incompatible_with_config("@platforms//cpu:riscv32") + incompatible_with_config("//bazel/constraint:rp2040"),
        "extra_deps": ["//src/rp2_common/hardware_dcp"],
-        "linkopts": _WRAP_FLOAT_SCI_FLAGS,
+        "linkopts": _WRAP_FLOAT_AEABI_CONV_64_FLAGS + _WRAP_FLOAT_SCI_FLAGS + _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    },
    {
        "name": "single_hazard3",
@@ -146,7 +165,7 @@ _PICO_FLOAT_IMPLS = [
        ],
        "compatibility": compatible_with_rp2() + ["@platforms//cpu:riscv32"],
        "extra_deps": ["//src/rp2_common/hardware_hazard3"],
-        "linkopts": _WRAP_FLOAT_SCI_FLAGS,
+        "linkopts": _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    },
 ]
@@ -184,7 +203,7 @@ cc_library(
    hdrs = ["include/pico/float.h"],
    defines = ["LIB_PICO_FLOAT_PICO=0"],
    includes = ["include"],
-    linkopts = _WRAP_FLOAT_AEABI_FLAGS + _WRAP_FLOAT_SCI_FLAGS,
+    linkopts = _WRAP_FLOAT_AEABI_ARITHMETIC_FLAGS + _WRAP_FLOAT_AEABI_CMP_FLAGS + _WRAP_FLOAT_AEABI_CONV_32_FLAGS + _WRAP_FLOAT_AEABI_CONV_64_FLAGS + _WRAP_FLOAT_AEABI_CONV_DOUBLE_FLAGS + _WRAP_FLOAT_SQRTF_FLAGS + _WRAP_FLOAT_SCI_FLAGS + _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    target_compatible_with = compatible_with_rp2(),
    visibility = ["//visibility:private"],
    deps = [
--- a/src/rp2_common/pico_float/CMakeLists.txt
+++ b/src/rp2_common/pico_float/CMakeLists.txt
@@ -18,13 +18,15 @@
            $<IF:$<BOOL:$<TARGET_PROPERTY:PICO_TARGET_FLOAT_IMPL>>,$<TARGET_PROPERTY:PICO_TARGET_FLOAT_IMPL>,${PICO_DEFAULT_FLOAT_IMPL}>)
    function(wrap_float_functions TARGET)
-        cmake_parse_arguments(WRAP_FLOAT "NO_WRAP_AEABI;NO_WRAP_SCI" "" "" ${ARGN} )
+        cmake_parse_arguments(WRAP_FLOAT "NO_AEABI_ARITHMETIC;NO_AEABI_CMP;NO_AEABI_CONV_32;NO_AEABI_CONV_64;NO_AEABI_CONV_DOUBLE;NO_SQRTF;NO_SCI;NO_SCI_EXTRA" "" "" ${ARGN} )
-        if (NOT WRAP_FLOAT_NO_WRAP_AEABI)
+        if (NOT WRAP_FLOAT_NO_AEABI_ARITHMETIC)
            pico_wrap_function(${TARGET} __aeabi_fadd)
            pico_wrap_function(${TARGET} __aeabi_fdiv)
            pico_wrap_function(${TARGET} __aeabi_fmul)
            pico_wrap_function(${TARGET} __aeabi_frsub)
            pico_wrap_function(${TARGET} __aeabi_fsub)
        endif()
        if (NOT WRAP_FLOAT_NO_AEABI_CMP)
            pico_wrap_function(${TARGET} __aeabi_cfcmpeq)
            pico_wrap_function(${TARGET} __aeabi_cfrcmple)
            pico_wrap_function(${TARGET} __aeabi_cfcmple)
@@ -34,32 +36,42 @@
            pico_wrap_function(${TARGET} __aeabi_fcmpge)
            pico_wrap_function(${TARGET} __aeabi_fcmpgt)
            pico_wrap_function(${TARGET} __aeabi_fcmpun)
        endif()
        if (NOT WRAP_FLOAT_NO_AEABI_CONV_32)
            pico_wrap_function(${TARGET} __aeabi_i2f)
            pico_wrap_function(${TARGET} __aeabi_l2f)
            pico_wrap_function(${TARGET} __aeabi_ui2f)
            pico_wrap_function(${TARGET} __aeabi_ul2f)
            pico_wrap_function(${TARGET} __aeabi_f2iz)
            pico_wrap_function(${TARGET} __aeabi_f2lz)
            pico_wrap_function(${TARGET} __aeabi_f2uiz)
        endif()
        if (NOT WRAP_FLOAT_NO_AEABI_CONV_64)
            pico_wrap_function(${TARGET} __aeabi_l2f)
            pico_wrap_function(${TARGET} __aeabi_ul2f)
            pico_wrap_function(${TARGET} __aeabi_f2lz)
            pico_wrap_function(${TARGET} __aeabi_f2ulz)
        endif()
        if (NOT WRAP_FLOAT_NO_AEABI_CONV_DOUBLE)
            pico_wrap_function(${TARGET} __aeabi_f2d)
        endif()
        # separate as we have a direct DCP version
        if (NOT WRAP_FLOAT_NO_SQRTF)
            pico_wrap_function(${TARGET} sqrtf)
        endif()
-        if (NOT WRAP_FLOAT_NO_WRAP_SCI)
+        if (NOT WRAP_FLOAT_NO_SCI)
            pico_wrap_function(${TARGET} cosf)
            pico_wrap_function(${TARGET} sinf)
            pico_wrap_function(${TARGET} tanf)
            pico_wrap_function(${TARGET} atan2f)
            pico_wrap_function(${TARGET} expf)
            pico_wrap_function(${TARGET} logf)
-
+            pico_wrap_function(${TARGET} sincosf) # gnu
        endif()
        if (NOT WRAP_FLOAT_NO_SCI_EXTRA)
            pico_wrap_function(${TARGET} ldexpf)
            pico_wrap_function(${TARGET} copysignf)
            pico_wrap_function(${TARGET} truncf)
            pico_wrap_function(${TARGET} floorf)
            pico_wrap_function(${TARGET} ceilf)
            pico_wrap_function(${TARGET} roundf)
            pico_wrap_function(${TARGET} sincosf) # gnu
            pico_wrap_function(${TARGET} asinf)
            pico_wrap_function(${TARGET} acosf)
            pico_wrap_function(${TARGET} atanf)
@@ -93,7 +105,9 @@
    )
    target_link_libraries(pico_float_none INTERFACE pico_float_headers)
-    wrap_float_functions(pico_float_none)
+    wrap_float_functions(pico_float_none) # we wrap all functions
    # be explicit that there should be no floating point instructions
    target_compile_options(pico_float_none INTERFACE -msoft-float)
    pico_add_library(pico_float_pico)
    if (PICO_RP2040)
@@ -107,21 +121,52 @@
        target_link_libraries(pico_float_pico INTERFACE pico_bootrom pico_float_headers hardware_divider)
    elseif(NOT PICO_RISCV)
        pico_add_library(pico_float_pico_dcp)
        # todo what functions from float_math belong in each case; should some be left to GCC on RP2350?
        target_sources(pico_float_pico_dcp INTERFACE
                ${CMAKE_CURRENT_LIST_DIR}/float_math.c
                ${CMAKE_CURRENT_LIST_DIR}/float_aeabi_dcp.S
                ${CMAKE_CURRENT_LIST_DIR}/float_common_m33.S
                ${CMAKE_CURRENT_LIST_DIR}/float_sci_m33.S
                ${CMAKE_CURRENT_LIST_DIR}/float_conv_m33.S
                )
-        wrap_float_functions(pico_float_pico_dcp NO_WRAP_AEABI)
+        # NOTE the main reason for using pico_float_pico_dcp is presumably that you
        # don't want to use VFP at all, so turn off compiler support, otherwise, it will inline usages
        target_compile_options(pico_float_pico_dcp INTERFACE -msoft-float)
        wrap_float_functions(pico_float_pico_dcp
                # we wrap all functions as we don't want to use VFP (or compiler versions) at all
                #NO_AEABI_ARITHMETIC
                #NO_AEABI_CMP
                #NO_AEABI_CONV_32
                #NO_AEABI_CONV_64
                #NO_AEABI_CONV_DOUBLE
                #NO_SQRTF
                #NO_SCI
                #NO_SCI_EXTRA
        )
        pico_add_library(pico_float_pico_vfp)
        target_sources(pico_float_pico_vfp INTERFACE
                ${CMAKE_CURRENT_LIST_DIR}/float_math.c
                ${CMAKE_CURRENT_LIST_DIR}/float_conv32_vfp.S
                ${CMAKE_CURRENT_LIST_DIR}/float_common_m33.S
                ${CMAKE_CURRENT_LIST_DIR}/float_sci_m33_vfp.S
                ${CMAKE_CURRENT_LIST_DIR}/float_conv_m33.S
        )
-        wrap_float_functions(pico_float_pico_vfp NO_WRAP_AEABI)
+        wrap_float_functions(pico_float_pico_vfp
                # for these 3, arguably compiler is probably inlining anyway, but use the cmopiler's
                # version for explicit AEABI calls
                NO_AEABI_ARITHMETIC
                NO_AEABI_CMP
                NO_AEABI_CONV_32
                #NO_AEABI_CONV_64   # we have optimized M33 versions
                NO_AEABI_CONV_DOUBLE
                # we don't have an optimized vfp or m33 sqrtf available
                NO_SQRTF
                #NO_SCI             # we have optimized VFP versions
                #NO_SCI_EXTRA       # todo - are our versions better than what GCC proides?
        )
        target_link_libraries(pico_float_pico INTERFACE
                pico_float_pico_vfp)
    else()
--- a/src/rp2_common/pico_float/float_aeabi_dcp.S
+++ b/src/rp2_common/pico_float/float_aeabi_dcp.S
@@ -5,15 +5,17 @@
 */
 #include "pico/asm_helper.S"
-#if HAS_DOUBLE_COPROCESSOR
+
 #if !HAS_DOUBLE_COPROCESSOR
 #error attempt to compile float_aeabi_dcp when there is no DCP
 #else
 #include "hardware/dcp_instr.inc.S"
 #include "hardware/dcp_canned.inc.S"
 pico_default_asm_setup
-// todo alignment
+// todo factor out save/restore (there is a copy in double code)
 //__pre_init __aeabi_float_init, 00020
 // factor out save/restore (there is a copy in double code)
 .macro float_section name
 #if PICO_FLOAT_IN_RAM
@@ -29,7 +31,7 @@ float_section WRAPPER_FUNC_NAME(\func)
 // ============== STATE SAVE AND RESTORE ===============
-.macro saving_func func
+.macro saving_func type func, opt_label1='-', opt_label2='-'
  // Note we are usually 32-bit aligned already at this point, as most of the
  // function bodies contain exactly two 16-bit instructions: bmi and bx lr.
  // We want the PCMP word-aligned.
@@ -41,8 +43,14 @@ float_section WRAPPER_FUNC_NAME(\func)
  push {lr}              // 16-bit instruction
  bl generic_save_state  // 32-bit instruction
  b 1f                   // 16-bit instruction
 .ifnc \opt_label1,'-'
 regular_func \opt_label1
 .endif
 .ifnc \opt_label2,'-'
 regular_func \opt_label2
 .endif
  // This is the actual entry point:
-wrapper_func \func
+\type\()_func \func
  PCMP apsr_nzcv
  bmi 1b
 1:
@@ -82,115 +90,208 @@ generic_restore_state:
 // ============== ARITHMETIC FUNCTIONS ===============
 float_wrapper_section __aeabi_fadd
-saving_func __aeabi_fadd
+saving_func wrapper __aeabi_fadd
  dcp_fadd_m r0,r0,r1
  saving_func_return
 float_wrapper_section __aeabi_fsub
-saving_func __aeabi_fsub
+saving_func wrapper __aeabi_fsub
  dcp_fsub_m r0,r0,r1
  saving_func_return
 float_wrapper_section __aeabi_frsub
-saving_func __aeabi_frsub
+saving_func wrapper __aeabi_frsub
  dcp_fsub_m r0,r1,r0
  saving_func_return
 float_wrapper_section __aeabi_fmul
-saving_func __aeabi_fmul
+saving_func wrapper __aeabi_fmul
  dcp_fmul_m r0,r0,r1,r0,r1
  saving_func_return
 float_section fdiv_fast
-saving_func fdiv_fast
+saving_func regular fdiv_fast
  dcp_fdiv_fast_m r0,r0,r1,r0,r1,r2
  saving_func_return
 float_wrapper_section __aeabi_fdiv
-saving_func __aeabi_fdiv
+saving_func wrapper __aeabi_fdiv
@ with correct rounding
  dcp_fdiv_m r0,r0,r1,r0,r1,r2,r3
  saving_func_return
 float_section sqrtf_fast
-saving_func sqrtf_fast
+saving_func regular sqrtf_fast
  dcp_fsqrt_fast_m r0,r0,r0,r1,r2,r3
  saving_func_return
 float_wrapper_section sqrtf
-saving_func sqrtf
+saving_func wrapper sqrtf
@ with correct rounding
  dcp_fsqrt_m r0,r0,r0,r1,r2,r3
  saving_func_return
-// todo not a real thing
+float_section fclassify
-float_wrapper_section __aeabi_fclassify
+saving_func regular fclassify
 saving_func __aeabi_fclassify
  dcp_fclassify_m apsr_nzcv,r0
  saving_func_return
 // ============== CONVERSION FUNCTIONS ===============
 float_wrapper_section __aeabi_f2d
-saving_func __aeabi_f2d
+saving_func wrapper __aeabi_f2d float2double
  dcp_float2double_m r0,r1,r0
  saving_func_return
 float_wrapper_section __aeabi_i2f
-saving_func __aeabi_i2f
+saving_func  wrapper __aeabi_i2f int2float
@ with rounding
  dcp_int2float_m r0,r0
  saving_func_return
 float_wrapper_section __aeabi_ui2f
-saving_func __aeabi_ui2f
+saving_func wrapper __aeabi_ui2f uint2float
@ with rounding
  dcp_uint2float_m r0,r0
  saving_func_return
 float_section float2fix_z
 regular_func float2fix_z
  ubfx r2, r0, #23, #8
  cbz r2, 2f // input is zero or denormal
  cmp r2, #0xff
  beq 3f // input infinite or nan
  adds r2, r1
  ble 2f // modified input is denormal so zero
  cmp r2, #0xff
  beq 3f // modified input is infinite
 1:
  bfi r0, r2, #23, #8
  b float2int_z_entry
 2:
  movs r0, #0
  bx lr
 3:
  mvn r1, #0x80000000
  add r0, r1, r0, lsr#31 @ so -Inf → 0x80000000, +Inf → 0x7fffffff
  bx lr
 float_wrapper_section __aeabi_f2iz
-saving_func __aeabi_f2iz
+saving_func wrapper __aeabi_f2iz float2int_z
@ with truncation towards 0
 float2int_z_entry:
  dcp_float2int_m r0,r0
  saving_func_return
 float_section __aeabi_f2ufix
 regular_func float2ufix
 regular_func float2ufix_z
  ubfx r2, r0, #23, #8
  cbz r2, 2f // input is zero or denormal
  cmp r2, #0xff
  beq 3f // input infinite or nan
  adds r2, r1
  ble 2f // modified input is denormal so zero
  cmp r2, #0xff
  beq 3f // modified input is infinite
 1:
  bfi r0, r2, #23, #8
  b float2uint_z_entry
 2:
  movs r0, #0
  bx lr
 3:
  mvn r0, r0, asr #31
  bx lr
 float_wrapper_section __aeabi_f2uiz
-saving_func __aeabi_f2uiz
+saving_func wrapper __aeabi_f2uiz float2uint_z float2uint
@ with truncation towards 0
 float2uint_z_entry:
  dcp_float2uint_m r0,r0
  saving_func_return
-// todo not a real thing
+float_section conv_f2fix
 saving_func regular float2fix
  ubfx r2, r0, #23, #8
  cbz r2, 2f // input is zero or denormal
  cmp r2, #0xff
  beq 3f // input infinite or nan
  adds r2, r1
  ble 2f // modified input is denormal so zero
  cmp r2, #0xff
  beq 3f // modified input is infinite
 1:
  bfi r0, r2, #23, #8
  b float2int_entry
 2:
  movs r0, #0
  bx lr
 3:
  mvn r1, #0x80000000
  add r0, r1, r0, lsr#31 @ so -Inf → 0x80000000, +Inf → 0x7fffffff
  bx lr
 float_section float2int
 // (not a real thing - kept because we use wrapper in saving_func)
 saving_func regular float2int
 float2int_entry:
  lsls r1, r0, #1
  // r0 = abs(zero)                   => r1 = 0x00000000
  // r0 = abs(denornaml)              => r1 = 0x00xxxxxx
  // r0 = abs(1.0f)                   => r1 = 0x7f000000
  // r0 = abs(inf/nan)                => r1 = 0xffxxxxxx
  bls float2int_z_entry // input positive or zero or -zero are ok for int64_z
  lsrs r1, #24
  beq float2int_z_entry // input denormal is flushed to zero anyway
  subs r1, #0x7f
  bcc 1f // input < 1.0f means we need to subtract 1 after conversion
  // mask off all but fractional bits
  lsls r2, r0, r1
  lsls r2, #9
  beq float2int_z_entry // input is integer
 1:
  WXFC r0, r0
  ADD0
  ADD1
  NTDC
  RDIC r0
  subs r0, #1
 saving_func_return
 #if 0 // not sure these are super useful; if they are we should give them names
 float_wrapper_section __aeabi_f2i_r
-saving_func __aeabi_f2i_r
+// (not a real thing - kept because we use wrapper in saving_func)
 saving_func wrapper __aeabi_f2i_r
@ with rounding
  dcp_float2int_r_m r0,r0
  saving_func_return
 // todo not a real thing
 float_wrapper_section __aeabi_f2ui_r
-saving_func __aeabi_f2ui_r
+// (not a real thing - kept because we use wrapper in saving_func)
 saving_func wrapper __aeabi_f2ui_r
@ with rounding
  dcp_float2uint_r_m r0,r0
  saving_func_return
 #endif
 // ============== COMPARISON FUNCTIONS ===============
 float_wrapper_section __aeabi_fcmpun
-saving_func __aeabi_fcmpun
+saving_func wrapper __aeabi_fcmpun
  dcp_fcmp_m r0,r0,r1
  // extract unordered bit
  ubfx r0, r0, #28, #1
  saving_func_return
 float_wrapper_section __aeabi_fcmp
-saving_func __aeabi_cfrcmple
+saving_func wrapper __aeabi_cfrcmple
  dcp_fcmp_m apsr_nzcv,r1,r0 // with arguments reversed
  bvs cmp_nan
  saving_func_return
 // these next two can be the same function in the absence of exceptions
-saving_func __aeabi_cfcmple
+saving_func wrapper __aeabi_cfcmple
  dcp_fcmp_m apsr_nzcv,r0,r1
  bvs cmp_nan
  saving_func_return
@@ -198,7 +299,7 @@ saving_func __aeabi_cfcmple
 // It is not clear from the ABI documentation whether cfcmpeq must set the C flag
 // in the same way as cfcmple. If not, we could save the "bvs" below; but we
 // err on the side of caution.
-saving_func __aeabi_cfcmpeq
+saving_func wrapper __aeabi_cfcmpeq
  dcp_fcmp_m apsr_nzcv,r0,r1
  bvs cmp_nan
  saving_func_return
@@ -212,14 +313,14 @@ cmp_nan:
  saving_func_return
 float_wrapper_section __aeabi_fcmpeq
-saving_func __aeabi_fcmpeq
+saving_func wrapper __aeabi_fcmpeq
  dcp_fcmp_m r0,r0,r1
  // extract Z
  ubfx r0, r0, #30, #1
  saving_func_return
 float_wrapper_section __aeabi_fcmplt
-saving_func __aeabi_fcmplt
+saving_func wrapper __aeabi_fcmplt
  dcp_fcmp_m apsr_nzcv,r1,r0
  ite hi
  movhi r0,#1
@@ -227,7 +328,7 @@ saving_func __aeabi_fcmplt
  saving_func_return
 float_wrapper_section __aeabi_fcmple
-saving_func __aeabi_fcmple
+saving_func wrapper __aeabi_fcmple
  dcp_fcmp_m apsr_nzcv,r1,r0
  ite hs
  movhs r0,#1
@@ -235,7 +336,7 @@ saving_func __aeabi_fcmple
  saving_func_return
 float_wrapper_section __aeabi_fcmpge
-saving_func __aeabi_fcmpge
+saving_func wrapper __aeabi_fcmpge
  dcp_fcmp_m apsr_nzcv,r0,r1
  ite hs
  movhs r0,#1
@@ -243,7 +344,7 @@ saving_func __aeabi_fcmpge
  saving_func_return
 float_wrapper_section __aeabi_fcmpgt
-saving_func __aeabi_fcmpgt
+saving_func wrapper __aeabi_fcmpgt
  dcp_fcmp_m apsr_nzcv,r0,r1
  ite hi
  movhi r0,#1
--- a/src/rp2_common/pico_float/float_aeabi_rp2040.S
+++ b/src/rp2_common/pico_float/float_aeabi_rp2040.S
@@ -471,17 +471,36 @@ float_section float2int
 regular_func float2int
    shimmable_table_tail_call SF_TABLE_FLOAT2INT float2int_shim
 float_section float2fix_z
 regular_func float2fix_z
    cmn r0, r0
    bcc float2fix
    push {lr}
    lsls r0, #1
    lsrs r0, #1
    bl float2ufix_z
    cmp r0, #0
    bmi 1f
    negs r0, r0
    pop {pc}
 1:
    movs r0, #128
    lsls r0, #24
    pop {pc}
 float_section float2fix
 regular_func float2fix
    shimmable_table_tail_call SF_TABLE_FLOAT2FIX float2fix_shim
 float_section float2ufix
 regular_func float2ufix
 regular_func float2ufix_z
    table_tail_call SF_TABLE_FLOAT2UFIX
 // unsigned FUNC_NAME(__aeabi_f2uiz)(float)             float (single precision) to unsigned C-style conversion [3]
 float_wrapper_section __aeabi_f2uiz
 wrapper_func __aeabi_f2uiz
 regular_func float2uint
 regular_func float2uint_z
    table_tail_call SF_TABLE_FLOAT2UINT
@@ -530,10 +549,11 @@ wrapper_func __aeabi_f2lz
 regular_func float2int64_z
    cmn r0, r0
    bcc float2int64
    movs r1, #0
 float2fix64_z_neg:
    push {lr}
    lsls r0, #1
    lsrs r0, #1
    movs r1, #0
    bl float2ufix64
    cmp r1, #0
    bmi 1f
@@ -553,17 +573,24 @@ regular_func float2int64
    shimmable_table_tail_call SF_TABLE_FLOAT2INT64 float2int64_shim
 float_section float2fix64
 regular_func float2fix64_z
    cmn r0, r0
    bcs float2fix64_z_neg
    // fall thru
 regular_func float2fix64
    shimmable_table_tail_call SF_TABLE_FLOAT2FIX64 float2fix64_shim
 // unsigned long long FUNC_NAME(__aeabi_f2ulz)(float)     float to unsigned long long C-style conversion [3]
 float_wrapper_section __aeabi_f2ulz
 wrapper_func __aeabi_f2ulz
 regular_func float2uint64
 regular_func float2uint64_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UINT64 float2uint64_shim
 float_section float2ufix64
 regular_func float2ufix64
 regular_func float2ufix64_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UFIX64 float2ufix64_shim
 float_wrapper_section __aeabi_f2d
--- a/src/rp2_common/pico_float/float_common_m33.S
+++ b/src/rp2_common/pico_float/float_common_m33.S
@@ -241,7 +241,52 @@ regular_func ufix642float
 bxlo r14
 b 3b
-float_wrapper_section conv_ftoi64
+float_section conv_ftoi64
 regular_func float2int64
  lsls r1, r0, #1
  // r0 = abs(zero)                   => r1 = 0x00000000
  // r0 = abs(denornaml)              => r1 = 0x00xxxxxx
  // r0 = abs(1.0f)                   => r1 = 0x7f000000
  // r0 = abs(inf/nan)                => r1 = 0xffxxxxxx
  bls float2int64_z // positive or zero or -zero are ok for int64_z
  lsrs r1, #24
  subs r1, #0x7f
  bcc 1f // <1 means subtract 1
  // mask off all but fractional bits
  lsls r2, r0, r1
  lsls r2, #9
  beq float2int64_z // integer
 1:
  push {lr}
  bl float2int64_z
  subs r0, #1
  sbcs r1, r1, #0
  pop {pc}
 float_section conv_ftof64
 regular_func float2fix64
  lsls r2, r0, #1
  // r0 = abs(zero)                   => r1 = 0x00000000
  // r0 = abs(denornaml)              => r1 = 0x00xxxxxx
  // r0 = abs(1.0f)                   => r1 = 0x7f000000
  // r0 = abs(inf/nan)                => r1 = 0xffxxxxxx
  bls float2fix64_z // positive or zero or -zero are ok for fix64_z
  lsrs r2, #24
  rsbs r3, r1, #0x7f
  subs r2, r3
  bcc 1f // <1 means subtract 1
  // mask off all but fractional bits
  lsls r2, r0, r2
  lsls r2, #9
  beq float2fix64_z // integer
 1:
  push {lr}
  bl float2fix64_z
  subs r0, #1
  sbcs r1, r1, #0
  pop {pc}
 float_wrapper_section conv_ftoi64z
@ convert float to signed int64, rounding towards 0, clamping
 wrapper_func __aeabi_f2lz
@@ -318,7 +363,7 @@ regular_func float2uint64_z
 movs r1,#0      @ fall through
@ convert float in r0 to unsigned fixed point in r0:r1, clamping
 regular_func float2ufix64
-//regular_func float2ufix64_z
+regular_func float2ufix64_z
 subs r1,#0x96 @ remove exponent bias, compensate for mantissa length
 asrs r2,r0,#23 @ sign and exponent
 sub r3,r2,#1
--- a/src/rp2_common/pico_float/float_conv32_vfp.S
+++ b/src/rp2_common/pico_float/float_conv32_vfp.S
@@ -0,0 +1,106 @@
 /*
 * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */
 #if !PICO_RP2040
 #include "pico/asm_helper.S"
 pico_default_asm_setup
 .macro float_section name
 #if PICO_FLOAT_IN_RAM
 .section RAM_SECTION_NAME(\name), "ax"
 #else
 .section SECTION_NAME(\name), "ax"
 #endif
 .endm
 float_section int2float
 regular_func int2float
 	vmov s15, r0
 	vcvt.f32.s32 s15, s15
 	vmov r0, s15
 	bx lr
 float_section uint2float
 regular_func uint2float
 	vmov s15, r0
 	vcvt.f32.u32 s15, s15
 	vmov r0, s15
 	bx lr
 float_section float2int
 regular_func float2int
 	vmov s15, r0
 	vcvtm.s32.f32 s15, s15
 	vmov r0, s15
 	bx lr
 float_section float2int_z
 regular_func float2int_z
 	vmov s15, r0
 	vcvt.s32.f32 s15, s15
 	vmov r0, s15
 	bx lr
 float_section float2uint
 regular_func float2uint
 regular_func float2uint_z
 	vmov s15, r0
 	vcvt.u32.f32 s15, s15
 	vmov r0, s15
 	bx lr
 float_section float2fix_z
 regular_func float2fix_z
  ubfx r2, r0, #23, #8
  adds r2, r1
  asrs r3, r2, #8
  beq 1f
  ite pl
  movpl r2, #0xff
  movmi r2, #0
 1:
  bfi r0, r2, #23, #8
  b float2int_z
 float_section float2fix
 regular_func float2fix
  lsls r2, r0, #1
  // r0 = abs(zero)                   => r1 = 0x00000000
  // r0 = abs(denornaml)              => r1 = 0x00xxxxxx
  // r0 = abs(1.0f)                   => r1 = 0x7f000000
  // r0 = abs(inf/nan)                => r1 = 0xffxxxxxx
  bls float2fix_z // input positive or zero or -zero are ok for fix_z
  lsrs r2, #24
  beq float2fix_z // input denormal will be flushed to zero
  rsbs r3, r1, #0x7f
  subs r2, r3
  bcc 1f // iunput <1.0f means we need to subtract 1
  // mask off all but fractional bits
  lsls r2, r0, r2
  lsls r2, #9
  beq float2fix_z // input is integer
 1:
  push {lr}
  bl float2fix_z
  subs r0, #1
  sbcs r1, r1, #0
  pop {pc}
 float_section float2ufix
 regular_func float2ufix
 regular_func float2ufix_z
  ubfx r2, r0, #23, #8
  adds r2, r1
  asrs r3, r2, #8
  beq 1f
  ite pl
  movpl r2, #0xff
  movmi r2, #0
 1:
  bfi r0, r2, #23, #8
  b float2uint_z
 #endif
--- a/src/rp2_common/pico_float/include/pico/float.h
+++ b/src/rp2_common/pico_float/include/pico/float.h
@@ -21,68 +21,296 @@ extern "C" {
 *
 * \brief Optimized single-precision floating point functions
 *
-* (Replacement) optimized implementations are provided for the following compiler built-ins
+* An application can take control of the floating point routines used in the application over and above what is provided by the compiler,
-* and math library functions on Arm:
+* by depending on the pico_float library. A user might want to do this
 *
-* - __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub, __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun, __aeabi_i2f, __aeabi_l2f, __aeabi_ui2f, __aeabi_ul2f, __aeabi_f2iz, __aeabi_f2lz, __aeabi_f2uiz, __aeabi_f2ulz, __aeabi_f2d, sqrtf, cosf, sinf, tanf, atan2f, expf, logf
+* 1. To use optimized software implementations provided by the RP2-series device's bootrom or the SDK
-* - ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
+* 2. To use optimized combined software/hardware implementations utilizing custom RP2-series hardware for acceleration
-* - powintf, sincosf (GNU extensions)
+* 3. To control the amount of C compiler/library code bloat
 * 4. To make sure no floating point is called at all
 *
-* The following additional optimized functions are also provided:
+* The pico_float library comes in three main flavors:
 *
-* - int2float, uint2float, int642float, uint642float, fix2float, ufix2float, fix642float, ufix642float
+* 1. `pico_float_none` - all floating point operations cause a \ref panic - no single-precision floating point code is included
-* - float2fix, float2ufix, float2fix64, float2ufix64, float2int, float2uint, float2int64, float2uint64, float2int_z, float2int64_z, float2uint_z, float2uint64_z
+* 2. `pico_float_compiler` - no custom functions are provided; all single-precision floating point is handled by the C compiler/library
-* - exp10f, sincosf, powintf
+* 3. `pico_float_pico` - the smallest and fastest available for the platform, along with additional functionality (e.g. fixed point conversions) which are detailed below
 *
-* On RP2350 (Arm) the following additional functions are available; the _fast methods are faster but do not round correctly
+* The user can control which version they want (e.g. **pico_float_xxx** by either setting the CMake global variable
 * `PICO_DEFAULT_FLOAT_IMPL=xxx`, or by using the CMake function `pico_set_float_implementation(<TARGET> xxx)`. Note that in the absence
 * of either, pico_float_pico is used by default.
 *
-* - float2fix64_z, fdiv_fast, fsqrt_fast,
+* \if rp2040_specific
 * On RP2040, `pico_float_pico` uses optimized hand coded implementations from the bootrom and the SDK for both
 * basic single-precision floating point operations and floating point math library functions. These implementations
 * are generally faster and smaller than those provided by the C compiler/library, though they don't support all the features of a fully compliant
 * floating point implementation; they are however usually fine for the majority of cases
 * \endif
 *
-* On RP2350 RISC-V, only a small number of compiler runtime functions are overridden with faster implementations:
+* \if rp2350_specific
 * On Arm on RP2350, there are multiple options for `pico_float_pico`:
 *
-* - __addsf3, __subsf3, __mulsf3
+* 1. `pico_float_pico_vfp` - this library leaves basic C single-precision floating point operations to the compiler
 * which can use inlined VFP (Arm FPU) code. Custom optimized versions of trigonometric and scientific functions are provided.
 * No DCP (RP2350 Double co-processor) instructions are used.
 * 2. `pico_float_pico_dcp` - this library prevents the compiler injecting inlined VFP code, and also implements
 * all single-precision floating point operations in optimized DCP or M33 code. This option is not quite as fast
 * as pico_float_pico_vfp, however it allows floating point operations without enabling the floating point co-processor
 * on the CPU; this can be beneficial in certain circumstances, e.g. where leaving stack in tasks or interrupts
 * for the floating point state is undesirable.
 *
 * Note: `pico_float_pico` is equivalent to `pico_float_pico_vfp` on RP2350, as this is the most sensible default
 * \endif
 *
 * On Arm, (replacement) optimized implementations are provided for the following compiler built-ins
 * and math library functions when using `_pico` variants of `pico_float`:
 *
 * - basic arithmetic: (except `pico_float_pico_vfp`)
 *
 *   __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub
 *
 * - comparison: (except `pico_float_pico_vfp`)
 *
 *   __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun
 *
 * - (u)int32 <-> float: (except `pico_float_pico_vfp`)
 *
 *    __aeabi_i2f, __aeabi_ui2f, __aeabi_f2iz, __aeabi_f2uiz
 *
 * - (u)int64 <-> float: (except `pico_float_pico_vfp`)
 *
 *   __aeabi_l2f, __aeabi_ul2f, __aeabi_f2lz, __aeabi_f2ulz
 *
 * - float -> double: (except `pico_float_pico_vfp`)
 *
 *   __aeabi_f2d
 *
 * - basic trigonometric:
 *
 *   sqrtf, cosf, sinf, tanf, atan2f, expf, logf
 *
 * - trigonometric and scientific
 *
 *   ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
 *
 * - GNU exetnsions:
 *
 *   powintf, sincosf
 *
 * On Arm, the following additional optimized functions are also provided (when using `_pico` variants of `pico_float`):
 *
 * - Conversions to/from integer types:
 *
 *   - (u)int -> float (round to nearest):
 *
 *     int2float, uint2float, int642float, uint642float
 *
 *     note: on `pico_float_pico_vfp` the 32-bit functions are also provided as C macros since they map to inline VFP code
 *
 *   - (u)float -> int (round towards zero):
 *
 *     float2int_z, float2uint_z, float2int64_z, float2uint64_z
 *
 *     note: on `pico_float_pico_vfp` the 32-bit functions are also provided as C macros since they map to inline VFP code
 *
 *   - (u)float -> int (round towards -infinity):
 *
 *     float2int, float2uint, float2int64, float2uint64
 *
 * - Conversions to/from fixed point integers:
 *
 *   - (u)fix -> float (round to nearest):
 *
 *       fix2float, ufix2float, fix642float, ufix642float
 *
 *   - float -> (u)fix (round towards zero):
 *
 *       float2fix_z, float2ufix_z, float2fix64_z, float2ufix64_z
 *
 *     note: on `pico_float_pico_vfp` the 32-bit functions are also provided as C macros since they can map to inline VFP code
 *     when the number of fractional bits is a compile time constant between 1 and 32
 *
 *   - float -> (u)fix (round towards -infinity):
 *
 *       float2fix, float2ufix, float2fix64, float2ufix64
 *
 *     note: on `pico_float_pico_vfp` the 32-bit functions are also provided as C macros since they can map to inline VFP code
 *     when the number of fractional bits is a compile time constant between 1 and 32
 *
 * - Even faster versions of divide and square-root functions that do not round correctly: (`pico_float_pico_dcp` only)
 *
 *   fdiv_fast, sqrtf_fast
 *
 * \if rp2350_specific
 * On RISC-V, (replacement) optimized implementations are provided for the following compiler built-ins when using the `pico_float_pico`
 * library (note that there are no variants of this library like there are on Arm):
 *
 * - basic arithmetic:
 *
 *   __addsf3, __subsf3, __mulsf3
 * \endif
 */
 // None of these functions are available on RISC-V:
 #if !defined(__riscv) || PICO_COMBINED_DOCS
-float int2float(int32_t f);
+#if PICO_COMBINED_DOCS || !LIB_PICO_FLOAT_COMPILER
-float uint2float(uint32_t f);
+float int2float(int32_t i);
-float int642float(int64_t f);
+float uint2float(uint32_t i);
-float uint642float(uint64_t f);
+float int642float(int64_t i);
 float uint642float(uint64_t i);
 float fix2float(int32_t m, int e);
 float ufix2float(uint32_t m, int e);
 float fix642float(int64_t m, int e);
 float ufix642float(uint64_t m, int e);
-// These methods round towards -Infinity.
+// These methods round towards 0, which IS the C way
 int32_t float2fix(float f, int e);
 uint32_t float2ufix(float f, int e);
 int64_t float2fix64(float f, int e);
 uint64_t float2ufix64(float f, int e);
 int32_t float2int(float f);
 uint32_t float2uint(float f);
 int64_t float2int64(float f);
 uint64_t float2uint64(float f);
 // These methods round towards 0.
 int32_t float2int_z(float f);
 int64_t float2int64_z(float f);
 int32_t float2uint_z(float f);
 int64_t float2uint64_z(float f);
 int32_t float2fix_z(float f, int e);
 uint32_t float2ufix_z(float f, int e);
 int64_t float2fix64_z(float f, int e);
 uint64_t float2ufix64_z(float f, int e);
 // These methods round towards -Infinity - which IS NOT the C way for negative numbers;
 // as such the naming is not ideal, however is kept for backwards compatibility
 int32_t float2int(float f);
 uint32_t float2uint(float f);
 int64_t float2int64(float f);
 uint64_t float2uint64(float f);
 int32_t float2fix(float f, int e);
 uint32_t float2ufix(float f, int e);
 int64_t float2fix64(float f, int e);
 uint64_t float2ufix64(float f, int e);
 #if LIB_PICO_FLOAT_PICO_VFP
 // a bit of a hack to inline VFP fixed point conversion when exponent is constant and in range 1-32
 #define fix2float(m, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _fix2float_inline(m, e) : fix2 ## float(m, e), fix2 ## float(m, e))
 #define ufix2float(m, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _ufix2float_inline(m, e) : ufix2 ## float(m, e), ufix2 ## float(m, e))
 #define float2fix_z(f, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _float2fix_z_inline(f, e) : float2 ## fix_z(f, e), float2 ## fix_z(f, e))
 #define float2ufix_z(f, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _float2ufix_z_inline(f, e) : float2 ## ufix_z(f, e), float2 ## ufix_z(f, e))
 #define float2fix(f, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _float2fix_inline(f, e) : float2 ## fix(f, e), float2 ## fix(f, e))
 #define float2ufix(f, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _float2ufix_inline(f, e) : float2 ## ufix(f, e), float2 ## ufix(f, e))
 #define _fix2float_inline(m, e) ({ \
    int32_t _m = m; \
    float f; \
    pico_default_asm( \
        "vmov %0, %1\n" \
        "vcvt.f32.s32 %0, %0, %2\n" \
        : "=t" (f) \
        : "r" (_m), "i" (e) \
    ); \
    f; \
 })
 #define _ufix2float_inline(m, e) ({ \
    uint32_t _m = m; \
    float f; \
    pico_default_asm( \
        "vmov %0, %1\n" \
        "vcvt.f32.u32 %0, %0, %2\n" \
        : "=t" (f) \
        : "r" (_m), "i" (e) \
    ); \
    f; \
 })
 #define _float2fix_z_inline(f, e) ({ \
    int32_t _m; \
    float _f = (f); \
    pico_default_asm( \
        "vcvt.s32.f32 %0, %0, %2\n" \
        "vmov %1, %0\n" \
        : "+t" (_f), "=r" (_m) \
        : "i" (e) \
    ); \
    _m; \
 })
 #define _float2ufix_z_inline(f, e) ({ \
    uint32_t _m; \
    float _f = (f); \
    pico_default_asm( \
        "vcvt.u32.f32 %0, %0, %2\n" \
        "vmov %1, %0\n" \
        : "+t" (_f), "=r" (_m) \
        : "i" (e) \
    ); \
    _m; \
 })
 #define _float2fix_z_inline(f, e) ({ \
    int32_t _m; \
    float _f = (f); \
    pico_default_asm( \
        "vcvt.s32.f32 %0, %0, %2\n" \
        "vmov %1, %0\n" \
        : "+t" (_f), "=r" (_m) \
        : "i" (e) \
    ); \
    _m; \
 })
 #define _float2fix_inline(f, e) ({ \
    union { float _f; int32_t _i; } _u; \
    _u._f = (f); \
    uint rc, tmp; \
    pico_default_asm( \
        "vcvt.s32.f32 %0, %0, %4\n" \
        "vmov %2, %0\n" \
        "lsls %1, #1\n" \
        "bls 2f\n" /* positive or zero or -zero are ok with the result we have */ \
        "lsrs %3, %1, #24\n" \
        "subs %3, #0x7f - %c4\n" \
        "bcc 1f\n" /* 0 < abs(f) < 1 ^ e, so need to round down */ \
        /* mask off all but fractional bits */ \
        "lsls %1, %3\n" \
        "lsls %1, #8\n" \
        "beq 2f\n" /* integers can round towards zero */ \
        "1:\n" \
        /* need to subtract 1 from the result to round towards -infinity... */ \
        /* this will never cause an overflow, because to get here we must have had a non integer/infinite value which */ \
        /* therefore cannot have been equal to INT64_MIN when rounded towards zero */ \
        "subs %2, #1\n" \
        "2:\n" \
        : "+t" (_u._f), "+r" (_u._i), "=r" (rc), "=r" (tmp) \
        : "i" (e) \
    ); \
    rc; \
 })
 #define _float2ufix_inline(f, e) _float2ufix_z_inline((f), (e))
 #endif
 #if LIB_PICO_FLOAT_PICO_VFP
 // may as well provide inline macros for VFP
 #define int2float(i) ((float)(int32_t)(i))
 #define uint2float(i) ((float)(uint32_t)(i))
 #define float2int_z(f) ((int32_t)(f))
 #define float2uint_z(f) ((uint32_t)(f))
 #endif
 #endif
 float exp10f(float x);
 void sincosf(float x, float *sinx, float *cosx);
 float powintf(float x, int y);
 #if !PICO_RP2040 || PICO_COMBINED_DOCS
 int64_t float2fix64_z(float f, int e);
 float fdiv_fast(float n, float d);
-float fsqrt_fast(float f);
+float sqrtf_fast(float f);
 #endif
 #endif
 #if defined(__riscv) || LIB_PICO_FLOAT_COMPILER
 // when using the compiler or RISC-V, we provide as many functions as we trivially can - these will be efficient
 // when using hard-float on Arm
 static inline float int2float(int32_t i) { return (float)i; }
 static inline float uint2float(uint32_t i) { return (float)i; }
 static inline float int642float(int64_t i) { return (float)i; }
 static inline float uint642float(uint64_t i) { return (float)i; }
 static inline int32_t float2int_z(float f) { return (int32_t)f; }
 static inline int64_t float2int64_z(float f) { return (int64_t)f; }
 static inline int32_t float2uint_z(float f) { return (uint32_t)f; }
 static inline int64_t float2uint64_z(float f) { return (uint64_t)f; }
 #endif
 #ifdef __cplusplus
 }
 #endif
--- a/test/pico_float_test/BUILD.bazel
+++ b/test/pico_float_test/BUILD.bazel
@@ -85,3 +85,12 @@ filegroup(
    name = "m33",
    srcs = ["m33.c"],
 )
 # TODO: Add these tests to the Bazel build.
 filegroup(
    name = "unsupported_tests",
    srcs = [
        "custom_double_funcs_test.c",
        "custom_float_funcs_test.c",
    ],
 )
--- a/test/pico_float_test/CMakeLists.txt
+++ b/test/pico_float_test/CMakeLists.txt
@@ -79,4 +79,31 @@ else ()
        target_link_libraries(m33 pico_double pico_stdlib)
        pico_add_extra_outputs(m33)
    endif()
 endif()
 set(FLOAT_TYPES compiler)
 set(DOUBLE_TYPES compiler)
 list(APPEND FLOAT_TYPES pico)
 list(APPEND DOUBLE_TYPES pico)
 if (PICO_RP2350)
    if (NOT PICO_RISCV)
        list(APPEND FLOAT_TYPES pico_vfp pico_dcp)
    endif()
 endif()
 foreach (FLOAT_TYPE IN LISTS FLOAT_TYPES)
    add_executable(custom_float_funcs_test_${FLOAT_TYPE} custom_float_funcs_test.c)
    pico_set_float_implementation(custom_float_funcs_test_${FLOAT_TYPE} ${FLOAT_TYPE})
    target_link_libraries(custom_float_funcs_test_${FLOAT_TYPE} PRIVATE pico_stdlib)
    pico_add_extra_outputs(custom_float_funcs_test_${FLOAT_TYPE})
    pico_set_printf_implementation(custom_float_funcs_test_${FLOAT_TYPE} compiler)
 endforeach ()
 foreach (DOUBLE_TYPE IN LISTS DOUBLE_TYPES)
    add_executable(custom_double_funcs_test_${DOUBLE_TYPE} custom_double_funcs_test.c)
    pico_set_double_implementation(custom_double_funcs_test_${DOUBLE_TYPE} ${DOUBLE_TYPE})
    target_link_libraries(custom_double_funcs_test_${DOUBLE_TYPE} PRIVATE pico_stdlib)
    pico_add_extra_outputs(custom_double_funcs_test_${DOUBLE_TYPE})
    pico_set_printf_implementation(custom_double_funcs_test_${DOUBLE_TYPE} compiler)
 endforeach ()
--- a/test/pico_float_test/custom_double_funcs_test.c
+++ b/test/pico_float_test/custom_double_funcs_test.c
@@ -0,0 +1,515 @@
 #include <stdio.h>
 #include "pico/stdlib.h"
 #include "pico/double.h"
 #include "math.h"
 #if 0
 #define printf(...) ((void)0)
 #endif
 #if 0
 #define stop() return -1
 #else
 #define stop() rc=1
 #endif
 #define test_assert(x) ({ if (!(x)) { printf("Assertion failed: ");puts(#x);printf("  at " __FILE__ ":%d\n", __LINE__); stop(); } })
 #define test_checkd(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %f != %f\n", msg, x, expected); stop(); } })
 #define test_checki(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %d != %d\n", msg, x, expected); stop(); } })
 #define test_checku(x, expected, msg) ({ if ((uint32_t)(x) != (uint32_t)(expected)) { printf("  %s: %u != %u\n", msg, x, expected); stop(); } })
 #define test_checki64(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %lld != %lld\n", msg, (int64_t)(x), (int64_t)(expected)); stop(); } })
 #define test_checku64(x, expected, msg) ({ if ((uint64_t)(x) != (uint64_t)(expected)) { printf("  %s: %llu != %llu\n", msg, (uint64_t)(x), (uint64_t)(expected)); stop(); } })
 #if !(LIB_PICO_DOUBLE_COMPILER || defined(__riscv))
 static inline double fix2double_8(int32_t m) { return fix2double(m, 8); }
 static inline double fix2double_12(int32_t m) { return fix2double(m, 12); }
 static inline double fix2double_16(int32_t m) { return fix2double(m, 16); }
 static inline double fix2double_24(int32_t m) { return fix2double(m, 24); }
 static inline double fix2double_28(int32_t m) { return fix2double(m, 28); }
 static inline double fix2double_32(int32_t m) { return fix2double(m, 32); }
 static inline double ufix2double_12(int32_t m) { return ufix2double(m, 12); }
 static inline double double2fix_12(int32_t m) { return double2fix(m, 12); }
 static inline double double2ufix_12(int32_t m) { return double2ufix(m, 12); }
 #endif
 #if 1 && (LIB_PICO_DOUBLE_COMPILER || defined(__riscv))
 #define double2int_z(f) ({ double _d = f; pico_default_asm_volatile("" : "+r" (_d)); double2 ## int_z(_d); })
 #define double2uint_z(f) ({ double _d = f; pico_default_asm_volatile("" : "+r" (_d)); double2 ## uint_z(_d); })
 #define double2int64_z(f) ({ double _d = f; pico_default_asm_volatile("" : "+r" (_d)); double2 ## int64_z(_d); })
 #define double2uint64_z(f) ({ double _d = f; pico_default_asm_volatile("" : "+r" (_d)); double2 ## uint64_z(_d); })
 #define int2double(i) ({ int32_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); int2 ## double(_i); })
 #define uint2double(i) ({ uint32_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); uint2 ## double(_i); })
 #define int642double(i) ({ int64_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); int642 ## double(_i); })
 #define uint642double(i) ({ uint64_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); uint642 ## double(_i); })
 #endif
 int test() {
    int rc = 0;
 #if LIB_PICO_DOUBLE_PICO
    printf(">>> Using PICO\n");
 #endif
    printf("int2double\n");
    test_checkd(int2double(0), 0.0, "int2double1");
    test_checkd(int2double(-1), -1.0, "int2double2");
    test_checkd(int2double(1), 1.0, "int2double3");
    test_checkd(int2double(INT32_MAX), 2147483647.0, "int2double4");
    test_checkd(int2double(INT32_MIN), -2147483648.0, "int2double5");
    // these have rounding behavior on float but not double
    test_checkd(int2double(2147483391), 2147483391.0, "int2double6");
    test_checkd(int2double(2147483391), 2147483391.0, "int2double7");
    test_checkd(int2double(2147483457), 2147483457.0, "int2double8");
    test_checkd(int2double(2147483483), 2147483483.0, "int2double9");
    test_checkd(int2double(2147483584), 2147483584.0, "int2double10");
    printf("uint2double\n");
    test_checkd(uint2double(0), 0.0, "uint2double1");
    test_checkd(uint2double(1), 1.0, "uint2double2");
    test_checkd(uint2double(INT32_MAX), 2147483647.0, "uint2double3");
    // todo test correct rounding around maximum precision
    test_checkd(uint2double(UINT32_MAX), 4294967295.0, "uint2double4");
    printf("int642double\n");
    test_checkd(int642double(0), 0.0, "int642double1");
    test_checkd(int642double(-1), -1.0, "int642double2");
    test_checkd(int642double(1), 1.0, "int642double3");
    test_checkd(int642double(INT32_MAX-1), 2147483646.0, "int642double4");
    test_checkd(int642double(INT32_MAX), 2147483647.0, "int642double5");
    test_checkd(int642double(INT32_MAX+1ll), 2147483648.0, "int642double6");
    test_checkd(int642double(INT32_MIN-1ll), -2147483649.0, "int642double7");
    test_checkd(int642double(INT32_MIN), -2147483648.0, "int642double8");
    test_checkd(int642double(INT32_MIN+1ll), -2147483647.0, "int642double9");
    // todo test correct rounding around maximum precision
    test_checkd(int642double(INT64_MAX), 9223372036854775807.0, "int642double10");
    test_checkd(int642double(INT64_MIN), -9223372036854775808.0, "int642doubl11e");
    printf("uint642double\n");
    test_checkd(uint642double(0), 0.0, "uint642double1");
    test_checkd(uint642double(1), 1.0, "uint642double2");
    test_checkd(uint642double(INT32_MAX-1), 2147483646.0, "uint642double3");
    test_checkd(uint642double(INT32_MAX), 2147483647.0, "uint642double4");
    test_checkd(uint642double(INT32_MAX+1ll), 2147483648.0, "uint642double5");
    test_checkd(uint642double(INT64_MAX), 9223372036854775807.0, "uint642double6");
    // todo test correct rounding around maximum precision
    test_checkd(uint642double(UINT64_MAX), 18446744073709551615.0, "uint642double7");
    union {
        uint64_t u;
        double d;
    } u64d;
 #if !(LIB_PICO_DOUBLE_COMPILER || defined(__riscv))
    printf("fix2double\n");
    // todo test correct rounding around maximum precision
    test_checkd(fix2double(-3, 1), -1.5, "fix2double1");
    test_checkd(fix2double(-3, 1), -1.5, "fix2double2");
    test_checkd(fix2double(-3, -4), -48.0, "fix2double3");
    printf("ufix2double\n");
    // todo test correct rounding around maximum precision
    test_checkd(ufix2double(0xa0000000, 30), 2.5, "ufix2double1");
    test_checkd(ufix2double(3, -4), 48.0, "ufix2double2");
    printf("fix64double\n");
    // todo test correct rounding around maximum precision
    test_checkd(fix642double(-0xa000000000ll, 38), -2.5, "fix642double1");
    test_checkd(fix642double(-3, -34), -51539607552.0, "fix642double2");
    printf("ufix642double\n");
    // todo test correct rounding around maximum precision
    test_checkd(ufix642double(0xa000000000ll, 38), 2.5, "ufix642double1");
    test_checkd(ufix642double(3, -34), 51539607552.0, "fix64double2");
    test_checkd(fix2double_8(128), 0.5, "fix2double_8_1");
    test_checkd(fix2double_8(-128), -0.5, "fix2double_8_2");
    test_checkd(fix2double_16(8192), 0.125, "fix2double_8_3");
    test_checkd(fix2double_16(-8192), -0.125, "fix2double_8_4");
    test_checkd(fix2double_24(3<<23), 1.5, "fix2double_8_5");
    test_checkd(fix2double_24(-(3<<23)), -1.5, "fix2double_8_6");
    printf("double2fix\n");
    test_checki(double2fix(-0.5, 8), -0x80, "double2fix0");
    test_checki(double2fix(3.5, 8), 0x380, "double2fix1");
    test_checki(double2fix(-3.5, 8), -0x380, "double2fix2");
    test_checki(double2fix(32768.0, 16), INT32_MAX, "double2fix3");
    test_checki(double2fix(65536.0, 16), INT32_MAX, "double2fix4");
    test_checki(double2fix(-65536.0, 16), INT32_MIN, "double2fix4b");
    test_checki(double2fix(INFINITY, 16), INT32_MAX, "double2fix5");
    test_checki(double2fix(-INFINITY, 16), INT32_MIN, "double2fix5b");
    test_checki(double2fix(INFINITY, -16), INT32_MAX, "double2fix5c");
    test_checki(double2fix(-INFINITY, -16), INT32_MIN, "double2fix5d");
    test_checki(double2fix(3.24999, 2), 12, "double2fix6");
    test_checki(double2fix(3.25, 2), 13, "double2fix7");
    test_checki(double2fix(-3.24999, 2), -13, "double2fix8");
    test_checki(double2fix(-3.25, 2), -13, "double2fix9");
    test_checki(double2fix(-0.75, 1), -2, "double2fix10");
    test_checki(double2fix(-3.0, -1), -2, "double2fix11"); // not very useful
    test_checki(double2fix(0.0, 16), 0, "double2fix12");
    test_checki(double2fix(-0.0, 16), 0, "double2fix13");
    test_checki(double2fix(0.0, -16), 0, "double2fix14");
    test_checki(double2fix(-0.0, -16), 0, "double2fix15");
    printf("double2ufix\n");
    test_checku(double2ufix(3.5, 8), 0x380, "double2ufix1");
    test_checku(double2ufix(-3.5, 8), 0, "double2ufix2");
    test_checku(double2ufix(32768.0, 16), 32768 << 16, "double2ufix3");
    test_checku(double2ufix(65536.0, 16), UINT32_MAX, "double2ufix4");
    test_checku(double2ufix(INFINITY, 16), UINT32_MAX, "double2ufix5");
    test_checku(double2ufix(-INFINITY, 16), 0, "double2ufix5b");
    test_checku(double2ufix(INFINITY, -16), UINT32_MAX, "double2ufix5c");
    test_checku(double2ufix(-INFINITY, -16), 0, "double2ufix5d");
    test_checku(double2ufix(3.24999, 2), 12, "double2ufix6");
    test_checku(double2ufix(3.25, 2), 13, "double2ufix7");
    test_checku(double2ufix(3.0, -1), 1, "double2ufix8"); // not very useful
    test_checki(double2ufix(0.0, 16), 0, "double2ufix12");
    test_checki(double2ufix(-0.0, 16), 0, "double2fix13");
    test_checki(double2ufix(0.0, -16), 0, "double2ufix14");
    test_checki(double2ufix(-0.0, -16), 0, "double2fix15");
    printf("double2fix64\n");
    test_checki64(double2fix64(3.5, 8), 0x380, "double2fix641");
    test_checki64(double2fix64(-3.5, 8), -0x380, "double2fix642");
    test_checki64(double2fix64(32768.0, 16), 32768ll << 16, "double2fix643");
    test_checki64(double2fix64(65536.0, 16), 65536ll << 16, "double2fix644");
    test_checki64(double2fix64(2147483648.0, 16), 2147483648ll << 16, "double2ufix644b");
    test_checki64(double2fix64(65536.0 * 65536.0 * 32768.0, 16), INT64_MAX, "double2fix644c");
    test_checki64(double2fix64(INFINITY, 16), INT64_MAX, "double2fix645");
    test_checki64(double2fix64(-INFINITY, 16), INT64_MIN, "double2fix645b");
    test_checki64(double2fix64(INFINITY, -16), INT64_MAX, "double2fix645c");
    test_checki64(double2fix64(-INFINITY, -16), INT64_MIN, "double2fix645d");
    test_checki64(double2fix64(3.24999, 2), 12, "double2fix646");
    test_checki64(double2fix64(3.25, 2), 13, "double2fix647");
    test_checki64(double2fix64(-3.24999, 2), -13, "double2fix648");
    test_checki64(double2fix64(-3.25, 2), -13, "double2fix649");
    test_checki64(double2fix64(-3.0, -1), -2, "double2fix6410"); // not very useful
    test_checki64(double2fix64(2147483648.0 * 2147483648.0, 16), INT64_MAX, "double2ufix6411");
    test_checki64(double2fix64(0.0, 16), 0, "double2fix6412");
    test_checki64(double2fix64(-0.0, 16), 0, "double2fix6413");
    test_checki64(double2fix64(0.0, -16), 0, "double2fix6412b");
    test_checki64(double2fix64(-0.0, -16), 0, "double2fix6413b");
    test_checki64(double2fix64(-3.25, 40), -13ll * (1ll << 38), "double2fix6414");
    u64d.u = 0xc00a000000000001;
    test_checki64(double2fix64(u64d.d, 40), -13ll * (1ll << 38) - 1ll, "double2fix6414b");
    u64d.u = 0xc00a000080000001;
    test_checki64(double2fix64(u64d.d, 20), -13ll * (1ll << 18) - 2ll, "double2fix6415c");
    u64d.u = 0xc00a000080000000;
    test_checki64(double2fix64(u64d.d, 20), -13ll * (1ll << 18) - 1ll, "double2fix6415d");
    u64d.u = 0xc00a000000000001;
    test_checki64(double2fix64(u64d.d, 20), -13ll * (1ll << 18) - 1ll, "double2fix6415e");
    u64d.u = 0xc00a000000000000;
    test_checki64(double2fix64(u64d.d, 20), -13ll * (1ll << 18), "double2fix6415g");
    u64d.u = 0xc00a000080000001;
    test_checki64(double2fix64(u64d.d, 19), -13ll * (1ll << 17) - 1ll, "double2fix6415h");
    u64d.u = 0xc00a000080000000;
    test_checki64(double2fix64(u64d.d, 19), -13ll * (1ll << 17) - 1ll, "double2fix6415i");
    u64d.u = 0xc00a000000000001;
    test_checki64(double2fix64(u64d.d, 19), -13ll * (1ll << 17) - 1ll, "double2fix6415j");
    u64d.u = 0xc00a000000000000;
    test_checki64(double2fix64(u64d.d, 19), -13ll * (1ll << 17), "double2fix6415k");
    printf("double2ufix64\n");
    test_checku64(double2ufix64(3.5, 8), 0x380, "double2ufix641");
    test_checku64(double2ufix64(-3.5, 8), 0, "double2ufix642");
    test_checku64(double2ufix64(32768.0, 16), 32768ull << 16, "double2ufix643");
    test_checku64(double2ufix64(65536.0, 16), 65536ull << 16, "double2ufix644");
    test_checku64(double2ufix64(2147483648.0, 16), 2147483648ull << 16, "double2ufix644b");
    test_checku64(double2ufix64(INFINITY, 16), UINT64_MAX, "double2ufix645");
    test_checku64(double2ufix64(-INFINITY, 16), 0, "double2ufix645b");
    test_checku64(double2ufix64(INFINITY, -16), UINT64_MAX, "double2ufix645c");
    test_checku64(double2ufix64(-INFINITY, -16), 0, "double2ufix645d");
    test_checku64(double2ufix64(3.24999, 2), 12, "double2ufix646");
    test_checku64(double2ufix64(3.25, 2), 13, "double2ufix647");
    test_checku64(double2ufix64(3.0, -1), 1, "double2ufix648"); // not very useful
    test_checki64(double2ufix64(0.0, 16), 0, "double2ufix649");
    test_checki64(double2ufix64(-0.0, 16), 0, "double2ufix6410");
    printf("double2fix_z\n");
    test_checki(double2fix_z(3.5, 8), 0x380, "double2fix_z1");
    test_checki(double2fix_z(-3.5, 8), -0x380, "double2fix_z2");
    test_checki(double2fix_z(32768.0, 16), INT32_MAX, "double2fix_z3");
    test_checki(double2fix_z(65536.0, 16), INT32_MAX, "double2fix_z4");
    test_checki(double2fix_z(INFINITY, 16), INT32_MAX, "double2fix_z5");
    test_checki(double2fix_z(-INFINITY, 16), INT32_MIN, "double2fix_z5b");
    test_checki(double2fix_z(INFINITY, -50), INT32_MAX, "double2fix_z5c");
    test_checki(double2fix_z(-INFINITY, -50), INT32_MIN, "double2fix_z5d");
    test_checki(double2fix_z(3.24999, 2), 12, "double2fix_z6");
    test_checki(double2fix_z(3.25, 2), 13, "double2fix_z7");
    test_checki(double2fix_z(-3.24999, 2), -12, "double2fix_z8");
    test_checki(double2fix_z(-3.25, 2), -13, "double2fix_z9");
    test_checki(double2fix_z(-0.75, 1), -1, "double2fix_z10");
    test_checki(double2fix_z(-3.0, -1), -1, "double2fix_z11"); // not very useful
    test_checki(double2fix_z(0.0, 16), 0, "double2fix_z12");
    test_checki(double2fix_z(-0.0, 16), 0, "double2fix_z13");
    test_checki(double2fix_z(0.0, -16), 0, "double2fix_z12b");
    test_checki(double2fix_z(-0.0, -16), 0, "double2fix_z13b");
    printf("double2ufix_z\n");
    test_checku(double2ufix_z(3.5, 8), 0x380, "double2ufix_z1");
    test_checku(double2ufix_z(-3.5, 8), 0, "double2ufix_z2");
    test_checku(double2ufix_z(32768.0, 16), 32768 << 16, "double2ufix_z3");
    test_checku(double2ufix_z(65536.0, 16), UINT32_MAX, "double2ufix_z4");
    test_checku(double2ufix_z(INFINITY, 16), UINT32_MAX, "double2ufix_z5");
    test_checku(double2ufix_z(-INFINITY, 16), 0, "double2ufix_z5b");
    test_checku(double2ufix_z(INFINITY, 16), UINT32_MAX, "double2ufix_z5c");
    test_checku(double2ufix_z(-INFINITY, 16), 0, "double2ufix_z5d");
    test_checku(double2ufix_z(3.24999, 2), 12, "double2ufix_z6");
    test_checku(double2ufix_z(3.25, 2), 13, "double2ufix_z7");
    test_checku(double2ufix_z(3.0, -1), 1, "double2ufix_z8"); // not very useful
    test_checki(double2ufix_z(0.0, 16), 0, "double2fix_z9");
    test_checki(double2ufix_z(-0.0, 16), 0, "double2fix_z10");
    test_checki(double2ufix_z(0.0, -16), 0, "double2fix_z11");
    test_checki(double2ufix_z(-0.0, -16), 0, "double2fix_z12");
    printf("double2fix64_z\n");
    test_checki64(double2fix64_z(3.5, 8), 0x380, "double2fix64_z1");
    test_checki64(double2fix64_z(-3.5, 8), -0x380, "double2fix64_z2");
    test_checki64(double2fix64_z(32768.0, 16), 32768ll << 16, "double2fix64_z3");
    test_checki64(double2fix64_z(65536.0, 16), 65536ll << 16, "double2fix64_z4");
    test_checki64(double2fix64_z(65536.0 * 65536.0 * 32768.0, 16), INT64_MAX, "double2fix64_z4b");
    test_checki64(double2fix64_z(INFINITY, 16), INT64_MAX, "double2fix64_z5");
    test_checki64(double2fix64_z(-INFINITY, 16), INT64_MIN, "double2fix64_z5");
    test_checki64(double2fix64_z(INFINITY, 16), INT64_MAX, "double2fix64_z5");
    test_checki64(double2fix64_z(-INFINITY, 16), INT64_MIN, "double2fix64_z5");
    test_checki64(double2fix64_z(3.24999, 2), 12, "double2fix64_z6");
    test_checki64(double2fix64_z(3.25, 2), 13, "double2fix64_z7");
    test_checki64(double2fix64_z(-3.24999, 2), -12, "double2fix64_z8");
    test_checki64(double2fix64_z(-3.25, 2), -13, "double2fix64_z9");
    test_checki64(double2fix64_z(-3.0, -1), -1, "double2fix64_z10"); // not very useful
    test_checki64(double2fix64_z(0.0, 16), 0, "double2fix64_z11");
    test_checki64(double2fix64_z(-0.0, 16), 0, "double2fix64_z12");
    test_checki64(double2fix64_z(0.0, -16), 0, "double2fix64_z13");
    test_checki64(double2fix64_z(-0.0, -16), 0, "double2fix64_z14");
    test_checki64(double2fix64_z(-3.25, 40), -13ll * (1ll << 38), "double2fix64_z15");
    u64d.u = 0xc00a000000000001;
    test_checki64(double2fix64_z(u64d.d, 40), -13ll * (1ll << 38), "double2fix64_z15b");
    u64d.u = 0xc00a000080000001;
    test_checki64(double2fix64_z(u64d.d, 20), -13ll * (1ll << 18) - 1ll, "double2fix64_z15c");
    u64d.u = 0xc00a000080000000;
    test_checki64(double2fix64_z(u64d.d, 20), -13ll * (1ll << 18) - 1ll, "double2fix64_z15d");
    u64d.u = 0xc00a000000000001;
    test_checki64(double2fix64_z(u64d.d, 20), -13ll * (1ll << 18), "double2fix64_z15e");
    u64d.u = 0xc00a000000000000;
    test_checki64(double2fix64_z(u64d.d, 20), -13ll * (1ll << 18), "double2fix64_z15g");
    u64d.u = 0xc00a000080000001;
    test_checki64(double2fix64_z(u64d.d, 19), -13ll * (1ll << 17), "double2fix64_z15h");
    u64d.u = 0xc00a000080000000;
    test_checki64(double2fix64_z(u64d.d, 19), -13ll * (1ll << 17), "double2fix64_z15i");
    u64d.u = 0xc00a000000000001;
    test_checki64(double2fix64_z(u64d.d, 19), -13ll * (1ll << 17), "double2fix64_z15j");
    u64d.u = 0xc00a000000000000;
    test_checki64(double2fix64_z(u64d.d, 19), -13ll * (1ll << 17), "double2fix64_z15k");
    printf("double2ufix64_z\n");
    test_checku64(double2ufix64_z(3.5, 8), 0x380, "double2ufix64_z1");
    test_checku64(double2ufix64_z(-3.5, 8), 0, "double2ufix64_z2");
    test_checku64(double2ufix64_z(32768.0, 16), 32768ll << 16, "double2ufix64_z3");
    test_checku64(double2ufix64_z(65536.0, 16), 65536ll << 16, "double2ufix64_z4");
    test_checki64(double2ufix64_z(65536.0 * 65536.0 * 65536.0, 16), UINT64_MAX, "double2fix64_z4b");
    test_checku64(double2ufix64_z(INFINITY, 16), UINT64_MAX, "double2ufix64_z5");
    test_checku64(double2ufix64_z(-INFINITY, 16), 0, "double2ufix64_z5b");
    test_checku64(double2ufix64_z(INFINITY, 16), UINT64_MAX, "double2ufix64_z5c");
    test_checku64(double2ufix64_z(-INFINITY, 16), 0, "double2ufix64_z5d");
    test_checku64(double2ufix64_z(3.24999, 2), 12, "double2ufix64_z6");
    test_checku64(double2ufix64_z(3.25, 2), 13, "double2ufix64_z7");
    test_checki64(double2ufix64_z(3.0, -1), 1, "double2fuix64_z8"); // not very useful
    test_checki64(double2ufix64_z(0.0, 16), 0, "double2ufix64_z9");
    test_checki64(double2ufix64_z(-0.0, 16), 0, "double2ufix64_z10");
    test_checki64(double2ufix64_z(0.0, -16), 0, "double2ufix64_z11");
    test_checki64(double2ufix64_z(-0.0, -16), 0, "double2ufix64_z12");
    printf("double2int\n");
    test_checki(double2int(0.0), 0, "double2int1");
    test_checki(double2int(0.25), 0, "double2int1b");
    test_checki(double2int(0.5), 0, "double2int2");
    test_checki(double2int(0.75), 0, "double2int2b");
    test_checki(double2int(1.0), 1, "double2int3");
    test_checki(double2int(-10.0), -10, "double2int3a");
    test_checki(double2int(-0.0), 0, "double2int3b");
    test_checki(double2int(-0.25), -1, "double2int4");
    test_checki(double2int(-0.5), -1, "double2int4b");
    test_checki(double2int(-0.75), -1, "double2int5");
    test_checki(double2int(-1.0), -1, "double2int5b");
    // todo test correct rounding around maximum precision
    test_checki(double2int(2147483646.0), INT32_MAX-1, "double2int6");
    test_checki(double2int(2147483647.0), INT32_MAX, "double2int6b");
    test_checki(double2int(21474836470.0), INT32_MAX, "double2int7");
    test_checki(double2int(-2147483648.0), INT32_MIN, "double2int8");
    test_checki(double2int(-21474836480.0), INT32_MIN, "double2int9");
    test_checki(double2int(-2.5), -3, "double2int10");
    test_checki(double2int(-2.4), -3, "double2int11");
    u64d.u = 0xc000000000000000ull;
    test_checki(double2int(u64d.d), -2, "double2int12");
    u64d.u = 0xc008000000000000ull;
    test_checki(double2int(u64d.d), -3, "double2int12b");
    u64d.u = 0xc000000000000001ull;
    test_checki(double2int(u64d.d), -3, "double2int12c");
    u64d.u = 0xc000000080000000ull;
    test_checki(double2int(u64d.d), -3, "double2int12d");
    u64d.u = 0xc000000100000000ull;
    test_checki(double2int(u64d.d), -3, "double2int12e");
    u64d.u = 0xc000000100000001ull;
    test_checki(double2int(u64d.d), -3, "double2int12f");
    test_checki(double2int(-2147483647.0), INT32_MIN+1, "double2int13");
    test_checki(double2int(-2147483647.1), INT32_MIN, "double2int14");
    test_checki(double2int(-2147483647.9), INT32_MIN, "double2int15");
    test_checki(double2int(-2147483648.0), INT32_MIN, "double2int16");
    test_checki(double2int(-2147483648.1), INT32_MIN, "double2int17");
    test_checki(double2int(-21474836480.1), INT32_MIN, "double2int18");
    printf("double2uint\n");
    test_checku(double2uint(0.0), 0, "double2uint1");
    test_checku(double2uint(0.25), 0, "double2uint2");
    test_checku(double2uint(0.5), 0, "double2uint3");
    test_checku(double2uint(0.75), 0, "double2uint4");
    test_checku(double2uint(1.0), 1, "double2uint5");
    test_checku(double2uint(2147483647.0), INT32_MAX, "double2uint6");
    test_checku(double2uint(2147483648.0), INT32_MAX+1u, "double2uint7");
    test_checku(double2uint(4294967294.5), UINT32_MAX-1, "double2uint8");
    test_checku(double2uint(4294967295.0), UINT32_MAX, "double2uint9");
    test_checku(double2uint(42949672950.0), UINT32_MAX, "double2uint10");
    printf("double2int64\n");
    test_checki64(double2int64(0.0), 0, "double2int641");
    test_checki64(double2int64(0.25), 0, "double2int641b");
    test_checki64(double2int64(0.5), 0, "double2int642");
    test_checki64(double2int64(0.75), 0, "double2int642b");
    test_checki64(double2int64(1.0), 1, "double2int643");
    test_checki64(double2int64(-10.0), -10, "double2int643a");
    test_checki64(double2int64(-0.0), 0, "double2int643b");
    test_checki64(double2int64(-0.25), -1, "double2int644");
    test_checki64(double2int64(-0.5), -1, "double2int644b");
    test_checki64(double2int64(-0.75), -1, "double2int645");
    test_checki64(double2int64(-1.0), -1, "double2int645b");
    // todo test correct rounding around maximum precision
    test_checki64(double2int64(2147483647.0), INT32_MAX, "double2int646");
    test_checki64(double2int64(21474836470.0), 21474836470ll, "double2int647");
    test_checki64(double2int64(-2147483648.0), INT32_MIN, "double2int648");
    test_checki64(double2int64(-21474836480.0), -21474836480ll, "double2int649");
    test_checki64(double2int64(-2.5), -3, "double2int6410");
    test_checki64(double2int64(-2.4), -3, "double2int6411");
    u64d.u = 0xc000000000000000ull;
    test_checki64(double2int64(u64d.d), -2, "double2int6412");
    u64d.u = 0xc008000000000000ull;
    test_checki64(double2int64(u64d.d), -3, "double2int6412b");
    u64d.u = 0xc000000000000001ull;
    test_checki64(double2int64(u64d.d), -3, "double2int6412c");
    u64d.u = 0xc000000080000000ull;
    test_checki64(double2int64(u64d.d), -3, "double2int6412d");
    u64d.u = 0xc000000100000000ull;
    test_checki64(double2int64(u64d.d), -3, "double2int6412e");
    u64d.u = 0xc000000100000001ull;
    test_checki64(double2int64(u64d.d), -3, "double2int6412f");
    printf("double2uint64\n");
    test_checku64(double2uint64(0.0), 0, "double2uint641");
    test_checku64(double2uint64(0.25), 0, "double2uint642");
    test_checku64(double2uint64(0.5), 0, "double2uint643");
    test_checku64(double2uint64(0.75), 0, "double2uint644");
    test_checku64(double2uint64(1.0), 1, "double2uint645");
    test_checku64(double2uint64(2147483647.0), INT32_MAX, "double2uint646");
    test_checku64(double2uint64(2147483648.0), INT32_MAX+1u, "double2uint647");
    // todo test correct rounding around maximum precision
    test_checku64(double2uint64(4294967294.5), 4294967294ull, "double2uint648");
    test_checku64(double2uint64(4294967295.0), 4294967295ull, "double2uint649");
    test_checku64(double2uint64(42949672950.0), 42949672950, "double2uint6410");
 #endif
    // // These methods round towards 0.
    printf("double2int_z\n");
    test_checki(double2int_z(0.0), 0, "double2int_z1");
    test_checki(double2int_z(0.25), 0, "double2int_z1b");
    test_checki(double2int_z(0.5), 0, "double2int_z2");
    test_checki(double2int_z(0.75), 0, "double2int_z2b");
    test_checki(double2int_z(1.0), 1, "double2int_z3");
    test_checki(double2int_z(-10.0), -10, "double2int_z3a");
    test_checki(double2int_z(-0.0), 0, "double2int_z3b");
    test_checki(double2int_z(-0.25), 0, "double2int_z4");
    test_checki(double2int_z(-0.5), 0, "double2int_z4b");
    test_checki(double2int_z(-0.75), 0, "double2int_z5");
    test_checki(double2int_z(-1.0), -1, "double2int_z5b");
    // todo test correct rounding around maximum precision
    test_checki(double2int_z(2147483647.0), INT32_MAX, "double2int_z6");
    test_checki(double2int_z(21474836470.0), INT32_MAX, "double2int_z7");
    test_checki(double2int_z(-2147483648.0), INT32_MIN, "double2int_z8");
    test_checki(double2int_z(-21474836480.0), INT32_MIN, "double2int_z9");
    test_checki(double2int_z(-2.5), -2, "double2int_z10");
    test_checki(double2int_z(-2.4), -2, "double2int_z11");
    u64d.u = 0xc000000000000000ull;
    test_checki(double2int_z(u64d.d), -2, "double2int_z12");
    u64d.u = 0xc008000000000000ull;
    test_checki(double2int_z(u64d.d), -3, "double2int_z12b");
    u64d.u = 0xc000000000000001ull;
    test_checki(double2int_z(u64d.d), -2, "double2int_z12c");
    u64d.u = 0xc000000080000000ull;
    test_checki(double2int_z(u64d.d), -2, "double2int_z12d");
    u64d.u = 0xc000000100000000ull;
    test_checki(double2int_z(u64d.d), -2, "double2int_z12e");
    u64d.u = 0xc000000100000001ull;
    test_checki(double2int_z(u64d.d), -2, "double2int_z12f");
    printf("double2int64_z\n");
    test_checki64(double2int64_z(0.0), 0, "double2int64_z1");
    test_checki64(double2int64_z(0.25), 0, "double2int64_z1b");
    test_checki64(double2int64_z(0.5), 0, "double2int64_z2");
    test_checki64(double2int64_z(0.75), 0, "double2int64_z2b");
    test_checki64(double2int64_z(1.0), 1, "double2int64_z3");
    test_checki64(double2int64_z(-10.0), -10, "double2int64_z3a");
    test_checki64(double2int64_z(-0.0), 0, "double2int64_z3b");
    test_checki64(double2int64_z(-0.25), 0, "double2int64_z4");
    test_checki64(double2int64_z(-0.5), 0, "double2int64_z4b");
    test_checki64(double2int64_z(-0.75), 0, "double2int64_z5");
    test_checki64(double2int64_z(-1.0), -1, "double2int64_z5b");
    // todo test correct rounding around maximum precision
    test_checki64(double2int64_z(2147483647.0), 2147483647ll, "double2int64_z6");
    test_checki64(double2int64_z(21474836470.0), 21474836470ll, "double2int64_z7");
    test_checki64(double2int64_z(-2147483648.0), INT32_MIN, "double2int64_z8");
    test_checki64(double2int64_z(-21474836480.0), -21474836480ll, "double2int64_z9");
    test_checki64(double2int64_z(-2.5), -2, "double2int64_z10");
    test_checki64(double2int64_z(-2.4), -2, "double2int64_z11");
    printf("double2uint_z\n");
    test_checku(double2uint_z(0.0), 0, "double2uint_z1");
    test_checku(double2uint_z(0.25), 0, "double2uint_z2");
    test_checku(double2uint_z(0.5), 0, "double2uint_z3");
    test_checku(double2uint_z(0.75), 0, "double2uint_z4");
    test_checku(double2uint_z(1.0), 1, "double2uint_z5");
    test_checku(double2uint_z(2147483647.0), INT32_MAX, "double2uint_z6");
    test_checku(double2uint_z(2147483648.0), INT32_MAX+1u, "double2uint_z7");
    // todo test correct rounding around maximum precision
    test_checku(double2uint_z(4294967294.5), UINT32_MAX-1u, "double2uint_z8");
    test_checku(double2uint_z(4294967295.0), UINT32_MAX, "double2uint_z9");
    test_checku(double2uint_z(42949672950.0), UINT32_MAX, "double2uint_z10");
    printf("double2uint64_z\n");
    test_checku64(double2uint64_z(0.0), 0, "double2uint64_z1");
    test_checku64(double2uint64_z(0.25), 0, "double2uint64_z2");
    test_checku64(double2uint64_z(0.5), 0, "double2uint64_z3");
    test_checku64(double2uint64_z(0.75), 0, "double2uint64_z4");
    test_checku64(double2uint64_z(1.0), 1, "double2uint64_z5");
    test_checku64(double2uint64_z(2147483647.0), INT32_MAX, "double2uint64_z6");
    test_checku64(double2uint64_z(2147483648.0), INT32_MAX+1u, "double2uint64_z7");
    // todo test correct rounding around maximum precision
    test_checku64(double2uint64_z(4294967294.5), 4294967294ull, "double2uint64_z8");
    test_checku64(double2uint64_z(4294967295.0), 4294967295ull, "double2uint64_z9");
    test_checku64(double2uint64_z(4294967296.0), 4294967296ull, "double2uint64_z9b");
    test_checku64(double2uint64_z(42949672950.0), 42949672950ull, "double2uint64_z10");
    // double exp10(double x);
    // void sincos(double x, double *sinx, double *cosx);
    // double powint(double x, int y);
    return rc;
 }
 int main() {
    stdio_init_all();
    int rc = test();
    if (rc) {
        printf("FAILED\n");
    } else {
        printf("PASSED\n");
    }
 }
--- a/test/pico_float_test/custom_float_funcs_test.c
+++ b/test/pico_float_test/custom_float_funcs_test.c
@@ -0,0 +1,402 @@
 #include <stdio.h>
 #include "pico/stdlib.h"
 #include "pico/float.h"
 #include "math.h"
 #if 0
 #define printf(...) ((void)0)
 #endif
 #if 0
 #define stop() return -1
 #else
 #define stop() rc=1
 #endif
 #define test_assert(x) ({ if (!(x)) { printf("Assertion failed: ");puts(#x);printf("  at " __FILE__ ":%d\n", __LINE__); stop(); } })
 #define test_checkf(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %f != %f\n", msg, x, expected); stop(); } })
 #define test_checki(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %d != %d\n", msg, x, expected); stop(); } })
 #define test_checku(x, expected, msg) ({ if ((uint32_t)(x) != (uint32_t)(expected)) { printf("  %s: %u != %u\n", msg, x, expected); stop(); } })
 #define test_checki64(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %lld != %lld\n", msg, (int64_t)(x), (int64_t)(expected)); stop(); } })
 #define test_checku64(x, expected, msg) ({ if ((uint64_t)(x) != (uint64_t)(expected)) { printf("  %s: %llu != %llu\n", msg, (uint64_t)(x), (uint64_t)(expected)); stop(); } })
 #if !(LIB_PICO_FLOAT_COMPILER || defined(__riscv))
 static inline float fix2float_8(int32_t m) { return fix2float(m, 8); }
 static inline float fix2float_12(int32_t m) { return fix2float(m, 12); }
 static inline float fix2float_16(int32_t m) { return fix2float(m, 16); }
 static inline float fix2float_24(int32_t m) { return fix2float(m, 24); }
 static inline float fix2float_28(int32_t m) { return fix2float(m, 28); }
 static inline float fix2float_32(int32_t m) { return fix2float(m, 32); }
 static inline float ufix2float_12(int32_t m) { return ufix2float(m, 12); }
 static inline float float2fix_12(int32_t m) { return float2fix(m, 12); }
 static inline float float2ufix_12(int32_t m) { return float2ufix(m, 12); }
 #endif
 #if 1 && (LIB_PICO_FLOAT_COMPILER || defined(__riscv))
 #if __SOFTFP__ || defined(__riscv)
 #define FREG "+r"
 #else
 #define FREG "+t"
 #endif
 // prevent the compiler from eliding the calculations
 #define float2int_z(f) ({ float _f = f; pico_default_asm_volatile("" : FREG (_f)); float2 ## int_z(_f); })
 #define float2uint_z(f) ({ float _f = f; pico_default_asm_volatile("" : FREG (_f)); float2 ## uint_z(_f); })
 #define float2int64_z(f) ({ float _f = f; pico_default_asm_volatile("" : FREG (_f)); float2 ## int64_z(_f); })
 #define float2uint64_z(f) ({ float _f = f; pico_default_asm_volatile("" : FREG (_f)); float2 ## uint64_z(_f); })
 #define int2float(i) ({ int32_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); int2 ## float(_i); })
 #define uint2float(i) ({ uint32_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); uint2 ## float(_i); })
 #define int642float(i) ({ int64_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); int642 ## float(_i); })
 #define uint642float(i) ({ uint64_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); uint642 ## float(_i); })
 #endif
 #if 1 && LIB_PICO_FLOAT_VFP
 // prevet the compiler from eliding the calculations
 #undef float2int_z
 #undef float2uint_z
 #undef int2float
 #undef uint2float
 #endif
 int test() {
    int rc = 0;
 #if LIB_PICO_FLOAT_PICO_DCP
    printf(">>> Using DCP\n");
 #endif
 #if LIB_PICO_FLOAT_PICO_VFP
    printf(">>> Using VFP\n");
 #endif
    printf("int2float\n");
    test_checkf(int2float(0), 0.0f, "int2float1");
    test_checkf(int2float(-1), -1.0f, "int2float2");
    test_checkf(int2float(1), 1.0f, "int2float3");
    test_checkf(int2float(INT32_MAX), 2147483647.0f, "int2float4");
    test_checkf(int2float(INT32_MIN), -2147483648.0f, "int2float5");
    // check rounding
    test_checkf(int2float(2147483391), 2147483392.0f, "int2float6");
    test_checkf(int2float(2147483456), 2147483392.0f, "int2float7");
    test_checkf(int2float(2147483457), 2147483520.0f, "int2float8");
    test_checkf(int2float(2147483483), 2147483520.0f, "int2float9");
    test_checkf(int2float(2147483584), 2147483648.0f, "int2float10");
    printf("uint2float\n");
    test_checkf(uint2float(0), 0.0f, "uint2float1");
    test_checkf(uint2float(1), 1.0f, "uint2float2");
    test_checkf(uint2float(INT32_MAX), 2147483647.0f, "uint2float3");
    // todo test correct rounding around maximum precision
    test_checkf(uint2float(UINT32_MAX), 4294967295.0f, "uint2float4");
    printf("int642float\n");
    test_checkf(int642float(0), 0.0f, "int642float1");
    test_checkf(int642float(-1), -1.0f, "int642float2");
    test_checkf(int642float(1), 1.0f, "int642float3");
    test_checkf(int642float(INT32_MAX-1), 2147483646.0f, "int642float4"); // note equality is within 1ulp
    test_checkf(int642float(INT32_MAX), 2147483647.0f, "int642float5"); // note equality is within 1ulp
    test_checkf(int642float(INT32_MAX+1ll), 2147483648.0f, "int642float6");
    test_checkf(int642float(INT32_MIN-1ll), -2147483649.0f, "int642float7"); // note equality is within 1ulp
    test_checkf(int642float(INT32_MIN), -2147483648.0f, "int642float8");
    test_checkf(int642float(INT32_MIN+1ll), -2147483647.0f, "int642float9"); // note equality is within 1ulp
    // todo test correct rounding around maximum precision
    test_checkf(int642float(INT64_MAX), 9223372036854775807.0f, "int642float10");
    test_checkf(int642float(INT64_MIN), -9223372036854775808.0f, "int642float11");
    printf("uint642float\n");
    test_checkf(uint642float(0), 0.0f, "uint642float1");
    test_checkf(uint642float(1), 1.0f, "uint642float2");
    test_checkf(uint642float(INT32_MAX-1), 2147483646.0f, "uint642float3"); // note equality is within 1ulp
    test_checkf(uint642float(INT32_MAX), 2147483647.0f, "uint642float4"); // note equality is within 1ulp
    test_checkf(uint642float(INT32_MAX+1ll), 2147483648.0f, "uint642float5");
    test_checkf(uint642float(INT64_MAX), 9223372036854775807.0f, "uint642float6");
    // todo test correct rounding around maximum precision
    test_checkf(uint642float(UINT64_MAX), 18446744073709551615.0f, "uint642float7");
    union {
        uint32_t u;
        float f;
    } u32f;
 #if !(LIB_PICO_FLOAT_COMPILER || defined(__riscv))
    printf("fix2float\n");
    // todo test correct rounding around maximum precision
    test_checkf(fix2float(-3, 1), -1.5f, "fix2float1");
    test_checkf(fix2float(-3, 1), -1.5f, "fix2float2");
    test_checkf(fix2float(-3, -4), -48.0f, "fix2float3");
    printf("ufix2float\n");
    // todo test correct rounding around maximum precision
    test_checkf(ufix2float(0xa0000000, 30), 2.5f, "ufix2float1");
    test_checkf(ufix2float(3, -4), 48.0f, "ufix2float2");
    printf("fix642float\n");
    // todo test correct rounding around maximum precision
    test_checkf(fix642float(-0xa000000000ll, 38), -2.5f, "fix6422float1");
    test_checkf(fix642float(-3, -34), -51539607552.0f, "fix642float2");
    printf("ufix642float\n");
    // todo test correct rounding around maximum precision
    test_checkf(ufix642float(0xa000000000ll, 38), 2.5f, "ufix642float1");
    test_checkf(ufix642float(3, -34), 51539607552.0f, "fix64float2");
    test_checkf(fix2float_8(128), 0.5f, "fix2float_8_1");
    test_checkf(fix2float_8(-128), -0.5f, "fix2float_8_2");
    test_checkf(fix2float_16(8192), 0.125f, "fix2float_8_3");
    test_checkf(fix2float_16(-8192), -0.125f, "fix2float_8_4");
    test_checkf(fix2float_24(3<<23), 1.5f, "fix2float_8_5");
    test_checkf(fix2float_24(-(3<<23)), -1.5f, "fix2float_8_6");
    printf("float2fix\n");
    test_checki(float2fix(-0.5f, 8), -0x80, "float2fix0");
    test_checki(float2fix(3.5f, 8), 0x380, "float2fix1");
    test_checki(float2fix(-3.5f, 8), -0x380, "float2fix2");
    test_checki(float2fix(32768.0f, 16), INT32_MAX, "float2fix3");
    test_checki(float2fix(65536.0f, 16), INT32_MAX, "float2fix4");
    test_checki(float2fix(-65536.0f, 16), INT32_MIN, "float2fix4b");
    test_checki(float2fix(INFINITY, 16), INT32_MAX, "float2fix5");
    test_checki(float2fix(-INFINITY, 16), INT32_MIN, "float2fix5b");
    test_checki(float2fix(3.24999f, 2), 12, "float2fix6");
    test_checki(float2fix(3.25f, 2), 13, "float2fix7");
    test_checki(float2fix(-3.24999f, 2), -13, "float2fix8");
    test_checki(float2fix(-3.25f, 2), -13, "float2fix9");
    test_checki(float2fix(-0.75f, 1), -2, "float2fix10");
    test_checki(float2fix(-3.0f, -1), -2, "float2fix11"); // not very useful
    u32f.u = 0x7f012345;
    test_checki(float2fix(u32f.f, 1), INT32_MAX, "float2fix12");
    u32f.u = 0xff012345;
    test_checki(float2fix(u32f.f, 1), INT32_MIN, "float2fix13");
    printf("float2ufix\n");
    test_checku(float2ufix(3.5f, 8), 0x380, "float2ufix1");
    test_checku(float2ufix(-3.5f, 8), 0, "float2ufix2");
    test_checku(float2ufix(32768.0f, 16), 32768 << 16, "float2ufix3");
    test_checku(float2ufix(65536.0f, 16), UINT32_MAX, "float2ufix4");
    test_checku(float2ufix(INFINITY, 16), UINT32_MAX, "float2ufix5");
    test_checku(float2ufix(3.24999f, 2), 12, "float2ufix6");
    test_checku(float2ufix(3.25f, 2), 13, "float2ufix7");
    test_checku(float2ufix(3.0f, -1), 1, "float2ufix8"); // not very useful
    printf("float2fix64\n");
    test_checki64(float2fix64(3.5f, 8), 0x380, "float2fix641");
    test_checki64(float2fix64(-3.5f, 8), -0x380, "float2fix642");
    test_checki64(float2fix64(32768.0f, 16), 32768ll << 16, "float2fix643");
    test_checki64(float2fix64(65536.0f, 16), 65536ll << 16, "float2fix644");
    test_checki64(float2fix64(2147483648.0f, 16), 2147483648ll << 16, "float2ufix644b");
    test_checki64(float2fix64(65536.0f * 65536.0f * 32768.0f, 16), INT64_MAX, "float2fix644c");
    test_checki64(float2fix64(INFINITY, 16), INT64_MAX, "float2fix645");
    test_checki64(float2fix64(3.24999f, 2), 12, "float2fix646");
    test_checki64(float2fix64(3.25f, 2), 13, "float2fix647");
    test_checki64(float2fix64(-3.24999f, 2), -13, "float2fix648");
    test_checki64(float2fix64(-3.25f, 2), -13, "float2fix649");
    test_checki64(float2fix64(-3.0f, -1), -2, "float2fix6410"); // not very useful
    printf("float2ufix64\n");
    test_checku64(float2ufix64(3.5f, 8), 0x380, "float2ufix641");
    test_checku64(float2ufix64(-3.5f, 8), 0, "float2ufix642");
    test_checku64(float2ufix64(32768.0f, 16), 32768ull << 16, "float2ufix643");
    test_checku64(float2ufix64(65536.0f, 16), 65536ull << 16, "float2ufix644");
    test_checku64(float2ufix64(2147483648.0f, 16), 2147483648ull << 16, "float2ufix644b");
    test_checku64(float2ufix64(INFINITY, 16), UINT64_MAX, "float2ufix645");
    test_checku64(float2ufix64(3.24999f, 2), 12, "float2ufix646");
    test_checku64(float2ufix64(3.25f, 2), 13, "float2ufix647");
    test_checku64(float2ufix64(3.0f, -1), 1, "float2ufix648"); // not very useful
    printf("float2fix_z\n");
    test_checki(float2fix_z(3.5f, 8), 0x380, "float2fix_z1");
    test_checki(float2fix_z(-3.5f, 8), -0x380, "float2fix_z2");
    test_checki(float2fix_z(32768.0f, 16), INT32_MAX, "float2fix_z3");
    test_checki(float2fix_z(65536.0f, 16), INT32_MAX, "float2fix_z4");
    test_checki(float2fix_z(INFINITY, 16), INT32_MAX, "float2fix_z5");
    test_checki(float2fix_z(-INFINITY, 16), INT32_MIN, "float2fix_z5b");
    test_checki(float2fix_z(3.24999f, 2), 12, "float2fix_z6");
    test_checki(float2fix_z(3.25f, 2), 13, "float2fix_z7");
    test_checki(float2fix_z(-3.24999f, 2), -12, "float2fix_z8");
    test_checki(float2fix_z(-3.25f, 2), -13, "float2fix_z9");
    test_checki(float2fix_z(-0.75f, 1), -1, "float2fix_z10");
    test_checki(float2fix_z(-3.0f, -1), -1, "float2fix_z11"); // not very useful
    u32f.u = 0x7f012345;
    test_checki(float2fix_z(u32f.f, 1), INT32_MAX, "float2fix_z12");
    u32f.u = 0xff012345;
    test_checki(float2fix_z(u32f.f, 1), INT32_MIN, "float2fix_z13");
    printf("float2ufix_z\n");
    test_checku(float2ufix_z(3.5f, 8), 0x380, "float2ufix_z1");
    test_checku(float2ufix_z(-3.5f, 8), 0, "float2ufix_z2");
    test_checku(float2ufix_z(32768.0f, 16), 32768 << 16, "float2ufix_z3");
    test_checku(float2ufix_z(65536.0f, 16), UINT32_MAX, "float2ufix_z4");
    test_checku(float2ufix_z(INFINITY, 16), UINT32_MAX, "float2ufix_z5");
    test_checku(float2ufix_z(3.24999f, 2), 12, "float2ufix_z6");
    test_checku(float2ufix_z(3.25f, 2), 13, "float2ufix_z7");
    test_checku(float2ufix_z(3.0f, -1), 1, "float2ufix_z8"); // not very useful
    u32f.u = 0x7f012345;
    test_checku(float2ufix_z(u32f.f, 1), UINT32_MAX, "float2fix_z9");
    u32f.u = 0xff012345;
    test_checku(float2ufix_z(u32f.f, 1), 0, "float2fix_z10");
    printf("float2fix64_z\n");
    test_checki64(float2fix64_z(3.5f, 8), 0x380, "float2fix64_z1");
    test_checki64(float2fix64_z(-3.5f, 8), -0x380, "float2fix64_z2");
    test_checki64(float2fix64_z(32768.0f, 16), 32768ll << 16, "float2fix64_z3");
    test_checki64(float2fix64_z(65536.0f, 16), 65536ll << 16, "float2fix64_z4");
    test_checki64(float2fix64_z(65536.0f * 65536.0f * 32768.0f, 16), INT64_MAX, "float2fix64_z4b");
    test_checki64(float2fix64_z(INFINITY, 16), INT64_MAX, "float2fix64_z5");
    test_checki64(float2fix64_z(3.24999f, 2), 12, "float2fix64_z6");
    test_checki64(float2fix64_z(3.25f, 2), 13, "float2fix64_z7");
    test_checki64(float2fix64_z(-3.24999f, 2), -12, "float2fix64_z8");
    test_checki64(float2fix64_z(-3.25f, 2), -13, "float2fix64_z9");
    test_checki64(float2fix64_z(-3.0f, -1), -1, "float2fix64_z10"); // not very useful
    printf("float2ufix64_z\n");
    test_checku64(float2ufix64_z(3.5f, 8), 0x380, "float2ufix64_z1");
    test_checku64(float2ufix64_z(-3.5f, 8), 0, "float2ufix64_z2");
    test_checku64(float2ufix64_z(32768.0f, 16), 32768ll << 16, "float2ufix64_z3");
    test_checku64(float2ufix64_z(65536.0f, 16), 65536ll << 16, "float2ufix64_z4");
    test_checki64(float2ufix64_z(65536.0f * 65536.0f * 65536.0f, 16), UINT64_MAX, "float2fix64_z4b");
    test_checku64(float2ufix64_z(INFINITY, 16), UINT64_MAX, "float2ufix64_z5");
    test_checku64(float2ufix64_z(3.24999f, 2), 12, "float2ufix64_z6");
    test_checku64(float2ufix64_z(3.25f, 2), 13, "float2ufix64_z7");
    test_checki64(float2ufix64_z(3.0f, -1), 1, "float2fuix64_z8"); // not very useful
    printf("float2int\n");
    test_checki(float2int(0.0f), 0, "float2int1");
    test_checki(float2int(0.25f), 0, "float2int1b");
    test_checki(float2int(0.5f), 0, "float2int2");
    test_checki(float2int(0.75f), 0, "float2int2b");
    test_checki(float2int(1.0f), 1, "float2int3");
    test_checki(float2int(-10.0f), -10, "float2int3a");
    test_checki(float2int(-0.0f), 0, "float2int3b");
    test_checki(float2int(-0.25f), -1, "float2int4");
    test_checki(float2int(-0.5f), -1, "float2int4b");
    test_checki(float2int(-0.75f), -1, "float2int5");
    test_checki(float2int(-1.0f), -1, "float2int5b");
    // todo test correct rounding around maximum precision
    test_checki(float2int(2147483647.0f), INT32_MAX, "float2int6");
    test_checki(float2int(21474836470.0f), INT32_MAX, "float2int7");
    test_checki(float2int(-2147483648.0f), INT32_MIN, "float2int8");
    test_checki(float2int(-21474836480.0f), INT32_MIN, "float2int9");
    test_checki(float2int(-2.5f), -3, "float2int10");
    test_checki(float2int(-2.4f), -3, "float2int11");
    printf("float2uint\n");
    test_checku(float2uint(0.0f), 0, "float2uint1");
    test_checku(float2uint(0.25f), 0, "float2uint2");
    test_checku(float2uint(0.5f), 0, "float2uint3");
    test_checku(float2uint(0.75f), 0, "float2uint4");
    test_checku(float2uint(1.0f), 1, "float2uint5");
    test_checku(float2uint(2147483647.0f), INT32_MAX+1u, "float2uint6"); // note loss of precision
    test_checku(float2uint(2147483648.0f), INT32_MAX+1u, "float2uint7");
    test_checku(float2uint(4294967294.5f), UINT32_MAX, "float2uint8"); // note loss of precision
    test_checku(float2uint(4294967295.0f), UINT32_MAX, "float2uint9");
    test_checku(float2uint(42949672950.0f), UINT32_MAX, "float2uint10");
    printf("float2int64\n");
    test_checki64(float2int64(0.0f), 0, "float2int641");
    test_checki64(float2int64(0.25f), 0, "float2int641b");
    test_checki64(float2int64(0.5f), 0, "float2int642");
    test_checki64(float2int64(0.75f), 0, "float2int642b");
    test_checki64(float2int64(1.0f), 1, "float2int643");
    test_checki64(float2int64(-10.0f), -10, "float2int643a");
    test_checki64(float2int64(-0.0f), 0, "float2int643b");
    test_checki64(float2int64(-0.25f), -1, "float2int644");
    test_checki64(float2int64(-0.5f), -1, "float2int644b");
    test_checki64(float2int64(-0.75f), -1, "float2int645");
    test_checki64(float2int64(-1.0f), -1, "float2int645b");
    // todo test correct rounding around maximum precision
    test_checki64(float2int64(2147483647.0f), INT32_MAX+1ll, "float2int646");
    test_checki64(float2int64(21474836470.0f), 21474836480ll, "float2int647"); // note loss of precision
    test_checki64(float2int64(-2147483648.0f), INT32_MIN, "float2int648");
    test_checki64(float2int64(-21474836480.0f), -21474836480ll, "float2int649");
    test_checki64(float2int64(-2.5f), -3, "float2int6410");
    test_checki64(float2int64(-2.4f), -3, "float2int6411");
    printf("float2uint64\n");
    test_checku64(float2uint64(0.0f), 0, "float2uint641");
    test_checku64(float2uint64(0.25f), 0, "float2uint642");
    test_checku64(float2uint64(0.5f), 0, "float2uint643");
    test_checku64(float2uint64(0.75f), 0, "float2uint644");
    test_checku64(float2uint64(1.0f), 1, "float2uint645");
    test_checku64(float2uint64(2147483647.0f), INT32_MAX+1u, "float2uint646"); // note loss of precision
    test_checku64(float2uint64(2147483648.0f), INT32_MAX+1u, "float2uint647");
    test_checku64(float2uint64(4294967294.5f), 4294967296ull, "float2uint648"); // note loss of precision
    test_checku64(float2uint64(4294967295.0f), 4294967296ull, "float2uint649"); // note loss of precision
    test_checku64(float2uint64(42949672950.0f), 42949672960ull, "float2uint6410"); // note loss of precision
 #endif
    // // These methods round towards 0.
    printf("float2int_z\n");
    test_checki(float2int_z(0.0f), 0, "float2int_z1");
    test_checki(float2int_z(0.25f), 0, "float2int_z1b");
    test_checki(float2int_z(0.5f), 0, "float2int_z2");
    test_checki(float2int_z(0.75f), 0, "float2int_z2b");
    test_checki(float2int_z(1.0f), 1, "float2int_z3");
    test_checki(float2int_z(-10.0f), -10, "float2int_z3a");
    test_checki(float2int_z(-0.0f), 0, "float2int_z3b");
    test_checki(float2int_z(-0.25f), 0, "float2int_z4");
    test_checki(float2int_z(-0.5f), 0, "float2int_z4b");
    test_checki(float2int_z(-0.75f), 0, "float2int_z5");
    test_checki(float2int_z(-1.0f), -1, "float2int_z5b");
    // todo test correct rounding around maximum precision
    test_checki(float2int_z(2147483647.0f), INT32_MAX, "float2int_z6");
    test_checki(float2int_z(21474836470.0f), INT32_MAX, "float2int_z7");
    test_checki(float2int_z(-2147483648.0f), INT32_MIN, "float2int_z8");
    test_checki(float2int_z(-21474836480.0f), INT32_MIN, "float2int_z9");
    test_checki(float2int_z(-2.5f), -2, "float2int_z10");
    test_checki(float2int_z(-2.4f), -2, "float2int_z11");
    printf("float2int64_z\n");
    test_checki64(float2int64_z(0.0f), 0, "float2int64_z1");
    test_checki64(float2int64_z(0.25f), 0, "float2int64_z1b");
    test_checki64(float2int64_z(0.5f), 0, "float2int64_z2");
    test_checki64(float2int64_z(0.75f), 0, "float2int64_z2b");
    test_checki64(float2int64_z(1.0f), 1, "float2int64_z3");
    test_checki64(float2int64_z(-10.0f), -10, "float2int64_z3a");
    test_checki64(float2int64_z(-0.0f), 0, "float2int64_z3b");
    test_checki64(float2int64_z(-0.25f), 0, "float2int64_z4");
    test_checki64(float2int64_z(-0.5f), 0, "float2int64_z4b");
    test_checki64(float2int64_z(-0.75f), 0, "float2int64_z5");
    test_checki64(float2int64_z(-1.0f), -1, "float2int64_z5b");
    test_checki64(float2int64_z(2147483647.0f), 2147483648ll, "float2int64_z6"); // note loss of precision
    test_checki64(float2int64_z(21474836470.0f), 21474836480ll, "float2int64_z7"); // note loss of precision
    test_checki64(float2int64_z(-2147483648.0f), INT32_MIN, "float2int64_z8");
    test_checki64(float2int64_z(-21474836480.0f), -21474836480ll, "float2int64_z9");
    test_checki64(float2int64_z(-2.5f), -2, "float2int64_z10");
    test_checki64(float2int64_z(-2.4f), -2, "float2int64_z11");
    printf("float2uint_z\n");
    test_checku(float2uint_z(0.0f), 0, "float2uint_z1");
    test_checku(float2uint_z(0.25f), 0, "float2uint_z2");
    test_checku(float2uint_z(0.5f), 0, "float2uint_z3");
    test_checku(float2uint_z(0.75f), 0, "float2uint_z4");
    test_checku(float2uint_z(1.0f), 1, "float2uint_z5");
    test_checku(float2uint_z(2147483647.0f), INT32_MAX+1u, "float2uint_z6"); // note loss of precision
    test_checku(float2uint_z(2147483648.0f), INT32_MAX+1u, "float2uint_z7");
    // todo test correct rounding around maximum precision
    test_checku(float2uint_z(4294967294.5f), UINT32_MAX, "float2uint_z8"); // note loss of precision
    test_checku(float2uint_z(4294967295.0f), UINT32_MAX, "float2uint_z9");
    test_checku(float2uint_z(42949672950.0f), UINT32_MAX, "float2uint_z10");
    printf("float2uint64_z\n");
    test_checku64(float2uint64_z(0.0f), 0, "float2uint64_z1");
    test_checku64(float2uint64_z(0.25f), 0, "float2uint64_z2");
    test_checku64(float2uint64_z(0.5f), 0, "float2uint64_z3");
    test_checku64(float2uint64_z(0.75f), 0, "float2uint64_z4");
    test_checku64(float2uint64_z(1.0f), 1, "float2uint64_z5");
    test_checku64(float2uint64_z(2147483647.0f), INT32_MAX+1u, "float2uint64_z6"); // note loss of precision
    test_checku64(float2uint64_z(2147483648.0f), INT32_MAX+1u, "float2uint64_z7");
    test_checku64(float2uint64_z(4294967294.5f), 4294967296ull, "float2uint64_z8"); // note loss of precision
    test_checku64(float2uint64_z(4294967295.0f), 4294967296ull, "float2uint64_z9"); // note loss of precision
    test_checku64(float2uint64_z(42949672950.0f), 42949672960ull, "float2uint64_z10"); // note loss of precision
    // float exp10f(float x);
    // void sincosf(float x, float *sinx, float *cosx);
    // float powintf(float x, int y);
    return rc;
 }
 int main() {
    stdio_init_all();
    int rc = test();
    if (rc) {
        printf("FAILED\n");
    } else {
        printf("PASSED\n");
    }
 }