rationalize pico_float/pico_double libraries (#2208)

* on RP2350 _dcp variant now enables -msoft-float, since if you're using this at all it is likely because you don't want to use the VFP unit at all (to save stack space) * implement all float_ and double_ conversion functions in all pico_float_pico_ variants and pico_double_pico on RP2040 and RP2350 (many were missing in some combinations) * provide better granularity of what functions are wrapped in each case also marked custom_xxx_funcs_test.c as not in bazel build yet
2025-08-07 17:02:52 +03:00 · 2025-02-04 16:19:17 -06:00
parent 7d450bf097
commit e85c3e5515
17 changed files with 2012 additions and 142 deletions
--- a/src/rp2_common/hardware_dma/include/hardware/dma.h
+++ b/src/rp2_common/hardware_dma/include/hardware/dma.h
@@ -535,7 +535,7 @@ static inline void dma_channel_start(uint channel) {
 *\endcode
 *
 * \if rp2350_specific
- * RP2350 only: Due to errata RP12350-E5 (see the RP2350 datasheet for further detail), it is necessary to clear the enable bit of
+ * RP2350 only: Due to errata RP2350-E5 (see the RP2350 datasheet for further detail), it is necessary to clear the enable bit of
 * the aborted channel and any chained channels prior to the abort to prevent re-triggering.
 * \endif
 *
--- a/src/rp2_common/pico_double/double_aeabi_dcp.S
+++ b/src/rp2_common/pico_double/double_aeabi_dcp.S
@@ -7,7 +7,7 @@
 #include "pico/asm_helper.S"

 #if !HAS_DOUBLE_COPROCESSOR
-#error attempt to compile double_aeabi_rp2350 when there is no DCP
+#error attempt to compile double_aeabi_dcp when there is no DCP
 #else

 #include "hardware/dcp_instr.inc.S"
@@ -29,7 +29,7 @@ double_section WRAPPER_FUNC_NAME(\func)

 // ============== STATE SAVE AND RESTORE ===============

-.macro saving_func type func
+.macro saving_func type func, opt_label1='-', opt_label2='-'
  // Note we are usually 32-bit aligned already at this point, as most of the
  // function bodies contain exactly two 16-bit instructions: bmi and bx lr.
  // We want the PCMP word-aligned.
@@ -41,6 +41,12 @@ double_section WRAPPER_FUNC_NAME(\func)
  push {lr}              // 16-bit instruction
  bl generic_save_state  // 32-bit instruction
  b 1f                   // 16-bit instruction
+.ifnc \opt_label1,'-'
+regular_func \opt_label1
+.endif
+.ifnc \opt_label2,'-'
+regular_func \opt_label2
+.endif
  // This is the actual entry point:
 \type\()_func \func
  PCMP apsr_nzcv
@@ -128,53 +134,124 @@ saving_func wrapper sqrt
  dcp_dsqrt_m r0,r1,r0,r1,r0,r1,r2,r3,r12
  saving_func_return

-// todo not a real thing
-double_wrapper_section __aeabi_dclassify
-saving_func wrapper __aeabi_dclassify
-@ with correct rounding
+double_section dclassify
+saving_func regular dclassify
  dcp_dclassify_m apsr_nzcv,r0,r1
  saving_func_return

 // ============== CONVERSION FUNCTIONS ===============

 double_wrapper_section __aeabi_d2f
-saving_func wrapper __aeabi_d2f
+saving_func wrapper __aeabi_d2f double2float
@ with rounding
  dcp_double2float_m r0,r0,r1
  saving_func_return

 double_wrapper_section __aeabi_i2d
-saving_func wrapper __aeabi_i2d
+saving_func wrapper __aeabi_i2d int2double
  dcp_int2double_m r0,r1,r0
  saving_func_return

 double_wrapper_section __aeabi_ui2d
-saving_func wrapper __aeabi_ui2d
+saving_func wrapper __aeabi_ui2d uint2double
  dcp_uint2double_m r0,r1,r0
  saving_func_return

+double_section double2fix_z
+saving_func regular double2fix_z
+  ubfx r3, r1, #20, #11
+  adds r3, r2
+  beq 1f // very small; we don't care that we might make a denormal
+  asrs ip, r3, #11
+  beq 1f
+  ite pl
+  movpl r3, #0x7ff
+  movsmi r3, #0
+1:
+  bfi r1, r3, #20, #11
+  b double2int_z_entry
+
+double_section double2ufix
+saving_func regular double2ufix_z double2ufix
+double2ufix_z_entry:
+  ubfx r3, r1, #20, #11
+  adds r3, r2
+  beq 1f // very small; we don't care that we might make a denormal
+  asrs ip, r3, #11
+  beq 1f
+  ite pl
+  lsrspl r3, r1, #20 // 0x7ff
+  movsmi r3, #0
+1:
+  bfi r1, r3, #20, #11
+  b double2uint_z_entry
+
+double_section double2fix
+saving_func regular double2fix
+  ubfx r3, r1, #20, #11
+  cbz r3, 2f // 0 or denormal
+  adds r3, r2
+  beq 1f // very small; we don't care that we might make a denormal
+  asrs ip, r3, #11
+  beq 1f
+  ite pl
+  movpl r3, #0x7ff
+  movsmi r3, #0
+1:
+  bfi r1, r3, #20, #11
+  b double2int_entry
+2:
+  movs r0, #0
+saving_func_return
+
+
+double_section double2int
+saving_func regular double2int
+double2int_entry:
+  lsls r2, r1, #1
+  bcc double2int_z_entry // positive is ok for int64_z
+  lsrs r3, r2, #21
+  beq double2int_z_entry // 0 or -0 or denormal is ok for int_z
+
+  lsrs r2, #21
+  adds r2, #1
+  subs r2, r2, #0x400
+  bcc 1f // <1 means subtract 1
+  cmp r2, #31
+  bge double2int_z_entry // must be an integer or maxed out
+  lsls r3, r1, #12
+  adds r3, r3, r0, lsr #20 // r3 now has highest 32 mantissa bits
+  lsls r3, r2
+  orrs r3, r3, r0, lsl #12 // these bits are all guaranteed to be in the fraction
+  beq double2int_z_entry // integer
+1:
+  dcp_double2int_m r0,r0,r1
+  subs r0, #1
+saving_func_return
+
 double_wrapper_section __aeabi_d2iz
-saving_func wrapper __aeabi_d2iz
+saving_func wrapper __aeabi_d2iz double2int_z
+double2int_z_entry:
@ with truncation towards 0
  dcp_double2int_m r0,r0,r1
+  // note: this works with either saved or not saved call as it is just a `bx lr`
  saving_func_return

 double_wrapper_section __aeabi_d2uiz
-saving_func wrapper __aeabi_d2uiz
+saving_func wrapper __aeabi_d2uiz double2uint double2uint_z
+double2uint_z_entry:
@ with truncation towards 0
  dcp_double2uint_m r0,r0,r1
  saving_func_return

-// todo not a real thing
-double_wrapper_section __aeabi_d2i_r
-saving_func wrapper __aeabi_d2i_r
+double_section double2int_r
+saving_func regular double2int_r
@ with rounding
  dcp_double2int_r_m r0,r0,r1
  saving_func_return

-// todo not a real thing
-double_wrapper_section __aeabi_d2ui_r
-saving_func wrapper __aeabi_d2ui_r
+double_section double2uint_r
+saving_func regular double2uint_r
@ with rounding
  dcp_double2uint_r_m r0,r0,r1
  saving_func_return
@@ -189,7 +266,6 @@ saving_func wrapper __aeabi_dcmpun
  saving_func_return

 double_wrapper_section __aeabi_dcmp
-
 saving_func wrapper __aeabi_cdrcmple
  dcp_dcmp_m apsr_nzcv,r2,r3,r0,r1 // with arguments reversed
  bvs cmp_nan
--- a/src/rp2_common/pico_double/double_aeabi_rp2040.S
+++ b/src/rp2_common/pico_double/double_aeabi_rp2040.S
@@ -425,6 +425,7 @@ double_wrapper_section __aeabi_ui2d
 double_wrapper_section __aeabi_i2d

 wrapper_func __aeabi_ui2d
+regular_func uint2double
    movs r1, #0
    cmp r0, #0
    bne 2f
@@ -432,6 +433,7 @@ wrapper_func __aeabi_ui2d
    bx lr
 // double FUNC_NAME(__aeabi_i2d)(int)                     integer to double (double precision) conversion
 wrapper_func __aeabi_i2d
+regular_func int2double
    asrs r1, r0, #31
    eors r0, r1
    subs r0, r1
@@ -506,6 +508,7 @@ regular_func double2int
 // unsigned FUNC_NAME(__aeabi_d2uiz)(double)             double (double precision) to unsigned C-style conversion [3]
 double_wrapper_section __aeabi_d2uiz
 wrapper_func __aeabi_d2uiz
+regular_func double2uint_z
 regular_func double2uint
    shimmable_table_tail_call SF_TABLE_FLOAT2UINT double2uint_shim

@@ -528,11 +531,13 @@ regular_func ufix642double
 // double FUNC_NAME(__aeabi_l2d)(long long)             long long to double (double precision) conversion
 double_wrapper_section __aeabi_l2d
 wrapper_func __aeabi_l2d
+regular_func int642double
    shimmable_table_tail_call SF_TABLE_INT642FLOAT int642double_shim

 // double FUNC_NAME(__aeabi_l2f)(long long)             long long to double (double precision) conversion
 double_wrapper_section __aeabi_ul2d
 wrapper_func __aeabi_ul2d
+regular_func uint642double
    shimmable_table_tail_call SF_TABLE_UINT642FLOAT uint642double_shim

 // long long FUNC_NAME(__aeabi_d2lz)(double)             double (double precision) to long long C-style conversion [3]
@@ -566,22 +571,106 @@ regular_func double2int64
 // unsigned long long FUNC_NAME(__aeabi_d2ulz)(double)     double to unsigned long long C-style conversion [3]
 double_wrapper_section __aeabi_d2ulz
 wrapper_func __aeabi_d2ulz
+regular_func double2uint64
+regular_func double2uint64_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UINT64 double2uint64_shim

+double_section double2fix64_z
+regular_func double2fix64_z
+  lsls r3, r1, #1
+  bcc double2fix64 // input positive is ok for fix64
+  mov ip, r2
+  asrs r2, r3, #21
+  beq 3f           // input zero or denormal, so just return zero
+  adds r2, #1
+  beq double2fix64 // input infinite/nan is ok for fix64
+
+  lsrs r3, #21
+  add r3, ip
+  movs r2, #1
+  negs r2, r2
+  lsrs r2, #22
+  subs r3, r2 // r3 = modified e - 0x3ff
+
+  bcc 3f // modified input < 1.0 means result is zero
+  cmp r3, #52
+  bge 2f // modified input must be an integer or infinite
+
+  adds r3, #12
+  mov r2, r1
+  lsls r2, r2, r3    // r2 has remaining fractional mantissa bits of r1
+  bne 1f             // not integer as non zero fractional bits remain
+  subs r3, #32
+  asrs r2, r3, #31
+  bics r3, r3, r2
+  movs r2, r0
+  lsls r2, r2, r3
+  bne 1f             // remaining fractional bits are non-zero, so argument was not an integer
+2:
+  // integer
+  mov r2, ip
+  b double2fix64
+3: // result is zero
+  movs r0, #0
+  movs r1, #0
+  bx lr
+1:
+  push {lr}
+  mov r2, ip
+  bl double2fix64
+  movs r2, #0
+  adds r0, #1
+  adcs r1, r2
+  pop {pc}
+
 double_section double2fix64
 regular_func double2fix64
    shimmable_table_tail_call SF_TABLE_FLOAT2FIX64 double2fix64_shim

 double_section double2ufix64
 regular_func double2ufix64
+regular_func double2ufix64_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UFIX64 double2ufix64_shim

 double_section double2fix
 regular_func double2fix
    shimmable_table_tail_call SF_TABLE_FLOAT2FIX double2fix_shim

+double_section double2fix_z
+regular_func double2fix_z
+  lsls r3, r1, #1
+  asrs r3, #21
+  beq 2f // input is zero or denormal
+  adds r3, #1
+  beq 3f // input is infinite or nan
+
+  // extract exponent again
+  lsls r3, r1, #1
+  lsrs r3, #21
+  // adjust
+  adds r3, r2
+  ble 2f // adjusted input is zero or dedornmal or < 1
+  lsrs r3, r3, #11
+  bne 3f // adjusted input is > infinite
+
+  lsls r2, r2, #20 // align exponent adjustment offset
+  adds r1, r1, r2  // we know adjustment is safe
+  b double2int_z
+2:
+  // result is zero
+  movs r0, #0
+  bx lr
+3:
+  movs r0, #0
+  subs r0, #1
+  lsrs r0, #1
+  asrs r1, #31
+  eors r0, r1
+  bx lr
+
 double_section double2ufix
 regular_func double2ufix
+regular_func double2ufix_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UFIX double2ufix_shim

 double_wrapper_section __aeabi_d2f
--- a/src/rp2_common/pico_double/double_conv_m33.S
+++ b/src/rp2_common/pico_double/double_conv_m33.S
@@ -249,7 +249,69 @@ regular_func ufix2double
 movs r1,#0
 bx r14

-double_wrapper_section conv_dtoi64
+double_section conv_dtoi64
+regular_func double2int64
+  lsls r3, r1, #1
+  bcc double2int64_z // input positive is ok for int64_z
+  cmp r3, #0xffe00000
+  bcs double2int64_z // input is infinite
+  lsrs r3, #21
+  beq 2f // input zero or denormal, means answer remains zero
+  sub r3, #0x3ff
+  cmp r3, #0
+  blt 1f // input is less than 1.0
+  cmp r3, #52
+  bge double2int64_z // modified input must be an integer or infinite
+  adds r3, #12
+  lsls r2, r1, r3    // r2 has remaining fractional mantissa bits of r1
+  bne 1f             // not integer as non zero fractional bits remain
+  subs r3, #32
+  bics r3, r3, r3, asr #31 // map negative shift to zero
+  lsls r3, r0, r3
+  beq double2int64_z   // remaining fractional bits are 0, so argument was an integer
+1:
+  push {lr}
+  bl double2int64_z
+  subs r0, #1
+  sbcs r1, r1, #0
+  pop {pc}
+2:
+  movs r0, #0
+  movs r1, #0
+  bx lr
+
+double_section conv_dtofix64
+regular_func double2fix64
+  lsls r3, r1, #1
+  bcc double2fix64_z // input positive is ok for fix64_z
+  cmp r3, #0xffe00000
+  bcs double2fix64_z // input is infinite
+  lsrs r3, #21
+  beq 2f // input zero or denormal, means answer remains zero
+  sub r3, #0x3ff
+  adds r3, r2
+  blt 1f // modified input zero or denormal, or less than 1.0
+  cmp r3, #52
+  bge double2fix64_z // modified input must be an integer or infinite
+  adds r3, #12
+  lsls ip, r1, r3    // ip has remaining fractional mantissa bits of r1
+  bne 1f             // not integer as non zero fractional bits remain
+  subs r3, #32
+  bics r3, r3, r3, asr #31 // map negative shift to zero
+  lsls r3, r0, r3
+  beq double2fix64_z   // remaining fractional bits are 0, so argument was an integer
+1:
+  push {lr}
+  bl double2fix64_z
+  subs r0, #1
+  sbcs r1, r1, #0
+  pop {pc}
+2:
+  movs r0, #0
+  movs r1, #0
+  bx lr
+
+double_wrapper_section conv_dtoi64_z

@ convert double to signed int64, rounding towards 0, clamping
 wrapper_func __aeabi_d2lz
--- a/src/rp2_common/pico_double/double_fma_dcp.S
+++ b/src/rp2_common/pico_double/double_fma_dcp.S
@@ -582,7 +582,7 @@ wrapper_func fma
 saving_func_return


-double_wrapper_section __dmla
+double_section fma_fast
@ cf saving_func macro: but here we need to record the SP before the state save possibly changes it
 1:
 push {lr}              // 16-bit instruction
@@ -592,6 +592,7 @@ double_wrapper_section __dmla
@ r0:r1 m
@ r2:r3 n
@ [r13,#0] a
+regular_func fma_fast
 regular_func mla
 mov r12,sp                  @ save the SP
 PCMP apsr_nzcv              @ test the engaged flag
--- a/src/rp2_common/pico_double/include/pico/double.h
+++ b/src/rp2_common/pico_double/include/pico/double.h
@@ -16,50 +16,153 @@ extern "C" {
 #endif

 /** \file double.h
-*  \defgroup pico_double pico_double
+* \defgroup pico_double pico_double
 *
 * \brief Optimized double-precision floating point functions
 *
-* (Replacement) optimized implementations are provided of the following compiler built-ins
-* and math library functions:
+* An application can take control of the floating point routines used in the application over and above what is provided by the compiler,
+* by depending on the pico_double library. A user might want to do this:
 *
-* - __aeabi_dadd, __aeabi_ddiv, __aeabi_dmul, __aeabi_drsub, __aeabi_dsub, __aeabi_cdcmpeq, __aeabi_cdrcmple, __aeabi_cdcmple, __aeabi_dcmpeq, __aeabi_dcmplt, __aeabi_dcmple, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmpun, __aeabi_i2d, __aeabi_l2d, __aeabi_ui2d, __aeabi_ul2d, __aeabi_d2iz, __aeabi_d2lz, __aeabi_d2uiz, __aeabi_d2ulz, __aeabi_d2f
-* - sqrt, cos, sin, tan, atan2, exp, log, ldexp, copysign, trunc, floor, ceil, round, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh, exp2, log2, exp10, log10, pow,, hypot, cbrt, fmod, drem, remainder, remquo, expm1, log1p, fma
-* - powint, sincos (GNU extensions)
+* 1. To use optimized software implementations provided by the RP2-series device's bootrom or the SDK
+* 2. To use optimized combined software/hardware implementations utilizing custom RP2-series hardware for acceleration
+* 3. To control the amount of C compiler/library code bloat
+* 4. To make sure no floating point is called at all
 *
-* The following additional optimized functions are also provided:
+* The pico_double library comes in three main flavors:
 *
-* - int2double, uint2double, int642double, uint642double, fix2double, ufix2double, fix642double, ufix642double
-* - double2fix, double2ufix, double2fix64, double2ufix64, double2int, double2uint, double2int64, double2uint64, double2int_z, double2int64_z,
-* - exp10, sincos, powint
+* 1. `pico_double_none` - all floating point operations cause a \ref panic - no double-precision floating point code is included
+* 2. `pico_double_compiler` - no custom functions are provided; all double-precision floating point is handled by the C compiler/library
+* 3. `pico_double_pico` - the smallest and fastest available for the platform, along with additional functionality (e.g. fixed point conversions) which are detailed below
 *
-* On RP2350 the following additional functions are available; the _fast methods are faster but do not round correctly"
+* The user can control which version they want (e.g. **pico_double_xxx** by either setting the CMake global variable
+* `PICO_DEFAULT_DOUBLE_IMPL=xxx`, or by using the CMake function `pico_set_double_implementation(<TARGET> xxx)`. Note that in the absence
+* of either, pico_double_pico is used by default.
 *
-* - ddiv_fast, sqrt_fast
+* \if rp2040_specific
+* On RP2040, `pico_double_pico` uses optimized hand coded implementations from the bootrom and the SDK for both
+* basic double-precision floating point operations and floating point math library functions. These implementations
+* are generally faster and smaller than those provided by the C compiler/library, though they don't support all the features of a fully compliant
+* floating point implementation; they are however usually fine for the majority of cases
+* \endif
+*
+* \if rp2350_specific
+* On RP2350, `pico_double_pico` uses RP2350 DCP instructions (double co-processor) to implement fast version of the basic
+* arithmetic functions, and provides optimized M33 implementations of trignometric and scientific functions.
+* These implementations are generally faster and smaller than those provided by the C compiler/library, though they don't support all the features of a fully compliant
+* floating point implementation; they are however usually fine for the majority of cases
+* \endif
+*
+* On Arm, (replacement) optimized implementations are provided for the following compiler built-ins
+* and math library functions when using `pico_double_pico`:
+*
+* - basic arithmetic:
+*
+*   __aeabi_dadd, __aeabi_ddiv, __aeabi_dmul, __aeabi_drsub, __aeabi_dsub
+*
+* - comparison:
+*
+*   __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_dcmpeq, __aeabi_dcmplt, __aeabi_dcmple, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmpun
+*
+* - (u)int32 <-> double:
+*
+*    __aeabi_i2d, __aeabi_ui2d, __aeabi_d2iz, __aeabi_d2uiz
+*
+* - (u)int64 <-> double:
+*
+*   __aeabi_l2d, __aeabi_ul2d, __aeabi_d2lz, __aeabi_d2ulz
+*
+* - double -> float:
+*
+*   __aeabi_d2d
+*
+* - basic trigonometric:
+*
+*   sqrt, cos, sin, tan, atan2, exp, log
+*
+* - trigonometric and scientific
+*
+*   ldexp, copysign, trunc, floor, ceil, round, asin, acos, atan, sinh, cosh, tanh, asinh, acosh, atanh, exp2, log2, exp10, log10, pow, hypot, cbrt, fmod, drem, remainder, remquo, expm1, log1p, fma
+*
+* - GNU exetnsions:
+*
+*   powint, sincos
+*
+* On Arm, the following additional optimized functions are also provided when using `pico_double_pico`:
+*
+* - Conversions to/from integer types:
+*
+*   - (u)int -> double (round to nearest):
+*
+*     int2double, uint2double, int642double, uint642double
+*
+*   - (u)double -> int (round towards zero):
+*
+*     double2int_z, double2uint_z, double2int64_z, double2uint64_z
+*
+*   - (u)double -> int (round towards -infinity):
+*
+*     double2int, double2uint, double2int64, double2uint64
+*
+* - Conversions to/from fixed point integers:
+*
+*   - (u)fix -> double (round to nearest):
+*
+*       fix2double, ufix2double, fix642double, ufix642double
+*
+*   - double -> (u)fix (round towards zero):
+*
+*       double2fix_z, double2ufix_z, double2fix64_z, double2ufix64_z
+*
+*   - double -> (u)fix (round towards -infinity):
+*
+*       double2fix, double2ufix, double2fix64, double2ufix64
+*
+* - Even faster versions of divide and square-root functions that do not round correctly:
+*
+*   ddiv_fast, sqrt_fast (these do not round correctly)
+*
+* - Faster unfused multiply and accumulate:
+*
+*   mla (fast fma)
+*
+* \if rp2350_specific
+* On RISC-V there is no custom double-precision floating point support, so `pico_double_pico` is equivalent to `pico_double_compiler`
+* \endif
 */
+#if !defined(__riscv) || PICO_COMBINED_DOCS

+#if PICO_COMBINED_DOCS || !LIB_PICO_DOUBLE_COMPILER
 double int2double(int32_t i);
-double uint2double(uint32_t u);
+double uint2double(uint32_t i);
 double int642double(int64_t i);
-double uint642double(uint64_t u);
+double uint642double(uint64_t i);
 double fix2double(int32_t m, int e);
 double ufix2double(uint32_t m, int e);
 double fix642double(int64_t m, int e);
 double ufix642double(uint64_t m, int e);

-// These methods round towards -Infinity.
-int32_t double2fix(double d, int e);
-uint32_t double2ufix(double d, int e);
-int64_t double2fix64(double d, int e);
-uint64_t double2ufix64(double d, int e);
-int32_t double2int(double d);
-uint32_t double2uint(double d);
-int64_t double2int64(double d);
-uint64_t double2uint64(double d);
+// These methods round towards 0, which IS the C way
+int32_t double2int_z(double f);
+int64_t double2int64_z(double f);
+int32_t double2uint_z(double f);
+int64_t double2uint64_z(double f);
+int32_t double2fix_z(double f, int e);
+uint32_t double2ufix_z(double f, int e);
+int64_t double2fix64_z(double f, int e);
+uint64_t double2ufix64_z(double f, int e);

-// These methods round towards 0.
-int32_t double2int_z(double d);
-int64_t double2int64_z(double d);
+// These methods round towards -Infinity - which IS NOT the C way for negative numbers;
+// as such the naming is not ideal, however is kept for backwards compatibility
+int32_t double2int(double f);
+uint32_t double2uint(double f);
+int64_t double2int64(double f);
+uint64_t double2uint64(double f);
+int32_t double2fix(double f, int e);
+uint32_t double2ufix(double f, int e);
+int64_t double2fix64(double f, int e);
+uint64_t double2ufix64(double f, int e);
+
+#endif

 double exp10(double x);
 void sincos(double x, double *sinx, double *cosx);
@@ -67,8 +170,24 @@ double powint(double x, int y);

 #if !PICO_RP2040 || PICO_COMBINED_DOCS
 double ddiv_fast(double n, double d);
-double sqrt_fast(double d);
-double mla(double x, double y, double z); // note this is not fused
+double sqrt_fast(double f);
+double fma_fast(double x, double y, double z); // this is not fused
+double mla(double x, double y, double z); // another name for fma_fast
+#endif
+
+#endif
+
+#if LIB_PICO_DOUBLE_COMPILER || defined(__riscv)
+// when using the compiler; we provide as many functions as we trivially can, though in the double case they are not optimal
+static inline double int2double(int32_t i) { return (double)i; }
+static inline double uint2double(uint32_t i) { return (double)i; }
+static inline double int642double(int64_t i) { return (double)i; }
+static inline double uint642double(uint64_t i) { return (double)i; }
+
+static inline int32_t double2int_z(double d) { return (int32_t)d; }
+static inline int64_t double2int64_z(double d) { return (int64_t)d; }
+static inline int32_t double2uint_z(double d) { return (uint32_t)d; }
+static inline int64_t double2uint64_z(double d) { return (uint64_t)d; }
 #endif

 #ifdef __cplusplus
@@ -76,4 +195,3 @@ double mla(double x, double y, double z); // note this is not fused
 #endif

 #endif
-
--- a/src/rp2_common/pico_float/BUILD.bazel
+++ b/src/rp2_common/pico_float/BUILD.bazel
@@ -2,13 +2,16 @@ load("//bazel:defs.bzl", "compatible_with_rp2", "incompatible_with_config")

 package(default_visibility = ["//visibility:public"])

-_WRAP_FLOAT_AEABI_FLAGS = [
+_WRAP_FLOAT_AEABI_ARITHMETIC_FLAGS = [
    "-Wl,--wrap=__aeabi_fadd",
    "-Wl,--wrap=__aeabi_fdiv",
    "-Wl,--wrap=__aeabi_fmul",
    "-Wl,--wrap=__aeabi_frsub",
    "-Wl,--wrap=__aeabi_fsub",
    "-Wl,--wrap=__aeabi_cfcmpeq",
+]
+
+_WRAP_FLOAT_AEABI_CMP_FLAGS = [
    "-Wl,--wrap=__aeabi_cfrcmple",
    "-Wl,--wrap=__aeabi_cfcmple",
    "-Wl,--wrap=__aeabi_fcmpeq",
@@ -17,15 +20,27 @@ _WRAP_FLOAT_AEABI_FLAGS = [
    "-Wl,--wrap=__aeabi_fcmpge",
    "-Wl,--wrap=__aeabi_fcmpgt",
    "-Wl,--wrap=__aeabi_fcmpun",
+]
+
+_WRAP_FLOAT_AEABI_CONV_32_FLAGS = [
    "-Wl,--wrap=__aeabi_i2f",
    "-Wl,--wrap=__aeabi_l2f",
    "-Wl,--wrap=__aeabi_ui2f",
    "-Wl,--wrap=__aeabi_ul2f",
+]
+
+_WRAP_FLOAT_AEABI_CONV_64_FLAGS = [
    "-Wl,--wrap=__aeabi_f2iz",
    "-Wl,--wrap=__aeabi_f2lz",
    "-Wl,--wrap=__aeabi_f2uiz",
    "-Wl,--wrap=__aeabi_f2ulz",
+]
+
+_WRAP_FLOAT_AEABI_CONV_DOUBLE_FLAGS = [
    "-Wl,--wrap=__aeabi_f2d",
+]
+
+_WRAP_FLOAT_SQRTF_FLAGS = [
    "-Wl,--wrap=sqrtf",
 ]

@@ -36,13 +51,16 @@ _WRAP_FLOAT_SCI_FLAGS = [
    "-Wl,--wrap=atan2f",
    "-Wl,--wrap=expf",
    "-Wl,--wrap=logf",
+    "-Wl,--wrap=sincosf",  # gnu
+]
+
+_WRAP_FLOAT_SCI_EXTRA_FLAGS = [
    "-Wl,--wrap=ldexpf",
    "-Wl,--wrap=copysignf",
    "-Wl,--wrap=truncf",
    "-Wl,--wrap=floorf",
    "-Wl,--wrap=ceilf",
    "-Wl,--wrap=roundf",
-    "-Wl,--wrap=sincosf",  # gnu
    "-Wl,--wrap=asinf",
    "-Wl,--wrap=acosf",
    "-Wl,--wrap=atanf",
@@ -114,30 +132,31 @@ _PICO_FLOAT_IMPLS = [
        ],
        "compatibility": incompatible_with_config("@platforms//cpu:riscv32") + ["//bazel/constraint:rp2040"],
        "extra_deps": [],
-        "linkopts": _WRAP_FLOAT_AEABI_FLAGS + _WRAP_FLOAT_SCI_FLAGS,
+        "linkopts": _WRAP_FLOAT_AEABI_ARITHMETIC_FLAGS + _WRAP_FLOAT_AEABI_CMP_FLAGS + _WRAP_FLOAT_AEABI_CONV_32_FLAGS + _WRAP_FLOAT_AEABI_CONV_64_FLAGS + _WRAP_FLOAT_AEABI_CONV_DOUBLE_FLAGS + _WRAP_FLOAT_SQRTF_FLAGS + _WRAP_FLOAT_SCI_FLAGS + _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    },
    {
        "name": "dcp",
        "srcs": [
            "float_aeabi_dcp.S",
-            "float_conv_m33.S",
+            "float_common_m33.S",
            "float_math.c",
            "float_sci_m33.S",
        ],
        "compatibility": compatible_with_rp2() + incompatible_with_config("@platforms//cpu:riscv32") + incompatible_with_config("//bazel/constraint:rp2040"),
        "extra_deps": ["//src/rp2_common/hardware_dcp"],
-        "linkopts": _WRAP_FLOAT_SCI_FLAGS,
+        "linkopts": _WRAP_FLOAT_AEABI_ARITHMETIC_FLAGS + _WRAP_FLOAT_AEABI_CMP_FLAGS + _WRAP_FLOAT_AEABI_CONV_32_FLAGS + _WRAP_FLOAT_AEABI_CONV_64_FLAGS + _WRAP_FLOAT_AEABI_CONV_DOUBLE_FLAGS + _WRAP_FLOAT_SQRTF_FLAGS + _WRAP_FLOAT_SCI_FLAGS + _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    },
    {
        "name": "vfp",
        "srcs": [
+            "float_conv32_vfp.S",
            "float_sci_m33_vfp.S",
-            "float_conv_m33.S",
+            "float_common_m33.S",
            "float_math.c",
        ],
        "compatibility": compatible_with_rp2() + incompatible_with_config("@platforms//cpu:riscv32") + incompatible_with_config("//bazel/constraint:rp2040"),
        "extra_deps": ["//src/rp2_common/hardware_dcp"],
-        "linkopts": _WRAP_FLOAT_SCI_FLAGS,
+        "linkopts": _WRAP_FLOAT_AEABI_CONV_64_FLAGS + _WRAP_FLOAT_SCI_FLAGS + _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    },
    {
        "name": "single_hazard3",
@@ -146,7 +165,7 @@ _PICO_FLOAT_IMPLS = [
        ],
        "compatibility": compatible_with_rp2() + ["@platforms//cpu:riscv32"],
        "extra_deps": ["//src/rp2_common/hardware_hazard3"],
-        "linkopts": _WRAP_FLOAT_SCI_FLAGS,
+        "linkopts": _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    },
 ]

@@ -184,7 +203,7 @@ cc_library(
    hdrs = ["include/pico/float.h"],
    defines = ["LIB_PICO_FLOAT_PICO=0"],
    includes = ["include"],
-    linkopts = _WRAP_FLOAT_AEABI_FLAGS + _WRAP_FLOAT_SCI_FLAGS,
+    linkopts = _WRAP_FLOAT_AEABI_ARITHMETIC_FLAGS + _WRAP_FLOAT_AEABI_CMP_FLAGS + _WRAP_FLOAT_AEABI_CONV_32_FLAGS + _WRAP_FLOAT_AEABI_CONV_64_FLAGS + _WRAP_FLOAT_AEABI_CONV_DOUBLE_FLAGS + _WRAP_FLOAT_SQRTF_FLAGS + _WRAP_FLOAT_SCI_FLAGS + _WRAP_FLOAT_SCI_EXTRA_FLAGS,
    target_compatible_with = compatible_with_rp2(),
    visibility = ["//visibility:private"],
    deps = [
--- a/src/rp2_common/pico_float/CMakeLists.txt
+++ b/src/rp2_common/pico_float/CMakeLists.txt
@@ -18,13 +18,15 @@
            $<IF:$<BOOL:$<TARGET_PROPERTY:PICO_TARGET_FLOAT_IMPL>>,$<TARGET_PROPERTY:PICO_TARGET_FLOAT_IMPL>,${PICO_DEFAULT_FLOAT_IMPL}>)

    function(wrap_float_functions TARGET)
-        cmake_parse_arguments(WRAP_FLOAT "NO_WRAP_AEABI;NO_WRAP_SCI" "" "" ${ARGN} )
-        if (NOT WRAP_FLOAT_NO_WRAP_AEABI)
+        cmake_parse_arguments(WRAP_FLOAT "NO_AEABI_ARITHMETIC;NO_AEABI_CMP;NO_AEABI_CONV_32;NO_AEABI_CONV_64;NO_AEABI_CONV_DOUBLE;NO_SQRTF;NO_SCI;NO_SCI_EXTRA" "" "" ${ARGN} )
+        if (NOT WRAP_FLOAT_NO_AEABI_ARITHMETIC)
            pico_wrap_function(${TARGET} __aeabi_fadd)
            pico_wrap_function(${TARGET} __aeabi_fdiv)
            pico_wrap_function(${TARGET} __aeabi_fmul)
            pico_wrap_function(${TARGET} __aeabi_frsub)
            pico_wrap_function(${TARGET} __aeabi_fsub)
+        endif()
+        if (NOT WRAP_FLOAT_NO_AEABI_CMP)
            pico_wrap_function(${TARGET} __aeabi_cfcmpeq)
            pico_wrap_function(${TARGET} __aeabi_cfrcmple)
            pico_wrap_function(${TARGET} __aeabi_cfcmple)
@@ -34,32 +36,42 @@
            pico_wrap_function(${TARGET} __aeabi_fcmpge)
            pico_wrap_function(${TARGET} __aeabi_fcmpgt)
            pico_wrap_function(${TARGET} __aeabi_fcmpun)
+        endif()
+        if (NOT WRAP_FLOAT_NO_AEABI_CONV_32)
            pico_wrap_function(${TARGET} __aeabi_i2f)
-            pico_wrap_function(${TARGET} __aeabi_l2f)
            pico_wrap_function(${TARGET} __aeabi_ui2f)
-            pico_wrap_function(${TARGET} __aeabi_ul2f)
            pico_wrap_function(${TARGET} __aeabi_f2iz)
-            pico_wrap_function(${TARGET} __aeabi_f2lz)
            pico_wrap_function(${TARGET} __aeabi_f2uiz)
+        endif()
+        if (NOT WRAP_FLOAT_NO_AEABI_CONV_64)
+            pico_wrap_function(${TARGET} __aeabi_l2f)
+            pico_wrap_function(${TARGET} __aeabi_ul2f)
+            pico_wrap_function(${TARGET} __aeabi_f2lz)
            pico_wrap_function(${TARGET} __aeabi_f2ulz)
+        endif()
+        if (NOT WRAP_FLOAT_NO_AEABI_CONV_DOUBLE)
            pico_wrap_function(${TARGET} __aeabi_f2d)
+        endif()
+        # separate as we have a direct DCP version
+        if (NOT WRAP_FLOAT_NO_SQRTF)
            pico_wrap_function(${TARGET} sqrtf)
        endif()
-        if (NOT WRAP_FLOAT_NO_WRAP_SCI)
+        if (NOT WRAP_FLOAT_NO_SCI)
            pico_wrap_function(${TARGET} cosf)
            pico_wrap_function(${TARGET} sinf)
            pico_wrap_function(${TARGET} tanf)
            pico_wrap_function(${TARGET} atan2f)
            pico_wrap_function(${TARGET} expf)
            pico_wrap_function(${TARGET} logf)
-
+            pico_wrap_function(${TARGET} sincosf) # gnu
+        endif()
+        if (NOT WRAP_FLOAT_NO_SCI_EXTRA)
            pico_wrap_function(${TARGET} ldexpf)
            pico_wrap_function(${TARGET} copysignf)
            pico_wrap_function(${TARGET} truncf)
            pico_wrap_function(${TARGET} floorf)
            pico_wrap_function(${TARGET} ceilf)
            pico_wrap_function(${TARGET} roundf)
-            pico_wrap_function(${TARGET} sincosf) # gnu
            pico_wrap_function(${TARGET} asinf)
            pico_wrap_function(${TARGET} acosf)
            pico_wrap_function(${TARGET} atanf)
@@ -93,7 +105,9 @@
    )

    target_link_libraries(pico_float_none INTERFACE pico_float_headers)
-    wrap_float_functions(pico_float_none)
+    wrap_float_functions(pico_float_none) # we wrap all functions
+    # be explicit that there should be no floating point instructions
+    target_compile_options(pico_float_none INTERFACE -msoft-float)

    pico_add_library(pico_float_pico)
    if (PICO_RP2040)
@@ -107,21 +121,52 @@
        target_link_libraries(pico_float_pico INTERFACE pico_bootrom pico_float_headers hardware_divider)
    elseif(NOT PICO_RISCV)
        pico_add_library(pico_float_pico_dcp)
+        # todo what functions from float_math belong in each case; should some be left to GCC on RP2350?
        target_sources(pico_float_pico_dcp INTERFACE
                ${CMAKE_CURRENT_LIST_DIR}/float_math.c
                ${CMAKE_CURRENT_LIST_DIR}/float_aeabi_dcp.S
+                ${CMAKE_CURRENT_LIST_DIR}/float_common_m33.S
                ${CMAKE_CURRENT_LIST_DIR}/float_sci_m33.S
-                ${CMAKE_CURRENT_LIST_DIR}/float_conv_m33.S
                )

-        wrap_float_functions(pico_float_pico_dcp NO_WRAP_AEABI)
+        # NOTE the main reason for using pico_float_pico_dcp is presumably that you
+        # don't want to use VFP at all, so turn off compiler support, otherwise, it will inline usages
+        target_compile_options(pico_float_pico_dcp INTERFACE -msoft-float)
+
+        wrap_float_functions(pico_float_pico_dcp
+                # we wrap all functions as we don't want to use VFP (or compiler versions) at all
+                #NO_AEABI_ARITHMETIC
+                #NO_AEABI_CMP
+                #NO_AEABI_CONV_32
+                #NO_AEABI_CONV_64
+                #NO_AEABI_CONV_DOUBLE
+                #NO_SQRTF
+                #NO_SCI
+                #NO_SCI_EXTRA
+        )
+
        pico_add_library(pico_float_pico_vfp)
        target_sources(pico_float_pico_vfp INTERFACE
                ${CMAKE_CURRENT_LIST_DIR}/float_math.c
+                ${CMAKE_CURRENT_LIST_DIR}/float_conv32_vfp.S
+                ${CMAKE_CURRENT_LIST_DIR}/float_common_m33.S
                ${CMAKE_CURRENT_LIST_DIR}/float_sci_m33_vfp.S
-                ${CMAKE_CURRENT_LIST_DIR}/float_conv_m33.S
        )
-        wrap_float_functions(pico_float_pico_vfp NO_WRAP_AEABI)
+        wrap_float_functions(pico_float_pico_vfp
+                # for these 3, arguably compiler is probably inlining anyway, but use the cmopiler's
+                # version for explicit AEABI calls
+                NO_AEABI_ARITHMETIC
+                NO_AEABI_CMP
+                NO_AEABI_CONV_32
+                #NO_AEABI_CONV_64   # we have optimized M33 versions
+                NO_AEABI_CONV_DOUBLE
+                # we don't have an optimized vfp or m33 sqrtf available
+                NO_SQRTF
+                #NO_SCI             # we have optimized VFP versions
+                #NO_SCI_EXTRA       # todo - are our versions better than what GCC proides?
+        )
+
+
        target_link_libraries(pico_float_pico INTERFACE
                pico_float_pico_vfp)
    else()
--- a/src/rp2_common/pico_float/float_aeabi_dcp.S
+++ b/src/rp2_common/pico_float/float_aeabi_dcp.S
@@ -5,15 +5,17 @@
 */

 #include "pico/asm_helper.S"
-#if HAS_DOUBLE_COPROCESSOR
+
+#if !HAS_DOUBLE_COPROCESSOR
+#error attempt to compile float_aeabi_dcp when there is no DCP
+#else
+
 #include "hardware/dcp_instr.inc.S"
 #include "hardware/dcp_canned.inc.S"

 pico_default_asm_setup

-// todo alignment
-//__pre_init __aeabi_float_init, 00020
-// factor out save/restore (there is a copy in double code)
+// todo factor out save/restore (there is a copy in double code)

 .macro float_section name
 #if PICO_FLOAT_IN_RAM
@@ -29,7 +31,7 @@ float_section WRAPPER_FUNC_NAME(\func)

 // ============== STATE SAVE AND RESTORE ===============

-.macro saving_func func
+.macro saving_func type func, opt_label1='-', opt_label2='-'
  // Note we are usually 32-bit aligned already at this point, as most of the
  // function bodies contain exactly two 16-bit instructions: bmi and bx lr.
  // We want the PCMP word-aligned.
@@ -41,8 +43,14 @@ float_section WRAPPER_FUNC_NAME(\func)
  push {lr}              // 16-bit instruction
  bl generic_save_state  // 32-bit instruction
  b 1f                   // 16-bit instruction
+.ifnc \opt_label1,'-'
+regular_func \opt_label1
+.endif
+.ifnc \opt_label2,'-'
+regular_func \opt_label2
+.endif
  // This is the actual entry point:
-wrapper_func \func
+\type\()_func \func
  PCMP apsr_nzcv
  bmi 1b
 1:
@@ -82,115 +90,208 @@ generic_restore_state:
 // ============== ARITHMETIC FUNCTIONS ===============

 float_wrapper_section __aeabi_fadd
-saving_func __aeabi_fadd
+saving_func wrapper __aeabi_fadd
  dcp_fadd_m r0,r0,r1
  saving_func_return

 float_wrapper_section __aeabi_fsub
-saving_func __aeabi_fsub
+saving_func wrapper __aeabi_fsub
  dcp_fsub_m r0,r0,r1
  saving_func_return

 float_wrapper_section __aeabi_frsub
-saving_func __aeabi_frsub
+saving_func wrapper __aeabi_frsub
  dcp_fsub_m r0,r1,r0
  saving_func_return

 float_wrapper_section __aeabi_fmul
-saving_func __aeabi_fmul
+saving_func wrapper __aeabi_fmul
  dcp_fmul_m r0,r0,r1,r0,r1
  saving_func_return

 float_section fdiv_fast
-saving_func fdiv_fast
+saving_func regular fdiv_fast
  dcp_fdiv_fast_m r0,r0,r1,r0,r1,r2
  saving_func_return

 float_wrapper_section __aeabi_fdiv
-saving_func __aeabi_fdiv
+saving_func wrapper __aeabi_fdiv
@ with correct rounding
  dcp_fdiv_m r0,r0,r1,r0,r1,r2,r3
  saving_func_return

 float_section sqrtf_fast
-saving_func sqrtf_fast
+saving_func regular sqrtf_fast
  dcp_fsqrt_fast_m r0,r0,r0,r1,r2,r3
  saving_func_return

 float_wrapper_section sqrtf
-saving_func sqrtf
+saving_func wrapper sqrtf
@ with correct rounding
  dcp_fsqrt_m r0,r0,r0,r1,r2,r3
  saving_func_return

-// todo not a real thing
-float_wrapper_section __aeabi_fclassify
-saving_func __aeabi_fclassify
+float_section fclassify
+saving_func regular fclassify
  dcp_fclassify_m apsr_nzcv,r0
  saving_func_return

 // ============== CONVERSION FUNCTIONS ===============

 float_wrapper_section __aeabi_f2d
-saving_func __aeabi_f2d
+saving_func wrapper __aeabi_f2d float2double
  dcp_float2double_m r0,r1,r0
  saving_func_return

 float_wrapper_section __aeabi_i2f
-saving_func __aeabi_i2f
+saving_func  wrapper __aeabi_i2f int2float
@ with rounding
  dcp_int2float_m r0,r0
  saving_func_return

 float_wrapper_section __aeabi_ui2f
-saving_func __aeabi_ui2f
+saving_func wrapper __aeabi_ui2f uint2float
@ with rounding
  dcp_uint2float_m r0,r0
  saving_func_return

+float_section float2fix_z
+regular_func float2fix_z
+  ubfx r2, r0, #23, #8
+  cbz r2, 2f // input is zero or denormal
+  cmp r2, #0xff
+  beq 3f // input infinite or nan
+  adds r2, r1
+  ble 2f // modified input is denormal so zero
+  cmp r2, #0xff
+  beq 3f // modified input is infinite
+1:
+  bfi r0, r2, #23, #8
+  b float2int_z_entry
+2:
+  movs r0, #0
+  bx lr
+3:
+  mvn r1, #0x80000000
+  add r0, r1, r0, lsr#31 @ so -Inf → 0x80000000, +Inf → 0x7fffffff
+  bx lr
+
 float_wrapper_section __aeabi_f2iz
-saving_func __aeabi_f2iz
+saving_func wrapper __aeabi_f2iz float2int_z
@ with truncation towards 0
+float2int_z_entry:
  dcp_float2int_m r0,r0
  saving_func_return

+float_section __aeabi_f2ufix
+regular_func float2ufix
+regular_func float2ufix_z
+  ubfx r2, r0, #23, #8
+  cbz r2, 2f // input is zero or denormal
+  cmp r2, #0xff
+  beq 3f // input infinite or nan
+  adds r2, r1
+  ble 2f // modified input is denormal so zero
+  cmp r2, #0xff
+  beq 3f // modified input is infinite
+1:
+  bfi r0, r2, #23, #8
+  b float2uint_z_entry
+2:
+  movs r0, #0
+  bx lr
+3:
+  mvn r0, r0, asr #31
+  bx lr
+
 float_wrapper_section __aeabi_f2uiz
-saving_func __aeabi_f2uiz
+saving_func wrapper __aeabi_f2uiz float2uint_z float2uint
@ with truncation towards 0
+float2uint_z_entry:
  dcp_float2uint_m r0,r0
  saving_func_return

-// todo not a real thing
+float_section conv_f2fix
+saving_func regular float2fix
+  ubfx r2, r0, #23, #8
+  cbz r2, 2f // input is zero or denormal
+  cmp r2, #0xff
+  beq 3f // input infinite or nan
+  adds r2, r1
+  ble 2f // modified input is denormal so zero
+  cmp r2, #0xff
+  beq 3f // modified input is infinite
+1:
+  bfi r0, r2, #23, #8
+  b float2int_entry
+2:
+  movs r0, #0
+  bx lr
+3:
+  mvn r1, #0x80000000
+  add r0, r1, r0, lsr#31 @ so -Inf → 0x80000000, +Inf → 0x7fffffff
+  bx lr
+
+float_section float2int
+// (not a real thing - kept because we use wrapper in saving_func)
+saving_func regular float2int
+float2int_entry:
+  lsls r1, r0, #1
+  // r0 = abs(zero)                   => r1 = 0x00000000
+  // r0 = abs(denornaml)              => r1 = 0x00xxxxxx
+  // r0 = abs(1.0f)                   => r1 = 0x7f000000
+  // r0 = abs(inf/nan)                => r1 = 0xffxxxxxx
+  bls float2int_z_entry // input positive or zero or -zero are ok for int64_z
+  lsrs r1, #24
+  beq float2int_z_entry // input denormal is flushed to zero anyway
+  subs r1, #0x7f
+  bcc 1f // input < 1.0f means we need to subtract 1 after conversion
+  // mask off all but fractional bits
+  lsls r2, r0, r1
+  lsls r2, #9
+  beq float2int_z_entry // input is integer
+1:
+  WXFC r0, r0
+  ADD0
+  ADD1
+  NTDC
+  RDIC r0
+  subs r0, #1
+saving_func_return
+
+#if 0 // not sure these are super useful; if they are we should give them names
 float_wrapper_section __aeabi_f2i_r
-saving_func __aeabi_f2i_r
+// (not a real thing - kept because we use wrapper in saving_func)
+saving_func wrapper __aeabi_f2i_r
@ with rounding
  dcp_float2int_r_m r0,r0
  saving_func_return

-// todo not a real thing
 float_wrapper_section __aeabi_f2ui_r
-saving_func __aeabi_f2ui_r
+// (not a real thing - kept because we use wrapper in saving_func)
+saving_func wrapper __aeabi_f2ui_r
@ with rounding
  dcp_float2uint_r_m r0,r0
  saving_func_return
+#endif

 // ============== COMPARISON FUNCTIONS ===============

 float_wrapper_section __aeabi_fcmpun
-saving_func __aeabi_fcmpun
+saving_func wrapper __aeabi_fcmpun
  dcp_fcmp_m r0,r0,r1
  // extract unordered bit
  ubfx r0, r0, #28, #1
  saving_func_return

 float_wrapper_section __aeabi_fcmp
-saving_func __aeabi_cfrcmple
+saving_func wrapper __aeabi_cfrcmple
  dcp_fcmp_m apsr_nzcv,r1,r0 // with arguments reversed
  bvs cmp_nan
  saving_func_return

 // these next two can be the same function in the absence of exceptions
-saving_func __aeabi_cfcmple
+saving_func wrapper __aeabi_cfcmple
  dcp_fcmp_m apsr_nzcv,r0,r1
  bvs cmp_nan
  saving_func_return
@@ -198,7 +299,7 @@ saving_func __aeabi_cfcmple
 // It is not clear from the ABI documentation whether cfcmpeq must set the C flag
 // in the same way as cfcmple. If not, we could save the "bvs" below; but we
 // err on the side of caution.
-saving_func __aeabi_cfcmpeq
+saving_func wrapper __aeabi_cfcmpeq
  dcp_fcmp_m apsr_nzcv,r0,r1
  bvs cmp_nan
  saving_func_return
@@ -212,14 +313,14 @@ cmp_nan:
  saving_func_return

 float_wrapper_section __aeabi_fcmpeq
-saving_func __aeabi_fcmpeq
+saving_func wrapper __aeabi_fcmpeq
  dcp_fcmp_m r0,r0,r1
  // extract Z
  ubfx r0, r0, #30, #1
  saving_func_return

 float_wrapper_section __aeabi_fcmplt
-saving_func __aeabi_fcmplt
+saving_func wrapper __aeabi_fcmplt
  dcp_fcmp_m apsr_nzcv,r1,r0
  ite hi
  movhi r0,#1
@@ -227,7 +328,7 @@ saving_func __aeabi_fcmplt
  saving_func_return

 float_wrapper_section __aeabi_fcmple
-saving_func __aeabi_fcmple
+saving_func wrapper __aeabi_fcmple
  dcp_fcmp_m apsr_nzcv,r1,r0
  ite hs
  movhs r0,#1
@@ -235,7 +336,7 @@ saving_func __aeabi_fcmple
  saving_func_return

 float_wrapper_section __aeabi_fcmpge
-saving_func __aeabi_fcmpge
+saving_func wrapper __aeabi_fcmpge
  dcp_fcmp_m apsr_nzcv,r0,r1
  ite hs
  movhs r0,#1
@@ -243,7 +344,7 @@ saving_func __aeabi_fcmpge
  saving_func_return

 float_wrapper_section __aeabi_fcmpgt
-saving_func __aeabi_fcmpgt
+saving_func wrapper __aeabi_fcmpgt
  dcp_fcmp_m apsr_nzcv,r0,r1
  ite hi
  movhi r0,#1
--- a/src/rp2_common/pico_float/float_aeabi_rp2040.S
+++ b/src/rp2_common/pico_float/float_aeabi_rp2040.S
@@ -471,17 +471,36 @@ float_section float2int
 regular_func float2int
    shimmable_table_tail_call SF_TABLE_FLOAT2INT float2int_shim

+float_section float2fix_z
+regular_func float2fix_z
+    cmn r0, r0
+    bcc float2fix
+    push {lr}
+    lsls r0, #1
+    lsrs r0, #1
+    bl float2ufix_z
+    cmp r0, #0
+    bmi 1f
+    negs r0, r0
+    pop {pc}
+1:
+    movs r0, #128
+    lsls r0, #24
+    pop {pc}
+
 float_section float2fix
 regular_func float2fix
    shimmable_table_tail_call SF_TABLE_FLOAT2FIX float2fix_shim

 float_section float2ufix
 regular_func float2ufix
+regular_func float2ufix_z
    table_tail_call SF_TABLE_FLOAT2UFIX

 // unsigned FUNC_NAME(__aeabi_f2uiz)(float)             float (single precision) to unsigned C-style conversion [3]
 float_wrapper_section __aeabi_f2uiz
 wrapper_func __aeabi_f2uiz
+regular_func float2uint
 regular_func float2uint_z
    table_tail_call SF_TABLE_FLOAT2UINT

@@ -530,10 +549,11 @@ wrapper_func __aeabi_f2lz
 regular_func float2int64_z
    cmn r0, r0
    bcc float2int64
+    movs r1, #0
+float2fix64_z_neg:
    push {lr}
    lsls r0, #1
    lsrs r0, #1
-    movs r1, #0
    bl float2ufix64
    cmp r1, #0
    bmi 1f
@@ -553,17 +573,24 @@ regular_func float2int64
    shimmable_table_tail_call SF_TABLE_FLOAT2INT64 float2int64_shim

 float_section float2fix64
+regular_func float2fix64_z
+    cmn r0, r0
+    bcs float2fix64_z_neg
+    // fall thru
+
 regular_func float2fix64
    shimmable_table_tail_call SF_TABLE_FLOAT2FIX64 float2fix64_shim

 // unsigned long long FUNC_NAME(__aeabi_f2ulz)(float)     float to unsigned long long C-style conversion [3]
 float_wrapper_section __aeabi_f2ulz
 wrapper_func __aeabi_f2ulz
+regular_func float2uint64
 regular_func float2uint64_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UINT64 float2uint64_shim

 float_section float2ufix64
 regular_func float2ufix64
+regular_func float2ufix64_z
    shimmable_table_tail_call SF_TABLE_FLOAT2UFIX64 float2ufix64_shim

 float_wrapper_section __aeabi_f2d
--- a/src/rp2_common/pico_float/float_common_m33.S
+++ b/src/rp2_common/pico_float/float_common_m33.S
@@ -241,7 +241,52 @@ regular_func ufix642float
 bxlo r14
 b 3b

-float_wrapper_section conv_ftoi64
+float_section conv_ftoi64
+regular_func float2int64
+  lsls r1, r0, #1
+  // r0 = abs(zero)                   => r1 = 0x00000000
+  // r0 = abs(denornaml)              => r1 = 0x00xxxxxx
+  // r0 = abs(1.0f)                   => r1 = 0x7f000000
+  // r0 = abs(inf/nan)                => r1 = 0xffxxxxxx
+  bls float2int64_z // positive or zero or -zero are ok for int64_z
+  lsrs r1, #24
+  subs r1, #0x7f
+  bcc 1f // <1 means subtract 1
+  // mask off all but fractional bits
+  lsls r2, r0, r1
+  lsls r2, #9
+  beq float2int64_z // integer
+1:
+  push {lr}
+  bl float2int64_z
+  subs r0, #1
+  sbcs r1, r1, #0
+  pop {pc}
+
+float_section conv_ftof64
+regular_func float2fix64
+  lsls r2, r0, #1
+  // r0 = abs(zero)                   => r1 = 0x00000000
+  // r0 = abs(denornaml)              => r1 = 0x00xxxxxx
+  // r0 = abs(1.0f)                   => r1 = 0x7f000000
+  // r0 = abs(inf/nan)                => r1 = 0xffxxxxxx
+  bls float2fix64_z // positive or zero or -zero are ok for fix64_z
+  lsrs r2, #24
+  rsbs r3, r1, #0x7f
+  subs r2, r3
+  bcc 1f // <1 means subtract 1
+  // mask off all but fractional bits
+  lsls r2, r0, r2
+  lsls r2, #9
+  beq float2fix64_z // integer
+1:
+  push {lr}
+  bl float2fix64_z
+  subs r0, #1
+  sbcs r1, r1, #0
+  pop {pc}
+
+float_wrapper_section conv_ftoi64z

@ convert float to signed int64, rounding towards 0, clamping
 wrapper_func __aeabi_f2lz
@@ -318,7 +363,7 @@ regular_func float2uint64_z
 movs r1,#0      @ fall through
@ convert float in r0 to unsigned fixed point in r0:r1, clamping
 regular_func float2ufix64
-//regular_func float2ufix64_z
+regular_func float2ufix64_z
 subs r1,#0x96 @ remove exponent bias, compensate for mantissa length
 asrs r2,r0,#23 @ sign and exponent
 sub r3,r2,#1
--- a/src/rp2_common/pico_float/float_conv32_vfp.S
+++ b/src/rp2_common/pico_float/float_conv32_vfp.S
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#if !PICO_RP2040
+#include "pico/asm_helper.S"
+
+pico_default_asm_setup
+
+.macro float_section name
+#if PICO_FLOAT_IN_RAM
+.section RAM_SECTION_NAME(\name), "ax"
+#else
+.section SECTION_NAME(\name), "ax"
+#endif
+.endm
+
+float_section int2float
+regular_func int2float
+	vmov s15, r0
+	vcvt.f32.s32 s15, s15
+	vmov r0, s15
+	bx lr
+
+float_section uint2float
+regular_func uint2float
+	vmov s15, r0
+	vcvt.f32.u32 s15, s15
+	vmov r0, s15
+	bx lr
+
+float_section float2int
+regular_func float2int
+	vmov s15, r0
+	vcvtm.s32.f32 s15, s15
+	vmov r0, s15
+	bx lr
+
+float_section float2int_z
+regular_func float2int_z
+	vmov s15, r0
+	vcvt.s32.f32 s15, s15
+	vmov r0, s15
+	bx lr
+
+float_section float2uint
+regular_func float2uint
+regular_func float2uint_z
+	vmov s15, r0
+	vcvt.u32.f32 s15, s15
+	vmov r0, s15
+	bx lr
+
+float_section float2fix_z
+regular_func float2fix_z
+  ubfx r2, r0, #23, #8
+  adds r2, r1
+  asrs r3, r2, #8
+  beq 1f
+  ite pl
+  movpl r2, #0xff
+  movmi r2, #0
+1:
+  bfi r0, r2, #23, #8
+  b float2int_z
+
+float_section float2fix
+regular_func float2fix
+  lsls r2, r0, #1
+  // r0 = abs(zero)                   => r1 = 0x00000000
+  // r0 = abs(denornaml)              => r1 = 0x00xxxxxx
+  // r0 = abs(1.0f)                   => r1 = 0x7f000000
+  // r0 = abs(inf/nan)                => r1 = 0xffxxxxxx
+  bls float2fix_z // input positive or zero or -zero are ok for fix_z
+  lsrs r2, #24
+  beq float2fix_z // input denormal will be flushed to zero
+  rsbs r3, r1, #0x7f
+  subs r2, r3
+  bcc 1f // iunput <1.0f means we need to subtract 1
+  // mask off all but fractional bits
+  lsls r2, r0, r2
+  lsls r2, #9
+  beq float2fix_z // input is integer
+1:
+  push {lr}
+  bl float2fix_z
+  subs r0, #1
+  sbcs r1, r1, #0
+  pop {pc}
+
+float_section float2ufix
+regular_func float2ufix
+regular_func float2ufix_z
+  ubfx r2, r0, #23, #8
+  adds r2, r1
+  asrs r3, r2, #8
+  beq 1f
+  ite pl
+  movpl r2, #0xff
+  movmi r2, #0
+1:
+  bfi r0, r2, #23, #8
+  b float2uint_z
+#endif
--- a/src/rp2_common/pico_float/include/pico/float.h
+++ b/src/rp2_common/pico_float/include/pico/float.h
@@ -21,68 +21,296 @@ extern "C" {
 *
 * \brief Optimized single-precision floating point functions
 *
-* (Replacement) optimized implementations are provided for the following compiler built-ins
-* and math library functions on Arm:
+* An application can take control of the floating point routines used in the application over and above what is provided by the compiler,
+* by depending on the pico_float library. A user might want to do this
 *
-* - __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub, __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun, __aeabi_i2f, __aeabi_l2f, __aeabi_ui2f, __aeabi_ul2f, __aeabi_f2iz, __aeabi_f2lz, __aeabi_f2uiz, __aeabi_f2ulz, __aeabi_f2d, sqrtf, cosf, sinf, tanf, atan2f, expf, logf
-* - ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
-* - powintf, sincosf (GNU extensions)
+* 1. To use optimized software implementations provided by the RP2-series device's bootrom or the SDK
+* 2. To use optimized combined software/hardware implementations utilizing custom RP2-series hardware for acceleration
+* 3. To control the amount of C compiler/library code bloat
+* 4. To make sure no floating point is called at all
 *
-* The following additional optimized functions are also provided:
+* The pico_float library comes in three main flavors:
 *
-* - int2float, uint2float, int642float, uint642float, fix2float, ufix2float, fix642float, ufix642float
-* - float2fix, float2ufix, float2fix64, float2ufix64, float2int, float2uint, float2int64, float2uint64, float2int_z, float2int64_z, float2uint_z, float2uint64_z
-* - exp10f, sincosf, powintf
+* 1. `pico_float_none` - all floating point operations cause a \ref panic - no single-precision floating point code is included
+* 2. `pico_float_compiler` - no custom functions are provided; all single-precision floating point is handled by the C compiler/library
+* 3. `pico_float_pico` - the smallest and fastest available for the platform, along with additional functionality (e.g. fixed point conversions) which are detailed below
 *
-* On RP2350 (Arm) the following additional functions are available; the _fast methods are faster but do not round correctly
+* The user can control which version they want (e.g. **pico_float_xxx** by either setting the CMake global variable
+* `PICO_DEFAULT_FLOAT_IMPL=xxx`, or by using the CMake function `pico_set_float_implementation(<TARGET> xxx)`. Note that in the absence
+* of either, pico_float_pico is used by default.
 *
-* - float2fix64_z, fdiv_fast, fsqrt_fast,
+* \if rp2040_specific
+* On RP2040, `pico_float_pico` uses optimized hand coded implementations from the bootrom and the SDK for both
+* basic single-precision floating point operations and floating point math library functions. These implementations
+* are generally faster and smaller than those provided by the C compiler/library, though they don't support all the features of a fully compliant
+* floating point implementation; they are however usually fine for the majority of cases
+* \endif
 *
-* On RP2350 RISC-V, only a small number of compiler runtime functions are overridden with faster implementations:
+* \if rp2350_specific
+* On Arm on RP2350, there are multiple options for `pico_float_pico`:
 *
-* - __addsf3, __subsf3, __mulsf3
+* 1. `pico_float_pico_vfp` - this library leaves basic C single-precision floating point operations to the compiler
+* which can use inlined VFP (Arm FPU) code. Custom optimized versions of trigonometric and scientific functions are provided.
+* No DCP (RP2350 Double co-processor) instructions are used.
+* 2. `pico_float_pico_dcp` - this library prevents the compiler injecting inlined VFP code, and also implements
+* all single-precision floating point operations in optimized DCP or M33 code. This option is not quite as fast
+* as pico_float_pico_vfp, however it allows floating point operations without enabling the floating point co-processor
+* on the CPU; this can be beneficial in certain circumstances, e.g. where leaving stack in tasks or interrupts
+* for the floating point state is undesirable.
+*
+* Note: `pico_float_pico` is equivalent to `pico_float_pico_vfp` on RP2350, as this is the most sensible default
+* \endif
+*
+* On Arm, (replacement) optimized implementations are provided for the following compiler built-ins
+* and math library functions when using `_pico` variants of `pico_float`:
+*
+* - basic arithmetic: (except `pico_float_pico_vfp`)
+*
+*   __aeabi_fadd, __aeabi_fdiv, __aeabi_fmul, __aeabi_frsub, __aeabi_fsub
+*
+* - comparison: (except `pico_float_pico_vfp`)
+*
+*   __aeabi_cfcmpeq, __aeabi_cfrcmple, __aeabi_cfcmple, __aeabi_fcmpeq, __aeabi_fcmplt, __aeabi_fcmple, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmpun
+*
+* - (u)int32 <-> float: (except `pico_float_pico_vfp`)
+*
+*    __aeabi_i2f, __aeabi_ui2f, __aeabi_f2iz, __aeabi_f2uiz
+*
+* - (u)int64 <-> float: (except `pico_float_pico_vfp`)
+*
+*   __aeabi_l2f, __aeabi_ul2f, __aeabi_f2lz, __aeabi_f2ulz
+*
+* - float -> double: (except `pico_float_pico_vfp`)
+*
+*   __aeabi_f2d
+*
+* - basic trigonometric:
+*
+*   sqrtf, cosf, sinf, tanf, atan2f, expf, logf
+*
+* - trigonometric and scientific
+*
+*   ldexpf, copysignf, truncf, floorf, ceilf, roundf, asinf, acosf, atanf, sinhf, coshf, tanhf, asinhf, acoshf, atanhf, exp2f, log2f, exp10f, log10f, powf, hypotf, cbrtf, fmodf, dremf, remainderf, remquof, expm1f, log1pf, fmaf
+*
+* - GNU exetnsions:
+*
+*   powintf, sincosf
+*
+* On Arm, the following additional optimized functions are also provided (when using `_pico` variants of `pico_float`):
+*
+* - Conversions to/from integer types:
+*
+*   - (u)int -> float (round to nearest):
+*
+*     int2float, uint2float, int642float, uint642float
+*
+*     note: on `pico_float_pico_vfp` the 32-bit functions are also provided as C macros since they map to inline VFP code
+*
+*   - (u)float -> int (round towards zero):
+*
+*     float2int_z, float2uint_z, float2int64_z, float2uint64_z
+*
+*     note: on `pico_float_pico_vfp` the 32-bit functions are also provided as C macros since they map to inline VFP code
+*
+*   - (u)float -> int (round towards -infinity):
+*
+*     float2int, float2uint, float2int64, float2uint64
+*
+* - Conversions to/from fixed point integers:
+*
+*   - (u)fix -> float (round to nearest):
+*
+*       fix2float, ufix2float, fix642float, ufix642float
+*
+*   - float -> (u)fix (round towards zero):
+*
+*       float2fix_z, float2ufix_z, float2fix64_z, float2ufix64_z
+*
+*     note: on `pico_float_pico_vfp` the 32-bit functions are also provided as C macros since they can map to inline VFP code
+*     when the number of fractional bits is a compile time constant between 1 and 32
+*
+*   - float -> (u)fix (round towards -infinity):
+*
+*       float2fix, float2ufix, float2fix64, float2ufix64
+*
+*     note: on `pico_float_pico_vfp` the 32-bit functions are also provided as C macros since they can map to inline VFP code
+*     when the number of fractional bits is a compile time constant between 1 and 32
+*
+* - Even faster versions of divide and square-root functions that do not round correctly: (`pico_float_pico_dcp` only)
+*
+*   fdiv_fast, sqrtf_fast
+*
+* \if rp2350_specific
+* On RISC-V, (replacement) optimized implementations are provided for the following compiler built-ins when using the `pico_float_pico`
+* library (note that there are no variants of this library like there are on Arm):
+*
+* - basic arithmetic:
+*
+*   __addsf3, __subsf3, __mulsf3
+* \endif
 */
-
-// None of these functions are available on RISC-V:
 #if !defined(__riscv) || PICO_COMBINED_DOCS

-float int2float(int32_t f);
-float uint2float(uint32_t f);
-float int642float(int64_t f);
-float uint642float(uint64_t f);
+#if PICO_COMBINED_DOCS || !LIB_PICO_FLOAT_COMPILER
+float int2float(int32_t i);
+float uint2float(uint32_t i);
+float int642float(int64_t i);
+float uint642float(uint64_t i);
 float fix2float(int32_t m, int e);
 float ufix2float(uint32_t m, int e);
 float fix642float(int64_t m, int e);
 float ufix642float(uint64_t m, int e);

-// These methods round towards -Infinity.
-int32_t float2fix(float f, int e);
-uint32_t float2ufix(float f, int e);
-int64_t float2fix64(float f, int e);
-uint64_t float2ufix64(float f, int e);
-int32_t float2int(float f);
-uint32_t float2uint(float f);
-int64_t float2int64(float f);
-uint64_t float2uint64(float f);
-
-// These methods round towards 0.
+// These methods round towards 0, which IS the C way
 int32_t float2int_z(float f);
 int64_t float2int64_z(float f);
 int32_t float2uint_z(float f);
 int64_t float2uint64_z(float f);
+int32_t float2fix_z(float f, int e);
+uint32_t float2ufix_z(float f, int e);
+int64_t float2fix64_z(float f, int e);
+uint64_t float2ufix64_z(float f, int e);
+
+// These methods round towards -Infinity - which IS NOT the C way for negative numbers;
+// as such the naming is not ideal, however is kept for backwards compatibility
+int32_t float2int(float f);
+uint32_t float2uint(float f);
+int64_t float2int64(float f);
+uint64_t float2uint64(float f);
+int32_t float2fix(float f, int e);
+uint32_t float2ufix(float f, int e);
+int64_t float2fix64(float f, int e);
+uint64_t float2ufix64(float f, int e);
+
+#if LIB_PICO_FLOAT_PICO_VFP
+// a bit of a hack to inline VFP fixed point conversion when exponent is constant and in range 1-32
+#define fix2float(m, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _fix2float_inline(m, e) : fix2 ## float(m, e), fix2 ## float(m, e))
+#define ufix2float(m, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _ufix2float_inline(m, e) : ufix2 ## float(m, e), ufix2 ## float(m, e))
+#define float2fix_z(f, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _float2fix_z_inline(f, e) : float2 ## fix_z(f, e), float2 ## fix_z(f, e))
+#define float2ufix_z(f, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _float2ufix_z_inline(f, e) : float2 ## ufix_z(f, e), float2 ## ufix_z(f, e))
+#define float2fix(f, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _float2fix_inline(f, e) : float2 ## fix(f, e), float2 ## fix(f, e))
+#define float2ufix(f, e) __builtin_choose_expr(__builtin_constant_p(e), (e) >= 1 && (e) <= 32 ? _float2ufix_inline(f, e) : float2 ## ufix(f, e), float2 ## ufix(f, e))
+
+#define _fix2float_inline(m, e) ({ \
+    int32_t _m = m; \
+    float f; \
+    pico_default_asm( \
+        "vmov %0, %1\n" \
+        "vcvt.f32.s32 %0, %0, %2\n" \
+        : "=t" (f) \
+        : "r" (_m), "i" (e) \
+    ); \
+    f; \
+})
+#define _ufix2float_inline(m, e) ({ \
+    uint32_t _m = m; \
+    float f; \
+    pico_default_asm( \
+        "vmov %0, %1\n" \
+        "vcvt.f32.u32 %0, %0, %2\n" \
+        : "=t" (f) \
+        : "r" (_m), "i" (e) \
+    ); \
+    f; \
+})
+#define _float2fix_z_inline(f, e) ({ \
+    int32_t _m; \
+    float _f = (f); \
+    pico_default_asm( \
+        "vcvt.s32.f32 %0, %0, %2\n" \
+        "vmov %1, %0\n" \
+        : "+t" (_f), "=r" (_m) \
+        : "i" (e) \
+    ); \
+    _m; \
+})
+#define _float2ufix_z_inline(f, e) ({ \
+    uint32_t _m; \
+    float _f = (f); \
+    pico_default_asm( \
+        "vcvt.u32.f32 %0, %0, %2\n" \
+        "vmov %1, %0\n" \
+        : "+t" (_f), "=r" (_m) \
+        : "i" (e) \
+    ); \
+    _m; \
+})
+#define _float2fix_z_inline(f, e) ({ \
+    int32_t _m; \
+    float _f = (f); \
+    pico_default_asm( \
+        "vcvt.s32.f32 %0, %0, %2\n" \
+        "vmov %1, %0\n" \
+        : "+t" (_f), "=r" (_m) \
+        : "i" (e) \
+    ); \
+    _m; \
+})
+#define _float2fix_inline(f, e) ({ \
+    union { float _f; int32_t _i; } _u; \
+    _u._f = (f); \
+    uint rc, tmp; \
+    pico_default_asm( \
+        "vcvt.s32.f32 %0, %0, %4\n" \
+        "vmov %2, %0\n" \
+        "lsls %1, #1\n" \
+        "bls 2f\n" /* positive or zero or -zero are ok with the result we have */ \
+        "lsrs %3, %1, #24\n" \
+        "subs %3, #0x7f - %c4\n" \
+        "bcc 1f\n" /* 0 < abs(f) < 1 ^ e, so need to round down */ \
+        /* mask off all but fractional bits */ \
+        "lsls %1, %3\n" \
+        "lsls %1, #8\n" \
+        "beq 2f\n" /* integers can round towards zero */ \
+        "1:\n" \
+        /* need to subtract 1 from the result to round towards -infinity... */ \
+        /* this will never cause an overflow, because to get here we must have had a non integer/infinite value which */ \
+        /* therefore cannot have been equal to INT64_MIN when rounded towards zero */ \
+        "subs %2, #1\n" \
+        "2:\n" \
+        : "+t" (_u._f), "+r" (_u._i), "=r" (rc), "=r" (tmp) \
+        : "i" (e) \
+    ); \
+    rc; \
+})
+#define _float2ufix_inline(f, e) _float2ufix_z_inline((f), (e))
+#endif
+
+#if LIB_PICO_FLOAT_PICO_VFP
+// may as well provide inline macros for VFP
+#define int2float(i) ((float)(int32_t)(i))
+#define uint2float(i) ((float)(uint32_t)(i))
+#define float2int_z(f) ((int32_t)(f))
+#define float2uint_z(f) ((uint32_t)(f))
+#endif
+
+#endif

 float exp10f(float x);
 void sincosf(float x, float *sinx, float *cosx);
 float powintf(float x, int y);

 #if !PICO_RP2040 || PICO_COMBINED_DOCS
-int64_t float2fix64_z(float f, int e);
 float fdiv_fast(float n, float d);
-float fsqrt_fast(float f);
+float sqrtf_fast(float f);
 #endif

 #endif

+#if defined(__riscv) || LIB_PICO_FLOAT_COMPILER
+// when using the compiler or RISC-V, we provide as many functions as we trivially can - these will be efficient
+// when using hard-float on Arm
+static inline float int2float(int32_t i) { return (float)i; }
+static inline float uint2float(uint32_t i) { return (float)i; }
+static inline float int642float(int64_t i) { return (float)i; }
+static inline float uint642float(uint64_t i) { return (float)i; }
+
+static inline int32_t float2int_z(float f) { return (int32_t)f; }
+static inline int64_t float2int64_z(float f) { return (int64_t)f; }
+static inline int32_t float2uint_z(float f) { return (uint32_t)f; }
+static inline int64_t float2uint64_z(float f) { return (uint64_t)f; }
+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/test/pico_float_test/BUILD.bazel
+++ b/test/pico_float_test/BUILD.bazel
@@ -85,3 +85,12 @@ filegroup(
    name = "m33",
    srcs = ["m33.c"],
 )
+
+# TODO: Add these tests to the Bazel build.
+filegroup(
+    name = "unsupported_tests",
+    srcs = [
+        "custom_double_funcs_test.c",
+        "custom_float_funcs_test.c",
+    ],
+)
--- a/test/pico_float_test/CMakeLists.txt
+++ b/test/pico_float_test/CMakeLists.txt
@@ -79,4 +79,31 @@ else ()
        target_link_libraries(m33 pico_double pico_stdlib)
        pico_add_extra_outputs(m33)
    endif()
+
 endif()
+
+set(FLOAT_TYPES compiler)
+set(DOUBLE_TYPES compiler)
+list(APPEND FLOAT_TYPES pico)
+list(APPEND DOUBLE_TYPES pico)
+if (PICO_RP2350)
+    if (NOT PICO_RISCV)
+        list(APPEND FLOAT_TYPES pico_vfp pico_dcp)
+    endif()
+endif()
+
+foreach (FLOAT_TYPE IN LISTS FLOAT_TYPES)
+    add_executable(custom_float_funcs_test_${FLOAT_TYPE} custom_float_funcs_test.c)
+    pico_set_float_implementation(custom_float_funcs_test_${FLOAT_TYPE} ${FLOAT_TYPE})
+    target_link_libraries(custom_float_funcs_test_${FLOAT_TYPE} PRIVATE pico_stdlib)
+    pico_add_extra_outputs(custom_float_funcs_test_${FLOAT_TYPE})
+    pico_set_printf_implementation(custom_float_funcs_test_${FLOAT_TYPE} compiler)
+endforeach ()
+
+foreach (DOUBLE_TYPE IN LISTS DOUBLE_TYPES)
+    add_executable(custom_double_funcs_test_${DOUBLE_TYPE} custom_double_funcs_test.c)
+    pico_set_double_implementation(custom_double_funcs_test_${DOUBLE_TYPE} ${DOUBLE_TYPE})
+    target_link_libraries(custom_double_funcs_test_${DOUBLE_TYPE} PRIVATE pico_stdlib)
+    pico_add_extra_outputs(custom_double_funcs_test_${DOUBLE_TYPE})
+    pico_set_printf_implementation(custom_double_funcs_test_${DOUBLE_TYPE} compiler)
+endforeach ()
--- a/test/pico_float_test/custom_double_funcs_test.c
+++ b/test/pico_float_test/custom_double_funcs_test.c
@@ -0,0 +1,515 @@
+#include <stdio.h>
+#include "pico/stdlib.h"
+#include "pico/double.h"
+#include "math.h"
+
+#if 0
+#define printf(...) ((void)0)
+#endif
+#if 0
+#define stop() return -1
+#else
+#define stop() rc=1
+#endif
+#define test_assert(x) ({ if (!(x)) { printf("Assertion failed: ");puts(#x);printf("  at " __FILE__ ":%d\n", __LINE__); stop(); } })
+#define test_checkd(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %f != %f\n", msg, x, expected); stop(); } })
+#define test_checki(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %d != %d\n", msg, x, expected); stop(); } })
+#define test_checku(x, expected, msg) ({ if ((uint32_t)(x) != (uint32_t)(expected)) { printf("  %s: %u != %u\n", msg, x, expected); stop(); } })
+#define test_checki64(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %lld != %lld\n", msg, (int64_t)(x), (int64_t)(expected)); stop(); } })
+#define test_checku64(x, expected, msg) ({ if ((uint64_t)(x) != (uint64_t)(expected)) { printf("  %s: %llu != %llu\n", msg, (uint64_t)(x), (uint64_t)(expected)); stop(); } })
+
+#if !(LIB_PICO_DOUBLE_COMPILER || defined(__riscv))
+static inline double fix2double_8(int32_t m) { return fix2double(m, 8); }
+static inline double fix2double_12(int32_t m) { return fix2double(m, 12); }
+static inline double fix2double_16(int32_t m) { return fix2double(m, 16); }
+static inline double fix2double_24(int32_t m) { return fix2double(m, 24); }
+static inline double fix2double_28(int32_t m) { return fix2double(m, 28); }
+static inline double fix2double_32(int32_t m) { return fix2double(m, 32); }
+
+static inline double ufix2double_12(int32_t m) { return ufix2double(m, 12); }
+
+static inline double double2fix_12(int32_t m) { return double2fix(m, 12); }
+
+static inline double double2ufix_12(int32_t m) { return double2ufix(m, 12); }
+#endif
+
+#if 1 && (LIB_PICO_DOUBLE_COMPILER || defined(__riscv))
+#define double2int_z(f) ({ double _d = f; pico_default_asm_volatile("" : "+r" (_d)); double2 ## int_z(_d); })
+#define double2uint_z(f) ({ double _d = f; pico_default_asm_volatile("" : "+r" (_d)); double2 ## uint_z(_d); })
+#define double2int64_z(f) ({ double _d = f; pico_default_asm_volatile("" : "+r" (_d)); double2 ## int64_z(_d); })
+#define double2uint64_z(f) ({ double _d = f; pico_default_asm_volatile("" : "+r" (_d)); double2 ## uint64_z(_d); })
+#define int2double(i) ({ int32_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); int2 ## double(_i); })
+#define uint2double(i) ({ uint32_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); uint2 ## double(_i); })
+#define int642double(i) ({ int64_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); int642 ## double(_i); })
+#define uint642double(i) ({ uint64_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); uint642 ## double(_i); })
+#endif
+
+int test() {
+    int rc = 0;
+#if LIB_PICO_DOUBLE_PICO
+    printf(">>> Using PICO\n");
+#endif
+    printf("int2double\n");
+    test_checkd(int2double(0), 0.0, "int2double1");
+    test_checkd(int2double(-1), -1.0, "int2double2");
+    test_checkd(int2double(1), 1.0, "int2double3");
+    test_checkd(int2double(INT32_MAX), 2147483647.0, "int2double4");
+    test_checkd(int2double(INT32_MIN), -2147483648.0, "int2double5");
+    // these have rounding behavior on float but not double
+    test_checkd(int2double(2147483391), 2147483391.0, "int2double6");
+    test_checkd(int2double(2147483391), 2147483391.0, "int2double7");
+    test_checkd(int2double(2147483457), 2147483457.0, "int2double8");
+    test_checkd(int2double(2147483483), 2147483483.0, "int2double9");
+    test_checkd(int2double(2147483584), 2147483584.0, "int2double10");
+
+    printf("uint2double\n");
+    test_checkd(uint2double(0), 0.0, "uint2double1");
+    test_checkd(uint2double(1), 1.0, "uint2double2");
+    test_checkd(uint2double(INT32_MAX), 2147483647.0, "uint2double3");
+    // todo test correct rounding around maximum precision
+    test_checkd(uint2double(UINT32_MAX), 4294967295.0, "uint2double4");
+
+    printf("int642double\n");
+    test_checkd(int642double(0), 0.0, "int642double1");
+    test_checkd(int642double(-1), -1.0, "int642double2");
+    test_checkd(int642double(1), 1.0, "int642double3");
+    test_checkd(int642double(INT32_MAX-1), 2147483646.0, "int642double4");
+    test_checkd(int642double(INT32_MAX), 2147483647.0, "int642double5");
+    test_checkd(int642double(INT32_MAX+1ll), 2147483648.0, "int642double6");
+    test_checkd(int642double(INT32_MIN-1ll), -2147483649.0, "int642double7");
+    test_checkd(int642double(INT32_MIN), -2147483648.0, "int642double8");
+    test_checkd(int642double(INT32_MIN+1ll), -2147483647.0, "int642double9");
+    // todo test correct rounding around maximum precision
+    test_checkd(int642double(INT64_MAX), 9223372036854775807.0, "int642double10");
+    test_checkd(int642double(INT64_MIN), -9223372036854775808.0, "int642doubl11e");
+
+    printf("uint642double\n");
+    test_checkd(uint642double(0), 0.0, "uint642double1");
+    test_checkd(uint642double(1), 1.0, "uint642double2");
+    test_checkd(uint642double(INT32_MAX-1), 2147483646.0, "uint642double3");
+    test_checkd(uint642double(INT32_MAX), 2147483647.0, "uint642double4");
+    test_checkd(uint642double(INT32_MAX+1ll), 2147483648.0, "uint642double5");
+    test_checkd(uint642double(INT64_MAX), 9223372036854775807.0, "uint642double6");
+    // todo test correct rounding around maximum precision
+    test_checkd(uint642double(UINT64_MAX), 18446744073709551615.0, "uint642double7");
+
+    union {
+        uint64_t u;
+        double d;
+    } u64d;
+
+#if !(LIB_PICO_DOUBLE_COMPILER || defined(__riscv))
+    printf("fix2double\n");
+    // todo test correct rounding around maximum precision
+    test_checkd(fix2double(-3, 1), -1.5, "fix2double1");
+    test_checkd(fix2double(-3, 1), -1.5, "fix2double2");
+    test_checkd(fix2double(-3, -4), -48.0, "fix2double3");
+
+    printf("ufix2double\n");
+    // todo test correct rounding around maximum precision
+    test_checkd(ufix2double(0xa0000000, 30), 2.5, "ufix2double1");
+    test_checkd(ufix2double(3, -4), 48.0, "ufix2double2");
+
+    printf("fix64double\n");
+    // todo test correct rounding around maximum precision
+    test_checkd(fix642double(-0xa000000000ll, 38), -2.5, "fix642double1");
+    test_checkd(fix642double(-3, -34), -51539607552.0, "fix642double2");
+
+    printf("ufix642double\n");
+    // todo test correct rounding around maximum precision
+    test_checkd(ufix642double(0xa000000000ll, 38), 2.5, "ufix642double1");
+    test_checkd(ufix642double(3, -34), 51539607552.0, "fix64double2");
+
+    test_checkd(fix2double_8(128), 0.5, "fix2double_8_1");
+    test_checkd(fix2double_8(-128), -0.5, "fix2double_8_2");
+    test_checkd(fix2double_16(8192), 0.125, "fix2double_8_3");
+    test_checkd(fix2double_16(-8192), -0.125, "fix2double_8_4");
+    test_checkd(fix2double_24(3<<23), 1.5, "fix2double_8_5");
+    test_checkd(fix2double_24(-(3<<23)), -1.5, "fix2double_8_6");
+
+    printf("double2fix\n");
+    test_checki(double2fix(-0.5, 8), -0x80, "double2fix0");
+    test_checki(double2fix(3.5, 8), 0x380, "double2fix1");
+    test_checki(double2fix(-3.5, 8), -0x380, "double2fix2");
+    test_checki(double2fix(32768.0, 16), INT32_MAX, "double2fix3");
+    test_checki(double2fix(65536.0, 16), INT32_MAX, "double2fix4");
+    test_checki(double2fix(-65536.0, 16), INT32_MIN, "double2fix4b");
+    test_checki(double2fix(INFINITY, 16), INT32_MAX, "double2fix5");
+    test_checki(double2fix(-INFINITY, 16), INT32_MIN, "double2fix5b");
+    test_checki(double2fix(INFINITY, -16), INT32_MAX, "double2fix5c");
+    test_checki(double2fix(-INFINITY, -16), INT32_MIN, "double2fix5d");
+    test_checki(double2fix(3.24999, 2), 12, "double2fix6");
+    test_checki(double2fix(3.25, 2), 13, "double2fix7");
+    test_checki(double2fix(-3.24999, 2), -13, "double2fix8");
+    test_checki(double2fix(-3.25, 2), -13, "double2fix9");
+    test_checki(double2fix(-0.75, 1), -2, "double2fix10");
+    test_checki(double2fix(-3.0, -1), -2, "double2fix11"); // not very useful
+    test_checki(double2fix(0.0, 16), 0, "double2fix12");
+    test_checki(double2fix(-0.0, 16), 0, "double2fix13");
+    test_checki(double2fix(0.0, -16), 0, "double2fix14");
+    test_checki(double2fix(-0.0, -16), 0, "double2fix15");
+
+    printf("double2ufix\n");
+    test_checku(double2ufix(3.5, 8), 0x380, "double2ufix1");
+    test_checku(double2ufix(-3.5, 8), 0, "double2ufix2");
+    test_checku(double2ufix(32768.0, 16), 32768 << 16, "double2ufix3");
+    test_checku(double2ufix(65536.0, 16), UINT32_MAX, "double2ufix4");
+    test_checku(double2ufix(INFINITY, 16), UINT32_MAX, "double2ufix5");
+    test_checku(double2ufix(-INFINITY, 16), 0, "double2ufix5b");
+    test_checku(double2ufix(INFINITY, -16), UINT32_MAX, "double2ufix5c");
+    test_checku(double2ufix(-INFINITY, -16), 0, "double2ufix5d");
+    test_checku(double2ufix(3.24999, 2), 12, "double2ufix6");
+    test_checku(double2ufix(3.25, 2), 13, "double2ufix7");
+    test_checku(double2ufix(3.0, -1), 1, "double2ufix8"); // not very useful
+    test_checki(double2ufix(0.0, 16), 0, "double2ufix12");
+    test_checki(double2ufix(-0.0, 16), 0, "double2fix13");
+    test_checki(double2ufix(0.0, -16), 0, "double2ufix14");
+    test_checki(double2ufix(-0.0, -16), 0, "double2fix15");
+
+    printf("double2fix64\n");
+    test_checki64(double2fix64(3.5, 8), 0x380, "double2fix641");
+    test_checki64(double2fix64(-3.5, 8), -0x380, "double2fix642");
+    test_checki64(double2fix64(32768.0, 16), 32768ll << 16, "double2fix643");
+    test_checki64(double2fix64(65536.0, 16), 65536ll << 16, "double2fix644");
+    test_checki64(double2fix64(2147483648.0, 16), 2147483648ll << 16, "double2ufix644b");
+    test_checki64(double2fix64(65536.0 * 65536.0 * 32768.0, 16), INT64_MAX, "double2fix644c");
+    test_checki64(double2fix64(INFINITY, 16), INT64_MAX, "double2fix645");
+    test_checki64(double2fix64(-INFINITY, 16), INT64_MIN, "double2fix645b");
+    test_checki64(double2fix64(INFINITY, -16), INT64_MAX, "double2fix645c");
+    test_checki64(double2fix64(-INFINITY, -16), INT64_MIN, "double2fix645d");
+    test_checki64(double2fix64(3.24999, 2), 12, "double2fix646");
+    test_checki64(double2fix64(3.25, 2), 13, "double2fix647");
+    test_checki64(double2fix64(-3.24999, 2), -13, "double2fix648");
+    test_checki64(double2fix64(-3.25, 2), -13, "double2fix649");
+    test_checki64(double2fix64(-3.0, -1), -2, "double2fix6410"); // not very useful
+    test_checki64(double2fix64(2147483648.0 * 2147483648.0, 16), INT64_MAX, "double2ufix6411");
+    test_checki64(double2fix64(0.0, 16), 0, "double2fix6412");
+    test_checki64(double2fix64(-0.0, 16), 0, "double2fix6413");
+    test_checki64(double2fix64(0.0, -16), 0, "double2fix6412b");
+    test_checki64(double2fix64(-0.0, -16), 0, "double2fix6413b");
+    test_checki64(double2fix64(-3.25, 40), -13ll * (1ll << 38), "double2fix6414");
+    u64d.u = 0xc00a000000000001;
+    test_checki64(double2fix64(u64d.d, 40), -13ll * (1ll << 38) - 1ll, "double2fix6414b");
+
+    u64d.u = 0xc00a000080000001;
+    test_checki64(double2fix64(u64d.d, 20), -13ll * (1ll << 18) - 2ll, "double2fix6415c");
+    u64d.u = 0xc00a000080000000;
+    test_checki64(double2fix64(u64d.d, 20), -13ll * (1ll << 18) - 1ll, "double2fix6415d");
+    u64d.u = 0xc00a000000000001;
+    test_checki64(double2fix64(u64d.d, 20), -13ll * (1ll << 18) - 1ll, "double2fix6415e");
+    u64d.u = 0xc00a000000000000;
+    test_checki64(double2fix64(u64d.d, 20), -13ll * (1ll << 18), "double2fix6415g");
+
+    u64d.u = 0xc00a000080000001;
+    test_checki64(double2fix64(u64d.d, 19), -13ll * (1ll << 17) - 1ll, "double2fix6415h");
+    u64d.u = 0xc00a000080000000;
+    test_checki64(double2fix64(u64d.d, 19), -13ll * (1ll << 17) - 1ll, "double2fix6415i");
+    u64d.u = 0xc00a000000000001;
+    test_checki64(double2fix64(u64d.d, 19), -13ll * (1ll << 17) - 1ll, "double2fix6415j");
+    u64d.u = 0xc00a000000000000;
+    test_checki64(double2fix64(u64d.d, 19), -13ll * (1ll << 17), "double2fix6415k");
+
+    printf("double2ufix64\n");
+    test_checku64(double2ufix64(3.5, 8), 0x380, "double2ufix641");
+    test_checku64(double2ufix64(-3.5, 8), 0, "double2ufix642");
+    test_checku64(double2ufix64(32768.0, 16), 32768ull << 16, "double2ufix643");
+    test_checku64(double2ufix64(65536.0, 16), 65536ull << 16, "double2ufix644");
+    test_checku64(double2ufix64(2147483648.0, 16), 2147483648ull << 16, "double2ufix644b");
+    test_checku64(double2ufix64(INFINITY, 16), UINT64_MAX, "double2ufix645");
+    test_checku64(double2ufix64(-INFINITY, 16), 0, "double2ufix645b");
+    test_checku64(double2ufix64(INFINITY, -16), UINT64_MAX, "double2ufix645c");
+    test_checku64(double2ufix64(-INFINITY, -16), 0, "double2ufix645d");
+    test_checku64(double2ufix64(3.24999, 2), 12, "double2ufix646");
+    test_checku64(double2ufix64(3.25, 2), 13, "double2ufix647");
+    test_checku64(double2ufix64(3.0, -1), 1, "double2ufix648"); // not very useful
+    test_checki64(double2ufix64(0.0, 16), 0, "double2ufix649");
+    test_checki64(double2ufix64(-0.0, 16), 0, "double2ufix6410");
+
+    printf("double2fix_z\n");
+    test_checki(double2fix_z(3.5, 8), 0x380, "double2fix_z1");
+    test_checki(double2fix_z(-3.5, 8), -0x380, "double2fix_z2");
+    test_checki(double2fix_z(32768.0, 16), INT32_MAX, "double2fix_z3");
+    test_checki(double2fix_z(65536.0, 16), INT32_MAX, "double2fix_z4");
+    test_checki(double2fix_z(INFINITY, 16), INT32_MAX, "double2fix_z5");
+    test_checki(double2fix_z(-INFINITY, 16), INT32_MIN, "double2fix_z5b");
+    test_checki(double2fix_z(INFINITY, -50), INT32_MAX, "double2fix_z5c");
+    test_checki(double2fix_z(-INFINITY, -50), INT32_MIN, "double2fix_z5d");
+    test_checki(double2fix_z(3.24999, 2), 12, "double2fix_z6");
+    test_checki(double2fix_z(3.25, 2), 13, "double2fix_z7");
+    test_checki(double2fix_z(-3.24999, 2), -12, "double2fix_z8");
+    test_checki(double2fix_z(-3.25, 2), -13, "double2fix_z9");
+    test_checki(double2fix_z(-0.75, 1), -1, "double2fix_z10");
+    test_checki(double2fix_z(-3.0, -1), -1, "double2fix_z11"); // not very useful
+    test_checki(double2fix_z(0.0, 16), 0, "double2fix_z12");
+    test_checki(double2fix_z(-0.0, 16), 0, "double2fix_z13");
+    test_checki(double2fix_z(0.0, -16), 0, "double2fix_z12b");
+    test_checki(double2fix_z(-0.0, -16), 0, "double2fix_z13b");
+
+    printf("double2ufix_z\n");
+    test_checku(double2ufix_z(3.5, 8), 0x380, "double2ufix_z1");
+    test_checku(double2ufix_z(-3.5, 8), 0, "double2ufix_z2");
+    test_checku(double2ufix_z(32768.0, 16), 32768 << 16, "double2ufix_z3");
+    test_checku(double2ufix_z(65536.0, 16), UINT32_MAX, "double2ufix_z4");
+    test_checku(double2ufix_z(INFINITY, 16), UINT32_MAX, "double2ufix_z5");
+    test_checku(double2ufix_z(-INFINITY, 16), 0, "double2ufix_z5b");
+    test_checku(double2ufix_z(INFINITY, 16), UINT32_MAX, "double2ufix_z5c");
+    test_checku(double2ufix_z(-INFINITY, 16), 0, "double2ufix_z5d");
+    test_checku(double2ufix_z(3.24999, 2), 12, "double2ufix_z6");
+    test_checku(double2ufix_z(3.25, 2), 13, "double2ufix_z7");
+    test_checku(double2ufix_z(3.0, -1), 1, "double2ufix_z8"); // not very useful
+    test_checki(double2ufix_z(0.0, 16), 0, "double2fix_z9");
+    test_checki(double2ufix_z(-0.0, 16), 0, "double2fix_z10");
+    test_checki(double2ufix_z(0.0, -16), 0, "double2fix_z11");
+    test_checki(double2ufix_z(-0.0, -16), 0, "double2fix_z12");
+
+    printf("double2fix64_z\n");
+    test_checki64(double2fix64_z(3.5, 8), 0x380, "double2fix64_z1");
+    test_checki64(double2fix64_z(-3.5, 8), -0x380, "double2fix64_z2");
+    test_checki64(double2fix64_z(32768.0, 16), 32768ll << 16, "double2fix64_z3");
+    test_checki64(double2fix64_z(65536.0, 16), 65536ll << 16, "double2fix64_z4");
+    test_checki64(double2fix64_z(65536.0 * 65536.0 * 32768.0, 16), INT64_MAX, "double2fix64_z4b");
+    test_checki64(double2fix64_z(INFINITY, 16), INT64_MAX, "double2fix64_z5");
+    test_checki64(double2fix64_z(-INFINITY, 16), INT64_MIN, "double2fix64_z5");
+    test_checki64(double2fix64_z(INFINITY, 16), INT64_MAX, "double2fix64_z5");
+    test_checki64(double2fix64_z(-INFINITY, 16), INT64_MIN, "double2fix64_z5");
+    test_checki64(double2fix64_z(3.24999, 2), 12, "double2fix64_z6");
+    test_checki64(double2fix64_z(3.25, 2), 13, "double2fix64_z7");
+    test_checki64(double2fix64_z(-3.24999, 2), -12, "double2fix64_z8");
+    test_checki64(double2fix64_z(-3.25, 2), -13, "double2fix64_z9");
+    test_checki64(double2fix64_z(-3.0, -1), -1, "double2fix64_z10"); // not very useful
+    test_checki64(double2fix64_z(0.0, 16), 0, "double2fix64_z11");
+    test_checki64(double2fix64_z(-0.0, 16), 0, "double2fix64_z12");
+    test_checki64(double2fix64_z(0.0, -16), 0, "double2fix64_z13");
+    test_checki64(double2fix64_z(-0.0, -16), 0, "double2fix64_z14");
+    test_checki64(double2fix64_z(-3.25, 40), -13ll * (1ll << 38), "double2fix64_z15");
+    u64d.u = 0xc00a000000000001;
+    test_checki64(double2fix64_z(u64d.d, 40), -13ll * (1ll << 38), "double2fix64_z15b");
+
+    u64d.u = 0xc00a000080000001;
+    test_checki64(double2fix64_z(u64d.d, 20), -13ll * (1ll << 18) - 1ll, "double2fix64_z15c");
+    u64d.u = 0xc00a000080000000;
+    test_checki64(double2fix64_z(u64d.d, 20), -13ll * (1ll << 18) - 1ll, "double2fix64_z15d");
+    u64d.u = 0xc00a000000000001;
+    test_checki64(double2fix64_z(u64d.d, 20), -13ll * (1ll << 18), "double2fix64_z15e");
+    u64d.u = 0xc00a000000000000;
+    test_checki64(double2fix64_z(u64d.d, 20), -13ll * (1ll << 18), "double2fix64_z15g");
+
+    u64d.u = 0xc00a000080000001;
+    test_checki64(double2fix64_z(u64d.d, 19), -13ll * (1ll << 17), "double2fix64_z15h");
+    u64d.u = 0xc00a000080000000;
+    test_checki64(double2fix64_z(u64d.d, 19), -13ll * (1ll << 17), "double2fix64_z15i");
+    u64d.u = 0xc00a000000000001;
+    test_checki64(double2fix64_z(u64d.d, 19), -13ll * (1ll << 17), "double2fix64_z15j");
+    u64d.u = 0xc00a000000000000;
+    test_checki64(double2fix64_z(u64d.d, 19), -13ll * (1ll << 17), "double2fix64_z15k");
+
+    printf("double2ufix64_z\n");
+    test_checku64(double2ufix64_z(3.5, 8), 0x380, "double2ufix64_z1");
+    test_checku64(double2ufix64_z(-3.5, 8), 0, "double2ufix64_z2");
+    test_checku64(double2ufix64_z(32768.0, 16), 32768ll << 16, "double2ufix64_z3");
+    test_checku64(double2ufix64_z(65536.0, 16), 65536ll << 16, "double2ufix64_z4");
+    test_checki64(double2ufix64_z(65536.0 * 65536.0 * 65536.0, 16), UINT64_MAX, "double2fix64_z4b");
+    test_checku64(double2ufix64_z(INFINITY, 16), UINT64_MAX, "double2ufix64_z5");
+    test_checku64(double2ufix64_z(-INFINITY, 16), 0, "double2ufix64_z5b");
+    test_checku64(double2ufix64_z(INFINITY, 16), UINT64_MAX, "double2ufix64_z5c");
+    test_checku64(double2ufix64_z(-INFINITY, 16), 0, "double2ufix64_z5d");
+    test_checku64(double2ufix64_z(3.24999, 2), 12, "double2ufix64_z6");
+    test_checku64(double2ufix64_z(3.25, 2), 13, "double2ufix64_z7");
+    test_checki64(double2ufix64_z(3.0, -1), 1, "double2fuix64_z8"); // not very useful
+    test_checki64(double2ufix64_z(0.0, 16), 0, "double2ufix64_z9");
+    test_checki64(double2ufix64_z(-0.0, 16), 0, "double2ufix64_z10");
+    test_checki64(double2ufix64_z(0.0, -16), 0, "double2ufix64_z11");
+    test_checki64(double2ufix64_z(-0.0, -16), 0, "double2ufix64_z12");
+
+    printf("double2int\n");
+    test_checki(double2int(0.0), 0, "double2int1");
+    test_checki(double2int(0.25), 0, "double2int1b");
+    test_checki(double2int(0.5), 0, "double2int2");
+    test_checki(double2int(0.75), 0, "double2int2b");
+    test_checki(double2int(1.0), 1, "double2int3");
+    test_checki(double2int(-10.0), -10, "double2int3a");
+    test_checki(double2int(-0.0), 0, "double2int3b");
+    test_checki(double2int(-0.25), -1, "double2int4");
+    test_checki(double2int(-0.5), -1, "double2int4b");
+    test_checki(double2int(-0.75), -1, "double2int5");
+    test_checki(double2int(-1.0), -1, "double2int5b");
+    // todo test correct rounding around maximum precision
+    test_checki(double2int(2147483646.0), INT32_MAX-1, "double2int6");
+    test_checki(double2int(2147483647.0), INT32_MAX, "double2int6b");
+    test_checki(double2int(21474836470.0), INT32_MAX, "double2int7");
+    test_checki(double2int(-2147483648.0), INT32_MIN, "double2int8");
+    test_checki(double2int(-21474836480.0), INT32_MIN, "double2int9");
+    test_checki(double2int(-2.5), -3, "double2int10");
+    test_checki(double2int(-2.4), -3, "double2int11");
+    u64d.u = 0xc000000000000000ull;
+    test_checki(double2int(u64d.d), -2, "double2int12");
+    u64d.u = 0xc008000000000000ull;
+    test_checki(double2int(u64d.d), -3, "double2int12b");
+    u64d.u = 0xc000000000000001ull;
+    test_checki(double2int(u64d.d), -3, "double2int12c");
+    u64d.u = 0xc000000080000000ull;
+    test_checki(double2int(u64d.d), -3, "double2int12d");
+    u64d.u = 0xc000000100000000ull;
+    test_checki(double2int(u64d.d), -3, "double2int12e");
+    u64d.u = 0xc000000100000001ull;
+    test_checki(double2int(u64d.d), -3, "double2int12f");
+    test_checki(double2int(-2147483647.0), INT32_MIN+1, "double2int13");
+    test_checki(double2int(-2147483647.1), INT32_MIN, "double2int14");
+    test_checki(double2int(-2147483647.9), INT32_MIN, "double2int15");
+    test_checki(double2int(-2147483648.0), INT32_MIN, "double2int16");
+    test_checki(double2int(-2147483648.1), INT32_MIN, "double2int17");
+    test_checki(double2int(-21474836480.1), INT32_MIN, "double2int18");
+
+    printf("double2uint\n");
+    test_checku(double2uint(0.0), 0, "double2uint1");
+    test_checku(double2uint(0.25), 0, "double2uint2");
+    test_checku(double2uint(0.5), 0, "double2uint3");
+    test_checku(double2uint(0.75), 0, "double2uint4");
+    test_checku(double2uint(1.0), 1, "double2uint5");
+    test_checku(double2uint(2147483647.0), INT32_MAX, "double2uint6");
+    test_checku(double2uint(2147483648.0), INT32_MAX+1u, "double2uint7");
+    test_checku(double2uint(4294967294.5), UINT32_MAX-1, "double2uint8");
+    test_checku(double2uint(4294967295.0), UINT32_MAX, "double2uint9");
+    test_checku(double2uint(42949672950.0), UINT32_MAX, "double2uint10");
+
+    printf("double2int64\n");
+    test_checki64(double2int64(0.0), 0, "double2int641");
+    test_checki64(double2int64(0.25), 0, "double2int641b");
+    test_checki64(double2int64(0.5), 0, "double2int642");
+    test_checki64(double2int64(0.75), 0, "double2int642b");
+    test_checki64(double2int64(1.0), 1, "double2int643");
+    test_checki64(double2int64(-10.0), -10, "double2int643a");
+    test_checki64(double2int64(-0.0), 0, "double2int643b");
+    test_checki64(double2int64(-0.25), -1, "double2int644");
+    test_checki64(double2int64(-0.5), -1, "double2int644b");
+    test_checki64(double2int64(-0.75), -1, "double2int645");
+    test_checki64(double2int64(-1.0), -1, "double2int645b");
+    // todo test correct rounding around maximum precision
+    test_checki64(double2int64(2147483647.0), INT32_MAX, "double2int646");
+    test_checki64(double2int64(21474836470.0), 21474836470ll, "double2int647");
+    test_checki64(double2int64(-2147483648.0), INT32_MIN, "double2int648");
+    test_checki64(double2int64(-21474836480.0), -21474836480ll, "double2int649");
+    test_checki64(double2int64(-2.5), -3, "double2int6410");
+    test_checki64(double2int64(-2.4), -3, "double2int6411");
+    u64d.u = 0xc000000000000000ull;
+    test_checki64(double2int64(u64d.d), -2, "double2int6412");
+    u64d.u = 0xc008000000000000ull;
+    test_checki64(double2int64(u64d.d), -3, "double2int6412b");
+    u64d.u = 0xc000000000000001ull;
+    test_checki64(double2int64(u64d.d), -3, "double2int6412c");
+    u64d.u = 0xc000000080000000ull;
+    test_checki64(double2int64(u64d.d), -3, "double2int6412d");
+    u64d.u = 0xc000000100000000ull;
+    test_checki64(double2int64(u64d.d), -3, "double2int6412e");
+    u64d.u = 0xc000000100000001ull;
+    test_checki64(double2int64(u64d.d), -3, "double2int6412f");
+
+    printf("double2uint64\n");
+    test_checku64(double2uint64(0.0), 0, "double2uint641");
+    test_checku64(double2uint64(0.25), 0, "double2uint642");
+    test_checku64(double2uint64(0.5), 0, "double2uint643");
+    test_checku64(double2uint64(0.75), 0, "double2uint644");
+    test_checku64(double2uint64(1.0), 1, "double2uint645");
+    test_checku64(double2uint64(2147483647.0), INT32_MAX, "double2uint646");
+    test_checku64(double2uint64(2147483648.0), INT32_MAX+1u, "double2uint647");
+    // todo test correct rounding around maximum precision
+    test_checku64(double2uint64(4294967294.5), 4294967294ull, "double2uint648");
+    test_checku64(double2uint64(4294967295.0), 4294967295ull, "double2uint649");
+    test_checku64(double2uint64(42949672950.0), 42949672950, "double2uint6410");
+#endif
+
+    // // These methods round towards 0.
+    printf("double2int_z\n");
+    test_checki(double2int_z(0.0), 0, "double2int_z1");
+    test_checki(double2int_z(0.25), 0, "double2int_z1b");
+    test_checki(double2int_z(0.5), 0, "double2int_z2");
+    test_checki(double2int_z(0.75), 0, "double2int_z2b");
+    test_checki(double2int_z(1.0), 1, "double2int_z3");
+    test_checki(double2int_z(-10.0), -10, "double2int_z3a");
+    test_checki(double2int_z(-0.0), 0, "double2int_z3b");
+    test_checki(double2int_z(-0.25), 0, "double2int_z4");
+    test_checki(double2int_z(-0.5), 0, "double2int_z4b");
+    test_checki(double2int_z(-0.75), 0, "double2int_z5");
+    test_checki(double2int_z(-1.0), -1, "double2int_z5b");
+    // todo test correct rounding around maximum precision
+    test_checki(double2int_z(2147483647.0), INT32_MAX, "double2int_z6");
+    test_checki(double2int_z(21474836470.0), INT32_MAX, "double2int_z7");
+    test_checki(double2int_z(-2147483648.0), INT32_MIN, "double2int_z8");
+    test_checki(double2int_z(-21474836480.0), INT32_MIN, "double2int_z9");
+    test_checki(double2int_z(-2.5), -2, "double2int_z10");
+    test_checki(double2int_z(-2.4), -2, "double2int_z11");
+    u64d.u = 0xc000000000000000ull;
+    test_checki(double2int_z(u64d.d), -2, "double2int_z12");
+    u64d.u = 0xc008000000000000ull;
+    test_checki(double2int_z(u64d.d), -3, "double2int_z12b");
+    u64d.u = 0xc000000000000001ull;
+    test_checki(double2int_z(u64d.d), -2, "double2int_z12c");
+    u64d.u = 0xc000000080000000ull;
+    test_checki(double2int_z(u64d.d), -2, "double2int_z12d");
+    u64d.u = 0xc000000100000000ull;
+    test_checki(double2int_z(u64d.d), -2, "double2int_z12e");
+    u64d.u = 0xc000000100000001ull;
+    test_checki(double2int_z(u64d.d), -2, "double2int_z12f");
+
+    printf("double2int64_z\n");
+    test_checki64(double2int64_z(0.0), 0, "double2int64_z1");
+    test_checki64(double2int64_z(0.25), 0, "double2int64_z1b");
+    test_checki64(double2int64_z(0.5), 0, "double2int64_z2");
+    test_checki64(double2int64_z(0.75), 0, "double2int64_z2b");
+    test_checki64(double2int64_z(1.0), 1, "double2int64_z3");
+    test_checki64(double2int64_z(-10.0), -10, "double2int64_z3a");
+    test_checki64(double2int64_z(-0.0), 0, "double2int64_z3b");
+    test_checki64(double2int64_z(-0.25), 0, "double2int64_z4");
+    test_checki64(double2int64_z(-0.5), 0, "double2int64_z4b");
+    test_checki64(double2int64_z(-0.75), 0, "double2int64_z5");
+    test_checki64(double2int64_z(-1.0), -1, "double2int64_z5b");
+    // todo test correct rounding around maximum precision
+    test_checki64(double2int64_z(2147483647.0), 2147483647ll, "double2int64_z6");
+    test_checki64(double2int64_z(21474836470.0), 21474836470ll, "double2int64_z7");
+    test_checki64(double2int64_z(-2147483648.0), INT32_MIN, "double2int64_z8");
+    test_checki64(double2int64_z(-21474836480.0), -21474836480ll, "double2int64_z9");
+    test_checki64(double2int64_z(-2.5), -2, "double2int64_z10");
+    test_checki64(double2int64_z(-2.4), -2, "double2int64_z11");
+
+    printf("double2uint_z\n");
+    test_checku(double2uint_z(0.0), 0, "double2uint_z1");
+    test_checku(double2uint_z(0.25), 0, "double2uint_z2");
+    test_checku(double2uint_z(0.5), 0, "double2uint_z3");
+    test_checku(double2uint_z(0.75), 0, "double2uint_z4");
+    test_checku(double2uint_z(1.0), 1, "double2uint_z5");
+    test_checku(double2uint_z(2147483647.0), INT32_MAX, "double2uint_z6");
+    test_checku(double2uint_z(2147483648.0), INT32_MAX+1u, "double2uint_z7");
+    // todo test correct rounding around maximum precision
+    test_checku(double2uint_z(4294967294.5), UINT32_MAX-1u, "double2uint_z8");
+    test_checku(double2uint_z(4294967295.0), UINT32_MAX, "double2uint_z9");
+    test_checku(double2uint_z(42949672950.0), UINT32_MAX, "double2uint_z10");
+
+    printf("double2uint64_z\n");
+    test_checku64(double2uint64_z(0.0), 0, "double2uint64_z1");
+    test_checku64(double2uint64_z(0.25), 0, "double2uint64_z2");
+    test_checku64(double2uint64_z(0.5), 0, "double2uint64_z3");
+    test_checku64(double2uint64_z(0.75), 0, "double2uint64_z4");
+    test_checku64(double2uint64_z(1.0), 1, "double2uint64_z5");
+    test_checku64(double2uint64_z(2147483647.0), INT32_MAX, "double2uint64_z6");
+    test_checku64(double2uint64_z(2147483648.0), INT32_MAX+1u, "double2uint64_z7");
+    // todo test correct rounding around maximum precision
+    test_checku64(double2uint64_z(4294967294.5), 4294967294ull, "double2uint64_z8");
+    test_checku64(double2uint64_z(4294967295.0), 4294967295ull, "double2uint64_z9");
+    test_checku64(double2uint64_z(4294967296.0), 4294967296ull, "double2uint64_z9b");
+    test_checku64(double2uint64_z(42949672950.0), 42949672950ull, "double2uint64_z10");
+
+    // double exp10(double x);
+    // void sincos(double x, double *sinx, double *cosx);
+    // double powint(double x, int y);
+    return rc;
+}
+
+int main() {
+    stdio_init_all();
+    int rc = test();
+    if (rc) {
+        printf("FAILED\n");
+    } else {
+        printf("PASSED\n");
+    }
+}
--- a/test/pico_float_test/custom_float_funcs_test.c
+++ b/test/pico_float_test/custom_float_funcs_test.c
@@ -0,0 +1,402 @@
+#include <stdio.h>
+#include "pico/stdlib.h"
+#include "pico/float.h"
+#include "math.h"
+
+#if 0
+#define printf(...) ((void)0)
+#endif
+#if 0
+#define stop() return -1
+#else
+#define stop() rc=1
+#endif
+#define test_assert(x) ({ if (!(x)) { printf("Assertion failed: ");puts(#x);printf("  at " __FILE__ ":%d\n", __LINE__); stop(); } })
+#define test_checkf(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %f != %f\n", msg, x, expected); stop(); } })
+#define test_checki(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %d != %d\n", msg, x, expected); stop(); } })
+#define test_checku(x, expected, msg) ({ if ((uint32_t)(x) != (uint32_t)(expected)) { printf("  %s: %u != %u\n", msg, x, expected); stop(); } })
+#define test_checki64(x, expected, msg) ({ if ((x) != (expected)) { printf("  %s: %lld != %lld\n", msg, (int64_t)(x), (int64_t)(expected)); stop(); } })
+#define test_checku64(x, expected, msg) ({ if ((uint64_t)(x) != (uint64_t)(expected)) { printf("  %s: %llu != %llu\n", msg, (uint64_t)(x), (uint64_t)(expected)); stop(); } })
+
+#if !(LIB_PICO_FLOAT_COMPILER || defined(__riscv))
+static inline float fix2float_8(int32_t m) { return fix2float(m, 8); }
+static inline float fix2float_12(int32_t m) { return fix2float(m, 12); }
+static inline float fix2float_16(int32_t m) { return fix2float(m, 16); }
+static inline float fix2float_24(int32_t m) { return fix2float(m, 24); }
+static inline float fix2float_28(int32_t m) { return fix2float(m, 28); }
+static inline float fix2float_32(int32_t m) { return fix2float(m, 32); }
+
+static inline float ufix2float_12(int32_t m) { return ufix2float(m, 12); }
+
+static inline float float2fix_12(int32_t m) { return float2fix(m, 12); }
+
+static inline float float2ufix_12(int32_t m) { return float2ufix(m, 12); }
+#endif
+
+#if 1 && (LIB_PICO_FLOAT_COMPILER || defined(__riscv))
+#if __SOFTFP__ || defined(__riscv)
+#define FREG "+r"
+#else
+#define FREG "+t"
+#endif
+// prevent the compiler from eliding the calculations
+#define float2int_z(f) ({ float _f = f; pico_default_asm_volatile("" : FREG (_f)); float2 ## int_z(_f); })
+#define float2uint_z(f) ({ float _f = f; pico_default_asm_volatile("" : FREG (_f)); float2 ## uint_z(_f); })
+#define float2int64_z(f) ({ float _f = f; pico_default_asm_volatile("" : FREG (_f)); float2 ## int64_z(_f); })
+#define float2uint64_z(f) ({ float _f = f; pico_default_asm_volatile("" : FREG (_f)); float2 ## uint64_z(_f); })
+#define int2float(i) ({ int32_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); int2 ## float(_i); })
+#define uint2float(i) ({ uint32_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); uint2 ## float(_i); })
+#define int642float(i) ({ int64_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); int642 ## float(_i); })
+#define uint642float(i) ({ uint64_t _i = i; pico_default_asm_volatile("" : "+r" (_i)); uint642 ## float(_i); })
+#endif
+
+#if 1 && LIB_PICO_FLOAT_VFP
+// prevet the compiler from eliding the calculations
+#undef float2int_z
+#undef float2uint_z
+#undef int2float
+#undef uint2float
+#endif
+
+int test() {
+    int rc = 0;
+#if LIB_PICO_FLOAT_PICO_DCP
+    printf(">>> Using DCP\n");
+#endif
+#if LIB_PICO_FLOAT_PICO_VFP
+    printf(">>> Using VFP\n");
+#endif
+    printf("int2float\n");
+    test_checkf(int2float(0), 0.0f, "int2float1");
+    test_checkf(int2float(-1), -1.0f, "int2float2");
+    test_checkf(int2float(1), 1.0f, "int2float3");
+    test_checkf(int2float(INT32_MAX), 2147483647.0f, "int2float4");
+    test_checkf(int2float(INT32_MIN), -2147483648.0f, "int2float5");
+    // check rounding
+    test_checkf(int2float(2147483391), 2147483392.0f, "int2float6");
+    test_checkf(int2float(2147483456), 2147483392.0f, "int2float7");
+    test_checkf(int2float(2147483457), 2147483520.0f, "int2float8");
+    test_checkf(int2float(2147483483), 2147483520.0f, "int2float9");
+    test_checkf(int2float(2147483584), 2147483648.0f, "int2float10");
+
+    printf("uint2float\n");
+    test_checkf(uint2float(0), 0.0f, "uint2float1");
+    test_checkf(uint2float(1), 1.0f, "uint2float2");
+    test_checkf(uint2float(INT32_MAX), 2147483647.0f, "uint2float3");
+    // todo test correct rounding around maximum precision
+    test_checkf(uint2float(UINT32_MAX), 4294967295.0f, "uint2float4");
+
+    printf("int642float\n");
+    test_checkf(int642float(0), 0.0f, "int642float1");
+    test_checkf(int642float(-1), -1.0f, "int642float2");
+    test_checkf(int642float(1), 1.0f, "int642float3");
+    test_checkf(int642float(INT32_MAX-1), 2147483646.0f, "int642float4"); // note equality is within 1ulp
+    test_checkf(int642float(INT32_MAX), 2147483647.0f, "int642float5"); // note equality is within 1ulp
+    test_checkf(int642float(INT32_MAX+1ll), 2147483648.0f, "int642float6");
+    test_checkf(int642float(INT32_MIN-1ll), -2147483649.0f, "int642float7"); // note equality is within 1ulp
+    test_checkf(int642float(INT32_MIN), -2147483648.0f, "int642float8");
+    test_checkf(int642float(INT32_MIN+1ll), -2147483647.0f, "int642float9"); // note equality is within 1ulp
+    // todo test correct rounding around maximum precision
+    test_checkf(int642float(INT64_MAX), 9223372036854775807.0f, "int642float10");
+    test_checkf(int642float(INT64_MIN), -9223372036854775808.0f, "int642float11");
+
+    printf("uint642float\n");
+    test_checkf(uint642float(0), 0.0f, "uint642float1");
+    test_checkf(uint642float(1), 1.0f, "uint642float2");
+    test_checkf(uint642float(INT32_MAX-1), 2147483646.0f, "uint642float3"); // note equality is within 1ulp
+    test_checkf(uint642float(INT32_MAX), 2147483647.0f, "uint642float4"); // note equality is within 1ulp
+    test_checkf(uint642float(INT32_MAX+1ll), 2147483648.0f, "uint642float5");
+    test_checkf(uint642float(INT64_MAX), 9223372036854775807.0f, "uint642float6");
+    // todo test correct rounding around maximum precision
+    test_checkf(uint642float(UINT64_MAX), 18446744073709551615.0f, "uint642float7");
+
+    union {
+        uint32_t u;
+        float f;
+    } u32f;
+
+#if !(LIB_PICO_FLOAT_COMPILER || defined(__riscv))
+    printf("fix2float\n");
+    // todo test correct rounding around maximum precision
+    test_checkf(fix2float(-3, 1), -1.5f, "fix2float1");
+    test_checkf(fix2float(-3, 1), -1.5f, "fix2float2");
+    test_checkf(fix2float(-3, -4), -48.0f, "fix2float3");
+
+    printf("ufix2float\n");
+    // todo test correct rounding around maximum precision
+    test_checkf(ufix2float(0xa0000000, 30), 2.5f, "ufix2float1");
+    test_checkf(ufix2float(3, -4), 48.0f, "ufix2float2");
+
+    printf("fix642float\n");
+    // todo test correct rounding around maximum precision
+    test_checkf(fix642float(-0xa000000000ll, 38), -2.5f, "fix6422float1");
+    test_checkf(fix642float(-3, -34), -51539607552.0f, "fix642float2");
+
+    printf("ufix642float\n");
+    // todo test correct rounding around maximum precision
+    test_checkf(ufix642float(0xa000000000ll, 38), 2.5f, "ufix642float1");
+    test_checkf(ufix642float(3, -34), 51539607552.0f, "fix64float2");
+
+    test_checkf(fix2float_8(128), 0.5f, "fix2float_8_1");
+    test_checkf(fix2float_8(-128), -0.5f, "fix2float_8_2");
+    test_checkf(fix2float_16(8192), 0.125f, "fix2float_8_3");
+    test_checkf(fix2float_16(-8192), -0.125f, "fix2float_8_4");
+    test_checkf(fix2float_24(3<<23), 1.5f, "fix2float_8_5");
+    test_checkf(fix2float_24(-(3<<23)), -1.5f, "fix2float_8_6");
+
+    printf("float2fix\n");
+    test_checki(float2fix(-0.5f, 8), -0x80, "float2fix0");
+    test_checki(float2fix(3.5f, 8), 0x380, "float2fix1");
+    test_checki(float2fix(-3.5f, 8), -0x380, "float2fix2");
+    test_checki(float2fix(32768.0f, 16), INT32_MAX, "float2fix3");
+    test_checki(float2fix(65536.0f, 16), INT32_MAX, "float2fix4");
+    test_checki(float2fix(-65536.0f, 16), INT32_MIN, "float2fix4b");
+    test_checki(float2fix(INFINITY, 16), INT32_MAX, "float2fix5");
+    test_checki(float2fix(-INFINITY, 16), INT32_MIN, "float2fix5b");
+    test_checki(float2fix(3.24999f, 2), 12, "float2fix6");
+    test_checki(float2fix(3.25f, 2), 13, "float2fix7");
+    test_checki(float2fix(-3.24999f, 2), -13, "float2fix8");
+    test_checki(float2fix(-3.25f, 2), -13, "float2fix9");
+    test_checki(float2fix(-0.75f, 1), -2, "float2fix10");
+    test_checki(float2fix(-3.0f, -1), -2, "float2fix11"); // not very useful
+    u32f.u = 0x7f012345;
+    test_checki(float2fix(u32f.f, 1), INT32_MAX, "float2fix12");
+    u32f.u = 0xff012345;
+    test_checki(float2fix(u32f.f, 1), INT32_MIN, "float2fix13");
+
+    printf("float2ufix\n");
+    test_checku(float2ufix(3.5f, 8), 0x380, "float2ufix1");
+    test_checku(float2ufix(-3.5f, 8), 0, "float2ufix2");
+    test_checku(float2ufix(32768.0f, 16), 32768 << 16, "float2ufix3");
+    test_checku(float2ufix(65536.0f, 16), UINT32_MAX, "float2ufix4");
+    test_checku(float2ufix(INFINITY, 16), UINT32_MAX, "float2ufix5");
+    test_checku(float2ufix(3.24999f, 2), 12, "float2ufix6");
+    test_checku(float2ufix(3.25f, 2), 13, "float2ufix7");
+    test_checku(float2ufix(3.0f, -1), 1, "float2ufix8"); // not very useful
+
+    printf("float2fix64\n");
+    test_checki64(float2fix64(3.5f, 8), 0x380, "float2fix641");
+    test_checki64(float2fix64(-3.5f, 8), -0x380, "float2fix642");
+    test_checki64(float2fix64(32768.0f, 16), 32768ll << 16, "float2fix643");
+    test_checki64(float2fix64(65536.0f, 16), 65536ll << 16, "float2fix644");
+    test_checki64(float2fix64(2147483648.0f, 16), 2147483648ll << 16, "float2ufix644b");
+    test_checki64(float2fix64(65536.0f * 65536.0f * 32768.0f, 16), INT64_MAX, "float2fix644c");
+    test_checki64(float2fix64(INFINITY, 16), INT64_MAX, "float2fix645");
+    test_checki64(float2fix64(3.24999f, 2), 12, "float2fix646");
+    test_checki64(float2fix64(3.25f, 2), 13, "float2fix647");
+    test_checki64(float2fix64(-3.24999f, 2), -13, "float2fix648");
+    test_checki64(float2fix64(-3.25f, 2), -13, "float2fix649");
+    test_checki64(float2fix64(-3.0f, -1), -2, "float2fix6410"); // not very useful
+
+    printf("float2ufix64\n");
+    test_checku64(float2ufix64(3.5f, 8), 0x380, "float2ufix641");
+    test_checku64(float2ufix64(-3.5f, 8), 0, "float2ufix642");
+    test_checku64(float2ufix64(32768.0f, 16), 32768ull << 16, "float2ufix643");
+    test_checku64(float2ufix64(65536.0f, 16), 65536ull << 16, "float2ufix644");
+    test_checku64(float2ufix64(2147483648.0f, 16), 2147483648ull << 16, "float2ufix644b");
+    test_checku64(float2ufix64(INFINITY, 16), UINT64_MAX, "float2ufix645");
+    test_checku64(float2ufix64(3.24999f, 2), 12, "float2ufix646");
+    test_checku64(float2ufix64(3.25f, 2), 13, "float2ufix647");
+    test_checku64(float2ufix64(3.0f, -1), 1, "float2ufix648"); // not very useful
+
+    printf("float2fix_z\n");
+    test_checki(float2fix_z(3.5f, 8), 0x380, "float2fix_z1");
+    test_checki(float2fix_z(-3.5f, 8), -0x380, "float2fix_z2");
+    test_checki(float2fix_z(32768.0f, 16), INT32_MAX, "float2fix_z3");
+    test_checki(float2fix_z(65536.0f, 16), INT32_MAX, "float2fix_z4");
+    test_checki(float2fix_z(INFINITY, 16), INT32_MAX, "float2fix_z5");
+    test_checki(float2fix_z(-INFINITY, 16), INT32_MIN, "float2fix_z5b");
+    test_checki(float2fix_z(3.24999f, 2), 12, "float2fix_z6");
+    test_checki(float2fix_z(3.25f, 2), 13, "float2fix_z7");
+    test_checki(float2fix_z(-3.24999f, 2), -12, "float2fix_z8");
+    test_checki(float2fix_z(-3.25f, 2), -13, "float2fix_z9");
+    test_checki(float2fix_z(-0.75f, 1), -1, "float2fix_z10");
+    test_checki(float2fix_z(-3.0f, -1), -1, "float2fix_z11"); // not very useful
+    u32f.u = 0x7f012345;
+    test_checki(float2fix_z(u32f.f, 1), INT32_MAX, "float2fix_z12");
+    u32f.u = 0xff012345;
+    test_checki(float2fix_z(u32f.f, 1), INT32_MIN, "float2fix_z13");
+
+    printf("float2ufix_z\n");
+    test_checku(float2ufix_z(3.5f, 8), 0x380, "float2ufix_z1");
+    test_checku(float2ufix_z(-3.5f, 8), 0, "float2ufix_z2");
+    test_checku(float2ufix_z(32768.0f, 16), 32768 << 16, "float2ufix_z3");
+    test_checku(float2ufix_z(65536.0f, 16), UINT32_MAX, "float2ufix_z4");
+    test_checku(float2ufix_z(INFINITY, 16), UINT32_MAX, "float2ufix_z5");
+    test_checku(float2ufix_z(3.24999f, 2), 12, "float2ufix_z6");
+    test_checku(float2ufix_z(3.25f, 2), 13, "float2ufix_z7");
+    test_checku(float2ufix_z(3.0f, -1), 1, "float2ufix_z8"); // not very useful
+    u32f.u = 0x7f012345;
+    test_checku(float2ufix_z(u32f.f, 1), UINT32_MAX, "float2fix_z9");
+    u32f.u = 0xff012345;
+    test_checku(float2ufix_z(u32f.f, 1), 0, "float2fix_z10");
+
+    printf("float2fix64_z\n");
+    test_checki64(float2fix64_z(3.5f, 8), 0x380, "float2fix64_z1");
+    test_checki64(float2fix64_z(-3.5f, 8), -0x380, "float2fix64_z2");
+    test_checki64(float2fix64_z(32768.0f, 16), 32768ll << 16, "float2fix64_z3");
+    test_checki64(float2fix64_z(65536.0f, 16), 65536ll << 16, "float2fix64_z4");
+    test_checki64(float2fix64_z(65536.0f * 65536.0f * 32768.0f, 16), INT64_MAX, "float2fix64_z4b");
+    test_checki64(float2fix64_z(INFINITY, 16), INT64_MAX, "float2fix64_z5");
+    test_checki64(float2fix64_z(3.24999f, 2), 12, "float2fix64_z6");
+    test_checki64(float2fix64_z(3.25f, 2), 13, "float2fix64_z7");
+    test_checki64(float2fix64_z(-3.24999f, 2), -12, "float2fix64_z8");
+    test_checki64(float2fix64_z(-3.25f, 2), -13, "float2fix64_z9");
+    test_checki64(float2fix64_z(-3.0f, -1), -1, "float2fix64_z10"); // not very useful
+
+    printf("float2ufix64_z\n");
+    test_checku64(float2ufix64_z(3.5f, 8), 0x380, "float2ufix64_z1");
+    test_checku64(float2ufix64_z(-3.5f, 8), 0, "float2ufix64_z2");
+    test_checku64(float2ufix64_z(32768.0f, 16), 32768ll << 16, "float2ufix64_z3");
+    test_checku64(float2ufix64_z(65536.0f, 16), 65536ll << 16, "float2ufix64_z4");
+    test_checki64(float2ufix64_z(65536.0f * 65536.0f * 65536.0f, 16), UINT64_MAX, "float2fix64_z4b");
+    test_checku64(float2ufix64_z(INFINITY, 16), UINT64_MAX, "float2ufix64_z5");
+    test_checku64(float2ufix64_z(3.24999f, 2), 12, "float2ufix64_z6");
+    test_checku64(float2ufix64_z(3.25f, 2), 13, "float2ufix64_z7");
+    test_checki64(float2ufix64_z(3.0f, -1), 1, "float2fuix64_z8"); // not very useful
+
+    printf("float2int\n");
+    test_checki(float2int(0.0f), 0, "float2int1");
+    test_checki(float2int(0.25f), 0, "float2int1b");
+    test_checki(float2int(0.5f), 0, "float2int2");
+    test_checki(float2int(0.75f), 0, "float2int2b");
+    test_checki(float2int(1.0f), 1, "float2int3");
+    test_checki(float2int(-10.0f), -10, "float2int3a");
+    test_checki(float2int(-0.0f), 0, "float2int3b");
+    test_checki(float2int(-0.25f), -1, "float2int4");
+    test_checki(float2int(-0.5f), -1, "float2int4b");
+    test_checki(float2int(-0.75f), -1, "float2int5");
+    test_checki(float2int(-1.0f), -1, "float2int5b");
+    // todo test correct rounding around maximum precision
+    test_checki(float2int(2147483647.0f), INT32_MAX, "float2int6");
+    test_checki(float2int(21474836470.0f), INT32_MAX, "float2int7");
+    test_checki(float2int(-2147483648.0f), INT32_MIN, "float2int8");
+    test_checki(float2int(-21474836480.0f), INT32_MIN, "float2int9");
+    test_checki(float2int(-2.5f), -3, "float2int10");
+    test_checki(float2int(-2.4f), -3, "float2int11");
+
+    printf("float2uint\n");
+    test_checku(float2uint(0.0f), 0, "float2uint1");
+    test_checku(float2uint(0.25f), 0, "float2uint2");
+    test_checku(float2uint(0.5f), 0, "float2uint3");
+    test_checku(float2uint(0.75f), 0, "float2uint4");
+    test_checku(float2uint(1.0f), 1, "float2uint5");
+    test_checku(float2uint(2147483647.0f), INT32_MAX+1u, "float2uint6"); // note loss of precision
+    test_checku(float2uint(2147483648.0f), INT32_MAX+1u, "float2uint7");
+    test_checku(float2uint(4294967294.5f), UINT32_MAX, "float2uint8"); // note loss of precision
+    test_checku(float2uint(4294967295.0f), UINT32_MAX, "float2uint9");
+    test_checku(float2uint(42949672950.0f), UINT32_MAX, "float2uint10");
+
+    printf("float2int64\n");
+    test_checki64(float2int64(0.0f), 0, "float2int641");
+    test_checki64(float2int64(0.25f), 0, "float2int641b");
+    test_checki64(float2int64(0.5f), 0, "float2int642");
+    test_checki64(float2int64(0.75f), 0, "float2int642b");
+    test_checki64(float2int64(1.0f), 1, "float2int643");
+    test_checki64(float2int64(-10.0f), -10, "float2int643a");
+    test_checki64(float2int64(-0.0f), 0, "float2int643b");
+    test_checki64(float2int64(-0.25f), -1, "float2int644");
+    test_checki64(float2int64(-0.5f), -1, "float2int644b");
+    test_checki64(float2int64(-0.75f), -1, "float2int645");
+    test_checki64(float2int64(-1.0f), -1, "float2int645b");
+    // todo test correct rounding around maximum precision
+    test_checki64(float2int64(2147483647.0f), INT32_MAX+1ll, "float2int646");
+    test_checki64(float2int64(21474836470.0f), 21474836480ll, "float2int647"); // note loss of precision
+    test_checki64(float2int64(-2147483648.0f), INT32_MIN, "float2int648");
+    test_checki64(float2int64(-21474836480.0f), -21474836480ll, "float2int649");
+    test_checki64(float2int64(-2.5f), -3, "float2int6410");
+    test_checki64(float2int64(-2.4f), -3, "float2int6411");
+
+    printf("float2uint64\n");
+    test_checku64(float2uint64(0.0f), 0, "float2uint641");
+    test_checku64(float2uint64(0.25f), 0, "float2uint642");
+    test_checku64(float2uint64(0.5f), 0, "float2uint643");
+    test_checku64(float2uint64(0.75f), 0, "float2uint644");
+    test_checku64(float2uint64(1.0f), 1, "float2uint645");
+    test_checku64(float2uint64(2147483647.0f), INT32_MAX+1u, "float2uint646"); // note loss of precision
+    test_checku64(float2uint64(2147483648.0f), INT32_MAX+1u, "float2uint647");
+    test_checku64(float2uint64(4294967294.5f), 4294967296ull, "float2uint648"); // note loss of precision
+    test_checku64(float2uint64(4294967295.0f), 4294967296ull, "float2uint649"); // note loss of precision
+    test_checku64(float2uint64(42949672950.0f), 42949672960ull, "float2uint6410"); // note loss of precision
+#endif
+
+    // // These methods round towards 0.
+    printf("float2int_z\n");
+    test_checki(float2int_z(0.0f), 0, "float2int_z1");
+    test_checki(float2int_z(0.25f), 0, "float2int_z1b");
+    test_checki(float2int_z(0.5f), 0, "float2int_z2");
+    test_checki(float2int_z(0.75f), 0, "float2int_z2b");
+    test_checki(float2int_z(1.0f), 1, "float2int_z3");
+    test_checki(float2int_z(-10.0f), -10, "float2int_z3a");
+    test_checki(float2int_z(-0.0f), 0, "float2int_z3b");
+    test_checki(float2int_z(-0.25f), 0, "float2int_z4");
+    test_checki(float2int_z(-0.5f), 0, "float2int_z4b");
+    test_checki(float2int_z(-0.75f), 0, "float2int_z5");
+    test_checki(float2int_z(-1.0f), -1, "float2int_z5b");
+    // todo test correct rounding around maximum precision
+    test_checki(float2int_z(2147483647.0f), INT32_MAX, "float2int_z6");
+    test_checki(float2int_z(21474836470.0f), INT32_MAX, "float2int_z7");
+    test_checki(float2int_z(-2147483648.0f), INT32_MIN, "float2int_z8");
+    test_checki(float2int_z(-21474836480.0f), INT32_MIN, "float2int_z9");
+    test_checki(float2int_z(-2.5f), -2, "float2int_z10");
+    test_checki(float2int_z(-2.4f), -2, "float2int_z11");
+
+    printf("float2int64_z\n");
+    test_checki64(float2int64_z(0.0f), 0, "float2int64_z1");
+    test_checki64(float2int64_z(0.25f), 0, "float2int64_z1b");
+    test_checki64(float2int64_z(0.5f), 0, "float2int64_z2");
+    test_checki64(float2int64_z(0.75f), 0, "float2int64_z2b");
+    test_checki64(float2int64_z(1.0f), 1, "float2int64_z3");
+    test_checki64(float2int64_z(-10.0f), -10, "float2int64_z3a");
+    test_checki64(float2int64_z(-0.0f), 0, "float2int64_z3b");
+    test_checki64(float2int64_z(-0.25f), 0, "float2int64_z4");
+    test_checki64(float2int64_z(-0.5f), 0, "float2int64_z4b");
+    test_checki64(float2int64_z(-0.75f), 0, "float2int64_z5");
+    test_checki64(float2int64_z(-1.0f), -1, "float2int64_z5b");
+    test_checki64(float2int64_z(2147483647.0f), 2147483648ll, "float2int64_z6"); // note loss of precision
+    test_checki64(float2int64_z(21474836470.0f), 21474836480ll, "float2int64_z7"); // note loss of precision
+    test_checki64(float2int64_z(-2147483648.0f), INT32_MIN, "float2int64_z8");
+    test_checki64(float2int64_z(-21474836480.0f), -21474836480ll, "float2int64_z9");
+    test_checki64(float2int64_z(-2.5f), -2, "float2int64_z10");
+    test_checki64(float2int64_z(-2.4f), -2, "float2int64_z11");
+
+    printf("float2uint_z\n");
+    test_checku(float2uint_z(0.0f), 0, "float2uint_z1");
+    test_checku(float2uint_z(0.25f), 0, "float2uint_z2");
+    test_checku(float2uint_z(0.5f), 0, "float2uint_z3");
+    test_checku(float2uint_z(0.75f), 0, "float2uint_z4");
+    test_checku(float2uint_z(1.0f), 1, "float2uint_z5");
+    test_checku(float2uint_z(2147483647.0f), INT32_MAX+1u, "float2uint_z6"); // note loss of precision
+    test_checku(float2uint_z(2147483648.0f), INT32_MAX+1u, "float2uint_z7");
+    // todo test correct rounding around maximum precision
+    test_checku(float2uint_z(4294967294.5f), UINT32_MAX, "float2uint_z8"); // note loss of precision
+    test_checku(float2uint_z(4294967295.0f), UINT32_MAX, "float2uint_z9");
+    test_checku(float2uint_z(42949672950.0f), UINT32_MAX, "float2uint_z10");
+
+    printf("float2uint64_z\n");
+    test_checku64(float2uint64_z(0.0f), 0, "float2uint64_z1");
+    test_checku64(float2uint64_z(0.25f), 0, "float2uint64_z2");
+    test_checku64(float2uint64_z(0.5f), 0, "float2uint64_z3");
+    test_checku64(float2uint64_z(0.75f), 0, "float2uint64_z4");
+    test_checku64(float2uint64_z(1.0f), 1, "float2uint64_z5");
+    test_checku64(float2uint64_z(2147483647.0f), INT32_MAX+1u, "float2uint64_z6"); // note loss of precision
+    test_checku64(float2uint64_z(2147483648.0f), INT32_MAX+1u, "float2uint64_z7");
+    test_checku64(float2uint64_z(4294967294.5f), 4294967296ull, "float2uint64_z8"); // note loss of precision
+    test_checku64(float2uint64_z(4294967295.0f), 4294967296ull, "float2uint64_z9"); // note loss of precision
+    test_checku64(float2uint64_z(42949672950.0f), 42949672960ull, "float2uint64_z10"); // note loss of precision
+
+    // float exp10f(float x);
+    // void sincosf(float x, float *sinx, float *cosx);
+    // float powintf(float x, int y);
+    return rc;
+}
+
+int main() {
+    stdio_init_all();
+    int rc = test();
+    if (rc) {
+        printf("FAILED\n");
+    } else {
+        printf("PASSED\n");
+    }
+}