enable float/double benchmark on risc-v (#2885)

* enable float/double benchmark on risc-v

* fixup some double input COSTs
diff --git a/test/pico_float_test/CMakeLists.txt b/test/pico_float_test/CMakeLists.txt
index a80dfcc..0486e4b 100644
--- a/test/pico_float_test/CMakeLists.txt
+++ b/test/pico_float_test/CMakeLists.txt
@@ -71,15 +71,13 @@
         target_compile_options(custom_float_funcs_test_${FLOAT_TYPE} PRIVATE -fno-strict-float-cast-overflow)
     endif()
 
-    if (NOT PICO_RISCV) # todo need risc-v support too
-        add_executable(float_benchmark_${FLOAT_TYPE} float_benchmark.c)
-        pico_set_float_implementation(float_benchmark_${FLOAT_TYPE} ${FLOAT_TYPE})
-        target_link_libraries(float_benchmark_${FLOAT_TYPE} PRIVATE pico_stdlib m)
-        pico_add_extra_outputs(float_benchmark_${FLOAT_TYPE})
-        target_compile_definitions(float_benchmark_${FLOAT_TYPE} PRIVATE PICO_FLOAT_IN_RAM=1)
-        pico_set_printf_implementation(float_benchmark_${FLOAT_TYPE} compiler)
-        pico_set_binary_type(float_benchmark_${FLOAT_TYPE} copy_to_ram)
-    endif()
+    add_executable(float_benchmark_${FLOAT_TYPE} float_benchmark.c)
+    pico_set_float_implementation(float_benchmark_${FLOAT_TYPE} ${FLOAT_TYPE})
+    target_link_libraries(float_benchmark_${FLOAT_TYPE} PRIVATE pico_stdlib m)
+    pico_add_extra_outputs(float_benchmark_${FLOAT_TYPE})
+    target_compile_definitions(float_benchmark_${FLOAT_TYPE} PRIVATE PICO_FLOAT_IN_RAM=1)
+    pico_set_printf_implementation(float_benchmark_${FLOAT_TYPE} compiler)
+    pico_set_binary_type(float_benchmark_${FLOAT_TYPE} copy_to_ram)
 endforeach ()
 
 foreach (DOUBLE_TYPE IN LISTS DOUBLE_TYPES)
@@ -127,15 +125,13 @@
         endif()
     endif()
 
-    if (NOT PICO_RISCV) # todo need risc-v support too
-        add_executable(double_benchmark_${DOUBLE_TYPE} double_benchmark.c)
-        pico_set_double_implementation(double_benchmark_${DOUBLE_TYPE} ${DOUBLE_TYPE})
-        target_link_libraries(double_benchmark_${DOUBLE_TYPE} PRIVATE pico_stdlib m)
-        pico_add_extra_outputs(double_benchmark_${DOUBLE_TYPE})
-        target_compile_definitions(double_benchmark_${DOUBLE_TYPE} PRIVATE PICO_DOUBLE_IN_RAM=1)
-        pico_set_printf_implementation(double_benchmark_${DOUBLE_TYPE} compiler)
-        pico_set_binary_type(double_benchmark_${DOUBLE_TYPE} copy_to_ram)
-    endif()
+    add_executable(double_benchmark_${DOUBLE_TYPE} double_benchmark.c)
+    pico_set_double_implementation(double_benchmark_${DOUBLE_TYPE} ${DOUBLE_TYPE})
+    target_link_libraries(double_benchmark_${DOUBLE_TYPE} PRIVATE pico_stdlib m)
+    pico_add_extra_outputs(double_benchmark_${DOUBLE_TYPE})
+    target_compile_definitions(double_benchmark_${DOUBLE_TYPE} PRIVATE PICO_DOUBLE_IN_RAM=1)
+    pico_set_printf_implementation(double_benchmark_${DOUBLE_TYPE} compiler)
+    pico_set_binary_type(double_benchmark_${DOUBLE_TYPE} copy_to_ram)
 endforeach ()
 
 if (PICO_RP2350 AND NOT PICO_RISCV)
diff --git a/test/pico_float_test/double_benchmark.c b/test/pico_float_test/double_benchmark.c
index 2158ec6..90da4f1 100644
--- a/test/pico_float_test/double_benchmark.c
+++ b/test/pico_float_test/double_benchmark.c
@@ -9,9 +9,17 @@
 #endif
 
 static void init_systick() {
+#ifdef __riscv
+    // Stop, clear then start 64-bit RISC-V platform timer for boot timing
+    sio_hw->mtime_ctrl = 0;
+    sio_hw->mtime = 0;
+    sio_hw->mtimeh = 0;
+    sio_hw->mtime_ctrl = SIO_MTIME_CTRL_FULLSPEED_BITS | SIO_MTIME_CTRL_EN_BITS;
+#else
     systick_hw->csr = 0;
     systick_hw->rvr = ARM_CPU_PREFIXED(SYST_RVR_RELOAD_BITS);
     systick_hw->csr = ARM_CPU_PREFIXED(SYST_CSR_CLKSOURCE_BITS) | ARM_CPU_PREFIXED(SYST_CSR_ENABLE_BITS);
+#endif
 }
 
 // Stop the compiler from constant-folding a hardware base pointer into the
@@ -25,18 +33,22 @@
     __opaque_ptr; \
 })
 
-static __force_inline uint32_t systick_value() {
-    return systick_hw->cvr;
-}
-
 static __force_inline io_ro_32 *systick_value_ptr() {
+#ifdef __riscv
+    return &sio_hw->mtime;
+#else
     return __get_opaque_ptr(&systick_hw->cvr);
+#endif
 }
 
 static int cycle_diff(uint32_t systick1, uint32_t systick2) {
+#ifdef __riscv
+    return systick2 - systick1 - 1;
+#else
     static_assert(ARM_CPU_PREFIXED(SYST_CVR_CURRENT_LSB) == 0, "");
     uint32_t shift = 32 - ARM_CPU_PREFIXED(SYST_CVR_CURRENT_MSB);
     return (((int32_t)((systick1 << shift) - (systick2 << shift))) >> shift) - 1; // -1 since the second systick read costs one
+#endif
 }
 
 #define timer_func_def(name) static __noinline int __not_in_flash_func(time_##name)
@@ -156,8 +168,13 @@
 // #pragma message("EMITS_VFP = " __XSTRING(EMITS_VFP))
 // #pragma message("USING_HARD_FLOAT_ABI = " __XSTRING(USING_HARD_FLOAT_ABI))
 
+#ifdef __riscv
+#define LOAD_COST 1
+#define STORE_COST 1
+#else
 #define LOAD_COST 2
 #define STORE_COST 2
+#endif
 
 #define DOUBLE_INPUT_COST (LOAD_COST * 2)
 #define DOUBLE_OUTPUT_COST (STORE_COST * 2)
@@ -913,7 +930,7 @@
     uint32_t t0 = *systick_ptr;
     volatile double x = copysign(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
 }
 
 timer_func_def(dtrunc)(volatile double a) {
@@ -953,7 +970,7 @@
     uint32_t t0 = *systick_ptr;
     volatile double x = fmod(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
 }
 
 timer_func_def(ddrem)(volatile double a, volatile double b) {
@@ -965,7 +982,7 @@
     uint32_t t0 = *systick_ptr;
     volatile double x = drem(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
 #endif
 }
 
@@ -974,7 +991,7 @@
     uint32_t t0 = *systick_ptr;
     volatile double x = remainder(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
 }
 
 timer_func_def(dremquo)(volatile double a, volatile double b) {
@@ -983,7 +1000,7 @@
     uint32_t t0 = *systick_ptr;
     volatile double x = remquo(a, b, &c);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
 }
 
 timer_func_def(dexp2)(volatile double a) {
@@ -1047,7 +1064,7 @@
     uint32_t t0 = *systick_ptr;
     volatile double x = pow(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
 }
 
 timer_func_def(dcbrt)(volatile double a) {
@@ -1089,7 +1106,7 @@
     uint32_t t0 = *systick_ptr;
     volatile double x = hypot(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST - DOUBLE_OUTPUT_COST;
+    return cycle_diff(t0, t1) - DOUBLE_INPUT_COST * 2 - DOUBLE_OUTPUT_COST;
 }
 
 timer_func_def(dasin)(volatile double a) {
@@ -1184,8 +1201,8 @@
 }
 
 int main() {
-    stdio_init_all();
     init_systick();
+    stdio_init_all();
 #if PICO_C_COMPILER_IS_CLANG
     printf("================= Clang - ");
 #else
diff --git a/test/pico_float_test/float_benchmark.c b/test/pico_float_test/float_benchmark.c
index 73f8836..2e419a8 100644
--- a/test/pico_float_test/float_benchmark.c
+++ b/test/pico_float_test/float_benchmark.c
@@ -9,9 +9,17 @@
 #endif
 
 static void init_systick() {
+#ifdef __riscv
+    // Stop, clear then start 64-bit RISC-V platform timer for boot timing
+    sio_hw->mtime_ctrl = 0;
+    sio_hw->mtime = 0;
+    sio_hw->mtimeh = 0;
+    sio_hw->mtime_ctrl = SIO_MTIME_CTRL_FULLSPEED_BITS | SIO_MTIME_CTRL_EN_BITS;
+#else
     systick_hw->csr = 0;
     systick_hw->rvr = ARM_CPU_PREFIXED(SYST_RVR_RELOAD_BITS);
     systick_hw->csr = ARM_CPU_PREFIXED(SYST_CSR_CLKSOURCE_BITS) | ARM_CPU_PREFIXED(SYST_CSR_ENABLE_BITS);
+#endif
 }
 
 // Stop the compiler from constant-folding a hardware base pointer into the
@@ -25,18 +33,22 @@
     __opaque_ptr; \
 })
 
-static __force_inline uint32_t systick_value() {
-    return systick_hw->cvr;
-}
-
 static __force_inline io_ro_32 *systick_value_ptr() {
+#ifdef __riscv
+    return &sio_hw->mtime;
+#else
     return __get_opaque_ptr(&systick_hw->cvr);
+#endif
 }
 
 static int cycle_diff(uint32_t systick1, uint32_t systick2) {
+#ifdef __riscv
+    return systick2 - systick1 - 1;
+#else
     static_assert(ARM_CPU_PREFIXED(SYST_CVR_CURRENT_LSB) == 0, "");
     uint32_t shift = 32 - ARM_CPU_PREFIXED(SYST_CVR_CURRENT_MSB);
     return (((int32_t)((systick1 << shift) - (systick2 << shift))) >> shift) - 1; // -1 since the second systick read costs one
+#endif
 }
 
 #define timer_func_def(name) static __noinline int __not_in_flash_func(time_##name)
@@ -156,8 +168,13 @@
 // #pragma message("EMITS_VFP = " __XSTRING(EMITS_VFP))
 // #pragma message("USING_HARD_FLOAT_ABI = " __XSTRING(USING_HARD_FLOAT_ABI))
 
+#ifdef __riscv
+#define LOAD_COST 1
+#define STORE_COST 1
+#else
 #define LOAD_COST 2
 #define STORE_COST 2
+#endif
 
 #define FLOAT_INPUT_COST LOAD_COST
 #define FLOAT_OUTPUT_COST STORE_COST
@@ -902,7 +919,7 @@
     uint32_t t0 = *systick_ptr;
     volatile float x = copysignf(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
 }
 
 timer_func_def(ftrunc)(volatile float a) {
@@ -942,7 +959,7 @@
     uint32_t t0 = *systick_ptr;
     volatile float x = fmodf(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
 }
 
 timer_func_def(fdrem)(volatile float a, volatile float b) {
@@ -954,7 +971,7 @@
     uint32_t t0 = *systick_ptr;
     volatile float x = dremf(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
 #endif
 }
 
@@ -963,7 +980,7 @@
     uint32_t t0 = *systick_ptr;
     volatile float x = remainderf(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
 }
 
 timer_func_def(fremquo)(volatile float a, volatile float b) {
@@ -972,7 +989,7 @@
     uint32_t t0 = *systick_ptr;
     volatile float x = remquof(a, b, &c);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
 }
 
 timer_func_def(fexp2)(volatile float a) {
@@ -1036,7 +1053,7 @@
     uint32_t t0 = *systick_ptr;
     volatile float x = powf(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
 }
 
 timer_func_def(fcbrt)(volatile float a) {
@@ -1068,7 +1085,7 @@
     uint32_t t0 = *systick_ptr;
     volatile float x = hypotf(a, b);
     uint32_t t1 = *systick_ptr;
-    return cycle_diff(t0, t1) - FLOAT_INPUT_COST - FLOAT_OUTPUT_COST;
+    return cycle_diff(t0, t1) - FLOAT_INPUT_COST * 2 - FLOAT_OUTPUT_COST;
 }
 
 timer_func_def(fasin)(volatile float a) {
@@ -1129,8 +1146,8 @@
 
 
 int main() {
-    stdio_init_all();
     init_systick();
+    stdio_init_all();
 #if PICO_C_COMPILER_IS_CLANG
     printf("================= Clang - ");
 #else