Add C11 standard atomic support (#1645)

* Add runtime support for stdatomics

* Fix lock calculation and enable atomic_flag support
diff --git a/src/rp2_common/CMakeLists.txt b/src/rp2_common/CMakeLists.txt
index 82f56f8..19c52c1 100644
--- a/src/rp2_common/CMakeLists.txt
+++ b/src/rp2_common/CMakeLists.txt
@@ -51,6 +51,7 @@
     pico_add_subdirectory(pico_malloc)
     pico_add_subdirectory(pico_printf)
     pico_add_subdirectory(pico_rand)
+    pico_add_subdirectory(pico_atomic)
 
     pico_add_subdirectory(pico_stdio)
     pico_add_subdirectory(pico_stdio_semihosting)
diff --git a/src/rp2_common/pico_atomic/CMakeLists.txt b/src/rp2_common/pico_atomic/CMakeLists.txt
new file mode 100644
index 0000000..5670430
--- /dev/null
+++ b/src/rp2_common/pico_atomic/CMakeLists.txt
@@ -0,0 +1,11 @@
+if (NOT TARGET pico_atomic)
+    pico_add_library(pico_atomic)
+
+    target_sources(pico_atomic INTERFACE
+            ${CMAKE_CURRENT_LIST_DIR}/pico_atomic.c
+            )
+
+    target_include_directories(pico_atomic_headers INTERFACE ${CMAKE_CURRENT_LIST_DIR}/include)
+
+    target_link_libraries(pico_atomic INTERFACE pico_sync)
+endif()
diff --git a/src/rp2_common/pico_atomic/include/stdatomic.h b/src/rp2_common/pico_atomic/include/stdatomic.h
new file mode 100644
index 0000000..071286f
--- /dev/null
+++ b/src/rp2_common/pico_atomic/include/stdatomic.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
+ * Copyright (c) 2024 Stephen Street (stephen@redrocketcomputing.com).
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef __STDATOMIC_H
+#define __STDATOMIC_H
+
+#include_next <stdatomic.h>
+
+#undef atomic_flag_test_and_set
+#undef atomic_flag_test_and_set_explicit
+#undef atomic_flag_clear
+#undef atomic_flag_clear_explicit
+
+extern _Bool __atomic_test_and_set_m0(volatile void *mem, int model);
+extern void __atomic_clear_m0 (volatile void *mem, int model);
+
+#define atomic_flag_test_and_set(PTR) __atomic_test_and_set_m0((PTR), __ATOMIC_SEQ_CST)
+#define atomic_flag_test_and_set_explicit(PTR, MO) __atomic_test_and_set_m0((PTR), (MO))
+
+#define atomic_flag_clear(PTR) __atomic_clear_m0((PTR), __ATOMIC_SEQ_CST)
+#define atomic_flag_clear_explicit(PTR, MO) __atomic_clear_m0((PTR), (MO))
+
+#endif
diff --git a/src/rp2_common/pico_atomic/pico_atomic.c b/src/rp2_common/pico_atomic/pico_atomic.c
new file mode 100644
index 0000000..8dd6b96
--- /dev/null
+++ b/src/rp2_common/pico_atomic/pico_atomic.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2024 Raspberry Pi (Trading) Ltd.
+ * Copyright (c) 2024 Stephen Street (stephen@redrocketcomputing.com).
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "hardware/address_mapped.h"
+#include "hardware/regs/watchdog.h"
+#include "hardware/sync.h"
+
+#include "pico/config.h"
+
+#ifndef __optimize
+#define __optimize __attribute__((optimize("-Os")))
+#endif
+
+/* Must be powers of 2 */
+#define ATOMIC_STRIPE 4UL
+#define ATOMIC_LOCKS 16UL
+#define ATOMIC_LOCK_WIDTH 2UL
+#define ATOMIC_LOCK_IDX_Pos ((sizeof(unsigned long) * 8) - (__builtin_clz(ATOMIC_STRIPE - 1)))
+#define ATOMIC_LOCK_IDX_Msk (ATOMIC_LOCKS - 1UL)
+#define ATOMIC_LOCK_REG ((io_rw_32 *)(WATCHDOG_BASE + WATCHDOG_SCRATCH3_OFFSET))
+
+static __used __attribute__((section(".preinit_array.00030"))) void __atomic_init(void) {
+    *ATOMIC_LOCK_REG = 0;
+}
+
+/* 
+   To eliminate interference with existing hardware spinlock usage and reduce multicore contention on
+   unique atomic variables, we use one of the watchdog scratch registers (WATCHDOG_SCRATCH3) to 
+   implement 16, 2 bit, multicore locks, via a varation of Dekker's algorithm 
+   (see https://en.wikipedia.org/wiki/Dekker%27s_algorithm). The lock is selected as a
+   function of the variable address and the stripe width which hashes variables
+   addresses to locks numbers.
+*/
+static __optimize uint32_t __atomic_lock(volatile void *mem) {
+    const uint32_t core = get_core_num();
+    const uint32_t lock_idx = (((uintptr_t)mem) >> ATOMIC_LOCK_IDX_Pos) & ATOMIC_LOCK_IDX_Msk;
+    const uint32_t lock_pos = lock_idx * ATOMIC_LOCK_WIDTH;
+    const uint32_t lock_mask = ((1UL << ATOMIC_LOCK_WIDTH) - 1) << lock_pos;
+    const uint32_t locked_mask = 1UL << (lock_pos + core);
+
+    uint32_t state = save_and_disable_interrupts();
+    while (true) {
+
+        /* First set the bit */
+        hw_set_bits(ATOMIC_LOCK_REG, locked_mask);
+        __dmb();
+
+        /* Did we get the lock? */
+        if ((*ATOMIC_LOCK_REG & lock_mask) == locked_mask)
+            break;
+
+        /* Nope, clear our side */
+        __dmb();
+        hw_clear_bits(ATOMIC_LOCK_REG, locked_mask);
+
+        /* Need to break any ties if the cores are in lock step, is this really required? */
+        for (uint32_t i = core * 2; i > 0; --i)
+            asm volatile ("nop");
+    }
+
+    return state;
+}
+
+static __optimize void __atomic_unlock(volatile void *mem, uint32_t state) {
+    const uint32_t lock_idx = (((uintptr_t)mem) >> ATOMIC_LOCK_IDX_Pos) & ATOMIC_LOCK_IDX_Msk;
+    const uint32_t lock_pos = lock_idx * ATOMIC_LOCK_WIDTH;
+    const uint32_t locked_mask = 1UL << (lock_pos + get_core_num());
+
+    __dmb();
+    hw_clear_bits(ATOMIC_LOCK_REG, locked_mask);
+    restore_interrupts(state);
+}
+
+__optimize uint8_t __atomic_fetch_add_1(volatile void *mem, uint8_t val, __unused int model) {
+    volatile uint8_t *ptr = mem;
+    uint8_t state = __atomic_lock(mem);
+    uint8_t result = *ptr;
+    *ptr += val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint8_t __atomic_fetch_sub_1(volatile void *mem, uint8_t val, __unused int model) {
+    volatile uint8_t *ptr = mem;
+    uint8_t state = __atomic_lock(mem);
+    uint8_t result = *ptr;
+    *ptr -= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint8_t __atomic_fetch_and_1(volatile void *mem, uint8_t val, __unused int model) {
+    volatile uint8_t *ptr = mem;
+    uint8_t state = __atomic_lock(mem);
+    uint8_t result = *ptr;
+    *ptr &= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint8_t __atomic_fetch_or_1(volatile void *mem, uint8_t val, __unused int model) {
+    volatile uint8_t *ptr = mem;
+    uint8_t state = __atomic_lock(mem);
+    uint8_t result = *ptr;
+    *ptr |= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint8_t __atomic_exchange_1(volatile void *mem, uint8_t val, __unused int model) {
+    volatile uint8_t *ptr = mem;
+    uint8_t state = __atomic_lock(mem);
+    uint8_t result = *ptr;
+    *ptr = val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize bool __atomic_compare_exchange_1(volatile void *mem, void *expected, uint8_t desired, __unused bool weak, __unused int success, __unused int failure) {
+    bool result = false;
+    volatile uint8_t *ptr = mem;
+    uint8_t *e_ptr = expected;
+    uint8_t state = __atomic_lock(mem);
+    if (*ptr == *e_ptr) {
+        *ptr = desired;
+        result = true;
+    } else
+        *e_ptr = *ptr;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint16_t __atomic_fetch_add_2(volatile void *mem, uint16_t val, __unused int model) {
+    volatile uint16_t *ptr = mem;
+    uint16_t state = __atomic_lock(mem);
+    uint16_t result = *ptr;
+    *ptr += val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint16_t __atomic_fetch_sub_2(volatile void *mem, uint16_t val, __unused int model) {
+    volatile uint16_t *ptr = mem;
+    uint16_t state = __atomic_lock(mem);
+    uint16_t result = *ptr;
+    *ptr -= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint16_t __atomic_fetch_and_2(volatile void *mem, uint16_t val, __unused int model) {
+    volatile uint16_t *ptr = mem;
+    uint16_t state = __atomic_lock(mem);
+    uint16_t result = *ptr;
+    *ptr &= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint16_t __atomic_fetch_or_2(volatile void *mem, uint16_t val, __unused int model) {
+    volatile uint16_t *ptr = mem;
+    uint16_t state = __atomic_lock(mem);
+    uint16_t result = *ptr;
+    *ptr |= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint16_t __atomic_exchange_2(volatile void *mem, uint16_t val, __unused int model) {
+    volatile uint16_t *ptr = mem;
+    uint16_t state = __atomic_lock(mem);
+    uint16_t result = *ptr;
+    *ptr = val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize bool __atomic_compare_exchange_2(volatile void *mem, void *expected, uint16_t desired, __unused bool weak, __unused int success, __unused int failure) {
+    bool result = false;
+    volatile uint16_t *ptr = mem;
+    uint16_t *e_ptr = expected;
+    uint16_t state = __atomic_lock(mem);
+    if (*ptr == *e_ptr) {
+        *ptr = desired;
+        result = true;
+    } else
+        *e_ptr = *ptr;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint32_t __atomic_fetch_add_4(volatile void *mem, uint32_t val, __unused int model) {
+    volatile uint32_t *ptr = mem;
+    uint32_t state = __atomic_lock(mem);
+    uint32_t result = *ptr;
+    *ptr += val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint32_t __atomic_fetch_sub_4(volatile void *mem, uint32_t val, __unused int model) {
+    volatile uint32_t *ptr = mem;
+    uint32_t state = __atomic_lock(mem);
+    uint32_t result = *ptr;
+    *ptr -= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint32_t __atomic_fetch_and_4(volatile void *mem, uint32_t val, __unused int model) {
+    volatile uint32_t *ptr = mem;
+    uint32_t state = __atomic_lock(mem);
+    uint32_t result = *ptr;
+    *ptr &= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint32_t __atomic_fetch_or_4(volatile void *mem, uint32_t val, __unused int model) {
+    volatile uint32_t *ptr = mem;
+    uint32_t state = __atomic_lock(mem);
+    uint32_t result = *ptr;
+    *ptr |= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint32_t __atomic_exchange_4(volatile void *mem, uint32_t val, __unused int model) {
+    volatile uint32_t *ptr = mem;
+    uint32_t state = __atomic_lock(mem);
+    uint32_t result = *ptr;
+    *ptr = val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize bool __atomic_compare_exchange_4(volatile void *mem, void *expected, uint32_t desired, __unused bool weak, __unused int success, __unused int failure) {
+    bool result = false;
+    volatile uint32_t *ptr = mem;
+    uint32_t *e_ptr = expected;
+    uint32_t state = __atomic_lock(mem);
+    if (*ptr == *e_ptr) {
+        *ptr = desired;
+        result = true;
+    } else
+        *e_ptr = *ptr;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint64_t __atomic_fetch_add_8(volatile void *mem, uint64_t val, __unused int model) {
+    volatile uint64_t *ptr = mem;
+    uint64_t state = __atomic_lock(mem);
+    uint64_t result = *ptr;
+    *ptr += val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint64_t __atomic_fetch_sub_8(volatile void *mem, uint64_t val, __unused int model) {
+    volatile uint64_t *ptr = mem;
+    uint64_t state = __atomic_lock(mem);
+    uint64_t result = *ptr;
+    *ptr -= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint64_t __atomic_fetch_and_8(volatile void *mem, uint64_t val, __unused int model) {
+    volatile uint64_t *ptr = mem;
+    uint64_t state = __atomic_lock(mem);
+    uint64_t result = *ptr;
+    *ptr &= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint64_t __atomic_fetch_or_8(volatile void *mem, uint64_t val, __unused int model) {
+    volatile uint64_t *ptr = mem;
+    uint64_t state = __atomic_lock(mem);
+    uint64_t result = *ptr;
+    *ptr |= val;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint64_t __atomic_exchange_8(volatile void *mem, uint64_t val, __unused int model) {
+    volatile uint64_t *ptr = mem;
+    uint64_t state = __atomic_lock(mem);
+    uint64_t result = *ptr;
+    *ptr = val;
+    __atomic_unlock(mem, state); 
+    return result;
+}
+
+__optimize bool __atomic_compare_exchange_8(volatile void *mem, void *expected, uint64_t desired, __unused bool weak, __unused int success, __unused int failure) {
+    bool result = false;
+    volatile uint64_t *ptr = mem;
+    uint64_t *e_ptr = expected;
+    uint64_t state = __atomic_lock(mem);
+    if (*ptr == *e_ptr) {
+        *ptr = desired;
+        result = true;
+    } else
+        *e_ptr = *ptr;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize uint64_t __atomic_load_8(volatile void *mem, __unused int model) {
+    volatile uint64_t *ptr = mem;
+    uint32_t state = __atomic_lock(mem);
+    uint32_t result = *ptr;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize void __atomic_store_8(volatile void *mem, uint64_t val, __unused int model) {
+    volatile uint64_t *ptr = mem;
+    uint32_t state = __atomic_lock(mem);
+    *ptr = val;
+    __atomic_unlock(mem, state);
+}
+
+__optimize bool __atomic_test_and_set_m0(volatile void *mem, __unused int model) {
+    volatile bool *ptr = mem;
+    uint32_t state = __atomic_lock(mem);
+    volatile bool result = *ptr;
+    *ptr = true;
+    __atomic_unlock(mem, state);
+    return result;
+}
+
+__optimize void __atomic_clear_m0(volatile void *mem, __unused int model) {
+    volatile bool *ptr = mem;
+    *ptr = false;
+    __dmb();
+}
diff --git a/src/rp2_common/pico_runtime/CMakeLists.txt b/src/rp2_common/pico_runtime/CMakeLists.txt
index 9879ba1..791bf8f 100644
--- a/src/rp2_common/pico_runtime/CMakeLists.txt
+++ b/src/rp2_common/pico_runtime/CMakeLists.txt
@@ -35,6 +35,9 @@
 if (TARGET pico_mem_ops)
     pico_mirrored_target_link_libraries(pico_runtime INTERFACE pico_mem_ops)
 endif()
+if (TARGET pico_atomic)
+    pico_mirrored_target_link_libraries(pico_runtime INTERFACE pico_atomic)
+endif()
 if (TARGET pico_standard_link)
     pico_mirrored_target_link_libraries(pico_runtime INTERFACE pico_standard_link)
 endif()