Fix TLS and stack alignment when using picolibc (#637)

Both the TLS block and stack must be correctly aligned when using
picolibc. The architecture stack alignment is represented by the
portBYTE_ALIGNMENT_MASK and the TLS block alignment is provided by the
Picolibc _tls_align() inline function for Picolibc version 1.8 and
above. For older versions of Picolibc, we'll assume that the TLS block
requires the same alignment as the stack.

For downward growing stacks, this requires aligning the start of the
TLS block to the maximum of the stack alignment and the TLS
alignment. With this, both the TLS block and stack will now be
correctly aligned.

For upward growing stacks, the two areas must be aligned
independently; the TLS block is aligned from the start of the stack,
then the tls space is allocated, and then the stack is aligned above
that.

It's probably useful to know here that the linker ensures that
variables within the TLS block are assigned offsets that match their
alignment requirements. If the TLS block itself is correctly aligned,
then everything within will also be.

I have only tested the downward growing stack branch of this patch.

Signed-off-by: Keith Packard <keithpac@amazon.com>
Co-authored-by: Keith Packard <keithpac@amazon.com>
Co-authored-by: Gaurav-Aggarwal-AWS <33462878+aggarg@users.noreply.github.com>
diff --git a/include/picolibc-freertos.h b/include/picolibc-freertos.h
index 472d71e..467f7a9 100644
--- a/include/picolibc-freertos.h
+++ b/include/picolibc-freertos.h
@@ -43,22 +43,43 @@
 
 #define configTLS_BLOCK_TYPE               void *
 
+#define picolibcTLS_SIZE                   ( ( portPOINTER_SIZE_TYPE ) _tls_size() )
+#define picolibcSTACK_ALIGNMENT_MASK       ( ( portPOINTER_SIZE_TYPE ) portBYTE_ALIGNMENT_MASK )
+
+#if __PICOLIBC_MAJOR__ > 1 || __PICOLIBC_MINOR__ >= 8
+
+/* Picolibc 1.8 and newer have explicit alignment values provided
+ * by the _tls_align() inline */
+    #define picolibcTLS_ALIGNMENT_MASK    ( ( portPOINTER_SIZE_TYPE ) ( _tls_align() - 1 ) )
+#else
+
+/* For older Picolibc versions, use the general port alignment value */
+    #define picolibcTLS_ALIGNMENT_MASK    ( ( portPOINTER_SIZE_TYPE ) portBYTE_ALIGNMENT_MASK )
+#endif
+
 /* Allocate thread local storage block off the end of the
 * stack. The _tls_size() function returns the size (in
 * bytes) of the total TLS area used by the application */
 #if ( portSTACK_GROWTH < 0 )
-    #define configINIT_TLS_BLOCK( xTLSBlock, pxTopOfStack )                                            \
-    do {                                                                                               \
-        pxTopOfStack = ( StackType_t * ) ( ( ( portPOINTER_SIZE_TYPE ) pxTopOfStack ) - _tls_size() ); \
-        xTLSBlock = pxTopOfStack;                                                                      \
-        _init_tls( xTLSBlock );                                                                        \
+
+    #define configINIT_TLS_BLOCK( xTLSBlock, pxTopOfStack )                             \
+    do {                                                                                \
+        pxTopOfStack = ( StackType_t * ) ( ( ( ( portPOINTER_SIZE_TYPE ) pxTopOfStack ) \
+                                             - picolibcTLS_SIZE ) & ~                   \
+                                           configMAX( picolibcSTACK_ALIGNMENT_MASK,     \
+                                                      picolibcTLS_ALIGNMENT_MASK ) );   \
+        xTLSBlock = pxTopOfStack;                                                       \
+        _init_tls( xTLSBlock );                                                         \
     } while( 0 )
 #else /* portSTACK_GROWTH */
-    #define configINIT_TLS_BLOCK( xTLSBlock, pxTopOfStack )                                            \
-    do {                                                                                               \
-        xTLSBlock = pxTopOfStack;                                                                      \
-        pxTopOfStack = ( StackType_t * ) ( ( ( portPOINTER_SIZE_TYPE ) pxTopOfStack ) + _tls_size() ); \
-        _init_tls( xTLSBlock );                                                                        \
+    #define configINIT_TLS_BLOCK( xTLSBlock, pxTopOfStack )                                          \
+    do {                                                                                             \
+        xTLSBlock = ( void * ) ( ( ( portPOINTER_SIZE_TYPE ) pxTopOfStack +                          \
+                                   picolibcTLS_ALIGNMENT_MASK ) & ~picolibcTLS_ALIGNMENT_MASK );     \
+        pxTopOfStack = ( StackType_t * ) ( ( ( ( ( portPOINTER_SIZE_TYPE ) xTLSBlock ) +             \
+                                               picolibcTLS_SIZE ) + picolibcSTACK_ALIGNMENT_MASK ) & \
+                                           ~picolibcSTACK_ALIGNMENT_MASK );                          \
+        _init_tls( xTLSBlock );                                                                      \
     } while( 0 )
 #endif /* portSTACK_GROWTH */