portable: aarch64_sre: Add support for vApplicationFPUSafeIRQHandler

The application writer needs to name their IRQ handler as:
1. vApplicationIRQHandler if the IRQ handler does not use FPU registers.
2. vApplicationFPUSafeIRQHandler is the IRQ handler uses FPU registers.

When the application uses vApplicationFPUSafeIRQHandler, a default
implementation of vApplicationIRQHandler is used which stores FPU
registers and then calls vApplicationFPUSafeIRQHandler.

Note that recent versions of GCC may use FP/SIMD registers to optimize 16-bytes
copy and especially when using va_start()/va_arg() functions (e.g printing some thing
in IRQ handlers may trigger usage of FPU registers)

This implementation is heavily inspired by both the ARM_CA9 port and the ARM_CRx_No_GIC
port done in [1]

[1] https://github.com/FreeRTOS/FreeRTOS-Kernel/pull/1113

Signed-off-by: Marouen Ghodhbane <marouen.ghodhbane@nxp.com>
diff --git a/portable/GCC/ARM_AARCH64_SRE/port.c b/portable/GCC/ARM_AARCH64_SRE/port.c
index ffc34d7..ab9290d 100644
--- a/portable/GCC/ARM_AARCH64_SRE/port.c
+++ b/portable/GCC/ARM_AARCH64_SRE/port.c
@@ -133,6 +133,27 @@
  */
 extern void vPortRestoreTaskContext( void );
 
+/*
+ * If the application provides an implementation of vApplicationIRQHandler(),
+ * then it will get called directly without saving the FPU registers on
+ * interrupt entry, and this weak implementation of
+ * vApplicationFPUSafeIRQHandler() is just provided to remove linkage errors -
+ * it should never actually get called so its implementation contains a
+ * call to configASSERT() that will always fail.
+ *
+ * If the application provides its own implementation of
+ * vApplicationFPUSafeIRQHandler() then the implementation of
+ * vApplicationIRQHandler() provided in portASM.S will save the FPU registers
+ * before calling it.
+ *
+ * Therefore, if the application writer wants FPU registers to be saved on
+ * interrupt entry their IRQ handler must be called
+ * vApplicationFPUSafeIRQHandler(), and if the application writer does not want
+ * FPU registers to be saved on interrupt entry their IRQ handler must be
+ * called vApplicationIRQHandler().
+ */
+void vApplicationFPUSafeIRQHandler( uint32_t ulICCIAR ) __attribute__((weak) );
+
 /*-----------------------------------------------------------*/
 
 /* A variable is used to keep track of the critical section nesting.  This
@@ -495,3 +516,9 @@
 
 #endif /* configASSERT_DEFINED */
 /*-----------------------------------------------------------*/
+
+void vApplicationFPUSafeIRQHandler( uint32_t ulICCIAR )
+{
+    ( void ) ulICCIAR;
+    configASSERT( ( volatile void * ) NULL );
+}
diff --git a/portable/GCC/ARM_AARCH64_SRE/portASM.S b/portable/GCC/ARM_AARCH64_SRE/portASM.S
index 8d69b2a..f1f59cd 100644
--- a/portable/GCC/ARM_AARCH64_SRE/portASM.S
+++ b/portable/GCC/ARM_AARCH64_SRE/portASM.S
@@ -414,8 +414,82 @@
 
     ERET
 
+/******************************************************************************
+ * If the application provides an implementation of vApplicationIRQHandler(),
+ * then it will get called directly without saving the FPU registers on
+ * interrupt entry, and this weak implementation of
+ * vApplicationIRQHandler() will not get called.
+ *
+ * If the application provides its own implementation of
+ * vApplicationFPUSafeIRQHandler() then this implementation of
+ * vApplicationIRQHandler() will be called, save the FPU registers, and then
+ * call vApplicationFPUSafeIRQHandler().
+ *
+ * Therefore, if the application writer wants FPU registers to be saved on
+ * interrupt entry their IRQ handler must be called
+ * vApplicationFPUSafeIRQHandler(), and if the application writer does not want
+ * FPU registers to be saved on interrupt entry their IRQ handler must be
+ * called vApplicationIRQHandler().
+ *****************************************************************************/
 
+.align 8
+.weak vApplicationIRQHandler
+.type vApplicationIRQHandler, %function
+vApplicationIRQHandler:
+    /* Save LR and FP on the stack */
+    STP     X29, X30, [SP, #-0x10]!
 
+    /* Save FPU registers (32 128-bits + 2 64-bits configuration and status registers) */
+    STP     Q0, Q1, [SP,#-0x20]!
+    STP     Q2, Q3, [SP,#-0x20]!
+    STP     Q4, Q5, [SP,#-0x20]!
+    STP     Q6, Q7, [SP,#-0x20]!
+    STP     Q8, Q9, [SP,#-0x20]!
+    STP     Q10, Q11, [SP,#-0x20]!
+    STP     Q12, Q13, [SP,#-0x20]!
+    STP     Q14, Q15, [SP,#-0x20]!
+    STP     Q16, Q17, [SP,#-0x20]!
+    STP     Q18, Q19, [SP,#-0x20]!
+    STP     Q20, Q21, [SP,#-0x20]!
+    STP     Q22, Q23, [SP,#-0x20]!
+    STP     Q24, Q25, [SP,#-0x20]!
+    STP     Q26, Q27, [SP,#-0x20]!
+    STP     Q28, Q29, [SP,#-0x20]!
+    STP     Q30, Q31, [SP,#-0x20]!
+
+    /* Even though upper 32 bits of FPSR and FPCR are reserved, save and restore the whole 64 bits to keep 16-byte SP alignement. */
+    MRS     X9, FPSR
+    MRS     X10, FPCR
+    STP     X9, X10, [SP, #-0x10]!
+
+    /* Call the C handler. */
+    BL vApplicationFPUSafeIRQHandler
+
+    /* Restore FPU registers */
+
+    LDP     X9, X10, [SP], #0x10
+    LDP     Q30, Q31, [SP], #0x20
+    LDP     Q28, Q29, [SP], #0x20
+    LDP     Q26, Q27, [SP], #0x20
+    LDP     Q24, Q25, [SP], #0x20
+    LDP     Q22, Q23, [SP], #0x20
+    LDP     Q20, Q21, [SP], #0x20
+    LDP     Q18, Q19, [SP], #0x20
+    LDP     Q16, Q17, [SP], #0x20
+    LDP     Q14, Q15, [SP], #0x20
+    LDP     Q12, Q13, [SP], #0x20
+    LDP     Q10, Q11, [SP], #0x20
+    LDP     Q8, Q9, [SP], #0x20
+    LDP     Q6, Q7, [SP], #0x20
+    LDP     Q4, Q5, [SP], #0x20
+    LDP     Q2, Q3, [SP], #0x20
+    LDP     Q0, Q1, [SP], #0x20
+    MSR     FPSR, X9
+    MSR     FPCR, X10
+
+    /* Restore FP and LR */
+    LDP     X29, X30, [SP], #0x10
+    RET
 
 .align 8
 pxCurrentTCBConst: .dword pxCurrentTCB