kernel: handle thread self-aborts on idle thread
Fixes races where threads on another CPU are joining the
exiting thread, since it could still be running when
the joiners wake up on a different CPU.
Fixes problems where the thread object is still being
used by the kernel when the fn_abort() function is called,
preventing the thread object from being recycled or
freed back to a slab pool.
Fixes a race where a thread is aborted from one CPU while
it self-aborts on another CPU, that was currently worked
around with a busy-wait.
Precedent for doing this comes from FreeRTOS, which also
performs final thread cleanup in the idle thread.
Some logic in z_thread_single_abort() rearranged such that
when we release sched_spinlock, the thread object pointer
is never dereferenced by the kernel again; join waiters
or fn_abort() logic may free it immediately.
An assertion added to z_thread_single_abort() to ensure
it never gets called with thread == _current outside of an ISR.
Some logic has been added to ensure z_thread_single_abort()
tasks don't run more than once.
Fixes: #26486
Related to: #23063 #23062
Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
diff --git a/kernel/sched.c b/kernel/sched.c
index 0d1c12a..b9c7e3b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -182,7 +182,16 @@
static ALWAYS_INLINE struct k_thread *next_up(void)
{
- struct k_thread *thread = _priq_run_best(&_kernel.ready_q.runq);
+ struct k_thread *thread;
+
+ /* If a thread self-aborted we need the idle thread to clean it up
+ * before any other thread can run on this CPU
+ */
+ if (_current_cpu->pending_abort != NULL) {
+ return _current_cpu->idle_thread;
+ }
+
+ thread = _priq_run_best(&_kernel.ready_q.runq);
#if (CONFIG_NUM_METAIRQ_PRIORITIES > 0) && (CONFIG_NUM_COOP_PRIORITIES > 0)
/* MetaIRQs must always attempt to return back to a
@@ -366,7 +375,7 @@
!is_preempt(_current)) {
/* Record new preemption */
_current_cpu->metairq_preempted = _current;
- } else if (!is_metairq(thread)) {
+ } else if (!is_metairq(thread) && !z_is_idle_thread_object(thread)) {
/* Returning from existing preemption */
_current_cpu->metairq_preempted = NULL;
}
@@ -497,12 +506,25 @@
void z_thread_single_abort(struct k_thread *thread)
{
+ void (*fn_abort)(void) = NULL;
+
__ASSERT(!(thread->base.user_options & K_ESSENTIAL),
"essential thread aborted");
+ __ASSERT(thread != _current || arch_is_in_isr(),
+ "self-abort detected");
- if (thread->fn_abort != NULL) {
- thread->fn_abort();
+ /* Prevent any of the further logic in this function from running more
+ * than once
+ */
+ k_spinlock_key_t key = k_spin_lock(&sched_spinlock);
+ if ((thread->base.thread_state &
+ (_THREAD_ABORTING | _THREAD_DEAD)) != 0) {
+ LOG_DBG("Thread %p already dead or on the way out", thread);
+ k_spin_unlock(&sched_spinlock, key);
+ return;
}
+ thread->base.thread_state |= _THREAD_ABORTING;
+ k_spin_unlock(&sched_spinlock, key);
(void)z_abort_thread_timeout(thread);
@@ -511,6 +533,7 @@
}
LOCKED(&sched_spinlock) {
+ LOG_DBG("Cleanup aborting thread %p", thread);
struct k_thread *waiter;
if (z_is_thread_ready(thread)) {
@@ -529,37 +552,6 @@
}
}
- uint32_t mask = _THREAD_DEAD;
-
- /* If the abort is happening in interrupt context,
- * that means that execution will never return to the
- * thread's stack and that the abort is known to be
- * complete. Otherwise the thread still runs a bit
- * until it can swap, requiring a delay.
- */
- if (IS_ENABLED(CONFIG_SMP) && arch_is_in_isr()) {
- mask |= _THREAD_ABORTED_IN_ISR;
- }
-
- thread->base.thread_state |= mask;
-
-#ifdef CONFIG_USERSPACE
- /* Clear initialized state so that this thread object may be
- * re-used and triggers errors if API calls are made on it from
- * user threads
- *
- * For a whole host of reasons this is not ideal and should be
- * iterated on.
- */
- z_object_uninit(thread->stack_obj);
- z_object_uninit(thread);
-
- /* Revoke permissions on thread's ID so that it may be
- * recycled
- */
- z_thread_perms_all_clear(thread);
-#endif
-
/* Wake everybody up who was trying to join with this thread.
* A reschedule is invoked later by k_thread_abort().
*/
@@ -572,8 +564,49 @@
arch_thread_return_value_set(waiter, 0);
ready_thread(waiter);
}
+
+ if (z_is_idle_thread_object(_current)) {
+ update_cache(1);
+ }
+
+ thread->base.thread_state |= _THREAD_DEAD;
+
+ /* Read this here from the thread struct now instead of
+ * after we unlock
+ */
+ fn_abort = thread->fn_abort;
+
+ /* Keep inside the spinlock as these may use the contents
+ * of the thread object. As soon as we release this spinlock,
+ * the thread object could be destroyed at any time.
+ */
sys_trace_thread_abort(thread);
z_thread_monitor_exit(thread);
+
+#ifdef CONFIG_USERSPACE
+ /* Revoke permissions on thread's ID so that it may be
+ * recycled
+ */
+ z_thread_perms_all_clear(thread);
+
+ /* Clear initialized state so that this thread object may be
+ * re-used and triggers errors if API calls are made on it from
+ * user threads
+ */
+ z_object_uninit(thread->stack_obj);
+ z_object_uninit(thread);
+#endif
+ /* Kernel should never look at the thread object again past
+ * this point unless another thread API is called. If the
+ * object doesn't get corrupted, we'll catch other
+ * k_thread_abort()s on this object, although this is
+ * somewhat undefined behavoir. It must be safe to call
+ * k_thread_create() or free the object at this point.
+ */
+ }
+
+ if (fn_abort != NULL) {
+ fn_abort();
}
}
@@ -1343,9 +1376,6 @@
* it locally. Not all architectures support that, alas. If
* we don't have it, we need to wait for some other interrupt.
*/
- key = k_spin_lock(&sched_spinlock);
- thread->base.thread_state |= _THREAD_ABORTING;
- k_spin_unlock(&sched_spinlock, key);
#ifdef CONFIG_SCHED_IPI_SUPPORTED
arch_sched_ipi();
#endif
@@ -1369,16 +1399,6 @@
k_busy_wait(100);
}
}
-
- /* If the thread self-aborted (e.g. its own exit raced with
- * this external abort) then even though it is flagged DEAD,
- * it's still running until its next swap and thus the thread
- * object is still in use. We have to resort to a fallback
- * delay in that circumstance.
- */
- if ((thread->base.thread_state & _THREAD_ABORTED_IN_ISR) == 0U) {
- k_busy_wait(THREAD_ABORT_DELAY_US);
- }
}
#endif