kernel/sched: Rename/redocument wait_for_switch() -> z_sched_switch_spin() This trick turns out also to be needed by the abort/join code. Promote it to a more formal-looking internal API and clean up the documentation to (hopefully) clarify the exact behavior and better explain the need. This is one of the more... enchanted bits of the scheduler, and while the trick is IMHO pretty clean, it remains a big SMP footgun. Signed-off-by: Andy Ross <andyross@google.com>

commit: b89e427bd6fd2b85b425556c0eab06934a2edeb9 [log] [tgz]
author: Andy Ross <andyross@google.com> Fri May 26 09:12:51 2023 -0700
committer: jgl-meta <126111723+jgl-meta@users.noreply.github.com> Fri May 26 17:09:35 2023 -0400
tree: 781ee12a9873a6cda71b7c37fc787f0b78785c36
parent: d8d119fd9c04335fc5dfd71aecb49edc92599770 [diff]
diff --git a/arch/arc/include/swap_macros.h b/arch/arc/include/swap_macros.h
index 2328ce3..adb3b8c 100644
--- a/arch/arc/include/swap_macros.h
+++ b/arch/arc/include/swap_macros.h

@@ -416,11 +416,11 @@
 .macro _store_old_thread_callee_regs
 
 	_save_callee_saved_regs
-	/* Save old thread into switch handle which is required by wait_for_switch.
+	/* Save old thread into switch handle which is required by z_sched_switch_spin.
 	 * NOTE: we shouldn't save anything related to old thread context after this point!
 	 * TODO: we should add SMP write-after-write data memory barrier here, as we want all
 	 * previous writes completed before setting switch_handle which is polled by other cores
-	 * in wait_for_switch in case of SMP. Though it's not likely that this issue
+	 * in z_sched_switch_spin in case of SMP. Though it's not likely that this issue
 	 * will reproduce in real world as there is some gap before reading switch_handle and
 	 * reading rest of the data we've stored before.
 	 */

diff --git a/arch/arm64/core/switch.S b/arch/arm64/core/switch.S
index cb721ce..142103d 100644
--- a/arch/arm64/core/switch.S
+++ b/arch/arm64/core/switch.S

@@ -75,7 +75,7 @@
 #endif
 
 	/* save old thread into switch handle which is required by
-	 * wait_for_switch
+	 * z_sched_switch_spin()
 	 */
 	 str	x1, [x1, #___thread_t_switch_handle_OFFSET]
 

diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h
index 767e979..354ef06 100644
--- a/kernel/include/kswap.h
+++ b/kernel/include/kswap.h

@@ -30,16 +30,30 @@
 /* context switching and scheduling-related routines */
 #ifdef CONFIG_USE_SWITCH
 
-/* There is an unavoidable SMP race when threads swap -- their thread
- * record is in the queue (and visible to other CPUs) before
- * arch_switch() finishes saving state.  We must spin for the switch
- * handle before entering a new thread.  See docs on arch_switch().
+/* Spin, with the scheduler lock held (!), on a thread that is known
+ * (!!) to have released the lock and be on a path where it will
+ * deterministically (!!!) reach arch_switch() in very small constant
+ * time.
+ *
+ * This exists to treat an unavoidable SMP race when threads swap --
+ * their thread record is in the queue (and visible to other CPUs)
+ * before arch_switch() finishes saving state.  We must spin for the
+ * switch handle before entering a new thread.  See docs on
+ * arch_switch().
+ *
+ * Stated differently: there's a chicken and egg bug with the question
+ * of "is a thread running or not?".  The thread needs to mark itself
+ * "not running" from its own context, but at that moment it obviously
+ * is still running until it reaches arch_switch()!  Locking can't
+ * treat this because the scheduler lock can't be released by the
+ * switched-to thread, which is going to (obviously) be running its
+ * own code and doesn't know it was switched out.
  *
  * Note: future SMP architectures may need a fence/barrier or cache
  * invalidation here.  Current ones don't, and sadly Zephyr doesn't
  * have a framework for that yet.
  */
-static inline void wait_for_switch(struct k_thread *thread)
+static inline void z_sched_switch_spin(struct k_thread *thread)
 {
 #ifdef CONFIG_SMP
 	volatile void **shp = (void *)&thread->switch_handle;
@@ -117,7 +131,7 @@
 		}
 #endif
 		z_thread_mark_switched_out();
-		wait_for_switch(new_thread);
+		z_sched_switch_spin(new_thread);
 		_current_cpu->current = new_thread;
 
 #ifdef CONFIG_TIMESLICING
@@ -131,10 +145,9 @@
 		arch_cohere_stacks(old_thread, NULL, new_thread);
 
 #ifdef CONFIG_SMP
-		/* Add _current back to the run queue HERE. After
-		 * wait_for_switch() we are guaranteed to reach the
-		 * context switch in finite time, avoiding a potential
-		 * deadlock.
+		/* Now add _current back to the run queue, once we are
+		 * guaranteed to reach the context switch in finite
+		 * time.  See z_sched_switch_spin().
 		 */
 		z_requeue_current(old_thread);
 #endif
@@ -178,6 +191,11 @@
 
 extern int arch_swap(unsigned int key);
 
+static inline void z_sched_switch_spin(struct k_thread *thread)
+{
+	ARG_UNUSED(thread);
+}
+
 static inline int z_swap_irqlock(unsigned int key)
 {
 	int ret;

diff --git a/kernel/sched.c b/kernel/sched.c
index a13f94b..14134fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c

@@ -1095,7 +1095,7 @@
 
 		if (old_thread != new_thread) {
 			update_metairq_preempt(new_thread);
-			wait_for_switch(new_thread);
+			z_sched_switch_spin(new_thread);
 			arch_cohere_stacks(old_thread, interrupted, new_thread);
 
 			_current_cpu->swap_ok = 0;
commit	b89e427bd6fd2b85b425556c0eab06934a2edeb9	[log] [tgz]
author	Andy Ross <andyross@google.com>	Fri May 26 09:12:51 2023 -0700
committer	jgl-meta <126111723+jgl-meta@users.noreply.github.com>	Fri May 26 17:09:35 2023 -0400
tree	781ee12a9873a6cda71b7c37fc787f0b78785c36
parent	d8d119fd9c04335fc5dfd71aecb49edc92599770 [diff]