kernel: optimize ms-to-ticks for certain tick frequencies

Some tick frequencies lend themselves to optimized conversions from ms
to ticks and vice-versa.

- 1000Hz which does not need any conversion
- 500Hz, 250Hz, 125Hz where the division/multiplication are a straight
  shift since they are power-of-two factors of 1000.

In addition, some more generally used values are made to use optimized
conversion equations rather than the generic one that uses 64-bit math,
and often results in calling compiler intrinsics.

These values are: 100Hz, 50Hz, 25Hz, 20Hz, 10Hz, 1Hz (the last one used
in some testing).

Avoiding the 64-bit math intrisics has the additional benefit, in
addition to increased performance, of using a significant lower amount
of stack space: 52 bytes on ARM Cortex-M and 80 bytes on x86.

Change-Id: I080eb338a2637d6b1c6838c119af1a9fa37fe869
Signed-off-by: Benjamin Walsh <benjamin.walsh@windriver.com>
diff --git a/include/kernel.h b/include/kernel.h
index 9f67834..35b4510 100644
--- a/include/kernel.h
+++ b/include/kernel.h
@@ -32,6 +32,7 @@
 #include <misc/__assert.h>
 #include <misc/dlist.h>
 #include <misc/slist.h>
+#include <misc/util.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -634,14 +635,48 @@
  * @cond INTERNAL_HIDDEN
  */
 
+/* kernel clocks */
+
+#if	(sys_clock_ticks_per_sec == 1000) || \
+	(sys_clock_ticks_per_sec == 500)  || \
+	(sys_clock_ticks_per_sec == 250)  || \
+	(sys_clock_ticks_per_sec == 125)  || \
+	(sys_clock_ticks_per_sec == 100)  || \
+	(sys_clock_ticks_per_sec == 50)   || \
+	(sys_clock_ticks_per_sec == 25)   || \
+	(sys_clock_ticks_per_sec == 20)   || \
+	(sys_clock_ticks_per_sec == 10)   || \
+	(sys_clock_ticks_per_sec == 1)
+
+	#define _ms_per_tick (MSEC_PER_SEC / sys_clock_ticks_per_sec)
+#else
+	/* yields horrible 64-bit math on many architectures: try to avoid */
+	#define _NON_OPTIMIZED_TICKS_PER_SEC
+#endif
+
+#ifdef _NON_OPTIMIZED_TICKS_PER_SEC
+extern int32_t _ms_to_ticks(int32_t ms);
+#else
+static ALWAYS_INLINE int32_t _ms_to_ticks(int32_t ms)
+{
+	return (int32_t)ceiling_fraction((uint32_t)ms, _ms_per_tick);
+}
+#endif
+
 /* added tick needed to account for tick in progress */
 #define _TICK_ALIGN 1
 
-static int64_t __ticks_to_ms(int64_t ticks)
+static inline int64_t __ticks_to_ms(int64_t ticks)
 {
-#if CONFIG_SYS_CLOCK_EXISTS
+#ifdef CONFIG_SYS_CLOCK_EXISTS
+
+#ifdef _NON_OPTIMIZED_TICKS_PER_SEC
 	return (MSEC_PER_SEC * (uint64_t)ticks) / sys_clock_ticks_per_sec;
 #else
+	return (uint64_t)ticks * _ms_per_tick;
+#endif
+
+#else
 	__ASSERT(ticks == 0, "");
 	return 0;
 #endif
diff --git a/include/legacy.h b/include/legacy.h
index f3d525e..b4f4624 100644
--- a/include/legacy.h
+++ b/include/legacy.h
@@ -2892,10 +2892,6 @@
  */
 #define nano_task_stack_pop nano_stack_pop
 
-/* kernel clocks */
-
-extern int32_t _ms_to_ticks(int32_t ms);
-
 /**
  * @brief Return the current system tick count.
  *
diff --git a/kernel/Kconfig b/kernel/Kconfig
index 406420d..3e175ea 100644
--- a/kernel/Kconfig
+++ b/kernel/Kconfig
@@ -399,6 +399,30 @@
 	help
 	This option specifies the frequency of the system clock in Hz.
 
+	Depending on the choice made, an amount of possibly expensive math must
+	occur when converting ticks to milliseconds and vice-versa. Some values
+	are optimized, and yield significantly less math.
+
+	The optimal values from a computational point-of-view are 1000, 500,
+	250 and 125, since in these cases there is either no computation
+	required, or it is all done via bit-shifting. These also give a
+	granularity from 1ms to 8ms.
+
+	Other good values are 100, 50, 25, 20 and 10. In this case, some math
+	is required but is minimized. These are also values that necessitate a
+	reduced number of clock interrupts per second, at the cost of
+	granularity (10ms to 100ms).
+
+	All other values require some extensive 64-bit math, and in some
+	configurations even require calls to compiler built-in functions, and
+	can require a non-trivial extra amount of stack space (e.g. around 80
+	bytes on x86).
+
+	Using the legacy API also incurs an extra penalty, since when asking
+	for a timeout, a translation is made from ticks to milliseconds to call
+	the native kernel APIs, and then another translation is made back to
+	ticks, since the kernel is tick-based.
+
 config SYS_CLOCK_HW_CYCLES_PER_SEC
 	int "System clock's h/w timer frequency"
 	help
diff --git a/kernel/include/ksched.h b/kernel/include/ksched.h
index 008ae85..7cfc08d 100644
--- a/kernel/include/ksched.h
+++ b/kernel/include/ksched.h
@@ -31,7 +31,9 @@
 extern void _pend_current_thread(_wait_q_t *wait_q, int32_t timeout);
 extern void _move_thread_to_end_of_prio_q(struct k_thread *thread);
 extern int __must_switch_threads(void);
+#ifdef _NON_OPTIMIZED_TICKS_PER_SEC
 extern int32_t _ms_to_ticks(int32_t ms);
+#endif
 extern void idle(void *, void *, void *);
 
 /* find which one is the next thread to run */
diff --git a/kernel/sched.c b/kernel/sched.c
index 2755d81..c6674c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -19,6 +19,7 @@
 #include <atomic.h>
 #include <ksched.h>
 #include <wait_q.h>
+#include <misc/util.h>
 
 /* the only struct _kernel instance */
 struct _kernel _kernel = {0};
@@ -191,15 +192,14 @@
 
 /* convert milliseconds to ticks */
 
-#define ceiling(numerator, divider) \
-	(((numerator) + ((divider) - 1)) / (divider))
-
+#ifdef _NON_OPTIMIZED_TICKS_PER_SEC
 int32_t _ms_to_ticks(int32_t ms)
 {
 	int64_t ms_ticks_per_sec = (int64_t)ms * sys_clock_ticks_per_sec;
 
-	return (int32_t)ceiling(ms_ticks_per_sec, MSEC_PER_SEC);
+	return (int32_t)ceiling_fraction(ms_ticks_per_sec, MSEC_PER_SEC);
 }
+#endif
 
 /* pend the specified thread: it must *not* be in the ready queue */
 /* must be called with interrupts locked */
diff --git a/kernel/sys_clock.c b/kernel/sys_clock.c
index c8f28d7..26cea33 100644
--- a/kernel/sys_clock.c
+++ b/kernel/sys_clock.c
@@ -24,6 +24,12 @@
 #include <drivers/system_timer.h>
 
 #ifdef CONFIG_SYS_CLOCK_EXISTS
+#ifdef _NON_OPTIMIZED_TICKS_PER_SEC
+#warning "non-optimized system clock frequency chosen: performance may suffer"
+#endif
+#endif
+
+#ifdef CONFIG_SYS_CLOCK_EXISTS
 int sys_clock_us_per_tick = 1000000 / sys_clock_ticks_per_sec;
 int sys_clock_hw_cycles_per_tick =
 	CONFIG_SYS_CLOCK_HW_CYCLES_PER_SEC / sys_clock_ticks_per_sec;