Reduce stack use for certain arithmetic operations.

commit: cab2682e695a0d5a187e6e4e6b6518a1f8fc3875 [log] [tgz]
author: Dmitri Prime <dmitri+github.com@xn13.com> Sat Oct 03 18:28:08 2020 -0700
committer: Dmitri Prime <dmitri+github.com@xn13.com> Sat Oct 03 18:28:08 2020 -0700
tree: 0bb1b9cec74e5ecc6d65dc921bc5203b709acdc0
parent: a1d6fcaf6d74a11c5ec05ebc8a4edb7f9903843e [diff] [blame]
diff --git a/runtime/cpp/emboss_arithmetic.h b/runtime/cpp/emboss_arithmetic.h
index dd570ba..9bb209a 100644
--- a/runtime/cpp/emboss_arithmetic.h
+++ b/runtime/cpp/emboss_arithmetic.h

@@ -49,12 +49,37 @@
 // can be used directly for each operation, but casting may be required in
 // between operations.
 
+
+// AllKnown(...) returns true if all of its arguments are Known().  The base
+// case is no arguments.
 inline constexpr bool AllKnown() { return true; }
 
-template <typename T, typename... RestT>
-inline constexpr bool AllKnown(T value, RestT... rest) {
-  return value.Known() && AllKnown(rest...);
-}
+// The rest of AllKnown() could be:
+//
+// template <typename T, typename... RestT>
+// inline constexpr bool AllKnown(T v, RestT... rest) {
+//   return v.Known() && AllKnown(rest...);
+// }
+//
+// ... unfortunately, some compilers do not optimize this well, and it ends
+// up using linear stack space instead of constant stack space; for complex
+// structs on systems with limited stack (such as typical microcontrollers),
+// this can cause methods like Ok() to blow the stack.
+//
+// The C++14 solution would be to use a std::initializer_list and iterate over
+// the arguments.  Unfortunately, C++11 std::initializer_list is not
+// constexpr, and C++11 constexpr does not allow iteration.
+//
+// Instead, for "small" numbers of arguments (up to 64, at time of writing), we
+// have generated overloads of the form:
+//
+// template <typename T0, ... typename TN>
+// inline constexpr bool AllKnown(T0 v0, ... TN vN) {
+//   return v0.Known() && ... && vN.Known();
+// }
+//
+// This reduces stack frames by ~64x.
+#  include "emboss_arithmetic_all_known_generated.inc"
 
 // MaybeDo implements the logic of checking for known values, unwrapping the
 // known values, passing the unwrapped values to OperatorT, and then rewrapping
@@ -150,26 +175,68 @@
 // MaximumOperation is a bit more complex, in order to handle the variable
 // number of parameters.
 struct MaximumOperation {
+  // Maximum of 1 element is just itself.
   template <typename T>
   static inline constexpr T Do(T arg) {
-    // Base case for recursive template.
     return arg;
   }
 
-  // Ideally, this would only use template<typename T>, but C++11 requires a
-  // full variadic template or C-style variadic function in order to accept a
-  // variable number of arguments.  C-style variadic functions have no intrinsic
-  // way of figuring out how many arguments they receive, so we have to use a
-  // variadic template.
+  // The rest of MaximumOperation::Do could be:
   //
-  // The static_assert ensures that all arguments are actually the same type.
-  template <typename T1, typename T2, typename... T>
-  static inline constexpr T1 Do(T1 l, T2 r, T... rest) {
+  // template <typename T, typename... RestT>
+  // static inline constexpr T Do(T v0, T v1, RestT... rest) {
+  //   return Do(v0 < v1 ? v1 : v0, rest...);
+  // }
+  //
+  // ... unfortunately, some compilers do not optimize this well, and it ends
+  // up using linear stack space instead of constant stack space; for complex
+  // structs on systems with limited stack (such as typical microcontrollers),
+  // this can cause methods like Ok() to blow the stack.
+  //
+  // The C++14 solution would be to use a std::initializer_list and iterate over
+  // the arguments.  Unfortunately, C++11 std::initializer_list is not
+  // constexpr, and C++11 constexpr does not allow iteration.
+  //
+  // Instead, we have a small number of hand-written overloads and a large
+  // number (61, at time of writing) of generated overloads, which use O(lg(N))
+  // stack for "small" numbers of arguments (128 or fewer, at time of writing),
+  // and O(N) stack for more arguments, but with a much, much smaller constant
+  // multiplier: one additional stack frame per 64 arguments, instead of one
+  // per argument.
+
+  // Maximum of 2-4 elements are special-cased.
+  template <typename T>
+  static inline constexpr T Do(T v0, T v1) {
     // C++11 std::max is not constexpr, so we can't just call it.
-    static_assert(::std::is_same<T1, T2>::value,
-                  "Expected Do to be called with a proper intermediate type.");
-    return Do(l < r ? r : l, rest...);
+    return v0 < v1 ? v1 : v0;
   }
+
+  template <typename T>
+  static inline constexpr T Do(T v0, T v1, T v2) {
+    return Do(v0 < v1 ? v1 : v0, v2);
+  }
+
+  template <typename T>
+  static inline constexpr T Do(T v0, T v1, T v2, T v3) {
+    return Do(v0 < v1 ? v1 : v0, v2 < v3 ? v3 : v2);
+  }
+
+  // The remaining overloads (5+ arguments) are generated by a script and
+  // #included, so that they do not clutter the hand-written code.
+  //
+  // They are of the form:
+  //
+  // template <typename T>
+  // static inline constexpr Do(T v0, ... T vN, T vN_plus_1, ... T v2N) {
+  //   return Do(Do(v0, ... vN), Do(vN_plus_1, ... v2N));
+  // }
+  //
+  // In each case, they cut their argument lists in half, calling Do(Do(first
+  // half), Do(second half)).
+  //
+  // Note that, if there are enough arguments, this still falls back onto
+  // linear-stack-space recursion.
+#  include "emboss_arithmetic_maximum_operation_generated.inc"
 };
 
 //// Special operations, where either un-Known() operands do not always result
commit	cab2682e695a0d5a187e6e4e6b6518a1f8fc3875	[log] [tgz]
author	Dmitri Prime <dmitri+github.com@xn13.com>	Sat Oct 03 18:28:08 2020 -0700
committer	Dmitri Prime <dmitri+github.com@xn13.com>	Sat Oct 03 18:28:08 2020 -0700
tree	0bb1b9cec74e5ecc6d65dc921bc5203b709acdc0
parent	a1d6fcaf6d74a11c5ec05ebc8a4edb7f9903843e [diff] [blame]