Run each component in a subshell and handle errors more robustly

This commit completely rewrites keep-going mode. Instead of relying
solely on "set -e", which has some subtle limitations (such as being
off anywhere inside a conditional), use an ERR trap to record errors.

Run each component in a subshell. This way a component can set
environment variables, change the current directory, etc., without
affecting other components.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh
index 20a20a3..f3494bd 100755
--- a/tests/scripts/all.sh
+++ b/tests/scripts/all.sh
@@ -59,6 +59,15 @@
 # This script must be invoked from the toplevel directory of a git
 # working copy of Mbed TLS.
 #
+# The behavior on an error depends on whether --keep-going (alias -k)
+# is in effect.
+#  * Without --keep-going: the script stops on the first error without
+#    cleaning up. This lets you work in the configuration of the failing
+#    component.
+#  * With --keep-going: the script runs all requested components and
+#    reports failures at the end. In particular the script always cleans
+#    up on exit.
+#
 # Note that the output is not saved. You may want to run
 #   script -c tests/scripts/all.sh
 # or
@@ -81,6 +90,9 @@
 #
 # Each component must start by invoking `msg` with a short informative message.
 #
+# Each component is executed in a separate shell process. The component
+# fails if any command in it returns a non-zero status.
+#
 # The framework performs some cleanup tasks after each component. This
 # means that components can assume that the working directory is in a
 # cleaned-up state, and don't need to perform the cleanup themselves.
@@ -91,19 +103,6 @@
 #   `tests/Makefile` and `programs/fuzz/Makefile` from git.
 #   This cleans up after an in-tree use of CMake.
 #
-# Any command that is expected to fail must be protected so that the
-# script keeps running in --keep-going mode despite `set -e`. In keep-going
-# mode, if a protected command fails, this is logged as a failure and the
-# script will exit with a failure status once it has run all components.
-# Commands can be protected in any of the following ways:
-# * `make` is a function which runs the `make` command with protection.
-#   Note that you must write `make VAR=value`, not `VAR=value make`,
-#   because the `VAR=value make` syntax doesn't work with functions.
-# * Put `report_status` before the command to protect it.
-# * Put `if_build_successful` before a command. This protects it, and
-#   additionally skips it if a prior invocation of `make` in the same
-#   component failed.
-#
 # The tests are roughly in order from fastest to slowest. This doesn't
 # have to be exact, but in general you should add slower tests towards
 # the end and fast checks near the beginning.
@@ -477,8 +476,9 @@
 }
 
 pre_setup_keep_going () {
-    failure_summary=
-    failure_count=0
+    failure_count=0 # Number of failed components
+    last_failure_status=0 # Last failure status in this component
+
     start_red=
     end_color=
     if [ -t 1 ]; then
@@ -489,57 +489,73 @@
                 ;;
         esac
     fi
-    record_status () {
-        if "$@"; then
-            last_status=0
-        else
-            last_status=$?
-            text="$current_section: $* -> $last_status"
-            failure_summary="$failure_summary
-$text"
-            failure_count=$((failure_count + 1))
-            echo "${start_red}^^^^$text^^^^${end_color}" >&2
-        fi
-    }
-    make () {
-        case "$*" in
-            *test|*check)
-                if [ $build_status -eq 0 ]; then
-                    record_status command make "$@"
-                else
-                    echo "(skipped because the build failed)"
-                fi
-                ;;
-            *)
-                record_status command make "$@"
-                build_status=$last_status
-                ;;
+
+    # Keep a summary of failures in a file. We'll print it out at the end.
+    failure_summary_file=$PWD/all-sh-failures-$$.log
+    : >"$failure_summary_file"
+
+    # Whether it makes sense to keep a component going after the specified
+    # command fails (test command) or not (configure or build).
+    # This doesn't have to be 100% accurate: all failures are recorded anyway.
+    can_keep_going_after_failure () {
+        case "$1" in
+            "msg "*) false;;
+            *[!A-Za-z]"test"|*[!A-Za-z]"test"[!A-Za-z]*) true;;
+            "tests/"*) true;;
+            "grep "*|"not grep "*) true;;
+            *) false;;
         esac
     }
+
+    # This function runs if there is any error in a component.
+    # It must either exit with a nonzero status, or set
+    # last_failure_status to a nonzero value.
+    err_trap () {
+        # Save $? (status of the failing command). This must be the very
+        # first thing, before $? is overridden.
+        last_failure_status=$?
+        failed_command=$BASH_COMMAND
+
+        text="$current_section: $failed_command -> $last_failure_status"
+        echo "${start_red}^^^^$text^^^^${end_color}" >&2
+        echo "$text" >>"$failure_summary_file"
+
+        # If the command is fatal (configure or build command), stop this
+        # component. Otherwise (test command) keep the component running
+        # (run more tests from the same build).
+        if ! can_keep_going_after_failure "$failed_command"; then
+            exit $last_failure_status
+        fi
+    }
+
     final_report () {
         if [ $failure_count -gt 0 ]; then
             echo
             echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
-            echo "${start_red}FAILED: $failure_count${end_color}$failure_summary"
+            echo "${start_red}FAILED: $failure_count components${end_color}"
+            cat "$failure_summary_file"
             echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
-            exit 1
         elif [ -z "${1-}" ]; then
             echo "SUCCESS :)"
         fi
         if [ -n "${1-}" ]; then
             echo "Killed by SIG$1."
         fi
+        rm -f "$failure_summary_file"
+        if [ $failure_count -gt 0 ]; then
+            exit 1
+        fi
     }
 }
 
-if_build_succeeded () {
-    if [ $build_status -eq 0 ]; then
-        record_status "$@"
-    fi
+# These functions are kept temporarily for backward compatibility.
+# Don't use them in new components.
+record_status () {
+    "$@"
 }
-
-# to be used instead of ! for commands run with
-# record_status or if_build_succeeded
+if_build_succeeded () {
+    "$@"
+}
 not() {
     ! "$@"
 }
@@ -2667,12 +2683,35 @@
     # have messed it up or shortened it.
     redirect_err dd if=/dev/urandom of=./tests/seedfile bs=64 count=1
 
-    # Run the component code.
-    if [ $QUIET -eq 1 ]; then
-        # msg() is silenced, so just print the component name here
-        echo "${current_component#component_}"
+    # Run the component in a subshell
+    if [ $KEEP_GOING -eq 1 ]; then
+        set +e
     fi
-    redirect_out "$@"
+    (
+        if [ $QUIET -eq 1 ]; then
+            # msg() will be silenced, so just print the component name here.
+            echo "${current_component#component_}"
+            exec >/dev/null
+        fi
+        if [ $KEEP_GOING -eq 1 ]; then
+            # Keep "set -e" off, and run an ERR trap instead to record failures.
+            set -E
+            trap err_trap ERR
+        fi
+        # The next line is what runs the component
+        "$@"
+        if [ $KEEP_GOING -eq 1 ]; then
+            trap - ERR
+            exit $last_failure_status
+        fi
+    )
+    component_status=$?
+    if [ $KEEP_GOING -eq 1 ]; then
+        set -e
+        if [ $component_status -ne 0 ]; then
+            failure_count=$((failure_count + 1))
+        fi
+    fi
 
     # Restore the build tree to a clean state.
     cleanup
@@ -2689,10 +2728,6 @@
 build_status=0
 if [ $KEEP_GOING -eq 1 ]; then
     pre_setup_keep_going
-else
-    record_status () {
-        "$@"
-    }
 fi
 pre_setup_quiet_redirect
 pre_prepare_outcome_file