Properly handle crashes/reboots during FabricTable commit (#20010) * Properly handle crashes/reboots during FabricTable commit - Since #19819, commits are very small and safer. There is less surface to fail during commit. The previous large-scale fail-safe behavior stored too much data, for too long and could cause larger reverts even if nothing was committed yet. FabricTable data no longer is ever persisted without commit. - The existing code deleted fabrics unwittingly when not required, such as when powering off a light during a fail-safe for an update when there was nothing committed yet, assuming we still committed immediately. This change: - Detects failed commits - Only deletes data on failed commits - Fixes Thread driver to detect stale data where a backup was done (since we cannot prevent internal commits from OpenThread) Testing done: - Added unit test to FabricTable to generate the condition - Did manual testing of all-clusters-app/chip-tool Linux that aborted on second commissioning, during commit. Found that cleanup occurred as expected on restart - Integration/cert testing (including Cirque that validates fail-safe) still pass * Restyled by clang-format * Restyled by gn * Clear commit token after deferred cleanup * Applied code review comments * Restyled by clang-format * Fix CI * Fix CI * Remove revert at restart of openthread after other changes * Apply review comments * Try to fix nRF CI * Another attempt to fix nrfconnect CI * Fix CI some more * Fix CI again Co-authored-by: Restyled.io <commits@restyled.io>

commit: b91727b6b04d006ffd751b90bf973756428daed8 [log] [tgz]
author: Tennessee Carmel-Veilleux <tennessee.carmelveilleux@gmail.com> Tue Jun 28 20:52:02 2022 -0400
committer: GitHub <noreply@github.com> Tue Jun 28 20:52:02 2022 -0400
tree: eab3646450b63e6bc716965ae1ab7d0a49a999a8
parent: d4bb6ee22dffa398a565b5f301f1682151997c12 [diff] [blame]
diff --git a/src/app/server/Server.cpp b/src/app/server/Server.cpp
index 0ea320d..8ea1db7 100644
--- a/src/app/server/Server.cpp
+++ b/src/app/server/Server.cpp

@@ -169,9 +169,6 @@
     // This initializes clusters, so should come after lower level initialization.
     InitDataModelHandler(&mExchangeMgr);
 
-    // Clean-up previously standing fail-safes
-    InitFailSafe();
-
     // Init transport before operations with secure session mgr.
     err = mTransports.Init(UdpListenParameters(DeviceLayer::UDPEndPointManager())
                                .SetAddressType(IPAddressType::kIPv6)
@@ -257,7 +254,6 @@
     else
     {
 #if CHIP_DEVICE_CONFIG_ENABLE_PAIRING_AUTOSTART
-        GetFabricTable().DeleteAllFabrics();
         SuccessOrExit(err = mCommissioningWindowManager.OpenBasicCommissioningWindow());
 #endif
     }
@@ -300,6 +296,41 @@
     RejoinExistingMulticastGroups();
 #endif // !CHIP_SYSTEM_CONFIG_USE_OPEN_THREAD_ENDPOINT
 
+    // Handle deferred clean-up of a previously armed fail-safe that occurred during FabricTable commit.
+    // This is done at the very end since at the earlier time above when FabricTable::Init() is called,
+    // the delegates could not have been registered, and the other systems were not initialized. By now,
+    // everything is initialized, so we can do a deferred clean-up.
+    {
+        FabricIndex fabricIndexDeletedOnInit = GetFabricTable().GetDeletedFabricFromCommitMarker();
+        if (fabricIndexDeletedOnInit != kUndefinedFabricIndex)
+        {
+            ChipLogError(AppServer, "FabricIndex 0x%x deleted due to restart while fail-safed. Processing a clean-up!",
+                         static_cast<unsigned>(fabricIndexDeletedOnInit));
+
+            // Always pretend it was an add, since being in the middle of an update currently breaks
+            // the validity of the fabric table. This is expected to be extremely infrequent, so
+            // this "harsher" than usual clean-up is more likely to get us in a valid state for whatever
+            // remains.
+            const bool addNocCalled    = true;
+            const bool updateNocCalled = false;
+            GetFailSafeContext().ScheduleFailSafeCleanup(fabricIndexDeletedOnInit, addNocCalled, updateNocCalled);
+
+            // Schedule clearing of the commit marker to only occur after we have processed all fail-safe clean-up.
+            // Because Matter runs a single event loop for all scheduled work, it will occur after the above has
+            // taken place. If a reset occurs before we have cleaned everything up, the next boot will still
+            // see the commit marker.
+            PlatformMgr().ScheduleWork(
+                [](intptr_t arg) {
+                    Server * server = reinterpret_cast<Server *>(arg);
+                    VerifyOrReturn(server != nullptr);
+
+                    server->GetFabricTable().ClearCommitMarker();
+                    ChipLogProgress(AppServer, "Cleared FabricTable pending commit marker");
+                },
+                reinterpret_cast<intptr_t>(this));
+        }
+    }
+
     PlatformMgr().HandleServerStarted();
 
 exit:
@@ -315,39 +346,6 @@
     return err;
 }
 
-void Server::InitFailSafe()
-{
-    bool failSafeArmed = false;
-
-    CHIP_ERROR err = CHIP_NO_ERROR;
-
-    // If the fail-safe was armed when the device last shutdown, initiate cleanup based on the pending Fail Safe Context with
-    // which the fail-safe timer was armed.
-    if (DeviceLayer::ConfigurationMgr().GetFailSafeArmed(failSafeArmed) == CHIP_NO_ERROR && failSafeArmed)
-    {
-        FabricIndex fabricIndex;
-        bool addNocCommandInvoked;
-        bool updateNocCommandInvoked;
-
-        ChipLogProgress(AppServer, "Detected fail-safe armed on reboot");
-
-        err = DeviceLayer::FailSafeContext::LoadFromStorage(fabricIndex, addNocCommandInvoked, updateNocCommandInvoked);
-        if (err == CHIP_NO_ERROR)
-        {
-            DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext().ScheduleFailSafeCleanup(
-                fabricIndex, addNocCommandInvoked, updateNocCommandInvoked);
-        }
-        else
-        {
-            // This should not happen, but we should not fail system init based on it!
-            ChipLogError(DeviceLayer, "Failed to load fail-safe context from storage (err= %" CHIP_ERROR_FORMAT "), cleaning-up!",
-                         err.Format());
-            (void) DeviceLayer::ConfigurationMgr().SetFailSafeArmed(false);
-            err = CHIP_NO_ERROR;
-        }
-    }
-}
-
 void Server::RejoinExistingMulticastGroups()
 {
     ChipLogProgress(AppServer, "Joining Multicast groups");
commit	b91727b6b04d006ffd751b90bf973756428daed8	[log] [tgz]
author	Tennessee Carmel-Veilleux <tennessee.carmelveilleux@gmail.com>	Tue Jun 28 20:52:02 2022 -0400
committer	GitHub <noreply@github.com>	Tue Jun 28 20:52:02 2022 -0400
tree	eab3646450b63e6bc716965ae1ab7d0a49a999a8
parent	d4bb6ee22dffa398a565b5f301f1682151997c12 [diff] [blame]