Add a timer to track whether we have received BDX init after a query … (#24777)

* Add a timer to track whether we have received BDX init after a query image was successful

- Currently if an ota requester has a successful query image and an image it available but
  if for any reason, the ota requester doesn't send BDX init, the provider will be stuck until
  we reboot the resident. In order to have a fail safe, we are adding a timer that starts after
  query image returns image available and waits for a BDX init to come. In case BDX init doesn't
  come, it times out and resets state

- Add code to reset state if any API fails on the provider once we prepare for BDX transfer

- Stop polling when BDX transfer reset is called

- Return QueryImageResponse status busy instead of general failure if the sdk is busy and
  gets a sexond query image so accessory can handle the error correctly and retry until the
  sdk is done

- When we send a message, handle the case where if an error happens in sending the message
  close the exchange context and reset state. If the message is a status report and it
  succeeds, null out the exchange and reset state

* Update src/darwin/Framework/CHIP/MTROTAProviderDelegateBridge.mm

Co-authored-by: Boris Zbarsky <bzbarsky@apple.com>

* Update src/darwin/Framework/CHIP/MTROTAProviderDelegateBridge.mm

Co-authored-by: Boris Zbarsky <bzbarsky@apple.com>

* Reset mStopPolling to false when we call PrepareForTransfer to start polling again

- This covers the use case where we have an image available but the requestor rejects
  the transfer for some reason. On the next query image the requestor is able to
  continue with the BDX transfer. The provider state was not being reset for this scenario
  due to the polling flag not being reset.

- Fixes #23573

* Addressed review comments

* Fix the handling of the use case when hasUpdate is false we need to send the response as is

- Log the error if the BDX timer fails to start

---------

Co-authored-by: Boris Zbarsky <bzbarsky@apple.com>
diff --git a/src/darwin/Framework/CHIP/MTROTAProviderDelegateBridge.mm b/src/darwin/Framework/CHIP/MTROTAProviderDelegateBridge.mm
index 8592a64..3cd88ab 100644
--- a/src/darwin/Framework/CHIP/MTROTAProviderDelegateBridge.mm
+++ b/src/darwin/Framework/CHIP/MTROTAProviderDelegateBridge.mm
@@ -42,6 +42,14 @@
 // TODO Expose a method onto the delegate to make that configurable.
 constexpr uint32_t kMaxBdxBlockSize = 1024;
 constexpr uint32_t kMaxBDXURILen = 256;
+
+// Since the BDX timeout is 5 minutes and we are starting this after query image is available and before the BDX init comes,
+// we just double the timeout to give enough time for the BDX init to come in a reasonable amount of time.
+constexpr System::Clock::Timeout kBdxInitReceivedTimeout = System::Clock::Seconds16(10 * 60);
+
+// Time in seconds after which the requestor should retry calling query image if busy status is receieved
+constexpr uint32_t kDelayedActionTimeSeconds = 120;
+
 constexpr System::Clock::Timeout kBdxTimeout = System::Clock::Seconds16(5 * 60); // OTA Spec mandates >= 5 minutes
 constexpr System::Clock::Timeout kBdxPollIntervalMs = System::Clock::Milliseconds32(50);
 constexpr bdx::TransferRole kBdxRole = bdx::TransferRole::kSender;
@@ -89,13 +97,12 @@
         VerifyOrReturnError(mExchangeMgr != nullptr, CHIP_ERROR_INCORRECT_STATE);
 
         mExchangeMgr->UnregisterUnsolicitedMessageHandlerForProtocol(Protocols::BDX::Id);
+        ResetState();
 
         mExchangeMgr = nullptr;
         mSystemLayer = nullptr;
         mDelegateNotificationQueue = nil;
 
-        ResetState();
-
         return CHIP_NO_ERROR;
     }
 
@@ -120,7 +127,39 @@
         }
     }
 
+    void ResetState()
+    {
+        assertChipStackLockedByCurrentThread();
+        if (mSystemLayer) {
+            mSystemLayer->CancelTimer(HandleBdxInitReceivedTimeoutExpired, this);
+        }
+        // TODO: Check if this can be removed. It seems like we can close the exchange context and reset transfer regardless.
+        if (!mInitialized) {
+            return;
+        }
+        Responder::ResetTransfer();
+        ++mTransferGeneration;
+        mFabricIndex.ClearValue();
+        mNodeId.ClearValue();
+
+        if (mExchangeCtx != nullptr) {
+            mExchangeCtx->Close();
+            mExchangeCtx = nullptr;
+        }
+
+        mInitialized = false;
+    }
+
 private:
+    /**
+     * Timer callback called when we don't receive a BDX init within a reasonable time after a successful QueryImage response.
+     */
+    static void HandleBdxInitReceivedTimeoutExpired(chip::System::Layer * systemLayer, void * state)
+    {
+        VerifyOrReturn(state != nullptr);
+        static_cast<BdxOTASender *>(state)->ResetState();
+    }
+
     CHIP_ERROR OnMessageToSend(TransferSession::OutputEvent & event)
     {
         assertChipStackLockedByCurrentThread();
@@ -137,12 +176,41 @@
         }
 
         auto & msgTypeData = event.msgTypeData;
-        return mExchangeCtx->SendMessage(msgTypeData.ProtocolId, msgTypeData.MessageType, std::move(event.MsgData), sendFlags);
+        // If there's an error sending the message, close the exchange and call ResetState.
+        // TODO: If we can remove the !mInitialized check in ResetState(), just calling ResetState() will suffice here.
+        CHIP_ERROR err
+            = mExchangeCtx->SendMessage(msgTypeData.ProtocolId, msgTypeData.MessageType, std::move(event.MsgData), sendFlags);
+        if (err != CHIP_NO_ERROR) {
+            mExchangeCtx->Close();
+            mExchangeCtx = nullptr;
+            ResetState();
+        } else if (event.msgTypeData.HasMessageType(Protocols::SecureChannel::MsgType::StatusReport)) {
+            // If the send was successful for a status report, since we are not expecting a response the exchange context is
+            // already closed. We need to null out the reference to avoid having a dangling pointer.
+            mExchangeCtx = nullptr;
+            ResetState();
+        }
+        return err;
+    }
+
+    bdx::StatusCode GetBdxStatusCodeFromChipError(CHIP_ERROR err)
+    {
+        if (err == CHIP_ERROR_INCORRECT_STATE) {
+            return bdx::StatusCode::kUnexpectedMessage;
+        }
+        if (err == CHIP_ERROR_INVALID_ARGUMENT) {
+            return bdx::StatusCode::kBadMessageContents;
+        }
+        return bdx::StatusCode::kUnknown;
     }
 
     CHIP_ERROR OnTransferSessionBegin(TransferSession::OutputEvent & event)
     {
         assertChipStackLockedByCurrentThread();
+        // Once we receive the BDX init, cancel the BDX Init timeout and start the BDX session
+        if (mSystemLayer) {
+            mSystemLayer->CancelTimer(HandleBdxInitReceivedTimeoutExpired, this);
+        }
 
         VerifyOrReturnError(mFabricIndex.HasValue(), CHIP_ERROR_INCORRECT_STATE);
         VerifyOrReturnError(mNodeId.HasValue(), CHIP_ERROR_INCORRECT_STATE);
@@ -169,8 +237,9 @@
                     }
 
                     if (error != nil) {
-                        LogErrorOnFailure([MTRError errorToCHIPErrorCode:error]);
-                        LogErrorOnFailure(mTransfer.AbortTransfer(bdx::StatusCode::kUnknown));
+                        CHIP_ERROR err = [MTRError errorToCHIPErrorCode:error];
+                        LogErrorOnFailure(err);
+                        LogErrorOnFailure(mTransfer.AbortTransfer(GetBdxStatusCodeFromChipError(err)));
                         return;
                     }
 
@@ -328,6 +397,9 @@
         switch (event.EventType) {
         case TransferSession::OutputEventType::kInitReceived:
             err = OnTransferSessionBegin(event);
+            if (err != CHIP_NO_ERROR) {
+                LogErrorOnFailure(mTransfer.AbortTransfer(GetBdxStatusCodeFromChipError(err)));
+            }
             break;
         case TransferSession::OutputEventType::kStatusReceived:
             ChipLogError(BDX, "Got StatusReport %x", static_cast<uint16_t>(event.statusData.statusCode));
@@ -370,6 +442,14 @@
             ResetState();
         }
 
+        // Start a timer to track whether we receive a BDX init after a successful query image in a reasonable amount of time
+        CHIP_ERROR err = mSystemLayer->StartTimer(kBdxInitReceivedTimeout, HandleBdxInitReceivedTimeoutExpired, this);
+        LogErrorOnFailure(err);
+
+        // The caller of this method maps CHIP_ERROR to specific BDX Status Codes (see GetBdxStatusCodeFromChipError)
+        // and those are used by the BDX session to prepare the status report.
+        VerifyOrReturnError(err == CHIP_NO_ERROR, CHIP_ERROR_INCORRECT_STATE);
+
         mFabricIndex.SetValue(fabricIndex);
         mNodeId.SetValue(nodeId);
 
@@ -378,27 +458,6 @@
         return CHIP_NO_ERROR;
     }
 
-    void ResetState()
-    {
-        assertChipStackLockedByCurrentThread();
-
-        if (!mInitialized) {
-            return;
-        }
-
-        Responder::ResetTransfer();
-        ++mTransferGeneration;
-        mFabricIndex.ClearValue();
-        mNodeId.ClearValue();
-
-        if (mExchangeCtx != nullptr) {
-            mExchangeCtx->Close();
-            mExchangeCtx = nullptr;
-        }
-
-        mInitialized = false;
-    }
-
     bool mInitialized = false;
     Optional<FabricIndex> mFabricIndex;
     Optional<NodeId> mNodeId;
@@ -549,63 +608,81 @@
     __block CommandHandler::Handle handle(commandObj);
     __block ConcreteCommandPath cachedCommandPath(commandPath.mEndpointId, commandPath.mClusterId, commandPath.mCommandId);
 
-    auto completionHandler
-        = ^(MTROTASoftwareUpdateProviderClusterQueryImageResponseParams * _Nullable data, NSError * _Nullable error) {
-              [controller
-                  asyncDispatchToMatterQueue:^() {
-                      assertChipStackLockedByCurrentThread();
+    auto completionHandler = ^(
+        MTROTASoftwareUpdateProviderClusterQueryImageResponseParams * _Nullable data, NSError * _Nullable error) {
+        [controller
+            asyncDispatchToMatterQueue:^() {
+                assertChipStackLockedByCurrentThread();
 
-                      CommandHandler * handler = EnsureValidState(handle, cachedCommandPath, "QueryImage", data, error);
-                      VerifyOrReturn(handler != nullptr);
+                CommandHandler * handler = EnsureValidState(handle, cachedCommandPath, "QueryImage", data, error);
+                VerifyOrReturn(handler != nullptr);
 
-                      ChipLogDetail(Controller, "QueryImage: application responded with: %s",
-                          [[data description] cStringUsingEncoding:NSUTF8StringEncoding]);
+                ChipLogDetail(Controller, "QueryImage: application responded with: %s",
+                    [[data description] cStringUsingEncoding:NSUTF8StringEncoding]);
 
-                      Commands::QueryImageResponse::Type response;
-                      ConvertFromQueryImageResponseParms(data, response);
+                Commands::QueryImageResponse::Type response;
+                ConvertFromQueryImageResponseParms(data, response);
 
-                      auto hasUpdate = [data.status isEqual:@(MTROtaSoftwareUpdateProviderOTAQueryStatusUpdateAvailable)];
-                      auto isBDXProtocolSupported = [commandParams.protocolsSupported
-                          containsObject:@(MTROtaSoftwareUpdateProviderOTADownloadProtocolBDXSynchronous)];
+                auto hasUpdate = [data.status isEqual:@(MTROtaSoftwareUpdateProviderOTAQueryStatusUpdateAvailable)];
+                auto isBDXProtocolSupported = [commandParams.protocolsSupported
+                    containsObject:@(MTROtaSoftwareUpdateProviderOTADownloadProtocolBDXSynchronous)];
 
-                      if (hasUpdate && isBDXProtocolSupported) {
-                          auto fabricIndex = handler->GetSubjectDescriptor().fabricIndex;
-                          auto nodeId = handler->GetSubjectDescriptor().subject;
-                          CHIP_ERROR err = gOtaSender.PrepareForTransfer(fabricIndex, nodeId);
-                          if (CHIP_NO_ERROR != err) {
-                              LogErrorOnFailure(err);
-                              handler->AddStatus(cachedCommandPath, Protocols::InteractionModel::Status::Failure);
-                              handle.Release();
-                              return;
-                          }
+                if (hasUpdate && isBDXProtocolSupported) {
+                    auto fabricIndex = handler->GetSubjectDescriptor().fabricIndex;
+                    auto nodeId = handler->GetSubjectDescriptor().subject;
+                    CHIP_ERROR err = gOtaSender.PrepareForTransfer(fabricIndex, nodeId);
+                    if (CHIP_NO_ERROR != err) {
+                        LogErrorOnFailure(err);
+                        if (err == CHIP_ERROR_BUSY) {
+                            Commands::QueryImageResponse::Type busyResponse;
+                            busyResponse.status = static_cast<OTAQueryStatus>(MTROTASoftwareUpdateProviderOTAQueryStatusBusy);
+                            busyResponse.delayedActionTime.SetValue(response.delayedActionTime.ValueOr(kDelayedActionTimeSeconds));
+                            handler->AddResponse(cachedCommandPath, busyResponse);
+                            handle.Release();
+                            return;
+                        }
+                        handler->AddStatus(cachedCommandPath, Protocols::InteractionModel::Status::Failure);
+                        handle.Release();
+                        gOtaSender.ResetState();
+                        return;
+                    }
+                    auto targetNodeId
+                        = handler->GetExchangeContext()->GetSessionHandle()->AsSecureSession()->GetLocalScopedNodeId();
 
-                          auto targetNodeId
-                              = handler->GetExchangeContext()->GetSessionHandle()->AsSecureSession()->GetLocalScopedNodeId();
+                    char uriBuffer[kMaxBDXURILen];
+                    MutableCharSpan uri(uriBuffer);
+                    err = bdx::MakeURI(targetNodeId.GetNodeId(), AsCharSpan(data.imageURI), uri);
+                    if (CHIP_NO_ERROR != err) {
+                        LogErrorOnFailure(err);
+                        handler->AddStatus(cachedCommandPath, Protocols::InteractionModel::Status::Failure);
+                        handle.Release();
+                        gOtaSender.ResetState();
+                        return;
+                    }
 
-                          char uriBuffer[kMaxBDXURILen];
-                          MutableCharSpan uri(uriBuffer);
-                          err = bdx::MakeURI(targetNodeId.GetNodeId(), AsCharSpan(data.imageURI), uri);
-                          if (CHIP_NO_ERROR != err) {
-                              LogErrorOnFailure(err);
-                              handler->AddStatus(cachedCommandPath, Protocols::InteractionModel::Status::Failure);
-                              handle.Release();
-                              return;
-                          }
+                    response.imageURI.SetValue(uri);
+                    handler->AddResponse(cachedCommandPath, response);
+                    handle.Release();
+                    return;
+                }
+                if (!hasUpdate) {
+                    // Send whatever error response our delegate decided on.
+                    handler->AddResponse(cachedCommandPath, response);
+                } else {
+                    // We must have isBDXProtocolSupported false.  Send the corresponding error status.
+                    Commands::QueryImageResponse::Type protocolNotSupportedResponse;
+                    protocolNotSupportedResponse.status
+                        = static_cast<OTAQueryStatus>(MTROTASoftwareUpdateProviderOTAQueryStatusDownloadProtocolNotSupported);
+                    handler->AddResponse(cachedCommandPath, protocolNotSupportedResponse);
+                }
+                handle.Release();
+                gOtaSender.ResetState();
+            }
 
-                          response.imageURI.SetValue(uri);
-                          handler->AddResponse(cachedCommandPath, response);
-                          handle.Release();
-                          return;
-                      }
-
-                      handler->AddResponse(cachedCommandPath, response);
-                      handle.Release();
-                  }
-
-                                errorHandler:^(NSError *) {
-                                    // Not much we can do here
-                                }];
-          };
+                          errorHandler:^(NSError *) {
+                              // Not much we can do here
+                          }];
+    };
 
     auto strongDelegate = mDelegate;
     dispatch_async(mDelegateNotificationQueue, ^{
diff --git a/src/protocols/bdx/TransferFacilitator.cpp b/src/protocols/bdx/TransferFacilitator.cpp
index 65ab653..3195bb4 100644
--- a/src/protocols/bdx/TransferFacilitator.cpp
+++ b/src/protocols/bdx/TransferFacilitator.cpp
@@ -102,6 +102,7 @@
 
     mPollFreq    = pollFreq;
     mSystemLayer = layer;
+    mStopPolling = false;
 
     ReturnErrorOnFailure(mTransfer.WaitForTransfer(role, xferControlOpts, maxBlockSize, timeout));
 
@@ -112,6 +113,7 @@
 void Responder::ResetTransfer()
 {
     mTransfer.Reset();
+    mStopPolling = true;
 }
 
 CHIP_ERROR Initiator::InitiateTransfer(System::Layer * layer, TransferRole role, const TransferSession::TransferInitData & initData,