Improve our various round-trip timeout computations. (#33587) We had a few issues: 1) Our "round-trip timeout" only accounted for one side of the round-trip needing to do MRP retransmits. So if the sender retried a few times, the last retry finally got through, then the response had to be retried a well, the sender would time out the exchange before the response was received. The fix here is to add the MRP backoff times for both the initial message and the response. 2) ReadClient could end up timing out a subscription before the server had actually given up on receiving a StatusReport in response to its ReportData, in situations where the server ended up doing MRP retries and the last MRP retry took a few seconds to get through the network. The fix here is to just estimate how long the server will be waiting for the StatusReport and not time out the subscription until then; at that point even if the server did in fact send its report on time, it will have dropped the subscription on its end.

commit: 3e93ba6e7a1830ff20623b1f892d5e6540fbb2e7 [log] [tgz]
author: Boris Zbarsky <bzbarsky@apple.com> Fri May 24 17:34:19 2024 -0400
committer: GitHub <noreply@github.com> Fri May 24 14:34:19 2024 -0700
tree: f29c40fdc8da92ced51ab216921abe86cd211ba4
parent: b81370b28a13b4fd5ea5af31fae557c99efd7915 [diff]
diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp
index 5733a42..9197a2e 100644
--- a/src/app/OperationalSessionSetup.cpp
+++ b/src/app/OperationalSessionSetup.cpp

@@ -792,6 +792,11 @@
         // but in practice for old devices BUSY often sends some hardcoded value
         // that tells us nothing about when the other side will decide it has
         // timed out.
+        //
+        // Unfortunately, we do not have the MRP config for the other side here,
+        // but in practice if the other side is using its local config to
+        // compute Sigma2 response timeouts, then it's also returning useful
+        // values with BUSY, so we will wait long enough.
         auto additionalTimeout = CASESession::ComputeSigma2ResponseTimeout(GetLocalMRPConfig().ValueOr(GetDefaultMRPConfig()));
         actualTimerDelay += additionalTimeout;
     }

diff --git a/src/app/ReadClient.cpp b/src/app/ReadClient.cpp
index e8ada20..bb058e0 100644
--- a/src/app/ReadClient.cpp
+++ b/src/app/ReadClient.cpp

@@ -938,24 +938,26 @@
 
     //
     // To calculate the duration we're willing to wait for a report to come to us, we take into account the maximum interval of
-    // the subscription AND the time it takes for the report to make it to us in the worst case. This latter bit involves
-    // computing the Ack timeout from the publisher for the ReportData message being sent to us using our IDLE interval as the
-    // basis for that computation.
+    // the subscription AND the time it takes for the report to make it to us in the worst case.
     //
-    // Make sure to use the retransmission computation that includes backoff.  For purposes of that computation, treat us as
-    // active now (since we are right now sending/receiving messages), and use the default "how long are we guaranteed to stay
-    // active" threshold for now.
+    // We have no way to estimate what the network latency will be, but we do know the other side will time out its ReportData
+    // after its computed round-trip timeout plus the processing time it gives us (app::kExpectedIMProcessingTime).  Once it
+    // times out, assuming it sent the report at all, there's no point in us thinking we still have a subscription.
     //
-    // TODO: We need to find a good home for this logic that will correctly compute this based on transport. For now, this will
-    // suffice since we don't use TCP as a transport currently and subscriptions over BLE aren't really a thing.
+    // We can't use ComputeRoundTripTimeout() on the session for two reasons: we want the roundtrip timeout from the point of
+    // view of the peer, not us, and we want to start off with the assumption the peer will likely have, which is that we are
+    // idle, whereas ComputeRoundTripTimeout() uses the current activity state of the peer.
     //
-    const auto & localMRPConfig   = GetLocalMRPConfig();
-    const auto & defaultMRPConfig = GetDefaultMRPConfig();
-    const auto & ourMrpConfig     = localMRPConfig.ValueOr(defaultMRPConfig);
-    auto publisherTransmissionTimeout =
-        GetRetransmissionTimeout(ourMrpConfig.mActiveRetransTimeout, ourMrpConfig.mIdleRetransTimeout,
-                                 System::SystemClock().GetMonotonicTimestamp(), ourMrpConfig.mActiveThresholdTime);
-    *aTimeout = System::Clock::Seconds16(mMaxInterval) + publisherTransmissionTimeout;
+    // So recompute the round-trip timeout directly.  Assume MRP, since in practice that is likely what is happening.
+    auto & peerMRPConfig = mReadPrepareParams.mSessionHolder->GetRemoteMRPConfig();
+    // Peer will assume we are idle (hence we pass kZero to GetMessageReceiptTimeout()), but will assume we treat it as active
+    // for the response, so to match the retransmission timeout computation for the message back to the peeer, we should treat
+    // it as active.
+    auto roundTripTimeout = mReadPrepareParams.mSessionHolder->GetMessageReceiptTimeout(System::Clock::kZero) +
+        kExpectedIMProcessingTime +
+        GetRetransmissionTimeout(peerMRPConfig.mActiveRetransTimeout, peerMRPConfig.mIdleRetransTimeout,
+                                 System::SystemClock().GetMonotonicTimestamp(), peerMRPConfig.mActiveThresholdTime);
+    *aTimeout = System::Clock::Seconds16(mMaxInterval) + roundTripTimeout;
     return CHIP_NO_ERROR;
 }
 

diff --git a/src/darwin/Framework/CHIPTests/MTRControllerTests.m b/src/darwin/Framework/CHIPTests/MTRControllerTests.m
index b5ca631..436a0df 100644
--- a/src/darwin/Framework/CHIPTests/MTRControllerTests.m
+++ b/src/darwin/Framework/CHIPTests/MTRControllerTests.m

@@ -1557,6 +1557,10 @@
     // Can be called before starting the factory
     XCTAssertFalse(MTRDeviceControllerFactory.sharedInstance.running);
     MTRSetMessageReliabilityParameters(@2000, @2000, @2000, @2000);
+
+    // Now reset back to the default state, so timings in other tests are not
+    // affected.
+    MTRSetMessageReliabilityParameters(nil, nil, nil, nil);
 }
 
 @end

diff --git a/src/darwin/Framework/CHIPTests/MTRPerControllerStorageTests.m b/src/darwin/Framework/CHIPTests/MTRPerControllerStorageTests.m
index 868e81e..c5e3927 100644
--- a/src/darwin/Framework/CHIPTests/MTRPerControllerStorageTests.m
+++ b/src/darwin/Framework/CHIPTests/MTRPerControllerStorageTests.m

@@ -2140,6 +2140,10 @@
     XCTAssertTrue(controller.running);
     MTRSetMessageReliabilityParameters(@2000, @2000, @2000, @2000);
     [controller shutdown];
+
+    // Now reset back to the default state, so timings in other tests are not
+    // affected.
+    MTRSetMessageReliabilityParameters(nil, nil, nil, nil);
 }
 
 // TODO: This might also want to go in a separate test file, with some shared setup for commissioning devices per test

diff --git a/src/protocols/secure_channel/CASESession.cpp b/src/protocols/secure_channel/CASESession.cpp
index ccc2bc2..b0642a9 100644
--- a/src/protocols/secure_channel/CASESession.cpp
+++ b/src/protocols/secure_channel/CASESession.cpp

@@ -2362,21 +2362,36 @@
     return err;
 }
 
+namespace {
+System::Clock::Timeout ComputeRoundTripTimeout(ExchangeContext::Timeout serverProcessingTime,
+                                               const ReliableMessageProtocolConfig & remoteMrpConfig)
+{
+    // TODO: This is duplicating logic from Session::ComputeRoundTripTimeout.  Unfortunately, it's called by
+    // consumers who do not have a session.
+    const auto & maybeLocalMRPConfig = GetLocalMRPConfig();
+    const auto & defaultMRRPConfig   = GetDefaultMRPConfig();
+    const auto & localMRPConfig      = maybeLocalMRPConfig.ValueOr(defaultMRRPConfig);
+    return GetRetransmissionTimeout(remoteMrpConfig.mActiveRetransTimeout, remoteMrpConfig.mIdleRetransTimeout,
+                                    // Assume peer is idle, as a worst-case assumption (probably true for
+                                    // Sigma1, since that will be our initial message on the session, but less
+                                    // so for Sigma2).
+                                    System::Clock::kZero, remoteMrpConfig.mActiveThresholdTime) +
+        serverProcessingTime +
+        GetRetransmissionTimeout(localMRPConfig.mActiveRetransTimeout, localMRPConfig.mIdleRetransTimeout,
+                                 // Peer will assume we are active, since it's
+                                 // responding to our message.
+                                 System::SystemClock().GetMonotonicTimestamp(), localMRPConfig.mActiveThresholdTime);
+}
+} // anonymous namespace
+
 System::Clock::Timeout CASESession::ComputeSigma1ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig)
 {
-    return GetRetransmissionTimeout(remoteMrpConfig.mActiveRetransTimeout, remoteMrpConfig.mIdleRetransTimeout,
-                                    // Assume peer is idle, since that's what we
-                                    // will assume for our initial message.
-                                    System::Clock::kZero, remoteMrpConfig.mActiveThresholdTime) +
-        kExpectedSigma1ProcessingTime;
+    return ComputeRoundTripTimeout(kExpectedSigma1ProcessingTime, remoteMrpConfig);
 }
 
 System::Clock::Timeout CASESession::ComputeSigma2ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig)
 {
-    return GetRetransmissionTimeout(remoteMrpConfig.mActiveRetransTimeout, remoteMrpConfig.mIdleRetransTimeout,
-                                    // Assume peer is idle, as a worst-case assumption.
-                                    System::Clock::kZero, remoteMrpConfig.mActiveThresholdTime) +
-        kExpectedHighProcessingTime;
+    return ComputeRoundTripTimeout(kExpectedHighProcessingTime, remoteMrpConfig);
 }
 
 bool CASESession::InvokeBackgroundWorkWatchdog()

diff --git a/src/transport/GroupSession.h b/src/transport/GroupSession.h
index f20ff06..1c5ffec 100644
--- a/src/transport/GroupSession.h
+++ b/src/transport/GroupSession.h

@@ -75,6 +75,13 @@
         return System::Clock::Timeout();
     }
 
+    System::Clock::Milliseconds32 GetMessageReceiptTimeout(System::Clock::Timestamp ourLastActivity) const override
+    {
+        // There are no timeouts for group sessions.
+        VerifyOrDie(false);
+        return System::Clock::Timeout();
+    }
+
     GroupId GetGroupId() const { return mGroupId; }
 
 private:
@@ -127,6 +134,13 @@
         return System::Clock::Timeout();
     }
 
+    System::Clock::Milliseconds32 GetMessageReceiptTimeout(System::Clock::Timestamp ourLastActivity) const override
+    {
+        // There are no timeouts for group sessions.
+        VerifyOrDie(false);
+        return System::Clock::Timeout();
+    }
+
     GroupId GetGroupId() const { return mGroupId; }
 
 private:

diff --git a/src/transport/SecureSession.h b/src/transport/SecureSession.h
index c5e0462..fe70d67 100644
--- a/src/transport/SecureSession.h
+++ b/src/transport/SecureSession.h

@@ -179,6 +179,27 @@
         return System::Clock::Timeout();
     }
 
+    System::Clock::Milliseconds32 GetMessageReceiptTimeout(System::Clock::Timestamp ourLastActivity) const override
+    {
+        switch (mPeerAddress.GetTransportType())
+        {
+        case Transport::Type::kUdp: {
+            const auto & maybeLocalMRPConfig = GetLocalMRPConfig();
+            const auto & defaultMRRPConfig   = GetDefaultMRPConfig();
+            const auto & localMRPConfig      = maybeLocalMRPConfig.ValueOr(defaultMRRPConfig);
+            return GetRetransmissionTimeout(localMRPConfig.mActiveRetransTimeout, localMRPConfig.mIdleRetransTimeout,
+                                            ourLastActivity, localMRPConfig.mActiveThresholdTime);
+        }
+        case Transport::Type::kTcp:
+            return System::Clock::Seconds16(30);
+        case Transport::Type::kBle:
+            return System::Clock::Milliseconds32(BTP_ACK_TIMEOUT_MS);
+        default:
+            break;
+        }
+        return System::Clock::Timeout();
+    }
+
     const PeerAddress & GetPeerAddress() const { return mPeerAddress; }
     void SetPeerAddress(const PeerAddress & address) { mPeerAddress = address; }
 

diff --git a/src/transport/Session.cpp b/src/transport/Session.cpp
index 18737fb..4971b78 100644
--- a/src/transport/Session.cpp
+++ b/src/transport/Session.cpp

@@ -70,7 +70,10 @@
     {
         return System::Clock::kZero;
     }
-    return GetAckTimeout() + upperlayerProcessingTimeout;
+
+    // Treat us as active for purposes of GetMessageReceiptTimeout(), since the
+    // other side would be responding to our message.
+    return GetAckTimeout() + upperlayerProcessingTimeout + GetMessageReceiptTimeout(System::SystemClock().GetMonotonicTimestamp());
 }
 
 uint16_t Session::SessionIdForLogging() const

diff --git a/src/transport/Session.h b/src/transport/Session.h
index adaf872..fc656c8 100644
--- a/src/transport/Session.h
+++ b/src/transport/Session.h

@@ -244,7 +244,21 @@
     virtual bool AllowsLargePayload() const                              = 0;
     virtual const SessionParameters & GetRemoteSessionParameters() const = 0;
     virtual System::Clock::Timestamp GetMRPBaseTimeout() const           = 0;
-    virtual System::Clock::Milliseconds32 GetAckTimeout() const          = 0;
+    // GetAckTimeout is the estimate for how long it could take for the other
+    // side to receive our message (accounting for our MRP retransmits if it
+    // gets lost) and send a response.
+    virtual System::Clock::Milliseconds32 GetAckTimeout() const = 0;
+
+    // GetReceiptTimeout is the estimate for how long it could take for us to
+    // receive a message after the other side sends it, accounting for the MRP
+    // retransmits the other side might do if the message gets lost.
+    //
+    // The caller is expected to provide an estimate for when the peer would
+    // last have heard from us.  The most likely values to pass are
+    // System::SystemClock().GetMonotonicTimestamp() (to indicate "peer is
+    // responding to a message it just received") and System::Clock::kZero (to
+    // indicate "peer is reaching out to us, not in response to anything").
+    virtual System::Clock::Milliseconds32 GetMessageReceiptTimeout(System::Clock::Timestamp ourLastActivity) const = 0;
 
     const ReliableMessageProtocolConfig & GetRemoteMRPConfig() const { return GetRemoteSessionParameters().GetMRPConfig(); }
 

diff --git a/src/transport/UnauthenticatedSessionTable.h b/src/transport/UnauthenticatedSessionTable.h
index 9130582..d77bf70 100644
--- a/src/transport/UnauthenticatedSessionTable.h
+++ b/src/transport/UnauthenticatedSessionTable.h

@@ -107,6 +107,27 @@
         return System::Clock::Timeout();
     }
 
+    System::Clock::Milliseconds32 GetMessageReceiptTimeout(System::Clock::Timestamp ourLastActivity) const override
+    {
+        switch (mPeerAddress.GetTransportType())
+        {
+        case Transport::Type::kUdp: {
+            const auto & maybeLocalMRPConfig = GetLocalMRPConfig();
+            const auto & defaultMRRPConfig   = GetDefaultMRPConfig();
+            const auto & localMRPConfig      = maybeLocalMRPConfig.ValueOr(defaultMRRPConfig);
+            return GetRetransmissionTimeout(localMRPConfig.mActiveRetransTimeout, localMRPConfig.mIdleRetransTimeout,
+                                            ourLastActivity, localMRPConfig.mActiveThresholdTime);
+        }
+        case Transport::Type::kTcp:
+            return System::Clock::Seconds16(30);
+        case Transport::Type::kBle:
+            return System::Clock::Milliseconds32(BTP_ACK_TIMEOUT_MS);
+        default:
+            break;
+        }
+        return System::Clock::Timeout();
+    }
+
     NodeId GetPeerNodeId() const
     {
         if (mSessionRole == SessionRole::kInitiator)
commit	3e93ba6e7a1830ff20623b1f892d5e6540fbb2e7	[log] [tgz]
author	Boris Zbarsky <bzbarsky@apple.com>	Fri May 24 17:34:19 2024 -0400
committer	GitHub <noreply@github.com>	Fri May 24 14:34:19 2024 -0700
tree	f29c40fdc8da92ced51ab216921abe86cd211ba4
parent	b81370b28a13b4fd5ea5af31fae557c99efd7915 [diff]