DNS-SD: Implement browse retries for avahi (#28720)

* check

* check

* Implement retries for Avahi

The current avahi implementation uses a one-shot browse that completes
in approximately one second. This is insufficient for thread devices
that need time to propagage SRP messages through the BR. The majority
of the dnssd timeout is unused. This implements a browse retry with
backoff and also implements the StopBrowse function.

Test: chip-tool pairing started before app, now works. StopBrowse
      is checked in the TestDnssd platform tests.

* Restyled by clang-format

* Actually hook up the stop

* Fix the deletion on stop browse

---------

Co-authored-by: Restyled.io <commits@restyled.io>
diff --git a/src/platform/Linux/DnssdImpl.cpp b/src/platform/Linux/DnssdImpl.cpp
index 289ae41..fa72836 100644
--- a/src/platform/Linux/DnssdImpl.cpp
+++ b/src/platform/Linux/DnssdImpl.cpp
@@ -595,7 +595,8 @@
 }
 
 CHIP_ERROR MdnsAvahi::Browse(const char * type, DnssdServiceProtocol protocol, chip::Inet::IPAddressType addressType,
-                             chip::Inet::InterfaceId interface, DnssdBrowseCallback callback, void * context)
+                             chip::Inet::InterfaceId interface, DnssdBrowseCallback callback, void * context,
+                             intptr_t * browseIdentifier)
 {
     AvahiServiceBrowser * browser;
     BrowseContext * browseContext = chip::Platform::New<BrowseContext>();
@@ -609,18 +610,40 @@
     {
         avahiInterface = AVAHI_IF_UNSPEC;
     }
+    browseContext->mInterface     = avahiInterface;
+    browseContext->mProtocol      = GetFullType(type, protocol);
+    browseContext->mBrowseRetries = 0;
+    browseContext->mStopped.store(false);
 
-    browser = avahi_service_browser_new(mClient, avahiInterface, AVAHI_PROTO_UNSPEC, GetFullType(type, protocol).c_str(), nullptr,
+    browser = avahi_service_browser_new(mClient, avahiInterface, AVAHI_PROTO_UNSPEC, browseContext->mProtocol.c_str(), nullptr,
                                         static_cast<AvahiLookupFlags>(0), HandleBrowse, browseContext);
     // Otherwise the browser will be freed in the callback
     if (browser == nullptr)
     {
         chip::Platform::Delete(browseContext);
+        *browseIdentifier = reinterpret_cast<intptr_t>(nullptr);
+    }
+    else
+    {
+        *browseIdentifier = reinterpret_cast<intptr_t>(browseContext);
     }
 
     return browser == nullptr ? CHIP_ERROR_INTERNAL : CHIP_NO_ERROR;
 }
 
+CHIP_ERROR MdnsAvahi::StopBrowse(intptr_t browseIdentifier)
+{
+    BrowseContext * browseContext = reinterpret_cast<BrowseContext *>(browseIdentifier);
+    if (browseContext == nullptr)
+    {
+        return CHIP_ERROR_NOT_FOUND;
+    }
+    // Any running timers here will check mStopped before rescheduling. Leave the timer running
+    // so we don't race on deletion of the browse context.
+    browseContext->mStopped.store(true);
+    return CHIP_NO_ERROR;
+}
+
 DnssdServiceProtocol GetProtocolInType(const char * type)
 {
     const char * deliminator = strrchr(type, '.');
@@ -662,6 +685,27 @@
     }
 }
 
+void MdnsAvahi::BrowseRetryCallback(chip::System::Layer * aLayer, void * appState)
+{
+    BrowseContext * context = static_cast<BrowseContext *>(appState);
+    // Don't schedule anything new if we've stopped.
+    if (context->mStopped.load())
+    {
+        chip::Platform::Delete(context);
+        return;
+    }
+    AvahiServiceBrowser * newBrowser =
+        avahi_service_browser_new(context->mInstance->mClient, context->mInterface, AVAHI_PROTO_UNSPEC, context->mProtocol.c_str(),
+                                  nullptr, static_cast<AvahiLookupFlags>(0), HandleBrowse, context);
+    if (newBrowser == nullptr)
+    {
+        // If we failed to create the browser, this browse context is effectively done. We need to call the final callback and
+        // delete the context.
+        context->mCallback(context->mContext, context->mServices.data(), context->mServices.size(), true, CHIP_NO_ERROR);
+        chip::Platform::Delete(context);
+    }
+}
+
 void MdnsAvahi::HandleBrowse(AvahiServiceBrowser * browser, AvahiIfIndex interface, AvahiProtocol protocol, AvahiBrowserEvent event,
                              const char * name, const char * type, const char * domain, AvahiLookupResultFlags /*flags*/,
                              void * userdata)
@@ -695,12 +739,30 @@
             context->mServices.push_back(service);
         }
         break;
-    case AVAHI_BROWSER_ALL_FOR_NOW:
+    case AVAHI_BROWSER_ALL_FOR_NOW: {
         ChipLogProgress(DeviceLayer, "Avahi browse: all for now");
-        context->mCallback(context->mContext, context->mServices.data(), context->mServices.size(), true, CHIP_NO_ERROR);
+        bool needRetries = context->mBrowseRetries++ < kMaxBrowseRetries && !context->mStopped.load();
+        // If we were already asked to stop, no need to send a callback - no one is listening.
+        if (!context->mStopped.load())
+        {
+            context->mCallback(context->mContext, context->mServices.data(), context->mServices.size(), !needRetries,
+                               CHIP_NO_ERROR);
+        }
         avahi_service_browser_free(browser);
-        chip::Platform::Delete(context);
+        if (needRetries)
+        {
+            context->mNextRetryDelay *= 2;
+            // Hand the ownership of the context over to the timer. It will either schedule a new browse on the context,
+            // triggering this function, or it will delete and not reschedule (if stopped).
+            DeviceLayer::SystemLayer().StartTimer(context->mNextRetryDelay / 2, BrowseRetryCallback, context);
+        }
+        else
+        {
+            // We didn't schedule a timer, so we're responsible for deleting the context
+            chip::Platform::Delete(context);
+        }
         break;
+    }
     case AVAHI_BROWSER_REMOVE:
         ChipLogProgress(DeviceLayer, "Avahi browse: remove");
         if (strcmp("local", domain) == 0)
@@ -908,13 +970,12 @@
                            chip::Inet::InterfaceId interface, DnssdBrowseCallback callback, void * context,
                            intptr_t * browseIdentifier)
 {
-    *browseIdentifier = reinterpret_cast<intptr_t>(nullptr);
-    return MdnsAvahi::GetInstance().Browse(type, protocol, addressType, interface, callback, context);
+    return MdnsAvahi::GetInstance().Browse(type, protocol, addressType, interface, callback, context, browseIdentifier);
 }
 
 CHIP_ERROR ChipDnssdStopBrowse(intptr_t browseIdentifier)
 {
-    return CHIP_ERROR_NOT_IMPLEMENTED;
+    return MdnsAvahi::GetInstance().StopBrowse(browseIdentifier);
 }
 
 CHIP_ERROR ChipDnssdResolve(DnssdService * browseResult, chip::Inet::InterfaceId interface, DnssdResolveCallback callback,
diff --git a/src/platform/Linux/DnssdImpl.h b/src/platform/Linux/DnssdImpl.h
index 9bad9e8..e875e97 100644
--- a/src/platform/Linux/DnssdImpl.h
+++ b/src/platform/Linux/DnssdImpl.h
@@ -20,6 +20,7 @@
 #include <sys/select.h>
 #include <unistd.h>
 
+#include <atomic>
 #include <chrono>
 #include <map>
 #include <memory>
@@ -109,7 +110,8 @@
     CHIP_ERROR PublishService(const DnssdService & service, DnssdPublishCallback callback, void * context);
     CHIP_ERROR StopPublish();
     CHIP_ERROR Browse(const char * type, DnssdServiceProtocol protocol, chip::Inet::IPAddressType addressType,
-                      chip::Inet::InterfaceId interface, DnssdBrowseCallback callback, void * context);
+                      chip::Inet::InterfaceId interface, DnssdBrowseCallback callback, void * context, intptr_t * browseIdentifier);
+    CHIP_ERROR StopBrowse(intptr_t browseIdentifier);
     CHIP_ERROR Resolve(const char * name, const char * type, DnssdServiceProtocol protocol, chip::Inet::IPAddressType addressType,
                        chip::Inet::IPAddressType transportType, chip::Inet::InterfaceId interface, DnssdResolveCallback callback,
                        void * context);
@@ -126,6 +128,11 @@
         void * mContext;
         Inet::IPAddressType mAddressType;
         std::vector<DnssdService> mServices;
+        size_t mBrowseRetries;
+        AvahiIfIndex mInterface;
+        std::string mProtocol;
+        chip::System::Clock::Timeout mNextRetryDelay = chip::System::Clock::Seconds16(1);
+        std::atomic_bool mStopped{ false };
     };
 
     struct ResolveContext
@@ -153,6 +160,7 @@
     static void HandleBrowse(AvahiServiceBrowser * broswer, AvahiIfIndex interface, AvahiProtocol protocol, AvahiBrowserEvent event,
                              const char * name, const char * type, const char * domain, AvahiLookupResultFlags flags,
                              void * userdata);
+    static void BrowseRetryCallback(chip::System::Layer * aLayer, void * appState);
     static void HandleResolve(AvahiServiceResolver * resolver, AvahiIfIndex interface, AvahiProtocol protocol,
                               AvahiResolverEvent event, const char * name, const char * type, const char * domain,
                               const char * host_name, const AvahiAddress * address, uint16_t port, AvahiStringList * txt,
@@ -165,6 +173,7 @@
     AvahiClient * mClient;
     std::map<std::string, AvahiEntryGroup *> mPublishedGroups;
     Poller mPoller;
+    static constexpr size_t kMaxBrowseRetries = 4;
 };
 
 } // namespace Dnssd