Fix ThreadSanitizer failure in controller factory. (#26824)

* Fix ThreadSanitizer failure in controller factory.

The failure looks like this:

  WARNING: ThreadSanitizer: race on NSMutableArray (pid=11619)
    Read-only access of NSMutableArray at 0x7b0c0005f5b0 by thread T3:
      #0 -[__NSArrayM countByEnumeratingWithState:objects:count:] <null>:2 (CoreFoundation:x86_64+0x4a338)
      #1 -[MTRDeviceControllerFactory(InternalMethods) operationalInstanceAdded:] MTRDeviceControllerFactory.mm:855 (Matter:x86_64+0x1fd2a)
      #2 MTROperationalBrowser::OnBrowse(_DNSServiceRef_t*, unsigned int, unsigned int, int, char const*, char const*, char const*, void*) MTROperationalBrowser.mm:100 (Matter:x86_64+0x20ee63c)
      #3 handle_browse_response <null>:2 (libsystem_dnssd.dylib:x86_64+0x3733)
      #4 _dispatch_client_callout <null>:2 (libdispatch.dylib:x86_64+0x3316)

    Previous modifying access of NSMutableArray at 0x7b0c0005f5b0 by main thread:
      #0 -[__NSArrayM addObject:] <null>:2 (CoreFoundation:x86_64+0x2457a)
      #1 -[MTRDeviceControllerFactory createController] MTRDeviceControllerFactory.mm:719 (Matter:x86_64+0x1cee3)
      #2 -[MTRDeviceControllerFactory createControllerOnExistingFabric:error:] MTRDeviceControllerFactory.mm:534 (Matter:x86_64+0x19792)

The basic problem is that we are in the middle of adding an object to
_controllers on the API consumer thread when on the Matter thread we get our
browse notification.

The changes here don't aim to lock around all access to _controllers, but just
to make sure that our mutations of it can't race with the access on the Matter
thread.  More coarse locking would need to be done very carefully, given the
amount of dispath_sync to the Matter thread we have going on.

* Address review comments.
diff --git a/.github/workflows/darwin.yaml b/.github/workflows/darwin.yaml
index 053eb96..a7d8f31 100644
--- a/.github/workflows/darwin.yaml
+++ b/.github/workflows/darwin.yaml
@@ -155,7 +155,7 @@
                   TEST_RUNNER_ASAN_OPTIONS=__CURRENT_VALUE__:detect_stack_use_after_return=1 xcodebuild test -target "Matter" -scheme "Matter Framework Tests" -sdk macosx -enableAddressSanitizer YES -enableUndefinedBehaviorSanitizer YES OTHER_CFLAGS='${inherited} -Werror -Wconversion -Wno-incomplete-umbrella -Wno-unguarded-availability-new' CHIP_IS_UBSAN=YES > >(tee /tmp/darwin/framework-tests/darwin-tests-asan.log) 2> >(tee /tmp/darwin/framework-tests/darwin-tests-asan-err.log >&2)
                   # -enableThreadSanitizer instruments the code in Matter.framework,
                   # but to instrument the code in the underlying libCHIP we need to pass CHIP_IS_TSAN=YES
-                  #xcodebuild test -target "Matter" -scheme "Matter Framework Tests" -sdk macosx -enableThreadSanitizer YES OTHER_CFLAGS='${inherited} -Werror -Wconversion -Wno-incomplete-umbrella -Wno-unguarded-availability-new' CHIP_IS_TSAN=YES > >(tee /tmp/darwin/framework-tests/darwin-tests-tsan.log) 2> >(tee /tmp/darwin/framework-tests/darwin-tests-tsan-err.log >&2)
+                  xcodebuild test -target "Matter" -scheme "Matter Framework Tests" -sdk macosx -enableThreadSanitizer YES OTHER_CFLAGS='${inherited} -Werror -Wconversion -Wno-incomplete-umbrella -Wno-unguarded-availability-new' CHIP_IS_TSAN=YES > >(tee /tmp/darwin/framework-tests/darwin-tests-tsan.log) 2> >(tee /tmp/darwin/framework-tests/darwin-tests-tsan-err.log >&2)
               working-directory: src/darwin/Framework
             - name: Uploading log files
               uses: actions/upload-artifact@v3
diff --git a/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm b/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm
index 93f7abe..5bd050b 100644
--- a/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm
+++ b/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm
@@ -34,6 +34,8 @@
 #import "MTRPersistentStorageDelegateBridge.h"
 #import "NSDataSpanConversion.h"
 
+#import <os/lock.h>
+
 #include <controller/CHIPDeviceControllerFactory.h>
 #include <credentials/CHIPCert.h>
 #include <credentials/FabricTable.h>
@@ -88,6 +90,22 @@
 @property () chip::Credentials::DeviceAttestationVerifier * deviceAttestationVerifier;
 @property (readonly) BOOL advertiseOperational;
 @property (nonatomic, readonly) Credentials::IgnoreCertificateValidityPeriodPolicy * certificateValidityPolicy;
+// Lock used to serialize access to the "controllers" array, since it needs to
+// be touched from both whatever queue is starting controllers and from the
+// Matter queue.  The way this lock is used assumes that:
+//
+// 1) The only mutating accesses to the controllers array happen when the
+//    current queue is not the Matter queue.  This is a good assumption, because
+//    the implementation of the fucntions that mutate the array do sync dispatch
+//    to the Matter queue, which would deadlock if they were called when that
+//    queue was the current queue.
+// 2) It's our API consumer's responsibility to serialize access to us from
+//    outside.
+//
+// This means that we only take the lock around mutations of the array and
+// accesses to the array that are from code running on the Matter queue.
+
+@property (nonatomic, readonly) os_unfair_lock controllersLock;
 
 - (BOOL)findMatchingFabric:(FabricTable &)fabricTable
                     params:(MTRDeviceControllerStartupParams *)params
@@ -123,6 +141,7 @@
     _running = NO;
     _chipWorkQueue = DeviceLayer::PlatformMgrImpl().GetWorkQueue();
     _controllerFactory = &DeviceControllerFactory::GetInstance();
+    _controllersLock = OS_UNFAIR_LOCK_INIT;
 
     _sessionKeystore = new chip::Crypto::RawKeySessionKeystore();
     if ([self checkForInitError:(_sessionKeystore != nullptr) logMsg:kErrorSessionKeystoreInit]) {
@@ -166,8 +185,15 @@
     [self cleanupInitObjects];
 }
 
+- (void)_assertCurrentQueueIsNotMatterQueue
+{
+    VerifyOrDie(!DeviceLayer::PlatformMgrImpl().IsWorkQueueCurrentQueue());
+}
+
 - (BOOL)checkIsRunning:(NSError * __autoreleasing *)error
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     if ([self isRunning]) {
         return YES;
     }
@@ -255,6 +281,8 @@
 
 - (nullable NSArray<MTRFabricInfo *> *)knownFabrics
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     if (!self.isRunning) {
         return nil;
     }
@@ -303,6 +331,8 @@
 
 - (BOOL)startControllerFactory:(MTRDeviceControllerFactoryParams *)startupParams error:(NSError * __autoreleasing *)error;
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     if ([self isRunning]) {
         MTR_LOG_DEBUG("Ignoring duplicate call to startup, Matter controller factory already started...");
         return YES;
@@ -501,6 +531,8 @@
 
 - (void)stopControllerFactory
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     if (![self isRunning]) {
         return;
     }
@@ -524,6 +556,8 @@
 - (MTRDeviceController * _Nullable)createControllerOnExistingFabric:(MTRDeviceControllerStartupParams *)startupParams
                                                               error:(NSError * __autoreleasing *)error
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     if (![self checkIsRunning:error]) {
         MTR_LOG_ERROR("Trying to start controller while Matter controller factory is not running");
         return nil;
@@ -615,6 +649,8 @@
 - (MTRDeviceController * _Nullable)createControllerOnNewFabric:(MTRDeviceControllerStartupParams *)startupParams
                                                          error:(NSError * __autoreleasing *)error
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     if (![self isRunning]) {
         MTR_LOG_ERROR("Trying to start controller while Matter controller factory is not running");
         return nil;
@@ -699,6 +735,8 @@
 
 - (MTRDeviceController * _Nullable)createController
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     MTRDeviceController * controller = [[MTRDeviceController alloc] initWithFactory:self queue:_chipWorkQueue];
     if (controller == nil) {
         MTR_LOG_ERROR("Failed to init controller");
@@ -716,7 +754,9 @@
 
     // Add the controller to _controllers now, so if we fail partway through its
     // startup we will still do the right cleanups.
+    os_unfair_lock_lock(&_controllersLock);
     [_controllers addObject:controller];
+    os_unfair_lock_unlock(&_controllersLock);
 
     return controller;
 }
@@ -733,6 +773,8 @@
                     params:(MTRDeviceControllerStartupParams *)params
                     fabric:(const FabricInfo * _Nullable * _Nonnull)fabric
 {
+    assertChipStackLockedByCurrentThread();
+
     CHIP_ERROR err = fabricTable.Init(
         { .storage = _persistentStorageDelegateBridge, .operationalKeystore = _keystore, .opCertStore = _opCertStore });
     if (err != CHIP_NO_ERROR) {
@@ -768,6 +810,8 @@
 //
 - (MTRDeviceController * _Nullable)maybeInitializeOTAProvider:(MTRDeviceController * _Nonnull)controller
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     VerifyOrReturnValue(_otaProviderDelegateBridge != nil, controller);
     VerifyOrReturnValue([_controllers count] == 1, controller);
 
@@ -791,6 +835,8 @@
 
 - (void)controllerShuttingDown:(MTRDeviceController *)controller
 {
+    [self _assertCurrentQueueIsNotMatterQueue];
+
     if (![_controllers containsObject:controller]) {
         MTR_LOG_ERROR("Controller we don't know about shutting down");
         return;
@@ -808,7 +854,9 @@
         });
     }
 
+    os_unfair_lock_lock(&_controllersLock);
     [_controllers removeObject:controller];
+    os_unfair_lock_unlock(&_controllersLock);
 
     if ([_controllers count] == 0) {
         dispatch_sync(_chipWorkQueue, ^{
@@ -841,7 +889,13 @@
 
 - (nullable MTRDeviceController *)runningControllerForFabricIndex:(chip::FabricIndex)fabricIndex
 {
-    for (MTRDeviceController * existing in _controllers) {
+    assertChipStackLockedByCurrentThread();
+
+    os_unfair_lock_lock(&_controllersLock);
+    NSArray<MTRDeviceController *> * controllersCopy = [_controllers copy];
+    os_unfair_lock_unlock(&_controllersLock);
+
+    for (MTRDeviceController * existing in controllersCopy) {
         if ([existing fabricIndex] == fabricIndex) {
             return existing;
         }
@@ -852,7 +906,13 @@
 
 - (void)operationalInstanceAdded:(chip::PeerId &)operationalID
 {
-    for (MTRDeviceController * controller in _controllers) {
+    assertChipStackLockedByCurrentThread();
+
+    os_unfair_lock_lock(&_controllersLock);
+    NSArray<MTRDeviceController *> * controllersCopy = [_controllers copy];
+    os_unfair_lock_unlock(&_controllersLock);
+
+    for (MTRDeviceController * controller in controllersCopy) {
         auto * compressedFabricId = controller.compressedFabricID;
         if (compressedFabricId != nil && compressedFabricId.unsignedLongLongValue == operationalID.GetCompressedFabricId()) {
             ChipLogProgress(Controller, "Notifying controller at fabric index %u about new operational node 0x" ChipLogFormatX64,
diff --git a/src/platform/Darwin/PlatformManagerImpl.cpp b/src/platform/Darwin/PlatformManagerImpl.cpp
index 6e78a81..f1d5e48 100644
--- a/src/platform/Darwin/PlatformManagerImpl.cpp
+++ b/src/platform/Darwin/PlatformManagerImpl.cpp
@@ -88,7 +88,7 @@
     if (!mIsWorkQueueSuspended && !mIsWorkQueueSuspensionPending)
     {
         mIsWorkQueueSuspensionPending = true;
-        if (dispatch_get_current_queue() != mWorkQueue)
+        if (!IsWorkQueueCurrentQueue())
         {
             // dispatch_sync is used in order to guarantee serialization of the caller with
             // respect to any tasks that might already be on the queue, or running.
@@ -157,10 +157,15 @@
 {
     // If we have no work queue, or it's suspended, then we assume our caller
     // knows what they are doing in terms of their own concurrency.
-    return !mWorkQueue || mIsWorkQueueSuspended || dispatch_get_current_queue() == mWorkQueue;
+    return !mWorkQueue || mIsWorkQueueSuspended || IsWorkQueueCurrentQueue();
 };
 #endif
 
+bool PlatformManagerImpl::IsWorkQueueCurrentQueue() const
+{
+    return dispatch_get_current_queue() == mWorkQueue;
+}
+
 CHIP_ERROR PlatformManagerImpl::StartBleScan(BleScannerDelegate * delegate)
 {
 #if CONFIG_NETWORK_LAYER_BLE
diff --git a/src/platform/Darwin/PlatformManagerImpl.h b/src/platform/Darwin/PlatformManagerImpl.h
index a0aff2a..79da67a 100644
--- a/src/platform/Darwin/PlatformManagerImpl.h
+++ b/src/platform/Darwin/PlatformManagerImpl.h
@@ -56,6 +56,8 @@
         return mWorkQueue;
     }
 
+    bool IsWorkQueueCurrentQueue() const;
+
     CHIP_ERROR StartBleScan(BleScannerDelegate * delegate = nullptr);
     CHIP_ERROR StopBleScan();
     CHIP_ERROR PrepareCommissioning();