absl: speed up Mutex::Lock

Currently Mutex::Lock contains not inlined non-tail call:
TryAcquireWithSpinning -> GetMutexGlobals -> LowLevelCallOnce -> init closure
This turns the function into non-leaf with stack frame allocation
and additional register use. Remove this non-tail call to make the function leaf.
Move spin iterations initialization to LockSlow.

Current Lock happy path:

00000000001edc20 <absl::Mutex::Lock()>:
  1edc20:	55                   	push   %rbp
  1edc21:	48 89 e5             	mov    %rsp,%rbp
  1edc24:	53                   	push   %rbx
  1edc25:	50                   	push   %rax
  1edc26:	48 89 fb             	mov    %rdi,%rbx
  1edc29:	48 8b 07             	mov    (%rdi),%rax
  1edc2c:	a8 19                	test   $0x19,%al
  1edc2e:	75 0e                	jne    1edc3e <absl::Mutex::Lock()+0x1e>
  1edc30:	48 89 c1             	mov    %rax,%rcx
  1edc33:	48 83 c9 08          	or     $0x8,%rcx
  1edc37:	f0 48 0f b1 0b       	lock cmpxchg %rcx,(%rbx)
  1edc3c:	74 42                	je     1edc80 <absl::Mutex::Lock()+0x60>
  ... unhappy path ...
  1edc80:	48 83 c4 08          	add    $0x8,%rsp
  1edc84:	5b                   	pop    %rbx
  1edc85:	5d                   	pop    %rbp
  1edc86:	c3                   	ret

New Lock happy path:

00000000001eea80 <absl::Mutex::Lock()>:
  1eea80:	48 8b 07             	mov    (%rdi),%rax
  1eea83:	a8 19                	test   $0x19,%al
  1eea85:	75 0f                	jne    1eea96 <absl::Mutex::Lock()+0x16>
  1eea87:	48 89 c1             	mov    %rax,%rcx
  1eea8a:	48 83 c9 08          	or     $0x8,%rcx
  1eea8e:	f0 48 0f b1 0f       	lock cmpxchg %rcx,(%rdi)
  1eea93:	75 01                	jne    1eea96 <absl::Mutex::Lock()+0x16>
  1eea95:	c3                   	ret
  ... unhappy path ...

PiperOrigin-RevId: 577790105
Change-Id: I20793534050302ff9f7a20aed93791c088d98562
diff --git a/absl/synchronization/mutex.cc b/absl/synchronization/mutex.cc
index 4703267..40b90b3 100644
--- a/absl/synchronization/mutex.cc
+++ b/absl/synchronization/mutex.cc
@@ -129,11 +129,15 @@
 
 struct ABSL_CACHELINE_ALIGNED MutexGlobals {
   absl::once_flag once;
-  int spinloop_iterations = 0;
+  // Note: this variable is initialized separately in Mutex::LockSlow,
+  // so that Mutex::Lock does not have a stack frame in optimized build.
+  std::atomic<int> spinloop_iterations{0};
   int32_t mutex_sleep_spins[2] = {};
   absl::Duration mutex_sleep_time;
 };
 
+ABSL_CONST_INIT static MutexGlobals globals;
+
 absl::Duration MeasureTimeToYield() {
   absl::Time before = absl::Now();
   ABSL_INTERNAL_C_SYMBOL(AbslInternalMutexYield)();
@@ -141,33 +145,30 @@
 }
 
 const MutexGlobals& GetMutexGlobals() {
-  ABSL_CONST_INIT static MutexGlobals data;
-  absl::base_internal::LowLevelCallOnce(&data.once, [&]() {
+  absl::base_internal::LowLevelCallOnce(&globals.once, [&]() {
     if (absl::base_internal::NumCPUs() > 1) {
-      // If this is multiprocessor, allow spinning. If the mode is
-      // aggressive then spin many times before yielding. If the mode is
-      // gentle then spin only a few times before yielding. Aggressive spinning
-      // is used to ensure that an Unlock() call, which must get the spin lock
-      // for any thread to make progress gets it without undue delay.
-      data.spinloop_iterations = 1500;
-      data.mutex_sleep_spins[AGGRESSIVE] = 5000;
-      data.mutex_sleep_spins[GENTLE] = 250;
-      data.mutex_sleep_time = absl::Microseconds(10);
+      // If the mode is aggressive then spin many times before yielding.
+      // If the mode is gentle then spin only a few times before yielding.
+      // Aggressive spinning is used to ensure that an Unlock() call,
+      // which must get the spin lock for any thread to make progress gets it
+      // without undue delay.
+      globals.mutex_sleep_spins[AGGRESSIVE] = 5000;
+      globals.mutex_sleep_spins[GENTLE] = 250;
+      globals.mutex_sleep_time = absl::Microseconds(10);
     } else {
       // If this a uniprocessor, only yield/sleep. Real-time threads are often
       // unable to yield, so the sleep time needs to be long enough to keep
       // the calling thread asleep until scheduling happens.
-      data.spinloop_iterations = 0;
-      data.mutex_sleep_spins[AGGRESSIVE] = 0;
-      data.mutex_sleep_spins[GENTLE] = 0;
-      data.mutex_sleep_time = MeasureTimeToYield() * 5;
-      data.mutex_sleep_time =
-          std::min(data.mutex_sleep_time, absl::Milliseconds(1));
-      data.mutex_sleep_time =
-          std::max(data.mutex_sleep_time, absl::Microseconds(10));
+      globals.mutex_sleep_spins[AGGRESSIVE] = 0;
+      globals.mutex_sleep_spins[GENTLE] = 0;
+      globals.mutex_sleep_time = MeasureTimeToYield() * 5;
+      globals.mutex_sleep_time =
+          std::min(globals.mutex_sleep_time, absl::Milliseconds(1));
+      globals.mutex_sleep_time =
+          std::max(globals.mutex_sleep_time, absl::Microseconds(10));
     }
   });
-  return data;
+  return globals;
 }
 }  // namespace
 
@@ -1487,7 +1488,7 @@
 // Attempt to acquire *mu, and return whether successful.  The implementation
 // may spin for a short while if the lock cannot be acquired immediately.
 static bool TryAcquireWithSpinning(std::atomic<intptr_t>* mu) {
-  int c = GetMutexGlobals().spinloop_iterations;
+  int c = globals.spinloop_iterations.load(std::memory_order_relaxed);
   do {  // do/while somewhat faster on AMD
     intptr_t v = mu->load(std::memory_order_relaxed);
     if ((v & (kMuReader | kMuEvent)) != 0) {
@@ -1507,11 +1508,12 @@
   GraphId id = DebugOnlyDeadlockCheck(this);
   intptr_t v = mu_.load(std::memory_order_relaxed);
   // try fast acquire, then spin loop
-  if ((v & (kMuWriter | kMuReader | kMuEvent)) != 0 ||
-      !mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire,
-                                   std::memory_order_relaxed)) {
+  if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuReader | kMuEvent)) != 0) ||
+      ABSL_PREDICT_FALSE(!mu_.compare_exchange_strong(
+          v, kMuWriter | v, std::memory_order_acquire,
+          std::memory_order_relaxed))) {
     // try spin acquire, then slow loop
-    if (!TryAcquireWithSpinning(&this->mu_)) {
+    if (ABSL_PREDICT_FALSE(!TryAcquireWithSpinning(&this->mu_))) {
       this->LockSlow(kExclusive, nullptr, 0);
     }
   }
@@ -1783,6 +1785,22 @@
 // Internal version of LockWhen().  See LockSlowWithDeadline()
 ABSL_ATTRIBUTE_NOINLINE void Mutex::LockSlow(MuHow how, const Condition* cond,
                                              int flags) {
+  // Note: we specifically initialize spinloop_iterations after the first use
+  // in TryAcquireWithSpinning so that Lock function does not have any non-tail
+  // calls and consequently a stack frame. It's fine to have spinloop_iterations
+  // uninitialized (meaning no spinning) in all initial uncontended Lock calls
+  // and in the first contended call. After that we will have
+  // spinloop_iterations properly initialized.
+  if (ABSL_PREDICT_FALSE(
+          globals.spinloop_iterations.load(std::memory_order_relaxed) == 0)) {
+    if (absl::base_internal::NumCPUs() > 1) {
+      // If this is multiprocessor, allow spinning.
+      globals.spinloop_iterations.store(1500, std::memory_order_relaxed);
+    } else {
+      // If this a uniprocessor, only yield/sleep.
+      globals.spinloop_iterations.store(-1, std::memory_order_relaxed);
+    }
+  }
   ABSL_RAW_CHECK(
       this->LockSlowWithDeadline(how, cond, KernelTimeout::Never(), flags),
       "condition untrue on return from LockSlow");