absl: speed up Mutex::Lock
Currently Mutex::Lock contains not inlined non-tail call:
TryAcquireWithSpinning -> GetMutexGlobals -> LowLevelCallOnce -> init closure
This turns the function into non-leaf with stack frame allocation
and additional register use. Remove this non-tail call to make the function leaf.
Move spin iterations initialization to LockSlow.
Current Lock happy path:
00000000001edc20 <absl::Mutex::Lock()>:
1edc20: 55 push %rbp
1edc21: 48 89 e5 mov %rsp,%rbp
1edc24: 53 push %rbx
1edc25: 50 push %rax
1edc26: 48 89 fb mov %rdi,%rbx
1edc29: 48 8b 07 mov (%rdi),%rax
1edc2c: a8 19 test $0x19,%al
1edc2e: 75 0e jne 1edc3e <absl::Mutex::Lock()+0x1e>
1edc30: 48 89 c1 mov %rax,%rcx
1edc33: 48 83 c9 08 or $0x8,%rcx
1edc37: f0 48 0f b1 0b lock cmpxchg %rcx,(%rbx)
1edc3c: 74 42 je 1edc80 <absl::Mutex::Lock()+0x60>
... unhappy path ...
1edc80: 48 83 c4 08 add $0x8,%rsp
1edc84: 5b pop %rbx
1edc85: 5d pop %rbp
1edc86: c3 ret
New Lock happy path:
00000000001eea80 <absl::Mutex::Lock()>:
1eea80: 48 8b 07 mov (%rdi),%rax
1eea83: a8 19 test $0x19,%al
1eea85: 75 0f jne 1eea96 <absl::Mutex::Lock()+0x16>
1eea87: 48 89 c1 mov %rax,%rcx
1eea8a: 48 83 c9 08 or $0x8,%rcx
1eea8e: f0 48 0f b1 0f lock cmpxchg %rcx,(%rdi)
1eea93: 75 01 jne 1eea96 <absl::Mutex::Lock()+0x16>
1eea95: c3 ret
... unhappy path ...
PiperOrigin-RevId: 577790105
Change-Id: I20793534050302ff9f7a20aed93791c088d98562
diff --git a/absl/synchronization/mutex.cc b/absl/synchronization/mutex.cc
index 4703267..40b90b3 100644
--- a/absl/synchronization/mutex.cc
+++ b/absl/synchronization/mutex.cc
@@ -129,11 +129,15 @@
struct ABSL_CACHELINE_ALIGNED MutexGlobals {
absl::once_flag once;
- int spinloop_iterations = 0;
+ // Note: this variable is initialized separately in Mutex::LockSlow,
+ // so that Mutex::Lock does not have a stack frame in optimized build.
+ std::atomic<int> spinloop_iterations{0};
int32_t mutex_sleep_spins[2] = {};
absl::Duration mutex_sleep_time;
};
+ABSL_CONST_INIT static MutexGlobals globals;
+
absl::Duration MeasureTimeToYield() {
absl::Time before = absl::Now();
ABSL_INTERNAL_C_SYMBOL(AbslInternalMutexYield)();
@@ -141,33 +145,30 @@
}
const MutexGlobals& GetMutexGlobals() {
- ABSL_CONST_INIT static MutexGlobals data;
- absl::base_internal::LowLevelCallOnce(&data.once, [&]() {
+ absl::base_internal::LowLevelCallOnce(&globals.once, [&]() {
if (absl::base_internal::NumCPUs() > 1) {
- // If this is multiprocessor, allow spinning. If the mode is
- // aggressive then spin many times before yielding. If the mode is
- // gentle then spin only a few times before yielding. Aggressive spinning
- // is used to ensure that an Unlock() call, which must get the spin lock
- // for any thread to make progress gets it without undue delay.
- data.spinloop_iterations = 1500;
- data.mutex_sleep_spins[AGGRESSIVE] = 5000;
- data.mutex_sleep_spins[GENTLE] = 250;
- data.mutex_sleep_time = absl::Microseconds(10);
+ // If the mode is aggressive then spin many times before yielding.
+ // If the mode is gentle then spin only a few times before yielding.
+ // Aggressive spinning is used to ensure that an Unlock() call,
+ // which must get the spin lock for any thread to make progress gets it
+ // without undue delay.
+ globals.mutex_sleep_spins[AGGRESSIVE] = 5000;
+ globals.mutex_sleep_spins[GENTLE] = 250;
+ globals.mutex_sleep_time = absl::Microseconds(10);
} else {
// If this a uniprocessor, only yield/sleep. Real-time threads are often
// unable to yield, so the sleep time needs to be long enough to keep
// the calling thread asleep until scheduling happens.
- data.spinloop_iterations = 0;
- data.mutex_sleep_spins[AGGRESSIVE] = 0;
- data.mutex_sleep_spins[GENTLE] = 0;
- data.mutex_sleep_time = MeasureTimeToYield() * 5;
- data.mutex_sleep_time =
- std::min(data.mutex_sleep_time, absl::Milliseconds(1));
- data.mutex_sleep_time =
- std::max(data.mutex_sleep_time, absl::Microseconds(10));
+ globals.mutex_sleep_spins[AGGRESSIVE] = 0;
+ globals.mutex_sleep_spins[GENTLE] = 0;
+ globals.mutex_sleep_time = MeasureTimeToYield() * 5;
+ globals.mutex_sleep_time =
+ std::min(globals.mutex_sleep_time, absl::Milliseconds(1));
+ globals.mutex_sleep_time =
+ std::max(globals.mutex_sleep_time, absl::Microseconds(10));
}
});
- return data;
+ return globals;
}
} // namespace
@@ -1487,7 +1488,7 @@
// Attempt to acquire *mu, and return whether successful. The implementation
// may spin for a short while if the lock cannot be acquired immediately.
static bool TryAcquireWithSpinning(std::atomic<intptr_t>* mu) {
- int c = GetMutexGlobals().spinloop_iterations;
+ int c = globals.spinloop_iterations.load(std::memory_order_relaxed);
do { // do/while somewhat faster on AMD
intptr_t v = mu->load(std::memory_order_relaxed);
if ((v & (kMuReader | kMuEvent)) != 0) {
@@ -1507,11 +1508,12 @@
GraphId id = DebugOnlyDeadlockCheck(this);
intptr_t v = mu_.load(std::memory_order_relaxed);
// try fast acquire, then spin loop
- if ((v & (kMuWriter | kMuReader | kMuEvent)) != 0 ||
- !mu_.compare_exchange_strong(v, kMuWriter | v, std::memory_order_acquire,
- std::memory_order_relaxed)) {
+ if (ABSL_PREDICT_FALSE((v & (kMuWriter | kMuReader | kMuEvent)) != 0) ||
+ ABSL_PREDICT_FALSE(!mu_.compare_exchange_strong(
+ v, kMuWriter | v, std::memory_order_acquire,
+ std::memory_order_relaxed))) {
// try spin acquire, then slow loop
- if (!TryAcquireWithSpinning(&this->mu_)) {
+ if (ABSL_PREDICT_FALSE(!TryAcquireWithSpinning(&this->mu_))) {
this->LockSlow(kExclusive, nullptr, 0);
}
}
@@ -1783,6 +1785,22 @@
// Internal version of LockWhen(). See LockSlowWithDeadline()
ABSL_ATTRIBUTE_NOINLINE void Mutex::LockSlow(MuHow how, const Condition* cond,
int flags) {
+ // Note: we specifically initialize spinloop_iterations after the first use
+ // in TryAcquireWithSpinning so that Lock function does not have any non-tail
+ // calls and consequently a stack frame. It's fine to have spinloop_iterations
+ // uninitialized (meaning no spinning) in all initial uncontended Lock calls
+ // and in the first contended call. After that we will have
+ // spinloop_iterations properly initialized.
+ if (ABSL_PREDICT_FALSE(
+ globals.spinloop_iterations.load(std::memory_order_relaxed) == 0)) {
+ if (absl::base_internal::NumCPUs() > 1) {
+ // If this is multiprocessor, allow spinning.
+ globals.spinloop_iterations.store(1500, std::memory_order_relaxed);
+ } else {
+ // If this a uniprocessor, only yield/sleep.
+ globals.spinloop_iterations.store(-1, std::memory_order_relaxed);
+ }
+ }
ABSL_RAW_CHECK(
this->LockSlowWithDeadline(how, cond, KernelTimeout::Never(), flags),
"condition untrue on return from LockSlow");