Optimize memory orders for weakly ordered architectures.

In `RefCount::Unref()`, `fetch_sub()` with `memory_order_release` rather than
`memory_order_acq_rel`. Then, only in the unexpected path when this was the last
decrement (due to `IsUnique()` called before, this can happen only if another
thread calls `Unref()` at the same time), `atomic_thread_fence()` with
`memory_order_acquire`.

This removes the unnecessary acquire semantics from the expected path.

In `StableDependencyDefault::EnsureAllocatedSlow()`, `compare_exchange_strong()`
with a separate `memory_order_release` for the expected success path and
`memory_order_acquire` for the unexpected failure path (which can happen only
another thread calls `EnsureAllocatd()` at the same time), instead of using
`memory_order_acq_rel` for both.

In the unexpected path there is no change because `compare_exchange_strong()`
implicitly demotes `memory_order_acq_rel` to `memory_order_acquire` there.
This removes the unnecessary acquire semantics from the expected path.

Cosmetics: in `RefCount::{Ref,Unref}()`, replace `if` with `if constexpr`,
and split a complex conditional into separate statements.
PiperOrigin-RevId: 915941386
diff --git a/riegeli/base/ref_count.h b/riegeli/base/ref_count.h
index 1551321..1d2b0d9 100644
--- a/riegeli/base/ref_count.h
+++ b/riegeli/base/ref_count.h
@@ -21,6 +21,7 @@
 #include <type_traits>
 
 #include "absl/base/nullability.h"
+#include "absl/base/optimization.h"
 #include "riegeli/base/ownership.h"
 
 ABSL_POINTERS_DEFAULT_NONNULL
@@ -41,7 +42,7 @@
   template <typename Ownership = ShareOwnership,
             std::enable_if_t<IsOwnership<Ownership>::value, int> = 0>
   void Ref() const {
-    if (std::is_same_v<Ownership, ShareOwnership>) {
+    if constexpr (std::is_same_v<Ownership, ShareOwnership>) {
       ref_count_.fetch_add(1, std::memory_order_relaxed);
     }
   }
@@ -51,17 +52,32 @@
   //
   // Does nothing and returns `false` if `Ownership` is `ShareOwnership`.
   //
-  // When `Unref()` returns `true`, the decrement can be skipped and the actual
-  // value of the reference count is unspecified. This avoids an expensive
+  // When `Unref()` returns `true`, the decrement might be skipped, leaving the
+  // actual value of the reference count unspecified. This avoids an expensive
   // atomic read-modify-write operation, making the last `Unref()` much faster,
   // at the cost of making a non-last `Unref()` a bit slower. This is in
   // contrast to `std::shared_ptr` in libc++ and libstdc++.
   template <typename Ownership = PassOwnership,
             std::enable_if_t<IsOwnership<Ownership>::value, int> = 0>
   bool Unref() const {
-    return std::is_same_v<Ownership, PassOwnership> &&
-           (HasUniqueOwner() ||
-            ref_count_.fetch_sub(1, std::memory_order_acq_rel) == 1);
+    if constexpr (std::is_same_v<Ownership, PassOwnership>) {
+      if (HasUniqueOwner()) return true;
+      if (ABSL_PREDICT_FALSE(
+              ref_count_.fetch_sub(1, std::memory_order_release) == 1)) {
+        // Even though `HasUniqueOwner()` was `false` before, another thread
+        // has just decremented the reference count. This is the last reference
+        // after all.
+#ifdef THREAD_SANITIZER
+        // TSAN does not support `std::atomic_thread_fence()`. Using `load()`
+        // instead is less efficient but also correct.
+        (void)ref_count_.load(std::memory_order_acquire);
+#else
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        return true;
+      }
+    }
+    return false;
   }
 
   // Returns `true` if there is only one owner of the object.
diff --git a/riegeli/base/stable_dependency.h b/riegeli/base/stable_dependency.h
index b0ae3b3..6add8a4 100644
--- a/riegeli/base/stable_dependency.h
+++ b/riegeli/base/stable_dependency.h
@@ -250,8 +250,10 @@
   Dependency<Handle, Manager>* const dep = new Dependency<Handle, Manager>();
   Dependency<Handle, Manager>* other_dep = nullptr;
   if (ABSL_PREDICT_FALSE(!self.dep_.compare_exchange_strong(
-          other_dep, dep, std::memory_order_acq_rel))) {
-    // We lost the race.
+          other_dep, dep, std::memory_order_release,
+          std::memory_order_acquire))) {
+    // Another thread has just called `EnsureAllocated()` and won the race.
+    // `dep` is not needed after all.
     delete dep;
     return *other_dep;
   }