Fix crash in `gil_scoped_acquire` (#5828)

* Add a test reproducing the #5827 crash

Signed-off-by: Rostan Tabet <rtabet@nvidia.com>

* Fix #5827

Signed-off-by: Rostan Tabet <rtabet@nvidia.com>

* Rename PYBIND11_HAS_BARRIER and move it to common.h

Signed-off-by: Rostan Tabet <rtabet@nvidia.com>

* In test_thread.{cpp,py}, rename has_barrier

Signed-off-by: Rostan Tabet <rtabet@nvidia.com>

---------

Signed-off-by: Rostan Tabet <rtabet@nvidia.com>
diff --git a/include/pybind11/detail/common.h b/include/pybind11/detail/common.h
index 07c0943..05d6755 100644
--- a/include/pybind11/detail/common.h
+++ b/include/pybind11/detail/common.h
@@ -103,6 +103,10 @@
 #    define PYBIND11_DTOR_CONSTEXPR
 #endif
 
+#if defined(PYBIND11_CPP20) && defined(__has_include) && __has_include(<barrier>)
+#    define PYBIND11_HAS_STD_BARRIER 1
+#endif
+
 // Compiler version assertions
 #if defined(__INTEL_COMPILER)
 #    if __INTEL_COMPILER < 1800
diff --git a/include/pybind11/gil.h b/include/pybind11/gil.h
index 4222a03..9e799b3 100644
--- a/include/pybind11/gil.h
+++ b/include/pybind11/gil.h
@@ -120,7 +120,11 @@
                 pybind11_fail("scoped_acquire::dec_ref(): internal error!");
             }
 #    endif
+            // Make sure that PyThreadState_Clear is not recursively called by finalizers.
+            // See issue #5827
+            ++tstate->gilstate_counter;
             PyThreadState_Clear(tstate);
+            --tstate->gilstate_counter;
             if (active) {
                 PyThreadState_DeleteCurrent();
             }
diff --git a/tests/test_scoped_critical_section.cpp b/tests/test_scoped_critical_section.cpp
index dc9a69e..7401eb0 100644
--- a/tests/test_scoped_critical_section.cpp
+++ b/tests/test_scoped_critical_section.cpp
@@ -7,8 +7,7 @@
 #include <thread>
 #include <utility>
 
-#if defined(PYBIND11_CPP20) && defined(__has_include) && __has_include(<barrier>)
-#    define PYBIND11_HAS_BARRIER 1
+#if defined(PYBIND11_HAS_STD_BARRIER)
 #    include <barrier>
 #endif
 
@@ -39,7 +38,7 @@
     std::atomic_bool value_{false};
 };
 
-#if defined(PYBIND11_HAS_BARRIER)
+#if defined(PYBIND11_HAS_STD_BARRIER)
 
 // Modifying the C/C++ members of a Python object from multiple threads requires a critical section
 // to ensure thread safety and data integrity.
@@ -259,7 +258,7 @@
     (void) BoolWrapperHandle.ptr(); // suppress unused variable warning
 
     m.attr("has_barrier") =
-#ifdef PYBIND11_HAS_BARRIER
+#ifdef PYBIND11_HAS_STD_BARRIER
         true;
 #else
         false;
diff --git a/tests/test_thread.cpp b/tests/test_thread.cpp
index eabf39a..131bd87 100644
--- a/tests/test_thread.cpp
+++ b/tests/test_thread.cpp
@@ -15,6 +15,10 @@
 #include <chrono>
 #include <thread>
 
+#if defined(PYBIND11_HAS_STD_BARRIER)
+#    include <barrier>
+#endif
+
 namespace py = pybind11;
 
 namespace {
@@ -34,7 +38,6 @@
 } // namespace
 
 TEST_SUBMODULE(thread, m) {
-
     py::class_<IntStruct>(m, "IntStruct").def(py::init([](const int i) { return IntStruct(i); }));
 
     // implicitly_convertible uses loader_life_support when an implicit
@@ -67,6 +70,39 @@
     py::class_<EmptyStruct>(m, "EmptyStruct")
         .def_readonly_static("SharedInstance", &SharedInstance);
 
+#if defined(PYBIND11_HAS_STD_BARRIER)
+    // In the free-threaded build, during PyThreadState_Clear, removing the thread from the biased
+    // reference counting table may call destructors. Make sure that it doesn't crash.
+    m.def("test_pythread_state_clear_destructor", [](py::type cls) {
+        py::handle obj;
+
+        std::barrier barrier{2};
+        std::thread thread1{[&]() {
+            py::gil_scoped_acquire gil;
+            obj = cls().release();
+            barrier.arrive_and_wait();
+        }};
+        std::thread thread2{[&]() {
+            py::gil_scoped_acquire gil;
+            barrier.arrive_and_wait();
+            // ob_ref_shared becomes negative; transition to the queued state
+            obj.dec_ref();
+        }};
+
+        // jthread is not supported by Apple Clang
+        thread1.join();
+        thread2.join();
+    });
+#endif
+
+    m.attr("defined_PYBIND11_HAS_STD_BARRIER") =
+#ifdef PYBIND11_HAS_STD_BARRIER
+        true;
+#else
+        false;
+#endif
+    m.def("acquire_gil", []() { py::gil_scoped_acquire gil_acquired; });
+
     // NOTE: std::string_view also uses loader_life_support to ensure that
     // the string contents remain alive, but that's a C++ 17 feature.
 }
diff --git a/tests/test_thread.py b/tests/test_thread.py
index e9d7baf..d302c38 100644
--- a/tests/test_thread.py
+++ b/tests/test_thread.py
@@ -5,6 +5,7 @@
 
 import pytest
 
+import env
 from pybind11_tests import thread as m
 
 
@@ -66,3 +67,14 @@
         thread.start()
     for thread in threads:
         thread.join()
+
+
+@pytest.mark.skipif(sys.platform.startswith("emscripten"), reason="Requires threads")
+@pytest.mark.skipif(not m.defined_PYBIND11_HAS_STD_BARRIER, reason="no <barrier>")
+@pytest.mark.skipif(env.sys_is_gil_enabled(), reason="Deadlock with the GIL")
+def test_pythread_state_clear_destructor():
+    class Foo:
+        def __del__(self):
+            m.acquire_gil()
+
+    m.test_pythread_state_clear_destructor(Foo)