From bc613e4ecb3c8f6439e666b1ddd13ac533dcef95 Mon Sep 17 00:00:00 2001
From: Brett Simmers <bsimmers@fb.com>
Date: Wed, 17 Jan 2024 17:33:49 -0800
Subject: [PATCH] gh-112175: Add `eval_breaker` to `PyThreadState`

This change adds an `eval_breaker` field to `PyThreadState`, renaming the existing `eval_breaker` to
`interp_eval_breaker` (its uses are explained further down). The primary motivation is for
performance in free-threaded builds: with thread-local eval breakers, we can stop a specific thread
(e.g., for an async exception) without interrupting other threads.

There are still two situations where we want the first available thread to handle a request:
- Running a garbage collection: In normal builds, we set `_PY_GC_SCHEDULED_BIT` on the current
  thread. In case a thread suspends before handling the collection, the bit is copied to and from
  `interp_eval_breaker` on thread suspend and resume, respectively. In a free-threaded build, we
  simply iterate over all threads and set the bit. The first thread to check its eval breaker runs
  the collection, unsetting the bit on all threads.
  - Free-threaded builds could have multiple threads attempt a GC from one trigger if we get very
    unlucky with thread scheduling. I didn't put any protections against this in place because a)
    the consequences of it happening are just that one or more threads will check the GC thresholds
    right after a collection finishes, which won't affect correctness and b) it's incredibly,
    vanishingly unlikely.
- Pending calls not limited to the main thread (possible since python/cpython@757b402ea1c2). This is
  a little tricker, since the callback can be added from any thread, with or without the GIL
  held. If the targeted interpreter's GIL is locked, we signal the holding thread. When a thread is
  resumed, its `_PY_CALLS_TO_DO` bit is derived from the source of truth for pending calls (one of
  two `_pending_calls` structs). This handles situations where no thread held the GIL when the call
  was first added, or if the active thread did not handle the call before releasing the GIL. In a
  free-threaded build, all threads all signaled, similar to scheduling a GC.

The source of truth for the global instrumentation version is still in `interp_eval_breaker`, in
both normal and free-threaded builds. Threads usually read the version from their local
`eval_breaker`, where it continues to be colocated with the eval breaker bits, and the method for
keeping it up to date depends on build type. All builds first update the version in
`interp_eval_breaker`, and then:
- Normal builds update the version in the current thread's `eval_breaker`. When a thread takes the
  GIL, it copies the current version from `interp_eval_breaker` as part of the same operation that
  copies `_PY_GC_SCHEDULED_BIT`.
- Free-threaded builds again iterate over all threads in the current interpreter, updating the
  version on each one.
Instrumentation (and the specializing interpreter more generally) will need more work to be
compatible with free-threaded builds, so these changes are just intended to maintain the status quo
in normal builds for now.

Other notable changes are:
- The `_PY_*_BIT` macros now expand to the actual bit being set, rather than the bit's index. I
  think this is simpler overall. I also moved their definitions from `pycore_ceval.h` to
  `pycore_pystate.h`, since their main usage is on `PyThreadState`s now.
- Most manipulations of `eval_breaker` are done with a new pair of functions:
  `_PyThreadState_Signal()` and `_PyThreadState_Unsignal()`. Having two separate functions to
  set/unset a bit, rather than one function that takes the bit value to use, lets us use a single
  atomic `or`/`and`, rather than a loop around an atomic compare/exchange like the old
  `_Py_set_eval_breaker_bit` function.

Existing tests provide pretty good coverage for most of this functionality. The one new test I added
is to make sure a GC still happens if a thread schedules it then drops the GIL before the GC runs. I
don't love how complicated this test ended up so I'm open to other ideas for how to test this (or
other things to test in general).
---
 Include/cpython/pystate.h             |   5 +
 Include/internal/pycore_ceval.h       |  42 +----
 Include/internal/pycore_ceval_state.h |  13 +-
 Include/internal/pycore_gc.h          |   3 +-
 Include/internal/pycore_pystate.h     |  36 ++++
 Include/internal/pycore_runtime.h     |   3 +
 Lib/test/test_gc.py                   |  41 +++++
 Modules/_testinternalcapi.c           | 114 +++++++++++++
 Modules/signalmodule.c                |  10 +-
 Python/bytecodes.c                    |   7 +-
 Python/ceval.c                        |   2 +-
 Python/ceval_gil.c                    | 227 ++++++++++++++++----------
 Python/ceval_macros.h                 |   2 +-
 Python/executor_cases.c.h             |   2 +-
 Python/gc.c                           |  11 +-
 Python/gc_free_threading.c            |   7 +-
 Python/generated_cases.c.h            |   6 +-
 Python/instrumentation.c              |  43 +++--
 Python/pylifecycle.c                  |   1 +
 Python/pystate.c                      |  35 +++-
 20 files changed, 441 insertions(+), 169 deletions(-)

diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h
index 9bc8758e72bd8f..8d6c3ce29c30ec 100644
--- a/Include/cpython/pystate.h
+++ b/Include/cpython/pystate.h
@@ -68,6 +68,11 @@ struct _ts {
     PyThreadState *next;
     PyInterpreterState *interp;
 
+    /* The global instrumentation version in high bits, plus flags indicating
+       when to break out of the interpreter loop in lower bits. See details in
+       pycore_pystate.h. */
+    uintptr_t eval_breaker;
+
     struct {
         /* Has been initialized to a safe state.
 
diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h
index a66af1389541dd..8f81df5e3a7ce9 100644
--- a/Include/internal/pycore_ceval.h
+++ b/Include/internal/pycore_ceval.h
@@ -42,7 +42,7 @@ PyAPI_FUNC(int) _PyEval_MakePendingCalls(PyThreadState *);
 
 extern void _Py_FinishPendingCalls(PyThreadState *tstate);
 extern void _PyEval_InitState(PyInterpreterState *);
-extern void _PyEval_SignalReceived(PyInterpreterState *interp);
+extern void _PyEval_SignalReceived(void);
 
 // bitwise flags:
 #define _Py_PENDING_MAINTHREADONLY 1
@@ -55,7 +55,6 @@ PyAPI_FUNC(int) _PyEval_AddPendingCall(
     void *arg,
     int flags);
 
-extern void _PyEval_SignalAsyncExc(PyInterpreterState *interp);
 #ifdef HAVE_FORK
 extern PyStatus _PyEval_ReInitThreads(PyThreadState *tstate);
 #endif
@@ -181,8 +180,9 @@ extern struct _PyInterpreterFrame* _PyEval_GetFrame(void);
 extern PyObject* _Py_MakeCoro(PyFunctionObject *func);
 
 /* Handle signals, pending calls, GIL drop request
-   and asynchronous exception */
-extern int _Py_HandlePending(PyThreadState *tstate);
+   and asynchronous exception.
+   Export for '_testinternalcapi' shared extension. */
+PyAPI_FUNC(int) _Py_HandlePending(PyThreadState *tstate);
 
 extern PyObject * _PyEval_GetFrameLocals(void);
 
@@ -200,40 +200,6 @@ int _PyEval_UnpackIterable(PyThreadState *tstate, PyObject *v, int argcnt, int a
 void _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame *frame);
 
 
-#define _PY_GIL_DROP_REQUEST_BIT 0
-#define _PY_SIGNALS_PENDING_BIT 1
-#define _PY_CALLS_TO_DO_BIT 2
-#define _PY_ASYNC_EXCEPTION_BIT 3
-#define _PY_GC_SCHEDULED_BIT 4
-#define _PY_EVAL_PLEASE_STOP_BIT 5
-
-/* Reserve a few bits for future use */
-#define _PY_EVAL_EVENTS_BITS 8
-#define _PY_EVAL_EVENTS_MASK ((1 << _PY_EVAL_EVENTS_BITS)-1)
-
-static inline void
-_Py_set_eval_breaker_bit(PyInterpreterState *interp, uint32_t bit, uint32_t set)
-{
-    assert(set == 0 || set == 1);
-    uintptr_t to_set = set << bit;
-    uintptr_t mask = ((uintptr_t)1) << bit;
-    uintptr_t old = _Py_atomic_load_uintptr(&interp->ceval.eval_breaker);
-    if ((old & mask) == to_set) {
-        return;
-    }
-    uintptr_t new;
-    do {
-        new = (old & ~mask) | to_set;
-    } while (!_Py_atomic_compare_exchange_uintptr(&interp->ceval.eval_breaker, &old, new));
-}
-
-static inline bool
-_Py_eval_breaker_bit_is_set(PyInterpreterState *interp, int32_t bit)
-{
-    return _Py_atomic_load_uintptr_relaxed(&interp->ceval.eval_breaker) & (((uintptr_t)1) << bit);
-}
-
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/internal/pycore_ceval_state.h b/Include/internal/pycore_ceval_state.h
index 28738980eb49be..d398ede5664cd3 100644
--- a/Include/internal/pycore_ceval_state.h
+++ b/Include/internal/pycore_ceval_state.h
@@ -78,11 +78,14 @@ struct _ceval_runtime_state {
 
 
 struct _ceval_state {
-    /* This single variable consolidates all requests to break out of
-     * the fast path in the eval loop.
-     * It is by far the hottest field in this struct and
-     * should be placed at the beginning. */
-    uintptr_t eval_breaker;
+    /* This single variable holds the global instrumentation version and some
+     * interpreter-global requests to break out of the fast path in the eval
+     * loop. PyThreadState also contains an eval_breaker, which is the source
+     * of truth when a thread is running.
+     *
+     * It is by far the hottest field in this struct and should be placed at
+     * the beginning. */
+    uintptr_t interp_eval_breaker;
     /* Avoid false sharing */
     int64_t padding[7];
     int recursion_limit;
diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h
index 8d0bc2a218e48d..38dcb1feabf7e0 100644
--- a/Include/internal/pycore_gc.h
+++ b/Include/internal/pycore_gc.h
@@ -287,7 +287,8 @@ extern void _PySlice_ClearCache(_PyFreeListState *state);
 extern void _PyDict_ClearFreeList(_PyFreeListState *state, int is_finalization);
 extern void _PyAsyncGen_ClearFreeLists(_PyFreeListState *state, int is_finalization);
 extern void _PyContext_ClearFreeList(_PyFreeListState *state, int is_finalization);
-extern void _Py_ScheduleGC(PyInterpreterState *interp);
+// Export for '_testinternalcapi' shared extension.
+PyAPI_FUNC(void) _Py_ScheduleGC(PyThreadState *interp);
 extern void _Py_RunGC(PyThreadState *tstate);
 
 #ifdef __cplusplus
diff --git a/Include/internal/pycore_pystate.h b/Include/internal/pycore_pystate.h
index 289ef28f0dd9a9..41d1150fee2b3a 100644
--- a/Include/internal/pycore_pystate.h
+++ b/Include/internal/pycore_pystate.h
@@ -282,6 +282,42 @@ static inline _PyFreeListState* _PyFreeListState_GET(void)
 #endif
 }
 
+/* Bits that can be set in PyThreadState.eval_breaker */
+#define _PY_GIL_DROP_REQUEST_BIT (1U << 0)
+#define _PY_SIGNALS_PENDING_BIT (1U << 1)
+#define _PY_CALLS_TO_DO_BIT (1U << 2)
+#define _PY_ASYNC_EXCEPTION_BIT (1U << 3)
+#define _PY_GC_SCHEDULED_BIT (1U << 4)
+#define _PY_EVAL_PLEASE_STOP_BIT (1U << 5)
+
+/* Reserve a few bits for future use */
+#define _PY_EVAL_EVENTS_BITS 8
+#define _PY_EVAL_EVENTS_MASK ((1U << _PY_EVAL_EVENTS_BITS)-1)
+
+static inline void
+_PyThreadState_Signal(PyThreadState *tstate, uintptr_t bit)
+{
+    _Py_atomic_or_uintptr(&tstate->eval_breaker, bit);
+}
+
+static inline void
+_PyThreadState_Unsignal(PyThreadState *tstate, uintptr_t bit)
+{
+    _Py_atomic_and_uintptr(&tstate->eval_breaker, ~bit);
+}
+
+static inline int
+_PyThreadState_IsSignalled(PyThreadState *tstate, uintptr_t bit)
+{
+    uintptr_t b = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker);
+    return (b & bit) != 0;
+}
+
+// Free-threaded builds use these functions to set or unset a bit on all
+// threads in the given interpreter.
+void _PyInterpreterState_SignalAll(PyInterpreterState *interp, uintptr_t bit);
+void _PyInterpreterState_UnsignalAll(PyInterpreterState *interp, uintptr_t bit);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/internal/pycore_runtime.h b/Include/internal/pycore_runtime.h
index 7c705d1224f915..0c9c59e85b2fcf 100644
--- a/Include/internal/pycore_runtime.h
+++ b/Include/internal/pycore_runtime.h
@@ -191,7 +191,10 @@ typedef struct pyruntimestate {
         int64_t next_id;
     } interpreters;
 
+    /* Platform-specific identifier and PyThreadState, respectively, for the
+       main thread in the main interpreter. */
     unsigned long main_thread;
+    PyThreadState *main_tstate;
 
     /* ---------- IMPORTANT ---------------------------
      The fields above this line are declared as early as
diff --git a/Lib/test/test_gc.py b/Lib/test/test_gc.py
index b01f344cb14a1a..5bd827d1163094 100644
--- a/Lib/test/test_gc.py
+++ b/Lib/test/test_gc.py
@@ -6,6 +6,10 @@
 from test.support.os_helper import temp_dir, TESTFN, unlink
 from test.support.script_helper import assert_python_ok, make_script
 from test.support import threading_helper
+try:
+    import _testinternalcapi
+except ImportError:
+    _testinternalcapi = None
 
 import gc
 import sys
@@ -1418,6 +1422,43 @@ def test_ast_fini(self):
         assert_python_ok("-c", code)
 
 
+class GCSchedulingTests(unittest.TestCase):
+    @unittest.skipIf(_testinternalcapi is None,
+                     "Requires functions from _testinternalcapi")
+    @threading_helper.requires_working_threading()
+    def test_gc_schedule_before_thread_switch(self):
+        # Ensure that a scheduled collection is not lost due to thread
+        # switching. Most of the work happens in helper functions in
+        # _testinternalcapi.
+
+        class Cycle:
+            def __init__(self):
+                self._self = self
+
+        thresholds = gc.get_threshold()
+        gc.enable()
+
+        try:
+            state = _testinternalcapi.schedule_gc_new_state()
+
+            def thread1():
+                _testinternalcapi.schedule_gc_do_schedule(state)
+
+            gc.set_threshold(1)
+            threads = [threading.Thread(target=thread1)]
+            with threading_helper.start_threads(threads):
+                r = weakref.ref(Cycle())
+                _testinternalcapi.schedule_gc_do_wait(state)
+
+                # Ensure that at least one GC has happened
+                for i in range(5):
+                    self.assertEqual(1, 1)
+                self.assertIsNone(r())
+        finally:
+            gc.disable()
+            gc.set_threshold(*thresholds)
+
+
 def setUpModule():
     global enabled, debug
     enabled = gc.isenabled()
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 0bb739b5398b11..5f64bf5c04493e 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -1650,6 +1650,117 @@ get_rare_event_counters(PyObject *self, PyObject *type)
     );
 }
 
+// The schedule_gc_* functions work together to test GC timing and the eval
+// breaker, when used by
+// test_gc.py:GCSchedulingTests.test_gc_schedule_before_thread_switch().
+//
+// The expected sequence of events is:
+// - thread 2 waits for thread 1 to be ready
+// - thread 1 waits for thread 2 to be ready
+// (both threads are now at known locations in their respective C functions)
+// - thread 1 clears out pending eval breaker flags
+// - thread 2 checks that a GC is not scheduled
+// - thread 1 schedules a GC and releases the GIL without checking its eval breaker
+// - thread 2 checks that a GC is scheduled and returns
+// - thread 1 sees that thread 2 is done and returns, allowing Python code to run again
+typedef enum {
+    SCHEDULE_GC_INIT,
+    SCHEDULE_GC_THREAD1_READY,
+    SCHEDULE_GC_THREAD2_READY,
+    SCHEDULE_GC_THREAD1_CLEARED,
+    SCHEDULE_GC_THREAD2_VERIFIED,
+    SCHEDULE_GC_THREAD1_SCHEDULED,
+    SCHEDULE_GC_THREAD2_DONE,
+
+    SCHEDULE_GC_STOP,
+} schedule_gc_state;
+
+static void
+schedule_gc_state_destructor(PyObject *capsule)
+{
+    void *state = PyCapsule_GetPointer(capsule, NULL);
+    assert(state != NULL);
+    free(state);
+}
+
+static PyObject *
+schedule_gc_new_state(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+    schedule_gc_state *state = malloc(sizeof(schedule_gc_state));
+    if (state == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to allocate state");
+        return NULL;
+    }
+    *state = SCHEDULE_GC_INIT;
+    return PyCapsule_New(state, NULL, schedule_gc_state_destructor);
+}
+
+// Repeatedly release the GIL until the desired state appears in *state.
+#define SCHEDULE_GC_WAIT_FOR(desired) \
+     do { \
+         while (*state != desired) { \
+             if (*state == SCHEDULE_GC_STOP) { \
+                 Py_RETURN_NONE; \
+             } \
+             PyEval_RestoreThread(PyEval_SaveThread()); \
+         } \
+    } while (0)
+
+static PyObject *
+schedule_gc_do_schedule(PyObject *self, PyObject *capsule)
+{
+    PyThreadState *tstate = PyThreadState_Get();
+    schedule_gc_state *state = PyCapsule_GetPointer(capsule, NULL);
+    assert(state != NULL);
+
+    *state = SCHEDULE_GC_THREAD1_READY;
+    SCHEDULE_GC_WAIT_FOR(SCHEDULE_GC_THREAD2_READY);
+
+    if (_Py_HandlePending(tstate) < 0) {
+        *state = SCHEDULE_GC_STOP;
+        return NULL;
+    }
+    *state = SCHEDULE_GC_THREAD1_CLEARED;
+    SCHEDULE_GC_WAIT_FOR(SCHEDULE_GC_THREAD2_VERIFIED);
+
+    _Py_ScheduleGC(tstate);
+    *state = SCHEDULE_GC_THREAD1_SCHEDULED;
+    SCHEDULE_GC_WAIT_FOR(SCHEDULE_GC_THREAD2_DONE);
+
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+schedule_gc_do_wait(PyObject *self, PyObject *capsule)
+{
+    PyThreadState *tstate = PyThreadState_Get();
+    schedule_gc_state *state = PyCapsule_GetPointer(capsule, NULL);
+    assert(state != NULL);
+
+    SCHEDULE_GC_WAIT_FOR(SCHEDULE_GC_THREAD1_READY);
+
+    *state = SCHEDULE_GC_THREAD2_READY;
+    SCHEDULE_GC_WAIT_FOR(SCHEDULE_GC_THREAD1_CLEARED);
+
+    if (_PyThreadState_IsSignalled(tstate, _PY_GC_SCHEDULED_BIT)) {
+        PyErr_SetString(PyExc_AssertionError,
+                        "GC_SCHEDULED_BIT unexpectedly set");
+        return NULL;
+    }
+    *state = SCHEDULE_GC_THREAD2_VERIFIED;
+    SCHEDULE_GC_WAIT_FOR(SCHEDULE_GC_THREAD1_SCHEDULED);
+
+    if (!_PyThreadState_IsSignalled(tstate, _PY_GC_SCHEDULED_BIT)) {
+        PyErr_SetString(PyExc_AssertionError,
+                        "GC_SCHEDULED_BIT not carried over from thread 1");
+        return NULL;
+    }
+    *state = SCHEDULE_GC_THREAD2_DONE;
+    // Let the GC run naturally once we've returned to Python.
+
+    Py_RETURN_NONE;
+}
+
 
 #ifdef Py_GIL_DISABLED
 static PyObject *
@@ -1727,6 +1838,9 @@ static PyMethodDef module_functions[] = {
     _TESTINTERNALCAPI_TEST_LONG_NUMBITS_METHODDEF
     {"get_type_module_name",    get_type_module_name,            METH_O},
     {"get_rare_event_counters", get_rare_event_counters, METH_NOARGS},
+    {"schedule_gc_new_state", schedule_gc_new_state,     METH_NOARGS},
+    {"schedule_gc_do_schedule", schedule_gc_do_schedule, METH_O},
+    {"schedule_gc_do_wait", schedule_gc_do_wait,         METH_O},
 #ifdef Py_GIL_DISABLED
     {"py_thread_id", get_py_thread_id, METH_NOARGS},
 #endif
diff --git a/Modules/signalmodule.c b/Modules/signalmodule.c
index 394a997b20c06d..0969284d3e2af0 100644
--- a/Modules/signalmodule.c
+++ b/Modules/signalmodule.c
@@ -276,11 +276,7 @@ trip_signal(int sig_num)
        cleared in PyErr_CheckSignals() before .tripped. */
     _Py_atomic_store_int(&is_tripped, 1);
 
-    /* Signals are always handled by the main interpreter */
-    PyInterpreterState *interp = _PyInterpreterState_Main();
-
-    /* Notify ceval.c */
-    _PyEval_SignalReceived(interp);
+    _PyEval_SignalReceived();
 
     /* And then write to the wakeup fd *after* setting all the globals and
        doing the _PyEval_SignalReceived. We used to write to the wakeup fd
@@ -303,6 +299,7 @@ trip_signal(int sig_num)
 
     int fd = wakeup.fd;
     if (fd != INVALID_FD) {
+        PyInterpreterState *interp = _PyInterpreterState_Main();
         unsigned char byte = (unsigned char)sig_num;
 #ifdef MS_WINDOWS
         if (wakeup.use_send) {
@@ -1770,8 +1767,7 @@ PyErr_CheckSignals(void)
        Python code to ensure signals are handled. Checking for the GC here
        allows long running native code to clean cycles created using the C-API
        even if it doesn't run the evaluation loop */
-    if (_Py_eval_breaker_bit_is_set(tstate->interp, _PY_GC_SCHEDULED_BIT)) {
-        _Py_set_eval_breaker_bit(tstate->interp, _PY_GC_SCHEDULED_BIT, 0);
+    if (_PyThreadState_IsSignalled(tstate, _PY_GC_SCHEDULED_BIT)) {
         _Py_RunGC(tstate);
     }
 
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index 6fb4d719e43991..6ba8667fa50835 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -8,7 +8,6 @@
 
 #include "Python.h"
 #include "pycore_abstract.h"      // _PyIndex_Check()
-#include "pycore_ceval.h"         // _PyEval_SignalAsyncExc()
 #include "pycore_code.h"
 #include "pycore_emscripten_signal.h"  // _Py_CHECK_EMSCRIPTEN_SIGNALS
 #include "pycore_function.h"
@@ -144,7 +143,7 @@ dummy_func(
             TIER_ONE_ONLY
             assert(frame == tstate->current_frame);
             uintptr_t global_version =
-                _Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker) &
+                _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) &
                 ~_PY_EVAL_EVENTS_MASK;
             uintptr_t code_version = _PyFrame_GetCode(frame)->_co_instrumentation_version;
             assert((code_version & 255) == 0);
@@ -166,14 +165,14 @@ dummy_func(
             DEOPT_IF(_Py_emscripten_signal_clock == 0);
             _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING;
 #endif
-            uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker);
+            uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker);
             uintptr_t version = _PyFrame_GetCode(frame)->_co_instrumentation_version;
             assert((version & _PY_EVAL_EVENTS_MASK) == 0);
             DEOPT_IF(eval_breaker != version);
         }
 
         inst(INSTRUMENTED_RESUME, (--)) {
-            uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker) & ~_PY_EVAL_EVENTS_MASK;
+            uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & ~_PY_EVAL_EVENTS_MASK;
             uintptr_t code_version = _PyFrame_GetCode(frame)->_co_instrumentation_version;
             if (code_version != global_version) {
                 if (_Py_Instrument(_PyFrame_GetCode(frame), tstate->interp)) {
diff --git a/Python/ceval.c b/Python/ceval.c
index 4f208009086191..28a9499694c3de 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -5,7 +5,7 @@
 #include "Python.h"
 #include "pycore_abstract.h"      // _PyIndex_Check()
 #include "pycore_call.h"          // _PyObject_CallNoArgs()
-#include "pycore_ceval.h"         // _PyEval_SignalAsyncExc()
+#include "pycore_ceval.h"
 #include "pycore_code.h"
 #include "pycore_emscripten_signal.h"  // _Py_CHECK_EMSCRIPTEN_SIGNALS
 #include "pycore_function.h"
diff --git a/Python/ceval_gil.c b/Python/ceval_gil.c
index ad90359318761a..b5de17abd1cf2f 100644
--- a/Python/ceval_gil.c
+++ b/Python/ceval_gil.c
@@ -56,60 +56,78 @@
 #define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL)
 #endif
 
-/* bpo-40010: eval_breaker should be recomputed if there
-   is a pending signal: signal received by another thread which cannot
-   handle signals.
-   Similarly, we set CALLS_TO_DO and ASYNC_EXCEPTION to match the thread.
-*/
+// Atomically copy the bits indicated by mask between two eval breakers.
 static inline void
-update_eval_breaker_from_thread(PyInterpreterState *interp, PyThreadState *tstate)
+copy_eval_breaker_bits(uintptr_t *from, uintptr_t *to, uintptr_t mask)
 {
-    if (tstate == NULL) {
+    uintptr_t from_bits = _Py_atomic_load_uintptr_relaxed(from) & mask;
+    uintptr_t old_value = _Py_atomic_load_uintptr_relaxed(to);
+    uintptr_t to_bits = old_value & mask;
+    if (from_bits == to_bits) {
         return;
     }
 
-    if (_Py_IsMainThread()) {
-        int32_t calls_to_do = _Py_atomic_load_int32_relaxed(
-            &_PyRuntime.ceval.pending_mainthread.calls_to_do);
-        if (calls_to_do) {
-            _Py_set_eval_breaker_bit(interp, _PY_CALLS_TO_DO_BIT, 1);
-        }
-        if (_Py_ThreadCanHandleSignals(interp)) {
-            if (_Py_atomic_load_int(&_PyRuntime.signals.is_tripped)) {
-                _Py_set_eval_breaker_bit(interp, _PY_SIGNALS_PENDING_BIT, 1);
-            }
-        }
-    }
-    if (tstate->async_exc != NULL) {
-        _Py_set_eval_breaker_bit(interp, _PY_ASYNC_EXCEPTION_BIT, 1);
-    }
+    uintptr_t new_value;
+    do {
+        new_value = (old_value & ~mask) | from_bits;
+    } while (!_Py_atomic_compare_exchange_uintptr(to, &old_value, new_value));
 }
 
+// When attaching a thread, set the global instrumentation version,
+// _PY_CALLS_TO_DO_BIT, and _PY_GC_SCHEDULED_BIT to match the current state of
+// the interpreter.
 static inline void
-SET_GIL_DROP_REQUEST(PyInterpreterState *interp)
+update_thread_eval_breaker(PyInterpreterState *interp, PyThreadState *tstate)
 {
-    _Py_set_eval_breaker_bit(interp, _PY_GIL_DROP_REQUEST_BIT, 1);
-}
+#ifdef Py_GIL_DISABLED
+    // Free-threaded builds eagerly update the eval_breaker on *all* threads as
+    // needed, so this function doesn't apply.
+    return;
+#endif
 
+    if (tstate == NULL) {
+        return;
+    }
 
-static inline void
-RESET_GIL_DROP_REQUEST(PyInterpreterState *interp)
-{
-    _Py_set_eval_breaker_bit(interp, _PY_GIL_DROP_REQUEST_BIT, 0);
-}
+    int32_t calls_to_do = _Py_atomic_load_int32_relaxed(
+        &interp->ceval.pending.calls_to_do);
+    if (calls_to_do) {
+        _PyThreadState_Signal(tstate, _PY_CALLS_TO_DO_BIT);
+    }
+    else if (_Py_IsMainThread()) {
+        calls_to_do = _Py_atomic_load_int32_relaxed(
+            &_PyRuntime.ceval.pending_mainthread.calls_to_do);
+        if (calls_to_do) {
+            _PyThreadState_Signal(tstate, _PY_CALLS_TO_DO_BIT);
+        }
+    }
 
+    // _PY_CALLS_TO_DO_BIT was derived from other state above, so the only bits
+    // we copy from our interpreter's eval_breaker are the instrumentation
+    // version number and GC bit.
+    const uintptr_t mask = ~_PY_EVAL_EVENTS_MASK | _PY_GC_SCHEDULED_BIT;
+    copy_eval_breaker_bits(&interp->ceval.interp_eval_breaker,
+                           &tstate->eval_breaker,
+                           mask);
+}
 
+// When detaching a thread, transfer _PY_GC_SCHEDULED_BIT to its interpreter,
+// in case a GC was scheduled but not processed yet.
 static inline void
-SIGNAL_PENDING_CALLS(PyInterpreterState *interp)
-{
-    _Py_set_eval_breaker_bit(interp, _PY_CALLS_TO_DO_BIT, 1);
-}
+update_interp_eval_breaker(PyThreadState *tstate, PyInterpreterState *interp) {
+#ifdef Py_GIL_DISABLED
+    // Free-threaded builds eagerly update the eval_breaker on *all* threads as
+    // needed, so this function doesn't apply.
+    return;
+#endif
 
+    if (tstate == NULL) {
+        return;
+    }
 
-static inline void
-UNSIGNAL_PENDING_CALLS(PyInterpreterState *interp)
-{
-    _Py_set_eval_breaker_bit(interp, _PY_CALLS_TO_DO_BIT, 0);
+    copy_eval_breaker_bits(&tstate->eval_breaker,
+                           &interp->ceval.interp_eval_breaker,
+                           _PY_GC_SCHEDULED_BIT);
 }
 
 /*
@@ -240,6 +258,7 @@ drop_gil(PyInterpreterState *interp, PyThreadState *tstate)
     }
 
     MUTEX_LOCK(gil->mutex);
+    update_interp_eval_breaker(tstate, interp);
     _Py_ANNOTATE_RWLOCK_RELEASED(&gil->locked, /*is_write=*/1);
     _Py_atomic_store_int_relaxed(&gil->locked, 0);
     COND_SIGNAL(gil->cond);
@@ -254,13 +273,14 @@ drop_gil(PyInterpreterState *interp, PyThreadState *tstate)
        the GIL, and that's the only time we might delete the
        interpreter, so checking tstate first prevents the crash.
        See https://github.com/python/cpython/issues/104341. */
-    if (tstate != NULL && _Py_eval_breaker_bit_is_set(interp, _PY_GIL_DROP_REQUEST_BIT)) {
+    if (tstate != NULL &&
+        _PyThreadState_IsSignalled(tstate, _PY_GIL_DROP_REQUEST_BIT)) {
         MUTEX_LOCK(gil->switch_mutex);
         /* Not switched yet => wait */
         if (((PyThreadState*)_Py_atomic_load_ptr_relaxed(&gil->last_holder)) == tstate)
         {
             assert(_PyThreadState_CheckConsistency(tstate));
-            RESET_GIL_DROP_REQUEST(tstate->interp);
+            _PyThreadState_Unsignal(tstate, _PY_GIL_DROP_REQUEST_BIT);
             /* NOTE: if COND_WAIT does not atomically start waiting when
                releasing the mutex, another thread can run through, take
                the GIL and drop it again, and reset the condition
@@ -321,6 +341,8 @@ take_gil(PyThreadState *tstate)
             _Py_atomic_load_int_relaxed(&gil->locked) &&
             gil->switch_number == saved_switchnum)
         {
+            PyThreadState *holder_tstate =
+                (PyThreadState*)_Py_atomic_load_ptr_relaxed(&gil->last_holder);
             if (_PyThreadState_MustExit(tstate)) {
                 MUTEX_UNLOCK(gil->mutex);
                 // gh-96387: If the loop requested a drop request in a previous
@@ -330,13 +352,13 @@ take_gil(PyThreadState *tstate)
                 // may have to request again a drop request (iterate one more
                 // time).
                 if (drop_requested) {
-                    RESET_GIL_DROP_REQUEST(interp);
+                    _PyThreadState_Unsignal(holder_tstate, _PY_GIL_DROP_REQUEST_BIT);
                 }
                 PyThread_exit_thread();
             }
             assert(_PyThreadState_CheckConsistency(tstate));
 
-            SET_GIL_DROP_REQUEST(interp);
+            _PyThreadState_Signal(holder_tstate, _PY_GIL_DROP_REQUEST_BIT);
             drop_requested = 1;
         }
     }
@@ -369,13 +391,15 @@ take_gil(PyThreadState *tstate)
            in take_gil() while the main thread called
            wait_for_thread_shutdown() from Py_Finalize(). */
         MUTEX_UNLOCK(gil->mutex);
-        drop_gil(interp, tstate);
+        /* Passing NULL to drop_gil() indicates that this thread is about to
+           terminate and will never hold the GIL again. */
+        drop_gil(interp, NULL);
         PyThread_exit_thread();
     }
     assert(_PyThreadState_CheckConsistency(tstate));
 
-    RESET_GIL_DROP_REQUEST(interp);
-    update_eval_breaker_from_thread(interp, tstate);
+    _PyThreadState_Unsignal(tstate, _PY_GIL_DROP_REQUEST_BIT);
+    update_thread_eval_breaker(interp, tstate);
 
     MUTEX_UNLOCK(gil->mutex);
 
@@ -590,15 +614,6 @@ _PyEval_ReInitThreads(PyThreadState *tstate)
 }
 #endif
 
-/* This function is used to signal that async exceptions are waiting to be
-   raised. */
-
-void
-_PyEval_SignalAsyncExc(PyInterpreterState *interp)
-{
-    _Py_set_eval_breaker_bit(interp, _PY_ASYNC_EXCEPTION_BIT, 1);
-}
-
 PyThreadState *
 PyEval_SaveThread(void)
 {
@@ -646,11 +661,9 @@ PyEval_RestoreThread(PyThreadState *tstate)
 */
 
 void
-_PyEval_SignalReceived(PyInterpreterState *interp)
+_PyEval_SignalReceived()
 {
-    if (_Py_ThreadCanHandleSignals(interp)) {
-        _Py_set_eval_breaker_bit(interp, _PY_SIGNALS_PENDING_BIT, 1);
-    }
+    _PyThreadState_Signal(_PyRuntime.main_tstate, _PY_SIGNALS_PENDING_BIT);
 }
 
 /* Push one item onto the queue while holding the lock. */
@@ -702,6 +715,24 @@ _pop_pending_call(struct _pending_calls *pending,
     }
 }
 
+static void
+signal_active_thread(PyInterpreterState *interp, uintptr_t bit)
+{
+    struct _gil_runtime_state *gil = interp->ceval.gil;
+
+    // If a thread from the targeted interpreter is holding the GIL, signal
+    // that thread. Otherwise, the next thread to run from the targeted
+    // interpreter will have its bit set as part of taking the GIL.
+    MUTEX_LOCK(gil->mutex);
+    if (_Py_atomic_load_int_relaxed(&gil->locked)) {
+        PyThreadState *holder = (PyThreadState*)_Py_atomic_load_ptr_relaxed(&gil->last_holder);
+        if (holder->interp == interp) {
+            _PyThreadState_Signal(holder, bit);
+        }
+    }
+    MUTEX_UNLOCK(gil->mutex);
+}
+
 /* This implementation is thread-safe.  It allows
    scheduling to be made from any thread, and even from an executing
    callback.
@@ -711,10 +742,9 @@ int
 _PyEval_AddPendingCall(PyInterpreterState *interp,
                        _Py_pending_call_func func, void *arg, int flags)
 {
-    assert(!(flags & _Py_PENDING_MAINTHREADONLY)
-           || _Py_IsMainInterpreter(interp));
     struct _pending_calls *pending = &interp->ceval.pending;
-    if (flags & _Py_PENDING_MAINTHREADONLY) {
+    int main_only = (flags & _Py_PENDING_MAINTHREADONLY) != 0;
+    if (main_only) {
         /* The main thread only exists in the main interpreter. */
         assert(_Py_IsMainInterpreter(interp));
         pending = &_PyRuntime.ceval.pending_mainthread;
@@ -724,8 +754,16 @@ _PyEval_AddPendingCall(PyInterpreterState *interp,
     int result = _push_pending_call(pending, func, arg, flags);
     PyMutex_Unlock(&pending->mutex);
 
-    /* signal main loop */
-    SIGNAL_PENDING_CALLS(interp);
+    if (main_only) {
+        _PyThreadState_Signal(_PyRuntime.main_tstate, _PY_CALLS_TO_DO_BIT);
+    } else {
+#ifdef Py_GIL_DISABLED
+        _PyInterpreterState_SignalAll(interp, _PY_CALLS_TO_DO_BIT);
+#else
+        signal_active_thread(interp, _PY_CALLS_TO_DO_BIT);
+#endif
+    }
+
     return result;
 }
 
@@ -742,13 +780,13 @@ static int
 handle_signals(PyThreadState *tstate)
 {
     assert(_PyThreadState_CheckConsistency(tstate));
-    _Py_set_eval_breaker_bit(tstate->interp, _PY_SIGNALS_PENDING_BIT, 0);
+    _PyThreadState_Unsignal(tstate, _PY_SIGNALS_PENDING_BIT);
     if (!_Py_ThreadCanHandleSignals(tstate->interp)) {
         return 0;
     }
     if (_PyErr_CheckSignalsTstate(tstate) < 0) {
         /* On failure, re-schedule a call to handle_signals(). */
-        _Py_set_eval_breaker_bit(tstate->interp, _PY_SIGNALS_PENDING_BIT, 1);
+        _PyThreadState_Signal(tstate, _PY_SIGNALS_PENDING_BIT);
         return -1;
     }
     return 0;
@@ -783,9 +821,30 @@ _make_pending_calls(struct _pending_calls *pending)
     return 0;
 }
 
+static void
+signal_pending_calls(PyThreadState *tstate, PyInterpreterState *interp)
+{
+#ifdef Py_GIL_DISABLED
+    _PyInterpreterState_SignalAll(interp, _PY_CALLS_TO_DO_BIT);
+#else
+    _PyThreadState_Signal(tstate, _PY_CALLS_TO_DO_BIT);
+#endif
+}
+
+static void
+unsignal_pending_calls(PyThreadState *tstate, PyInterpreterState *interp)
+{
+#ifdef Py_GIL_DISABLED
+    _PyInterpreterState_UnsignalAll(interp, _PY_CALLS_TO_DO_BIT);
+#else
+    _PyThreadState_Unsignal(tstate, _PY_CALLS_TO_DO_BIT);
+#endif
+}
+
 static int
-make_pending_calls(PyInterpreterState *interp)
+make_pending_calls(PyThreadState *tstate)
 {
+    PyInterpreterState *interp = tstate->interp;
     struct _pending_calls *pending = &interp->ceval.pending;
     struct _pending_calls *pending_main = &_PyRuntime.ceval.pending_mainthread;
 
@@ -811,12 +870,12 @@ make_pending_calls(PyInterpreterState *interp)
 
     /* unsignal before starting to call callbacks, so that any callback
        added in-between re-signals */
-    UNSIGNAL_PENDING_CALLS(interp);
+    unsignal_pending_calls(tstate, interp);
 
     if (_make_pending_calls(pending) != 0) {
         pending->busy = 0;
         /* There might not be more calls to make, but we play it safe. */
-        SIGNAL_PENDING_CALLS(interp);
+        signal_pending_calls(tstate, interp);
         return -1;
     }
 
@@ -824,7 +883,7 @@ make_pending_calls(PyInterpreterState *interp)
         if (_make_pending_calls(pending_main) != 0) {
             pending->busy = 0;
             /* There might not be more calls to make, but we play it safe. */
-            SIGNAL_PENDING_CALLS(interp);
+            signal_pending_calls(tstate, interp);
             return -1;
         }
     }
@@ -839,7 +898,7 @@ _Py_FinishPendingCalls(PyThreadState *tstate)
     assert(PyGILState_Check());
     assert(_PyThreadState_CheckConsistency(tstate));
 
-    if (make_pending_calls(tstate->interp) < 0) {
+    if (make_pending_calls(tstate) < 0) {
         PyObject *exc = _PyErr_GetRaisedException(tstate);
         PyErr_BadInternalCall();
         _PyErr_ChainExceptions1(exc);
@@ -862,7 +921,7 @@ _PyEval_MakePendingCalls(PyThreadState *tstate)
         }
     }
 
-    res = make_pending_calls(tstate->interp);
+    res = make_pending_calls(tstate);
     if (res != 0) {
         return res;
     }
@@ -956,10 +1015,11 @@ int
 _Py_HandlePending(PyThreadState *tstate)
 {
     PyInterpreterState *interp = tstate->interp;
+    uintptr_t breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker);
 
     /* Stop-the-world */
-    if (_Py_eval_breaker_bit_is_set(interp, _PY_EVAL_PLEASE_STOP_BIT)) {
-        _Py_set_eval_breaker_bit(interp, _PY_EVAL_PLEASE_STOP_BIT, 0);
+    if ((breaker & _PY_EVAL_PLEASE_STOP_BIT) != 0) {
+        _PyThreadState_Unsignal(tstate, _PY_EVAL_PLEASE_STOP_BIT);
         _PyThreadState_Suspend(tstate);
 
         /* The attach blocks until the stop-the-world event is complete. */
@@ -967,27 +1027,26 @@ _Py_HandlePending(PyThreadState *tstate)
     }
 
     /* Pending signals */
-    if (_Py_eval_breaker_bit_is_set(interp, _PY_SIGNALS_PENDING_BIT)) {
+    if ((breaker & _PY_SIGNALS_PENDING_BIT) != 0) {
         if (handle_signals(tstate) != 0) {
             return -1;
         }
     }
 
     /* Pending calls */
-    if (_Py_eval_breaker_bit_is_set(interp, _PY_CALLS_TO_DO_BIT)) {
-        if (make_pending_calls(interp) != 0) {
+    if ((breaker & _PY_CALLS_TO_DO_BIT) != 0) {
+        if (make_pending_calls(tstate) != 0) {
             return -1;
         }
     }
 
     /* GC scheduled to run */
-    if (_Py_eval_breaker_bit_is_set(interp, _PY_GC_SCHEDULED_BIT)) {
-        _Py_set_eval_breaker_bit(interp, _PY_GC_SCHEDULED_BIT, 0);
+    if ((breaker & _PY_GC_SCHEDULED_BIT) != 0) {
         _Py_RunGC(tstate);
     }
 
     /* GIL drop request */
-    if (_Py_eval_breaker_bit_is_set(interp, _PY_GIL_DROP_REQUEST_BIT)) {
+    if ((breaker & _PY_GIL_DROP_REQUEST_BIT) != 0) {
         /* Give another thread a chance */
         _PyThreadState_Detach(tstate);
 
@@ -997,11 +1056,10 @@ _Py_HandlePending(PyThreadState *tstate)
     }
 
     /* Check for asynchronous exception. */
-    if (_Py_eval_breaker_bit_is_set(interp, _PY_ASYNC_EXCEPTION_BIT)) {
-        _Py_set_eval_breaker_bit(interp, _PY_ASYNC_EXCEPTION_BIT, 0);
-        if (tstate->async_exc != NULL) {
-            PyObject *exc = tstate->async_exc;
-            tstate->async_exc = NULL;
+    if ((breaker & _PY_ASYNC_EXCEPTION_BIT) != 0) {
+        _PyThreadState_Unsignal(tstate, _PY_ASYNC_EXCEPTION_BIT);
+        PyObject *exc = _Py_atomic_exchange_ptr(&tstate->async_exc, NULL);
+        if (exc != NULL) {
             _PyErr_SetNone(tstate, exc);
             Py_DECREF(exc);
             return -1;
@@ -1009,4 +1067,3 @@ _Py_HandlePending(PyThreadState *tstate)
     }
     return 0;
 }
-
diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h
index c2550f53ad6eaa..d5a979f801e7ba 100644
--- a/Python/ceval_macros.h
+++ b/Python/ceval_macros.h
@@ -117,7 +117,7 @@
 
 #define CHECK_EVAL_BREAKER() \
     _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY(); \
-    if (_Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker) & _PY_EVAL_EVENTS_MASK) { \
+    if (_Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & _PY_EVAL_EVENTS_MASK) { \
         if (_Py_HandlePending(tstate) != 0) { \
             GOTO_ERROR(error); \
         } \
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index 2d914b82dbf88f..14a50ab370c8dd 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -17,7 +17,7 @@
             if (_Py_emscripten_signal_clock == 0) goto deoptimize;
             _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING;
             #endif
-            uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker);
+            uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker);
             uintptr_t version = _PyFrame_GetCode(frame)->_co_instrumentation_version;
             assert((version & _PY_EVAL_EVENTS_MASK) == 0);
             if (eval_breaker != version) goto deoptimize;
diff --git a/Python/gc.c b/Python/gc.c
index 46646760291526..586b32d3194225 100644
--- a/Python/gc.c
+++ b/Python/gc.c
@@ -3,7 +3,6 @@
 //  See https://devguide.python.org/internals/garbage-collector/
 
 #include "Python.h"
-#include "pycore_ceval.h"         // _Py_set_eval_breaker_bit()
 #include "pycore_context.h"
 #include "pycore_dict.h"          // _PyDict_MaybeUntrack()
 #include "pycore_initconfig.h"
@@ -1765,9 +1764,12 @@ PyObject_IS_GC(PyObject *obj)
 }
 
 void
-_Py_ScheduleGC(PyInterpreterState *interp)
+_Py_ScheduleGC(PyThreadState *tstate)
 {
-    _Py_set_eval_breaker_bit(interp, _PY_GC_SCHEDULED_BIT, 1);
+    if (!_PyThreadState_IsSignalled(tstate, _PY_GC_SCHEDULED_BIT))
+    {
+        _PyThreadState_Signal(tstate, _PY_GC_SCHEDULED_BIT);
+    }
 }
 
 void
@@ -1787,13 +1789,14 @@ _PyObject_GC_Link(PyObject *op)
         !_Py_atomic_load_int_relaxed(&gcstate->collecting) &&
         !_PyErr_Occurred(tstate))
     {
-        _Py_ScheduleGC(tstate->interp);
+        _Py_ScheduleGC(tstate);
     }
 }
 
 void
 _Py_RunGC(PyThreadState *tstate)
 {
+    _PyThreadState_Unsignal(tstate, _PY_GC_SCHEDULED_BIT);
     gc_collect_main(tstate, GENERATION_AUTO, _Py_GC_REASON_HEAP);
 }
 
diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c
index 8fbcdb15109b76..54a926c0830a0d 100644
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@@ -1480,9 +1480,9 @@ PyObject_IS_GC(PyObject *obj)
 }
 
 void
-_Py_ScheduleGC(PyInterpreterState *interp)
+_Py_ScheduleGC(PyThreadState *tstate)
 {
-    _Py_set_eval_breaker_bit(interp, _PY_GC_SCHEDULED_BIT, 1);
+    _PyInterpreterState_SignalAll(tstate->interp, _PY_GC_SCHEDULED_BIT);
 }
 
 void
@@ -1495,13 +1495,14 @@ _PyObject_GC_Link(PyObject *op)
     if (gc_should_collect(gcstate) &&
         !_Py_atomic_load_int_relaxed(&gcstate->collecting))
     {
-        _Py_ScheduleGC(tstate->interp);
+        _Py_ScheduleGC(tstate);
     }
 }
 
 void
 _Py_RunGC(PyThreadState *tstate)
 {
+    _PyInterpreterState_UnsignalAll(tstate->interp, _PY_GC_SCHEDULED_BIT);
     gc_collect_main(tstate, 0, _Py_GC_REASON_HEAP);
 }
 
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
index 16f1db30620d72..64054b5d1890ac 100644
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@@ -3128,7 +3128,7 @@
             _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr;
             next_instr += 1;
             INSTRUCTION_STATS(INSTRUMENTED_RESUME);
-            uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker) & ~_PY_EVAL_EVENTS_MASK;
+            uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & ~_PY_EVAL_EVENTS_MASK;
             uintptr_t code_version = _PyFrame_GetCode(frame)->_co_instrumentation_version;
             if (code_version != global_version) {
                 if (_Py_Instrument(_PyFrame_GetCode(frame), tstate->interp)) {
@@ -4795,7 +4795,7 @@
             TIER_ONE_ONLY
             assert(frame == tstate->current_frame);
             uintptr_t global_version =
-            _Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker) &
+            _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) &
             ~_PY_EVAL_EVENTS_MASK;
             uintptr_t code_version = _PyFrame_GetCode(frame)->_co_instrumentation_version;
             assert((code_version & 255) == 0);
@@ -4822,7 +4822,7 @@
             DEOPT_IF(_Py_emscripten_signal_clock == 0, RESUME);
             _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING;
             #endif
-            uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker);
+            uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker);
             uintptr_t version = _PyFrame_GetCode(frame)->_co_instrumentation_version;
             assert((version & _PY_EVAL_EVENTS_MASK) == 0);
             DEOPT_IF(eval_breaker != version, RESUME);
diff --git a/Python/instrumentation.c b/Python/instrumentation.c
index 533aece210202b..79477701536b22 100644
--- a/Python/instrumentation.c
+++ b/Python/instrumentation.c
@@ -891,18 +891,39 @@ static inline int most_significant_bit(uint8_t bits) {
 static uint32_t
 global_version(PyInterpreterState *interp)
 {
-    return interp->ceval.eval_breaker & ~_PY_EVAL_EVENTS_MASK;
+    return interp->ceval.interp_eval_breaker & ~_PY_EVAL_EVENTS_MASK;
 }
 
 static void
-set_global_version(PyInterpreterState *interp, uint32_t version)
+set_version_raw(uintptr_t *breaker, uint32_t version)
 {
-    assert((version & _PY_EVAL_EVENTS_MASK) == 0);
-    uintptr_t old = _Py_atomic_load_uintptr(&interp->ceval.eval_breaker);
+    uintptr_t old = _Py_atomic_load_uintptr(breaker);
     intptr_t new;
     do {
         new = (old & _PY_EVAL_EVENTS_MASK) | version;
-    } while (!_Py_atomic_compare_exchange_uintptr(&interp->ceval.eval_breaker, &old, new));
+    } while (!_Py_atomic_compare_exchange_uintptr(breaker, &old, new));
+}
+
+static void
+set_global_version(PyThreadState *tstate, uint32_t version)
+{
+    assert((version & _PY_EVAL_EVENTS_MASK) == 0);
+    PyInterpreterState *interp = tstate->interp;
+    set_version_raw(&tstate->interp->ceval.interp_eval_breaker, version);
+
+#ifdef Py_GIL_DISABLED
+    // Set the version on all threads in free-threaded builds.
+    _PyRuntimeState *runtime = &_PyRuntime;
+    HEAD_LOCK(runtime);
+    for (; tstate; tstate = PyThreadState_Next(tstate)) {
+        set_version_raw(&tstate->eval_breaker, version);
+    };
+    HEAD_UNLOCK(runtime);
+#else
+    // Normal builds take the current version from interp_eval_breaker when
+    // attaching a thread, so we only have to set the current thread's version.
+    set_version_raw(&tstate->eval_breaker, version);
+#endif
 }
 
 static bool
@@ -1566,7 +1587,7 @@ _Py_Instrument(PyCodeObject *code, PyInterpreterState *interp)
 {
     if (is_version_up_to_date(code, interp)) {
         assert(
-            (interp->ceval.eval_breaker & ~_PY_EVAL_EVENTS_MASK) == 0 ||
+            (interp->ceval.interp_eval_breaker & ~_PY_EVAL_EVENTS_MASK) == 0 ||
             instrumentation_cross_checks(interp, code)
         );
         return 0;
@@ -1778,7 +1799,8 @@ int
 _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events)
 {
     assert(0 <= tool_id && tool_id < PY_MONITORING_TOOL_IDS);
-    PyInterpreterState *interp = _PyInterpreterState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
+    PyInterpreterState *interp = tstate->interp;
     assert(events < (1 << _PY_MONITORING_UNGROUPED_EVENTS));
     if (check_tool(interp, tool_id)) {
         return -1;
@@ -1793,7 +1815,7 @@ _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events)
         PyErr_Format(PyExc_OverflowError, "events set too many times");
         return -1;
     }
-    set_global_version(interp, new_version);
+    set_global_version(tstate, new_version);
     _Py_Executors_InvalidateAll(interp);
     return instrument_all_executing_code_objects(interp);
 }
@@ -2122,7 +2144,8 @@ monitoring_restart_events_impl(PyObject *module)
      * last restart version > instrumented version for all code objects
      * last restart version < current version
      */
-    PyInterpreterState *interp = _PyInterpreterState_GET();
+    PyThreadState *tstate = _PyThreadState_GET();
+    PyInterpreterState *interp = tstate->interp;
     uint32_t restart_version = global_version(interp) + MONITORING_VERSION_INCREMENT;
     uint32_t new_version = restart_version + MONITORING_VERSION_INCREMENT;
     if (new_version <= MONITORING_VERSION_INCREMENT) {
@@ -2130,7 +2153,7 @@ monitoring_restart_events_impl(PyObject *module)
         return NULL;
     }
     interp->last_restart_version = restart_version;
-    set_global_version(interp, new_version);
+    set_global_version(tstate, new_version);
     if (instrument_all_executing_code_objects(interp)) {
         return NULL;
     }
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
index 0cac7109340129..e2c8b66a340eb6 100644
--- a/Python/pylifecycle.c
+++ b/Python/pylifecycle.c
@@ -663,6 +663,7 @@ pycore_create_interpreter(_PyRuntimeState *runtime,
     if (tstate == NULL) {
         return _PyStatus_ERR("can't make first thread");
     }
+    runtime->main_tstate = tstate;
     _PyThreadState_Bind(tstate);
 
     init_interp_create_gil(tstate, config.gil);
diff --git a/Python/pystate.c b/Python/pystate.c
index e77e5bfa7e2df8..85d57f3bc65734 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -793,7 +793,7 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
 
     // At this time, all the threads should be cleared so we don't need
     // atomic operations for eval_breaker
-    interp->ceval.eval_breaker = 0;
+    interp->ceval.interp_eval_breaker = 0;
 
     for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) {
         interp->monitors.tools[i] = 0;
@@ -1306,6 +1306,7 @@ init_threadstate(_PyThreadStateImpl *_tstate,
 
     assert(interp != NULL);
     tstate->interp = interp;
+    tstate->eval_breaker = interp->ceval.interp_eval_breaker;
 
     // next/prev are set in add_threadstate().
     assert(tstate->next == NULL);
@@ -1987,8 +1988,7 @@ park_detached_threads(struct _stoptheworld_state *stw)
             }
         }
         else if (state == _Py_THREAD_ATTACHED && t != stw->requester) {
-            // TODO: set this per-thread, rather than per-interpreter.
-            _Py_set_eval_breaker_bit(t->interp, _PY_EVAL_PLEASE_STOP_BIT, 1);
+            _PyThreadState_Signal(t, _PY_EVAL_PLEASE_STOP_BIT);
         }
     }
     stw->thread_countdown -= num_parked;
@@ -2152,18 +2152,41 @@ PyThreadState_SetAsyncExc(unsigned long id, PyObject *exc)
          * deadlock, we need to release head_mutex before
          * the decref.
          */
-        PyObject *old_exc = tstate->async_exc;
-        tstate->async_exc = Py_XNewRef(exc);
+        Py_XINCREF(exc);
+        PyObject *old_exc = _Py_atomic_exchange_ptr(&tstate->async_exc, exc);
         HEAD_UNLOCK(runtime);
 
         Py_XDECREF(old_exc);
-        _PyEval_SignalAsyncExc(tstate->interp);
+        _PyThreadState_Signal(tstate, _PY_ASYNC_EXCEPTION_BIT);
         return 1;
     }
     HEAD_UNLOCK(runtime);
     return 0;
 }
 
+void
+_PyInterpreterState_SignalAll(PyInterpreterState *interp, uintptr_t bit)
+{
+    _PyRuntimeState *runtime = &_PyRuntime;
+
+    HEAD_LOCK(runtime);
+    for (PyThreadState *tstate = interp->threads.head; tstate != NULL; tstate = tstate->next) {
+        _PyThreadState_Signal(tstate, bit);
+    }
+    HEAD_UNLOCK(runtime);
+}
+
+void
+_PyInterpreterState_UnsignalAll(PyInterpreterState *interp, uintptr_t bit)
+{
+    _PyRuntimeState *runtime = &_PyRuntime;
+
+    HEAD_LOCK(runtime);
+    for (PyThreadState *tstate = interp->threads.head; tstate != NULL; tstate = tstate->next) {
+        _PyThreadState_Unsignal(tstate, bit);
+    }
+    HEAD_UNLOCK(runtime);
+}
 
 //---------------------------------
 // API for the current thread state