From 691c8dcb53bbc23308cab5279f02f0b5fa68ff36 Mon Sep 17 00:00:00 2001
From: Pablo Galindo Salgado <pablogsal@gmail.com>
Date: Tue, 23 Dec 2025 17:19:50 +0000
Subject: [PATCH] Fix flaky tests in test_external_inspection

Several tests calling unwinder.get_stack_trace() were flaky because they
used retry loops without exception handling. Transient failures like
"Failed to parse initial frame in chain" that occur when sampling at an
inopportune moment would immediately fail the test instead of being
retried.

The fix adds a _get_stack_trace_with_retry helper function and updates
seven locations to use busy_retry with contextlib.suppress for OSError
and RuntimeError, matching the existing pattern in
_get_frames_with_retry. This allows transient failures to be silently
retried while still timing out if the expected condition is never met.
---
 Lib/test/test_external_inspection.py | 171 +++++++++++++++------------
 1 file changed, 95 insertions(+), 76 deletions(-)

diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py
index 8837d3b729442c..97300bc92d40fa 100644
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -249,6 +249,26 @@ def get_all_awaited_by(pid):
     raise RuntimeError("Failed to get all awaited_by after retries")
 
 
+def _get_stack_trace_with_retry(unwinder, timeout=SHORT_TIMEOUT):
+    """Get stack trace from an existing unwinder with retry for transient errors.
+
+    This handles the case where we want to reuse an existing RemoteUnwinder
+    instance but still handle transient failures like "Failed to parse initial
+    frame in chain" that can occur when sampling at an inopportune moment.
+    """
+    last_error = None
+    for _ in busy_retry(timeout):
+        try:
+            return unwinder.get_stack_trace()
+        except (OSError, RuntimeError) as e:
+            last_error = e
+            time.sleep(0.1)
+            continue
+    raise RuntimeError(
+        f"Failed to get stack trace after retries: {last_error}"
+    )
+
+
 # ============================================================================
 # Base test class with shared infrastructure
 # ============================================================================
@@ -1704,16 +1724,16 @@ def main_work():
 
                     # Get stack trace with all threads
                     unwinder_all = RemoteUnwinder(p.pid, all_threads=True)
-                    for _ in range(MAX_TRIES):
-                        all_traces = unwinder_all.get_stack_trace()
-                        found = self._find_frame_in_trace(
-                            all_traces,
-                            lambda f: f.funcname == "main_work"
-                            and f.location.lineno > 12,
-                        )
-                        if found:
-                            break
-                        time.sleep(0.1)
+                    for _ in busy_retry(SHORT_TIMEOUT):
+                        with contextlib.suppress(OSError, RuntimeError):
+                            all_traces = unwinder_all.get_stack_trace()
+                            found = self._find_frame_in_trace(
+                                all_traces,
+                                lambda f: f.funcname == "main_work"
+                                and f.location.lineno > 12,
+                            )
+                            if found:
+                                break
                     else:
                         self.fail(
                             "Main thread did not start its busy work on time"
@@ -1723,7 +1743,7 @@ def main_work():
                     unwinder_gil = RemoteUnwinder(
                         p.pid, only_active_thread=True
                     )
-                    gil_traces = unwinder_gil.get_stack_trace()
+                    gil_traces = _get_stack_trace_with_retry(unwinder_gil)
 
                     # Count threads
                     total_threads = sum(
@@ -1998,15 +2018,15 @@ def busy():
                         mode=mode,
                         skip_non_matching_threads=False,
                     )
-                    for _ in range(MAX_TRIES):
-                        traces = unwinder.get_stack_trace()
-                        statuses = self._get_thread_statuses(traces)
+                    for _ in busy_retry(SHORT_TIMEOUT):
+                        with contextlib.suppress(OSError, RuntimeError):
+                            traces = unwinder.get_stack_trace()
+                            statuses = self._get_thread_statuses(traces)
 
-                        if check_condition(
-                            statuses, sleeper_tid, busy_tid
-                        ):
-                            break
-                        time.sleep(0.5)
+                            if check_condition(
+                                statuses, sleeper_tid, busy_tid
+                            ):
+                                break
 
                     return statuses, sleeper_tid, busy_tid
             finally:
@@ -2150,29 +2170,29 @@ def busy_thread():
                         mode=PROFILING_MODE_ALL,
                         skip_non_matching_threads=False,
                     )
-                    for _ in range(MAX_TRIES):
-                        traces = unwinder.get_stack_trace()
-                        statuses = self._get_thread_statuses(traces)
-
-                        # Check ALL mode provides both GIL and CPU info
-                        if (
-                            sleeper_tid in statuses
-                            and busy_tid in statuses
-                            and not (
-                                statuses[sleeper_tid]
-                                & THREAD_STATUS_ON_CPU
-                            )
-                            and not (
-                                statuses[sleeper_tid]
-                                & THREAD_STATUS_HAS_GIL
-                            )
-                            and (statuses[busy_tid] & THREAD_STATUS_ON_CPU)
-                            and (
-                                statuses[busy_tid] & THREAD_STATUS_HAS_GIL
-                            )
-                        ):
-                            break
-                        time.sleep(0.5)
+                    for _ in busy_retry(SHORT_TIMEOUT):
+                        with contextlib.suppress(OSError, RuntimeError):
+                            traces = unwinder.get_stack_trace()
+                            statuses = self._get_thread_statuses(traces)
+
+                            # Check ALL mode provides both GIL and CPU info
+                            if (
+                                sleeper_tid in statuses
+                                and busy_tid in statuses
+                                and not (
+                                    statuses[sleeper_tid]
+                                    & THREAD_STATUS_ON_CPU
+                                )
+                                and not (
+                                    statuses[sleeper_tid]
+                                    & THREAD_STATUS_HAS_GIL
+                                )
+                                and (statuses[busy_tid] & THREAD_STATUS_ON_CPU)
+                                and (
+                                    statuses[busy_tid] & THREAD_STATUS_HAS_GIL
+                                )
+                            ):
+                                break
 
                     self.assertIsNotNone(
                         sleeper_tid, "Sleeper thread id not received"
@@ -2296,18 +2316,18 @@ def test_thread_status_exception_detection(self):
                 mode=PROFILING_MODE_ALL,
                 skip_non_matching_threads=False,
             )
-            for _ in range(MAX_TRIES):
-                traces = unwinder.get_stack_trace()
-                statuses = self._get_thread_statuses(traces)
-
-                if (
-                    exception_tid in statuses
-                    and normal_tid in statuses
-                    and (statuses[exception_tid] & THREAD_STATUS_HAS_EXCEPTION)
-                    and not (statuses[normal_tid] & THREAD_STATUS_HAS_EXCEPTION)
-                ):
-                    break
-                time.sleep(0.5)
+            for _ in busy_retry(SHORT_TIMEOUT):
+                with contextlib.suppress(OSError, RuntimeError):
+                    traces = unwinder.get_stack_trace()
+                    statuses = self._get_thread_statuses(traces)
+
+                    if (
+                        exception_tid in statuses
+                        and normal_tid in statuses
+                        and (statuses[exception_tid] & THREAD_STATUS_HAS_EXCEPTION)
+                        and not (statuses[normal_tid] & THREAD_STATUS_HAS_EXCEPTION)
+                    ):
+                        break
 
             self.assertIn(exception_tid, statuses)
             self.assertIn(normal_tid, statuses)
@@ -2339,18 +2359,18 @@ def test_thread_status_exception_mode_filtering(self):
                 mode=PROFILING_MODE_EXCEPTION,
                 skip_non_matching_threads=True,
             )
-            for _ in range(MAX_TRIES):
-                traces = unwinder.get_stack_trace()
-                statuses = self._get_thread_statuses(traces)
-
-                if exception_tid in statuses:
-                    self.assertNotIn(
-                        normal_tid,
-                        statuses,
-                        "Normal thread should be filtered out in exception mode",
-                    )
-                    return
-                time.sleep(0.5)
+            for _ in busy_retry(SHORT_TIMEOUT):
+                with contextlib.suppress(OSError, RuntimeError):
+                    traces = unwinder.get_stack_trace()
+                    statuses = self._get_thread_statuses(traces)
+
+                    if exception_tid in statuses:
+                        self.assertNotIn(
+                            normal_tid,
+                            statuses,
+                            "Normal thread should be filtered out in exception mode",
+                        )
+                        return
 
             self.fail("Never found exception thread in exception mode")
 
@@ -2504,18 +2524,17 @@ def _check_exception_status(self, p, thread_tid, expect_exception):
 
         # Collect multiple samples for reliability
         results = []
-        for _ in range(MAX_TRIES):
-            traces = unwinder.get_stack_trace()
-            statuses = self._get_thread_statuses(traces)
-
-            if thread_tid in statuses:
-                has_exc = bool(statuses[thread_tid] & THREAD_STATUS_HAS_EXCEPTION)
-                results.append(has_exc)
+        for _ in busy_retry(SHORT_TIMEOUT):
+            with contextlib.suppress(OSError, RuntimeError):
+                traces = unwinder.get_stack_trace()
+                statuses = self._get_thread_statuses(traces)
 
-                if len(results) >= 3:
-                    break
+                if thread_tid in statuses:
+                    has_exc = bool(statuses[thread_tid] & THREAD_STATUS_HAS_EXCEPTION)
+                    results.append(has_exc)
 
-            time.sleep(0.2)
+                    if len(results) >= 3:
+                        break
 
         # Check majority of samples match expected
         if not results: