Skip to content

Commit 691c8dc

Browse files
committed
Fix flaky tests in test_external_inspection
Several tests calling unwinder.get_stack_trace() were flaky because they used retry loops without exception handling. Transient failures like "Failed to parse initial frame in chain" that occur when sampling at an inopportune moment would immediately fail the test instead of being retried. The fix adds a _get_stack_trace_with_retry helper function and updates seven locations to use busy_retry with contextlib.suppress for OSError and RuntimeError, matching the existing pattern in _get_frames_with_retry. This allows transient failures to be silently retried while still timing out if the expected condition is never met.
1 parent 81c8eb8 commit 691c8dc

File tree

1 file changed

+95
-76
lines changed

1 file changed

+95
-76
lines changed

Lib/test/test_external_inspection.py

Lines changed: 95 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,26 @@ def get_all_awaited_by(pid):
249249
raise RuntimeError("Failed to get all awaited_by after retries")
250250

251251

252+
def _get_stack_trace_with_retry(unwinder, timeout=SHORT_TIMEOUT):
253+
"""Get stack trace from an existing unwinder with retry for transient errors.
254+
255+
This handles the case where we want to reuse an existing RemoteUnwinder
256+
instance but still handle transient failures like "Failed to parse initial
257+
frame in chain" that can occur when sampling at an inopportune moment.
258+
"""
259+
last_error = None
260+
for _ in busy_retry(timeout):
261+
try:
262+
return unwinder.get_stack_trace()
263+
except (OSError, RuntimeError) as e:
264+
last_error = e
265+
time.sleep(0.1)
266+
continue
267+
raise RuntimeError(
268+
f"Failed to get stack trace after retries: {last_error}"
269+
)
270+
271+
252272
# ============================================================================
253273
# Base test class with shared infrastructure
254274
# ============================================================================
@@ -1704,16 +1724,16 @@ def main_work():
17041724

17051725
# Get stack trace with all threads
17061726
unwinder_all = RemoteUnwinder(p.pid, all_threads=True)
1707-
for _ in range(MAX_TRIES):
1708-
all_traces = unwinder_all.get_stack_trace()
1709-
found = self._find_frame_in_trace(
1710-
all_traces,
1711-
lambda f: f.funcname == "main_work"
1712-
and f.location.lineno > 12,
1713-
)
1714-
if found:
1715-
break
1716-
time.sleep(0.1)
1727+
for _ in busy_retry(SHORT_TIMEOUT):
1728+
with contextlib.suppress(OSError, RuntimeError):
1729+
all_traces = unwinder_all.get_stack_trace()
1730+
found = self._find_frame_in_trace(
1731+
all_traces,
1732+
lambda f: f.funcname == "main_work"
1733+
and f.location.lineno > 12,
1734+
)
1735+
if found:
1736+
break
17171737
else:
17181738
self.fail(
17191739
"Main thread did not start its busy work on time"
@@ -1723,7 +1743,7 @@ def main_work():
17231743
unwinder_gil = RemoteUnwinder(
17241744
p.pid, only_active_thread=True
17251745
)
1726-
gil_traces = unwinder_gil.get_stack_trace()
1746+
gil_traces = _get_stack_trace_with_retry(unwinder_gil)
17271747

17281748
# Count threads
17291749
total_threads = sum(
@@ -1998,15 +2018,15 @@ def busy():
19982018
mode=mode,
19992019
skip_non_matching_threads=False,
20002020
)
2001-
for _ in range(MAX_TRIES):
2002-
traces = unwinder.get_stack_trace()
2003-
statuses = self._get_thread_statuses(traces)
2021+
for _ in busy_retry(SHORT_TIMEOUT):
2022+
with contextlib.suppress(OSError, RuntimeError):
2023+
traces = unwinder.get_stack_trace()
2024+
statuses = self._get_thread_statuses(traces)
20042025

2005-
if check_condition(
2006-
statuses, sleeper_tid, busy_tid
2007-
):
2008-
break
2009-
time.sleep(0.5)
2026+
if check_condition(
2027+
statuses, sleeper_tid, busy_tid
2028+
):
2029+
break
20102030

20112031
return statuses, sleeper_tid, busy_tid
20122032
finally:
@@ -2150,29 +2170,29 @@ def busy_thread():
21502170
mode=PROFILING_MODE_ALL,
21512171
skip_non_matching_threads=False,
21522172
)
2153-
for _ in range(MAX_TRIES):
2154-
traces = unwinder.get_stack_trace()
2155-
statuses = self._get_thread_statuses(traces)
2156-
2157-
# Check ALL mode provides both GIL and CPU info
2158-
if (
2159-
sleeper_tid in statuses
2160-
and busy_tid in statuses
2161-
and not (
2162-
statuses[sleeper_tid]
2163-
& THREAD_STATUS_ON_CPU
2164-
)
2165-
and not (
2166-
statuses[sleeper_tid]
2167-
& THREAD_STATUS_HAS_GIL
2168-
)
2169-
and (statuses[busy_tid] & THREAD_STATUS_ON_CPU)
2170-
and (
2171-
statuses[busy_tid] & THREAD_STATUS_HAS_GIL
2172-
)
2173-
):
2174-
break
2175-
time.sleep(0.5)
2173+
for _ in busy_retry(SHORT_TIMEOUT):
2174+
with contextlib.suppress(OSError, RuntimeError):
2175+
traces = unwinder.get_stack_trace()
2176+
statuses = self._get_thread_statuses(traces)
2177+
2178+
# Check ALL mode provides both GIL and CPU info
2179+
if (
2180+
sleeper_tid in statuses
2181+
and busy_tid in statuses
2182+
and not (
2183+
statuses[sleeper_tid]
2184+
& THREAD_STATUS_ON_CPU
2185+
)
2186+
and not (
2187+
statuses[sleeper_tid]
2188+
& THREAD_STATUS_HAS_GIL
2189+
)
2190+
and (statuses[busy_tid] & THREAD_STATUS_ON_CPU)
2191+
and (
2192+
statuses[busy_tid] & THREAD_STATUS_HAS_GIL
2193+
)
2194+
):
2195+
break
21762196

21772197
self.assertIsNotNone(
21782198
sleeper_tid, "Sleeper thread id not received"
@@ -2296,18 +2316,18 @@ def test_thread_status_exception_detection(self):
22962316
mode=PROFILING_MODE_ALL,
22972317
skip_non_matching_threads=False,
22982318
)
2299-
for _ in range(MAX_TRIES):
2300-
traces = unwinder.get_stack_trace()
2301-
statuses = self._get_thread_statuses(traces)
2302-
2303-
if (
2304-
exception_tid in statuses
2305-
and normal_tid in statuses
2306-
and (statuses[exception_tid] & THREAD_STATUS_HAS_EXCEPTION)
2307-
and not (statuses[normal_tid] & THREAD_STATUS_HAS_EXCEPTION)
2308-
):
2309-
break
2310-
time.sleep(0.5)
2319+
for _ in busy_retry(SHORT_TIMEOUT):
2320+
with contextlib.suppress(OSError, RuntimeError):
2321+
traces = unwinder.get_stack_trace()
2322+
statuses = self._get_thread_statuses(traces)
2323+
2324+
if (
2325+
exception_tid in statuses
2326+
and normal_tid in statuses
2327+
and (statuses[exception_tid] & THREAD_STATUS_HAS_EXCEPTION)
2328+
and not (statuses[normal_tid] & THREAD_STATUS_HAS_EXCEPTION)
2329+
):
2330+
break
23112331

23122332
self.assertIn(exception_tid, statuses)
23132333
self.assertIn(normal_tid, statuses)
@@ -2339,18 +2359,18 @@ def test_thread_status_exception_mode_filtering(self):
23392359
mode=PROFILING_MODE_EXCEPTION,
23402360
skip_non_matching_threads=True,
23412361
)
2342-
for _ in range(MAX_TRIES):
2343-
traces = unwinder.get_stack_trace()
2344-
statuses = self._get_thread_statuses(traces)
2345-
2346-
if exception_tid in statuses:
2347-
self.assertNotIn(
2348-
normal_tid,
2349-
statuses,
2350-
"Normal thread should be filtered out in exception mode",
2351-
)
2352-
return
2353-
time.sleep(0.5)
2362+
for _ in busy_retry(SHORT_TIMEOUT):
2363+
with contextlib.suppress(OSError, RuntimeError):
2364+
traces = unwinder.get_stack_trace()
2365+
statuses = self._get_thread_statuses(traces)
2366+
2367+
if exception_tid in statuses:
2368+
self.assertNotIn(
2369+
normal_tid,
2370+
statuses,
2371+
"Normal thread should be filtered out in exception mode",
2372+
)
2373+
return
23542374

23552375
self.fail("Never found exception thread in exception mode")
23562376

@@ -2504,18 +2524,17 @@ def _check_exception_status(self, p, thread_tid, expect_exception):
25042524

25052525
# Collect multiple samples for reliability
25062526
results = []
2507-
for _ in range(MAX_TRIES):
2508-
traces = unwinder.get_stack_trace()
2509-
statuses = self._get_thread_statuses(traces)
2510-
2511-
if thread_tid in statuses:
2512-
has_exc = bool(statuses[thread_tid] & THREAD_STATUS_HAS_EXCEPTION)
2513-
results.append(has_exc)
2527+
for _ in busy_retry(SHORT_TIMEOUT):
2528+
with contextlib.suppress(OSError, RuntimeError):
2529+
traces = unwinder.get_stack_trace()
2530+
statuses = self._get_thread_statuses(traces)
25142531

2515-
if len(results) >= 3:
2516-
break
2532+
if thread_tid in statuses:
2533+
has_exc = bool(statuses[thread_tid] & THREAD_STATUS_HAS_EXCEPTION)
2534+
results.append(has_exc)
25172535

2518-
time.sleep(0.2)
2536+
if len(results) >= 3:
2537+
break
25192538

25202539
# Check majority of samples match expected
25212540
if not results:

0 commit comments

Comments
 (0)