@@ -249,6 +249,25 @@ def get_all_awaited_by(pid):
249249 raise RuntimeError ("Failed to get all awaited_by after retries" )
250250
251251
252+ def _get_stack_trace_with_retry (unwinder , timeout = SHORT_TIMEOUT ):
253+ """Get stack trace from an existing unwinder with retry for transient errors.
254+
255+ This handles the case where we want to reuse an existing RemoteUnwinder
256+ instance but still handle transient failures like "Failed to parse initial
257+ frame in chain" that can occur when sampling at an inopportune moment.
258+ """
259+ last_error = None
260+ for _ in busy_retry (timeout ):
261+ try :
262+ return unwinder .get_stack_trace ()
263+ except (OSError , RuntimeError ) as e :
264+ last_error = e
265+ continue
266+ raise RuntimeError (
267+ f"Failed to get stack trace after retries: { last_error } "
268+ )
269+
270+
252271# ============================================================================
253272# Base test class with shared infrastructure
254273# ============================================================================
@@ -1704,16 +1723,16 @@ def main_work():
17041723
17051724 # Get stack trace with all threads
17061725 unwinder_all = RemoteUnwinder (p .pid , all_threads = True )
1707- for _ in range ( MAX_TRIES ):
1708- all_traces = unwinder_all . get_stack_trace ()
1709- found = self . _find_frame_in_trace (
1710- all_traces ,
1711- lambda f : f . funcname == "main_work"
1712- and f . location . lineno > 12 ,
1713- )
1714- if found :
1715- break
1716- time . sleep ( 0.1 )
1726+ for _ in busy_retry ( SHORT_TIMEOUT ):
1727+ with contextlib . suppress ( OSError , RuntimeError ):
1728+ all_traces = unwinder_all . get_stack_trace ()
1729+ found = self . _find_frame_in_trace (
1730+ all_traces ,
1731+ lambda f : f . funcname == "main_work"
1732+ and f . location . lineno > 12 ,
1733+ )
1734+ if found :
1735+ break
17171736 else :
17181737 self .fail (
17191738 "Main thread did not start its busy work on time"
@@ -1723,7 +1742,7 @@ def main_work():
17231742 unwinder_gil = RemoteUnwinder (
17241743 p .pid , only_active_thread = True
17251744 )
1726- gil_traces = unwinder_gil . get_stack_trace ( )
1745+ gil_traces = _get_stack_trace_with_retry ( unwinder_gil )
17271746
17281747 # Count threads
17291748 total_threads = sum (
@@ -1998,15 +2017,15 @@ def busy():
19982017 mode = mode ,
19992018 skip_non_matching_threads = False ,
20002019 )
2001- for _ in range (MAX_TRIES ):
2002- traces = unwinder .get_stack_trace ()
2003- statuses = self ._get_thread_statuses (traces )
2020+ for _ in busy_retry (SHORT_TIMEOUT ):
2021+ with contextlib .suppress (OSError , RuntimeError ):
2022+ traces = unwinder .get_stack_trace ()
2023+ statuses = self ._get_thread_statuses (traces )
20042024
2005- if check_condition (
2006- statuses , sleeper_tid , busy_tid
2007- ):
2008- break
2009- time .sleep (0.5 )
2025+ if check_condition (
2026+ statuses , sleeper_tid , busy_tid
2027+ ):
2028+ break
20102029
20112030 return statuses , sleeper_tid , busy_tid
20122031 finally :
@@ -2150,29 +2169,29 @@ def busy_thread():
21502169 mode = PROFILING_MODE_ALL ,
21512170 skip_non_matching_threads = False ,
21522171 )
2153- for _ in range ( MAX_TRIES ):
2154- traces = unwinder . get_stack_trace ()
2155- statuses = self . _get_thread_statuses ( traces )
2156-
2157- # Check ALL mode provides both GIL and CPU info
2158- if (
2159- sleeper_tid in statuses
2160- and busy_tid in statuses
2161- and not (
2162- statuses [ sleeper_tid ]
2163- & THREAD_STATUS_ON_CPU
2164- )
2165- and not (
2166- statuses [ sleeper_tid ]
2167- & THREAD_STATUS_HAS_GIL
2168- )
2169- and ( statuses [ busy_tid ] & THREAD_STATUS_ON_CPU )
2170- and (
2171- statuses [ busy_tid ] & THREAD_STATUS_HAS_GIL
2172- )
2173- ):
2174- break
2175- time . sleep ( 0.5 )
2172+ for _ in busy_retry ( SHORT_TIMEOUT ):
2173+ with contextlib . suppress ( OSError , RuntimeError ):
2174+ traces = unwinder . get_stack_trace ( )
2175+ statuses = self . _get_thread_statuses ( traces )
2176+
2177+ # Check ALL mode provides both GIL and CPU info
2178+ if (
2179+ sleeper_tid in statuses
2180+ and busy_tid in statuses
2181+ and not (
2182+ statuses [ sleeper_tid ]
2183+ & THREAD_STATUS_ON_CPU
2184+ )
2185+ and not (
2186+ statuses [ sleeper_tid ]
2187+ & THREAD_STATUS_HAS_GIL
2188+ )
2189+ and (statuses [ busy_tid ] & THREAD_STATUS_ON_CPU )
2190+ and (
2191+ statuses [ busy_tid ] & THREAD_STATUS_HAS_GIL
2192+ )
2193+ ):
2194+ break
21762195
21772196 self .assertIsNotNone (
21782197 sleeper_tid , "Sleeper thread id not received"
@@ -2296,18 +2315,18 @@ def test_thread_status_exception_detection(self):
22962315 mode = PROFILING_MODE_ALL ,
22972316 skip_non_matching_threads = False ,
22982317 )
2299- for _ in range ( MAX_TRIES ):
2300- traces = unwinder . get_stack_trace ()
2301- statuses = self . _get_thread_statuses ( traces )
2302-
2303- if (
2304- exception_tid in statuses
2305- and normal_tid in statuses
2306- and ( statuses [ exception_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2307- and not (statuses [normal_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2308- ):
2309- break
2310- time . sleep ( 0.5 )
2318+ for _ in busy_retry ( SHORT_TIMEOUT ):
2319+ with contextlib . suppress ( OSError , RuntimeError ):
2320+ traces = unwinder . get_stack_trace ( )
2321+ statuses = self . _get_thread_statuses ( traces )
2322+
2323+ if (
2324+ exception_tid in statuses
2325+ and normal_tid in statuses
2326+ and (statuses [exception_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2327+ and not ( statuses [ normal_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2328+ ):
2329+ break
23112330
23122331 self .assertIn (exception_tid , statuses )
23132332 self .assertIn (normal_tid , statuses )
@@ -2339,18 +2358,18 @@ def test_thread_status_exception_mode_filtering(self):
23392358 mode = PROFILING_MODE_EXCEPTION ,
23402359 skip_non_matching_threads = True ,
23412360 )
2342- for _ in range ( MAX_TRIES ):
2343- traces = unwinder . get_stack_trace ()
2344- statuses = self . _get_thread_statuses ( traces )
2345-
2346- if exception_tid in statuses :
2347- self . assertNotIn (
2348- normal_tid ,
2349- statuses ,
2350- "Normal thread should be filtered out in exception mode" ,
2351- )
2352- return
2353- time . sleep ( 0.5 )
2361+ for _ in busy_retry ( SHORT_TIMEOUT ):
2362+ with contextlib . suppress ( OSError , RuntimeError ):
2363+ traces = unwinder . get_stack_trace ( )
2364+ statuses = self . _get_thread_statuses ( traces )
2365+
2366+ if exception_tid in statuses :
2367+ self . assertNotIn (
2368+ normal_tid ,
2369+ statuses ,
2370+ "Normal thread should be filtered out in exception mode" ,
2371+ )
2372+ return
23542373
23552374 self .fail ("Never found exception thread in exception mode" )
23562375
@@ -2504,18 +2523,17 @@ def _check_exception_status(self, p, thread_tid, expect_exception):
25042523
25052524 # Collect multiple samples for reliability
25062525 results = []
2507- for _ in range (MAX_TRIES ):
2508- traces = unwinder .get_stack_trace ()
2509- statuses = self ._get_thread_statuses (traces )
2510-
2511- if thread_tid in statuses :
2512- has_exc = bool (statuses [thread_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2513- results .append (has_exc )
2526+ for _ in busy_retry (SHORT_TIMEOUT ):
2527+ with contextlib .suppress (OSError , RuntimeError ):
2528+ traces = unwinder .get_stack_trace ()
2529+ statuses = self ._get_thread_statuses (traces )
25142530
2515- if len (results ) >= 3 :
2516- break
2531+ if thread_tid in statuses :
2532+ has_exc = bool (statuses [thread_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2533+ results .append (has_exc )
25172534
2518- time .sleep (0.2 )
2535+ if len (results ) >= 3 :
2536+ break
25192537
25202538 # Check majority of samples match expected
25212539 if not results :
0 commit comments