@@ -249,6 +249,26 @@ def get_all_awaited_by(pid):
249249 raise RuntimeError ("Failed to get all awaited_by after retries" )
250250
251251
252+ def _get_stack_trace_with_retry (unwinder , timeout = SHORT_TIMEOUT ):
253+ """Get stack trace from an existing unwinder with retry for transient errors.
254+
255+ This handles the case where we want to reuse an existing RemoteUnwinder
256+ instance but still handle transient failures like "Failed to parse initial
257+ frame in chain" that can occur when sampling at an inopportune moment.
258+ """
259+ last_error = None
260+ for _ in busy_retry (timeout ):
261+ try :
262+ return unwinder .get_stack_trace ()
263+ except (OSError , RuntimeError ) as e :
264+ last_error = e
265+ time .sleep (0.1 )
266+ continue
267+ raise RuntimeError (
268+ f"Failed to get stack trace after retries: { last_error } "
269+ )
270+
271+
252272# ============================================================================
253273# Base test class with shared infrastructure
254274# ============================================================================
@@ -1704,16 +1724,16 @@ def main_work():
17041724
17051725 # Get stack trace with all threads
17061726 unwinder_all = RemoteUnwinder (p .pid , all_threads = True )
1707- for _ in range ( MAX_TRIES ):
1708- all_traces = unwinder_all . get_stack_trace ()
1709- found = self . _find_frame_in_trace (
1710- all_traces ,
1711- lambda f : f . funcname == "main_work"
1712- and f . location . lineno > 12 ,
1713- )
1714- if found :
1715- break
1716- time . sleep ( 0.1 )
1727+ for _ in busy_retry ( SHORT_TIMEOUT ):
1728+ with contextlib . suppress ( OSError , RuntimeError ):
1729+ all_traces = unwinder_all . get_stack_trace ()
1730+ found = self . _find_frame_in_trace (
1731+ all_traces ,
1732+ lambda f : f . funcname == "main_work"
1733+ and f . location . lineno > 12 ,
1734+ )
1735+ if found :
1736+ break
17171737 else :
17181738 self .fail (
17191739 "Main thread did not start its busy work on time"
@@ -1723,7 +1743,7 @@ def main_work():
17231743 unwinder_gil = RemoteUnwinder (
17241744 p .pid , only_active_thread = True
17251745 )
1726- gil_traces = unwinder_gil . get_stack_trace ( )
1746+ gil_traces = _get_stack_trace_with_retry ( unwinder_gil )
17271747
17281748 # Count threads
17291749 total_threads = sum (
@@ -1998,15 +2018,15 @@ def busy():
19982018 mode = mode ,
19992019 skip_non_matching_threads = False ,
20002020 )
2001- for _ in range (MAX_TRIES ):
2002- traces = unwinder .get_stack_trace ()
2003- statuses = self ._get_thread_statuses (traces )
2021+ for _ in busy_retry (SHORT_TIMEOUT ):
2022+ with contextlib .suppress (OSError , RuntimeError ):
2023+ traces = unwinder .get_stack_trace ()
2024+ statuses = self ._get_thread_statuses (traces )
20042025
2005- if check_condition (
2006- statuses , sleeper_tid , busy_tid
2007- ):
2008- break
2009- time .sleep (0.5 )
2026+ if check_condition (
2027+ statuses , sleeper_tid , busy_tid
2028+ ):
2029+ break
20102030
20112031 return statuses , sleeper_tid , busy_tid
20122032 finally :
@@ -2150,29 +2170,29 @@ def busy_thread():
21502170 mode = PROFILING_MODE_ALL ,
21512171 skip_non_matching_threads = False ,
21522172 )
2153- for _ in range ( MAX_TRIES ):
2154- traces = unwinder . get_stack_trace ()
2155- statuses = self . _get_thread_statuses ( traces )
2156-
2157- # Check ALL mode provides both GIL and CPU info
2158- if (
2159- sleeper_tid in statuses
2160- and busy_tid in statuses
2161- and not (
2162- statuses [ sleeper_tid ]
2163- & THREAD_STATUS_ON_CPU
2164- )
2165- and not (
2166- statuses [ sleeper_tid ]
2167- & THREAD_STATUS_HAS_GIL
2168- )
2169- and ( statuses [ busy_tid ] & THREAD_STATUS_ON_CPU )
2170- and (
2171- statuses [ busy_tid ] & THREAD_STATUS_HAS_GIL
2172- )
2173- ):
2174- break
2175- time . sleep ( 0.5 )
2173+ for _ in busy_retry ( SHORT_TIMEOUT ):
2174+ with contextlib . suppress ( OSError , RuntimeError ):
2175+ traces = unwinder . get_stack_trace ( )
2176+ statuses = self . _get_thread_statuses ( traces )
2177+
2178+ # Check ALL mode provides both GIL and CPU info
2179+ if (
2180+ sleeper_tid in statuses
2181+ and busy_tid in statuses
2182+ and not (
2183+ statuses [ sleeper_tid ]
2184+ & THREAD_STATUS_ON_CPU
2185+ )
2186+ and not (
2187+ statuses [ sleeper_tid ]
2188+ & THREAD_STATUS_HAS_GIL
2189+ )
2190+ and (statuses [ busy_tid ] & THREAD_STATUS_ON_CPU )
2191+ and (
2192+ statuses [ busy_tid ] & THREAD_STATUS_HAS_GIL
2193+ )
2194+ ):
2195+ break
21762196
21772197 self .assertIsNotNone (
21782198 sleeper_tid , "Sleeper thread id not received"
@@ -2296,18 +2316,18 @@ def test_thread_status_exception_detection(self):
22962316 mode = PROFILING_MODE_ALL ,
22972317 skip_non_matching_threads = False ,
22982318 )
2299- for _ in range ( MAX_TRIES ):
2300- traces = unwinder . get_stack_trace ()
2301- statuses = self . _get_thread_statuses ( traces )
2302-
2303- if (
2304- exception_tid in statuses
2305- and normal_tid in statuses
2306- and ( statuses [ exception_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2307- and not (statuses [normal_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2308- ):
2309- break
2310- time . sleep ( 0.5 )
2319+ for _ in busy_retry ( SHORT_TIMEOUT ):
2320+ with contextlib . suppress ( OSError , RuntimeError ):
2321+ traces = unwinder . get_stack_trace ( )
2322+ statuses = self . _get_thread_statuses ( traces )
2323+
2324+ if (
2325+ exception_tid in statuses
2326+ and normal_tid in statuses
2327+ and (statuses [exception_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2328+ and not ( statuses [ normal_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2329+ ):
2330+ break
23112331
23122332 self .assertIn (exception_tid , statuses )
23132333 self .assertIn (normal_tid , statuses )
@@ -2339,18 +2359,18 @@ def test_thread_status_exception_mode_filtering(self):
23392359 mode = PROFILING_MODE_EXCEPTION ,
23402360 skip_non_matching_threads = True ,
23412361 )
2342- for _ in range ( MAX_TRIES ):
2343- traces = unwinder . get_stack_trace ()
2344- statuses = self . _get_thread_statuses ( traces )
2345-
2346- if exception_tid in statuses :
2347- self . assertNotIn (
2348- normal_tid ,
2349- statuses ,
2350- "Normal thread should be filtered out in exception mode" ,
2351- )
2352- return
2353- time . sleep ( 0.5 )
2362+ for _ in busy_retry ( SHORT_TIMEOUT ):
2363+ with contextlib . suppress ( OSError , RuntimeError ):
2364+ traces = unwinder . get_stack_trace ( )
2365+ statuses = self . _get_thread_statuses ( traces )
2366+
2367+ if exception_tid in statuses :
2368+ self . assertNotIn (
2369+ normal_tid ,
2370+ statuses ,
2371+ "Normal thread should be filtered out in exception mode" ,
2372+ )
2373+ return
23542374
23552375 self .fail ("Never found exception thread in exception mode" )
23562376
@@ -2504,18 +2524,17 @@ def _check_exception_status(self, p, thread_tid, expect_exception):
25042524
25052525 # Collect multiple samples for reliability
25062526 results = []
2507- for _ in range (MAX_TRIES ):
2508- traces = unwinder .get_stack_trace ()
2509- statuses = self ._get_thread_statuses (traces )
2510-
2511- if thread_tid in statuses :
2512- has_exc = bool (statuses [thread_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2513- results .append (has_exc )
2527+ for _ in busy_retry (SHORT_TIMEOUT ):
2528+ with contextlib .suppress (OSError , RuntimeError ):
2529+ traces = unwinder .get_stack_trace ()
2530+ statuses = self ._get_thread_statuses (traces )
25142531
2515- if len (results ) >= 3 :
2516- break
2532+ if thread_tid in statuses :
2533+ has_exc = bool (statuses [thread_tid ] & THREAD_STATUS_HAS_EXCEPTION )
2534+ results .append (has_exc )
25172535
2518- time .sleep (0.2 )
2536+ if len (results ) >= 3 :
2537+ break
25192538
25202539 # Check majority of samples match expected
25212540 if not results :
0 commit comments