Address andystaples PR review feedback

Bernd Verst · Copilot · Bernd Verst · commit 345d94128777 · 2026-05-21T16:06:47.000-07:00
- Make FailureTracker thread-safe with an internal lock so multi-threaded
  sync clients can't race the consecutive-failure counter (review [3/10]).
- Track _AsyncWorkerManager pool shutdown via an explicit _pool_is_shutdown
  flag instead of reading ThreadPoolExecutor._shutdown (CPython private API,
  review [4/10]).
- Collapse identical wrap_execution/wrap_cancellation closures in the worker
  stream loop into a single wrap_with_release helper (review [5/10]).
- Promote the retired-channel close delay and jitter exponent cap to named
  module-level constants (review [7/10]).
- Key _InFlightChannelTracker on the channel object instead of id(channel)
  so the lifetime invariant is local to the tracker (review [9/10]).
- Rename TaskHubGrpcWorker._can_recreate_channel() to the existing
  _owns_channel attribute used by the clients, so both files use the same
  name for the same concept (review [2/10]).
- Add regression tests for FailureTracker concurrency and for thread-pool
  recreation after manager shutdown.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/durabletask/client.py b/durabletask/client.py
@@ -166,6 +166,12 @@ def parse_orchestration_state(state: pb.OrchestrationState) -> OrchestrationStat
         failure_details)
 
 
+# Grace period before a retired SDK-owned channel is force-closed. Long enough
+# for in-flight unary RPCs to drain on their own, short enough that recreate
+# storms don't pile up dozens of half-closed channels.
+_RETIRED_CHANNEL_CLOSE_DELAY_SECONDS = 30.0
+
+
 class TaskHubGrpcClient:
     def __init__(self, *,
                  host_address: Optional[str] = None,
@@ -264,7 +270,7 @@ def _maybe_recreate_channel(self) -> None:
             self._last_recreate_time = now
             self._client_failure_tracker.record_success()
             close_timer = threading.Timer(
-                30.0,
+                _RETIRED_CHANNEL_CLOSE_DELAY_SECONDS,
                 self._close_retired_channel,
                 args=(old_channel,),
             )
@@ -730,7 +736,7 @@ async def _maybe_recreate_channel(self) -> None:
 
     async def _close_retired_channel(self, channel: grpc.aio.Channel) -> None:
         try:
-            await asyncio.sleep(30.0)
+            await asyncio.sleep(_RETIRED_CHANNEL_CLOSE_DELAY_SECONDS)
             await channel.close()
         finally:
             async with self._recreate_lock:
diff --git a/durabletask/internal/grpc_resiliency.py b/durabletask/internal/grpc_resiliency.py
@@ -2,37 +2,59 @@
 # Licensed under the MIT License.
 
 import random
-from dataclasses import dataclass
+import threading
+from dataclasses import dataclass, field
 
 import grpc
 
+# Sidecar RPCs that legitimately block on the server until an instance reaches a
+# terminal state. ``DEADLINE_EXCEEDED`` on these is the caller's chosen timeout
+# expiring rather than a transport failure, so we do not treat it as one.
 LONG_POLL_METHODS = {"WaitForInstanceStart", "WaitForInstanceCompletion"}
 
+# Cap the attempt number fed into ``2 ** attempt`` to keep the jitter calculation
+# bounded for callers that retry indefinitely; once we hit the cap, the upper
+# bound is fully governed by ``cap_seconds``.
+_MAX_JITTER_ATTEMPT_EXPONENT = 30
+
 
 def get_full_jitter_delay_seconds(
         attempt: int,
         *,
         base_seconds: float,
         cap_seconds: float,
 ) -> float:
-    capped_attempt = min(attempt, 30)
+    capped_attempt = min(attempt, _MAX_JITTER_ATTEMPT_EXPONENT)
     upper_bound = min(cap_seconds, base_seconds * (2 ** capped_attempt))
     return random.random() * upper_bound
 
 
 @dataclass
 class FailureTracker:
+    """Counts consecutive transport failures with thread-safe mutation.
+
+    The sync ``TaskHubGrpcClient`` is commonly invoked from multiple worker
+    threads, so ``record_failure``/``record_success`` need a lock to keep the
+    increment-and-compare atomic. The async client only mutates this from a
+    single event loop, but the extra lock has negligible cost on that path.
+    """
+
     threshold: int
     consecutive_failures: int = 0
+    _lock: threading.Lock = field(
+        default_factory=threading.Lock, init=False, repr=False, compare=False
+    )
 
     def record_failure(self) -> bool:
         if self.threshold <= 0:
             return False
-        self.consecutive_failures += 1
-        return self.consecutive_failures >= self.threshold
+        with self._lock:
+            self.consecutive_failures += 1
+            return self.consecutive_failures >= self.threshold
 
     def record_success(self) -> None:
-        self.consecutive_failures = 0
+        with self._lock:
+            self.consecutive_failures = 0
 
 
 def is_client_transport_failure(method_name: str, status_code: grpc.StatusCode) -> bool:
diff --git a/durabletask/worker.py b/durabletask/worker.py
@@ -140,15 +140,19 @@ class _TrackedChannelState:
 class _InFlightChannelTracker:
     def __init__(self):
         self._lock = Lock()
-        self._states: dict[int, _TrackedChannelState] = {}
+        # Keyed on the channel itself; gRPC channels are hashable by identity
+        # and we keep a strong reference via _TrackedChannelState so reuse-after-
+        # GC isn't a concern. Using the channel directly (instead of ``id()``)
+        # makes the invariant local to this class rather than something a
+        # reader has to verify by tracing _TrackedChannelState lifetimes.
+        self._states: dict[Any, _TrackedChannelState] = {}
 
     def acquire(self, channel: Any):
-        channel_key = id(channel)
         with self._lock:
-            state = self._states.get(channel_key)
+            state = self._states.get(channel)
             if state is None:
                 state = _TrackedChannelState(channel=channel)
-                self._states[channel_key] = state
+                self._states[channel] = state
             state.ref_count += 1
 
         released = False
@@ -161,26 +165,25 @@ def release() -> None:
 
             channel_to_close = None
             with self._lock:
-                state = self._states.get(channel_key)
+                state = self._states.get(channel)
                 if state is None:
                     return
 
                 state.ref_count -= 1
                 if state.ref_count == 0:
                     if state.close_when_released:
                         channel_to_close = state.channel
-                    del self._states[channel_key]
+                    del self._states[channel]
 
             if channel_to_close is not None:
                 self._close_channel(channel_to_close)
 
         return release
 
     def retire(self, channel: Any) -> None:
-        channel_key = id(channel)
         channel_to_close = None
         with self._lock:
-            state = self._states.get(channel_key)
+            state = self._states.get(channel)
             if state is None:
                 channel_to_close = channel
             else:
@@ -533,6 +536,10 @@ def __init__(
         self._shutdown = Event()
         self._is_running = False
         self._channel = channel
+        # The SDK owns (and may recreate) the gRPC channel only when the caller
+        # did not provide one. Mirrors ``TaskHubGrpcClient._owns_channel`` so
+        # both files use the same name for the same concept.
+        self._owns_channel = channel is None
         self._secure_channel = secure_channel
         self._payload_store = payload_store
         self._channel_options = channel_options
@@ -598,9 +605,6 @@ def _should_count_worker_failure(
     ) -> bool:
         return is_worker_transport_failure(status_code)
 
-    def _can_recreate_channel(self) -> bool:
-        return self._channel is None
-
     def add_orchestrator(self, fn: task.Orchestrator[TInput, TOutput]) -> str:
         """Registers an orchestrator function with the worker."""
         if self._is_running:
@@ -742,16 +746,7 @@ def create_fresh_connection():
                 current_stub = None
                 raise
 
-        def wrap_execution(handler, release):
-            def wrapped(*args, **kwargs):
-                try:
-                    return handler(*args, **kwargs)
-                finally:
-                    release()
-
-            return wrapped
-
-        def wrap_cancellation(handler, release):
+        def wrap_with_release(handler, release):
             def wrapped(*args, **kwargs):
                 try:
                     return handler(*args, **kwargs)
@@ -772,8 +767,8 @@ def submit_work_item(
             release = in_flight_channel_tracker.acquire(channel)
             try:
                 submit_func(
-                    wrap_execution(handler, release),
-                    wrap_cancellation(cancellation_handler, release),
+                    wrap_with_release(handler, release),
+                    wrap_with_release(cancellation_handler, release),
                     request,
                     stub,
                     completion_token,
@@ -808,7 +803,7 @@ def invalidate_connection(
 
             if (
                     current_channel is not None
-                    and self._can_recreate_channel()
+                    and self._owns_channel
                     and (recreate_channel or close_channel)
             ):
                 in_flight_channel_tracker.retire(current_channel)
@@ -837,7 +832,7 @@ def should_invalidate_connection(rpc_error):
                         if self._should_count_worker_failure(error_code):
                             recreate_channel = (
                                 failure_tracker.record_failure()
-                                and self._can_recreate_channel()
+                                and self._owns_channel
                             )
                     invalidate_connection(recreate_channel=recreate_channel)
                     conn_retry_count += 1
@@ -995,7 +990,7 @@ def stream_reader():
                     )
                     recreate_channel = (
                         failure_tracker.record_failure()
-                        and self._can_recreate_channel()
+                        and self._owns_channel
                     )
                     invalidate_connection(recreate_channel=recreate_channel)
                     conn_retry_count += 1
@@ -1010,7 +1005,7 @@ def stream_reader():
                 if should_invalidate and self._should_count_worker_failure(error_code):
                     recreate_channel = (
                         failure_tracker.record_failure()
-                        and self._can_recreate_channel()
+                        and self._owns_channel
                     )
                 if should_invalidate:
                     invalidate_connection(recreate_channel=recreate_channel)
@@ -2893,6 +2888,7 @@ def __init__(self, concurrency_options: ConcurrencyOptions, logger: logging.Logg
         self._pending_orchestration_work: list = []
         self._pending_entity_batch_work: list = []
         self.thread_pool = self._create_thread_pool()
+        self._pool_is_shutdown = False
         self._shutdown = False
 
     def _create_thread_pool(self) -> ThreadPoolExecutor:
@@ -2902,8 +2898,12 @@ def _create_thread_pool(self) -> ThreadPoolExecutor:
         )
 
     def _ensure_thread_pool(self) -> None:
-        if getattr(self.thread_pool, "_shutdown", False):
+        # Track the pool's shutdown state explicitly instead of reading
+        # ``ThreadPoolExecutor._shutdown`` (which is a CPython implementation
+        # detail and not part of ``concurrent.futures``'s public API).
+        if self._pool_is_shutdown:
             self.thread_pool = self._create_thread_pool()
+            self._pool_is_shutdown = False
 
     def prepare_for_run(self) -> None:
         self._shutdown = False
@@ -3045,8 +3045,9 @@ async def run(self):
                     self._logger.error(f"Uncaught error while cancelling entity batch work item: {cancellation_exception}")
             self.shutdown()
         finally:
-            if not getattr(self.thread_pool, "_shutdown", False):
+            if not self._pool_is_shutdown:
                 self.thread_pool.shutdown(wait=True)
+                self._pool_is_shutdown = True
 
     async def _consume_queue(self, queue: asyncio.Queue, semaphore: asyncio.Semaphore):
         # List to track running tasks
diff --git a/tests/durabletask/test_grpc_resiliency.py b/tests/durabletask/test_grpc_resiliency.py
@@ -147,6 +147,26 @@ def test_failure_tracker_threshold_zero_never_trips():
     assert tracker.consecutive_failures == 0
 
 
+def test_failure_tracker_record_failure_is_thread_safe():
+    import threading
+
+    tracker = FailureTracker(threshold=10_000)
+    iterations = 500
+    workers = 8
+
+    def increment() -> None:
+        for _ in range(iterations):
+            tracker.record_failure()
+
+    threads = [threading.Thread(target=increment) for _ in range(workers)]
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()
+
+    assert tracker.consecutive_failures == iterations * workers
+
+
 @pytest.mark.parametrize(
     "method_name",
     [
diff --git a/tests/durabletask/test_worker_resiliency.py b/tests/durabletask/test_worker_resiliency.py
@@ -131,6 +131,26 @@ async def test_async_worker_manager_honors_shutdown_requested_before_run():
     await asyncio.wait_for(manager.run(), timeout=1.0)
 
 
+@pytest.mark.asyncio
+async def test_async_worker_manager_recreates_thread_pool_after_run():
+    manager = _AsyncWorkerManager(
+        ConcurrencyOptions(maximum_thread_pool_workers=1),
+        MagicMock(),
+    )
+
+    original_pool = manager.thread_pool
+
+    manager.shutdown()
+    await asyncio.wait_for(manager.run(), timeout=1.0)
+
+    assert manager._pool_is_shutdown is True
+
+    manager.prepare_for_run()
+
+    assert manager._pool_is_shutdown is False
+    assert manager.thread_pool is not original_pool
+
+
 def test_worker_start_clears_prior_shutdown_request():
     worker = TaskHubGrpcWorker()
     worker._shutdown.set()
@@ -190,7 +210,7 @@ def test_worker_counts_only_transport_failures_for_recreation():
 
 def test_worker_does_not_recreate_caller_owned_channel():
     worker = TaskHubGrpcWorker(channel=MagicMock())
-    assert worker._can_recreate_channel() is False
+    assert worker._owns_channel is False
 
 
 @pytest.mark.asyncio