fixes: coroutine and threading mix caused blocking bugs

oOraph · oOraph · commit 66e920cff2ec · 2025-11-17T17:36:40.000+01:00
Signed-off-by: Raphael Glon &lt;oOraph@users.noreply.github.com&gt;
diff --git a/src/huggingface_inference_toolkit/async_utils.py b/src/huggingface_inference_toolkit/async_utils.py
@@ -5,6 +5,8 @@
 from anyio import Semaphore
 from typing_extensions import ParamSpec
 
+from huggingface_inference_toolkit.logging import logger
+
 # To not have too many threads running (which could happen on too many concurrent
 # requests, we limit it with a semaphore.
 MAX_CONCURRENT_THREADS = 1
@@ -15,6 +17,8 @@
 
 # moves blocking call to asyncio threadpool limited to 1 to not overload the system
 # REF: https://stackoverflow.com/a/70929141
-async def async_handler_call(handler: Callable[P, T], body: Dict[str, Any]) -> T:
+async def async_call(handler: Callable[P, T], *args, **kwargs) -> T:
+    logger.info("Setting blocking call to async handler")
     async with MAX_THREADS_GUARD:
-        return await anyio.to_thread.run_sync(functools.partial(handler, body))
+        logger.info("Async call semaphore passed")
+        return await anyio.to_thread.run_sync(handler, *args, **kwargs)
diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
@@ -1,5 +1,6 @@
 import os
 from pathlib import Path
+from time import perf_counter
 from typing import Any, Dict, Literal, Optional, Union
 
 from huggingface_inference_toolkit import logging
@@ -37,7 +38,13 @@ def __call__(self, data: Dict[str, Any]):
             :data: (obj): the raw request body data.
         :return: prediction output
         """
+        start = perf_counter()
+        pred = self._timed_call(data)
+        end = perf_counter()
+        logger.info("Inference duration: %.2f ms", (end - start) * 1000)
+        return pred
 
+    def _timed_call(self, data: Dict[str, Any]):
         logger.debug("Calling HF default handler")
         # import as late as possible to reduce the footprint
         from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
diff --git a/src/huggingface_inference_toolkit/idle.py b/src/huggingface_inference_toolkit/idle.py
@@ -5,6 +5,8 @@
 import signal
 import time
 
+from anyio import Semaphore
+
 LOG = logging.getLogger(__name__)
 
 LAST_START = None
@@ -13,13 +15,16 @@
 UNLOAD_IDLE = os.getenv("UNLOAD_IDLE", "").lower() in ("1", "true")
 IDLE_TIMEOUT = int(os.getenv("IDLE_TIMEOUT", 15))
 
+MAX_REQUESTS = 1000
+REQUEST_COUNTER = Semaphore(MAX_REQUESTS)
+
 
 async def live_check_loop():
     global LAST_START, LAST_END
 
     pid = os.getpid()
 
-    LOG.debug("Starting live check loop")
+    LOG.info("Starting live check loop")
     sleep_time = max(int(IDLE_TIMEOUT // 5), 1)
 
     while True:
@@ -31,32 +36,41 @@ async def live_check_loop():
 
         LOG.debug("Checking pid %d activity", pid)
         if not last_start:
+            LOG.debug("No request yet, no need to unload")
+            continue
+
+        if REQUEST_COUNTER.value < MAX_REQUESTS:
+            LOG.info("idle checker: %s requests likely being processed for pid %d, it won't be killed",
+                     MAX_REQUESTS - REQUEST_COUNTER.value, pid)
             continue
         if not last_end or last_start >= last_end:
-            LOG.debug("Request likely being processed for pid %d", pid)
+            LOG.warning("This case should not be possible, semaphore unconsistency ? "
+                        "Request likely being processed for pid %d", pid)
             continue
         now = time.time()
         last_request_age = now - last_end
         LOG.debug("Pid %d, last request age %s", pid, last_request_age)
         if last_request_age < IDLE_TIMEOUT:
             LOG.debug("Model recently active")
         else:
-            LOG.debug("Inactive for too long. Leaving live check loop")
+            LOG.info("Idle checker: worker inactive for too long. Leaving live check loop")
             break
-    LOG.debug("Aborting this worker")
+    LOG.info("Aborting this idle worker")
     os.kill(pid, signal.SIGTERM)
 
 
-@contextlib.contextmanager
-def request_witnesses():
-    global LAST_START, LAST_END
-    LOG.debug("Last request start was %s", LAST_START)
-    LOG.debug("Last request end was %s", LAST_END)
-    # Simple assignment, concurrency safe, no need for any lock
-    LAST_START = time.time()
-    LOG.debug("Current request start timestamp %s", LAST_START)
-    try:
-        yield
-    finally:
-        LAST_END = time.time()
-        LOG.debug("Current request end timestamp %s", LAST_END)
+@contextlib.asynccontextmanager
+async def request_witnesses():
+    async with REQUEST_COUNTER:
+        LOG.info("Current request count, %s", REQUEST_COUNTER.value)
+        global LAST_START, LAST_END
+        LOG.info("Last request start was %s", LAST_START)
+        LOG.info("Last request end was %s", LAST_END)
+        # Simple assignment, concurrency safe, no need for any lock
+        LAST_START = time.time()
+        LOG.info("Current request start timestamp %s", LAST_START)
+        try:
+            yield
+        finally:
+            LAST_END = time.time()
+            LOG.info("Current request end timestamp %s", LAST_END)
diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py
@@ -1,17 +1,17 @@
 import asyncio
 import base64
 import os
-import threading
 from pathlib import Path
 from time import perf_counter
 
 import orjson
+from anyio import Semaphore
 from starlette.applications import Starlette
 from starlette.responses import PlainTextResponse, Response
 from starlette.routing import Route
 
 from huggingface_inference_toolkit import idle
-from huggingface_inference_toolkit.async_utils import MAX_CONCURRENT_THREADS, MAX_THREADS_GUARD, async_handler_call
+from huggingface_inference_toolkit.async_utils import MAX_CONCURRENT_THREADS, MAX_THREADS_GUARD, async_call
 from huggingface_inference_toolkit.const import (
     HF_FRAMEWORK,
     HF_HUB_TOKEN,
@@ -32,9 +32,9 @@
 from huggingface_inference_toolkit.vertex_ai_utils import _load_repository_from_gcs
 
 INFERENCE_HANDLERS = {}
-INFERENCE_HANDLERS_LOCK = threading.Lock()
+INFERENCE_HANDLERS_SEMAPHORE = Semaphore(1)
 MODEL_DOWNLOADED = False
-MODEL_DL_LOCK = threading.Lock()
+MODEL_DL_SEMAPHORE = Semaphore(1)
 
 
 async def prepare_model_artifacts():
@@ -43,7 +43,7 @@ async def prepare_model_artifacts():
     if idle.UNLOAD_IDLE:
         asyncio.create_task(idle.live_check_loop(), name="live_check_loop")
     else:
-        _eager_model_dl()
+        await async_call(_eager_model_dl)
         logger.info(f"Initializing model from directory:{HF_MODEL_DIR}")
         # 2. determine correct inference handler
         inference_handler = get_inference_handler_either_custom_or_default_handler(
@@ -54,7 +54,7 @@ async def prepare_model_artifacts():
 
 
 def _eager_model_dl():
-    logger.debug("Model download")
+    logger.info("Model download")
     global MODEL_DOWNLOADED
     from huggingface_inference_toolkit.heavy_utils import load_repository_from_hf
     # 1. check if model artifacts available in HF_MODEL_DIR
@@ -83,7 +83,8 @@ def _eager_model_dl():
                     HF_MODEL_DIR: {HF_MODEL_DIR} and HF_MODEL_ID:{HF_MODEL_ID}"""
             )
     else:
-        logger.debug("Model already downloaded in %s", HF_MODEL_DIR)
+        logger.info("Model already downloaded in %s", HF_MODEL_DIR)
+    logger.info("Model successfully downloaded")
     MODEL_DOWNLOADED = True
 
 
@@ -104,14 +105,19 @@ async def metrics(request):
 
 
 async def predict(request):
-    with idle.request_witnesses():
+    total_start_time = perf_counter()
+
+    async with idle.request_witnesses():
         logger.debug("Received request, scope %s", request.scope)
 
         global INFERENCE_HANDLERS
 
         if not MODEL_DOWNLOADED:
-            with MODEL_DL_LOCK:
-                await asyncio.to_thread(_eager_model_dl)
+            async with MODEL_DL_SEMAPHORE:
+                if not MODEL_DOWNLOADED:
+                    logger.info("Model dl semaphore acquired")
+                    await async_call(_eager_model_dl)
+                    logger.info("Model dl semaphore released")
         try:
             task = request.path_params.get("task", HF_TASK)
             # extracts content from request
@@ -152,28 +158,27 @@ async def predict(request):
                 task = "sentence-embeddings"
             inference_handler = INFERENCE_HANDLERS.get(task)
             if not inference_handler:
-                with INFERENCE_HANDLERS_LOCK:
+                async with INFERENCE_HANDLERS_SEMAPHORE:
                     if task not in INFERENCE_HANDLERS:
                         inference_handler = get_inference_handler_either_custom_or_default_handler(
                             HF_MODEL_DIR, task=task)
                         INFERENCE_HANDLERS[task] = inference_handler
                     else:
                         inference_handler = INFERENCE_HANDLERS[task]
-            # tracks request time
-            start_time = perf_counter()
 
             if should_discard_left() and isinstance(inference_handler, HuggingFaceHandler):
                 deserialized_body['handler_params'] = {
                     'request': request
                 }
 
-            logger.debug("Calling inference handler prediction routine")
+            logger.info("Calling inference handler prediction routine")
             # run async not blocking call
-            pred = await async_handler_call(inference_handler, deserialized_body)
+            pred = await async_call(inference_handler, deserialized_body)
 
             # log request time
+            end_time = perf_counter()
             logger.info(
-                f"POST {request.url.path} | Duration: {(perf_counter()-start_time) *1000:.2f} ms"
+                f"POST {request.url.path} Total request duration: {(end_time-total_start_time) *1000:.2f} ms"
             )
 
             if should_discard_left() and pred is None: