From 3c06c7742e663b2635d9b833cce89b2759586dda Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Wed, 1 Apr 2026 17:25:22 -0700 Subject: [PATCH 1/6] fix(cli): handle sync/async serve functions in m serve Fixes sync/async mismatch in `m serve` by detecting function type and handling appropriately: - Async serve functions are awaited directly - Sync serve functions are wrapped in asyncio.to_thread() to prevent blocking FastAPI's event loop This ensures the server can handle concurrent requests efficiently regardless of whether user-defined serve functions are sync or async. Changes: - cli/serve/app.py: Add asyncio/inspect imports, update make_chat_endpoint() to detect coroutine functions and wrap sync functions in to_thread() - test/cli/test_serve_sync_async.py: Add comprehensive test suite (9 tests) including empirical timing test that proves non-blocking behavior Signed-off-by: Mark Sturdevant --- cli/serve/app.py | 35 +++-- test/cli/test_serve_sync_async.py | 236 ++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+), 9 deletions(-) create mode 100644 test/cli/test_serve_sync_async.py diff --git a/cli/serve/app.py b/cli/serve/app.py index df9eeab76..4dfacb585 100644 --- a/cli/serve/app.py +++ b/cli/serve/app.py @@ -1,6 +1,8 @@ """A simple app that runs an OpenAI compatible server wrapped around a M program.""" +import asyncio import importlib.util +import inspect import os import sys import time @@ -58,15 +60,30 @@ async def endpoint(request: ChatCompletionRequest): completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}" created_timestamp = int(time.time()) - output = module.serve( - input=request.messages, - requirements=request.requirements, - model_options={ - k: v - for k, v in request.model_dump().items() - if k not in ["messages", "requirements"] - }, - ) + # Detect if serve is async or sync and handle accordingly + if inspect.iscoroutinefunction(module.serve): + # It's async, await it directly + output = await module.serve( + input=request.messages, + requirements=request.requirements, + model_options={ + k: v + for k, v in request.model_dump().items() + if k not in ["messages", "requirements"] + }, + ) + else: + # It's sync, run in thread pool to avoid blocking event loop + output = await asyncio.to_thread( + module.serve, + input=request.messages, + requirements=request.requirements, + model_options={ + k: v + for k, v in request.model_dump().items() + if k not in ["messages", "requirements"] + }, + ) # Extract usage information from the ModelOutputThunk if available usage = None diff --git a/test/cli/test_serve_sync_async.py b/test/cli/test_serve_sync_async.py new file mode 100644 index 000000000..1e9ece80c --- /dev/null +++ b/test/cli/test_serve_sync_async.py @@ -0,0 +1,236 @@ +"""Tests for sync/async serve function handling in m serve.""" + +import asyncio +from unittest.mock import Mock + +import pytest + +from cli.serve.app import make_chat_endpoint +from cli.serve.models import ChatCompletionRequest, ChatMessage +from mellea.core import ModelOutputThunk + + +@pytest.fixture +def mock_sync_module(): + """Create a mock module with a synchronous serve function.""" + module = Mock() + module.__name__ = "test_sync_module" + + def sync_serve(input, requirements=None, model_options=None): + """Synchronous serve function.""" + # Simulate some work + result = Mock(spec=ModelOutputThunk) + result.value = f"Sync response to: {input[-1].content}" + return result + + # Use Mock to wrap the function so we can track calls + module.serve = Mock(side_effect=sync_serve) + return module + + +@pytest.fixture +def mock_async_module(): + """Create a mock module with an asynchronous serve function.""" + module = Mock() + module.__name__ = "test_async_module" + + async def async_serve(input, requirements=None, model_options=None): + """Asynchronous serve function.""" + # Simulate async work + await asyncio.sleep(0.01) + result = Mock(spec=ModelOutputThunk) + result.value = f"Async response to: {input[-1].content}" + return result + + module.serve = async_serve + return module + + +@pytest.fixture +def mock_slow_sync_module(): + """Create a mock module with a slow synchronous serve function.""" + module = Mock() + module.__name__ = "test_slow_sync_module" + + def slow_sync_serve(input, requirements=None, model_options=None): + """Slow synchronous serve function that would block event loop.""" + import time + + time.sleep(0.1) # Simulate blocking work + result = Mock(spec=ModelOutputThunk) + result.value = f"Slow sync response to: {input[-1].content}" + return result + + module.serve = slow_sync_serve + return module + + +class TestSyncAsyncServeHandling: + """Test that serve handles both sync and async serve functions correctly.""" + + @pytest.mark.asyncio + async def test_sync_serve_function(self, mock_sync_module): + """Test that synchronous serve functions work correctly.""" + endpoint = make_chat_endpoint(mock_sync_module) + + request = ChatCompletionRequest( + model="test-model", + messages=[ChatMessage(role="user", content="Hello sync!")], + ) + + response = await endpoint(request) + + assert response.choices[0].message.content == "Sync response to: Hello sync!" + assert response.model == "test-model" + assert response.object == "chat.completion" + + @pytest.mark.asyncio + async def test_async_serve_function(self, mock_async_module): + """Test that asynchronous serve functions work correctly.""" + endpoint = make_chat_endpoint(mock_async_module) + + request = ChatCompletionRequest( + model="test-model", + messages=[ChatMessage(role="user", content="Hello async!")], + ) + + response = await endpoint(request) + + assert response.choices[0].message.content == "Async response to: Hello async!" + assert response.model == "test-model" + assert response.object == "chat.completion" + + @pytest.mark.asyncio + async def test_slow_sync_does_not_block(self, mock_slow_sync_module): + """Test that slow sync functions run in thread pool and don't block event loop. + + This test verifies non-blocking behavior by measuring timing. If the sync + function blocked the event loop, two sequential calls would take 2x the time. + With proper threading, they should overlap and take only slightly more than 1x. + """ + import time + + endpoint = make_chat_endpoint(mock_slow_sync_module) + + request = ChatCompletionRequest( + model="test-model", + messages=[ChatMessage(role="user", content="Hello slow!")], + ) + + # Time two concurrent requests + start = time.time() + results = await asyncio.gather(endpoint(request), endpoint(request)) + elapsed = time.time() - start + + # If blocking: would take ~0.2s (0.1s + 0.1s sequentially) + # If non-blocking: should take ~0.1s (both run concurrently in threads) + # Allow some overhead, but should be much less than 0.2s + assert elapsed < 0.15, ( + f"Took {elapsed:.3f}s - appears to be blocking (expected ~0.1s)" + ) + assert all( + r.choices[0].message.content == "Slow sync response to: Hello slow!" + for r in results + ) + + @pytest.mark.asyncio + async def test_concurrent_requests_with_sync_serve(self, mock_slow_sync_module): + """Test that multiple sync requests can be handled concurrently.""" + endpoint = make_chat_endpoint(mock_slow_sync_module) + + requests = [ + ChatCompletionRequest( + model="test-model", + messages=[ChatMessage(role="user", content=f"Request {i}")], + ) + for i in range(3) + ] + + # Run requests concurrently + responses = await asyncio.gather(*[endpoint(req) for req in requests]) + + # All should complete successfully + assert len(responses) == 3 + for i, response in enumerate(responses): + assert ( + response.choices[0].message.content + == f"Slow sync response to: Request {i}" + ) + + @pytest.mark.asyncio + async def test_requirements_passed_to_serve(self, mock_sync_module): + """Test that requirements are correctly passed to serve function.""" + endpoint = make_chat_endpoint(mock_sync_module) + + request = ChatCompletionRequest( + model="test-model", + messages=[ChatMessage(role="user", content="Test")], + requirements=["req1", "req2"], + ) + + await endpoint(request) + + # Verify serve was called with requirements + mock_sync_module.serve.assert_called_once() + call_kwargs = mock_sync_module.serve.call_args.kwargs + assert call_kwargs["requirements"] == ["req1", "req2"] + + @pytest.mark.asyncio + async def test_model_options_passed_to_serve(self, mock_sync_module): + """Test that model options are correctly passed to serve function.""" + endpoint = make_chat_endpoint(mock_sync_module) + + request = ChatCompletionRequest( + model="test-model", + messages=[ChatMessage(role="user", content="Test")], + temperature=0.7, + max_tokens=100, + ) + + await endpoint(request) + + # Verify serve was called with model_options + mock_sync_module.serve.assert_called_once() + call_kwargs = mock_sync_module.serve.call_args.kwargs + model_options = call_kwargs["model_options"] + assert "temperature" in model_options + assert "max_tokens" in model_options + + +class TestEndpointIntegration: + """Integration tests for the full endpoint.""" + + def test_endpoint_name_set_correctly(self, mock_sync_module): + """Test that endpoint function name is set correctly.""" + endpoint = make_chat_endpoint(mock_sync_module) + assert endpoint.__name__ == "chat_test_sync_module_endpoint" + + @pytest.mark.asyncio + async def test_completion_id_generated(self, mock_sync_module): + """Test that each response gets a unique completion ID.""" + endpoint = make_chat_endpoint(mock_sync_module) + + request = ChatCompletionRequest( + model="test-model", messages=[ChatMessage(role="user", content="Test")] + ) + + response1 = await endpoint(request) + response2 = await endpoint(request) + + assert response1.id.startswith("chatcmpl-") + assert response2.id.startswith("chatcmpl-") + assert response1.id != response2.id + + @pytest.mark.asyncio + async def test_timestamp_generated(self, mock_sync_module): + """Test that response includes a timestamp.""" + endpoint = make_chat_endpoint(mock_sync_module) + + request = ChatCompletionRequest( + model="test-model", messages=[ChatMessage(role="user", content="Test")] + ) + + response = await endpoint(request) + + assert isinstance(response.created, int) + assert response.created > 0 From a2c1958aed33c8eedcf782805ac25bb72b132b9d Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Wed, 1 Apr 2026 17:52:59 -0700 Subject: [PATCH 2/6] feat: Map OpenAI parameters to ModelOption sentinels: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - temperature → ModelOption.TEMPERATURE - max_tokens → ModelOption.MAX_NEW_TOKENS - seed → ModelOption.SEED Signed-off-by: Mark Sturdevant --- cli/serve/app.py | 41 +++++++++++++++++++++++-------- test/cli/test_serve.py | 12 +++++---- test/cli/test_serve_sync_async.py | 29 ++++++++++++++++++++-- 3 files changed, 65 insertions(+), 17 deletions(-) diff --git a/cli/serve/app.py b/cli/serve/app.py index 4dfacb585..713cc4331 100644 --- a/cli/serve/app.py +++ b/cli/serve/app.py @@ -13,6 +13,8 @@ from fastapi import FastAPI from fastapi.responses import JSONResponse +from mellea.backends.model_options import ModelOption + from .models import ( ChatCompletion, ChatCompletionMessage, @@ -55,22 +57,45 @@ def create_openai_error_response( def make_chat_endpoint(module): """Makes a chat endpoint using a custom module.""" + def _build_model_options(request: ChatCompletionRequest) -> dict: + """Build model_options dict, mapping OpenAI params to ModelOption sentinels.""" + model_options = {} + + # Map standard OpenAI parameters to ModelOption sentinels + if request.temperature is not None: + model_options[ModelOption.TEMPERATURE] = request.temperature + if request.max_tokens is not None: + model_options[ModelOption.MAX_NEW_TOKENS] = request.max_tokens + if request.seed is not None: + model_options[ModelOption.SEED] = request.seed + + # Include any other fields that aren't messages or requirements + for k, v in request.model_dump().items(): + if k not in [ + "messages", + "requirements", + "temperature", + "max_tokens", + "seed", + ]: + model_options[k] = v + + return model_options + async def endpoint(request: ChatCompletionRequest): try: completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}" created_timestamp = int(time.time()) + model_options = _build_model_options(request) + # Detect if serve is async or sync and handle accordingly if inspect.iscoroutinefunction(module.serve): # It's async, await it directly output = await module.serve( input=request.messages, requirements=request.requirements, - model_options={ - k: v - for k, v in request.model_dump().items() - if k not in ["messages", "requirements"] - }, + model_options=model_options, ) else: # It's sync, run in thread pool to avoid blocking event loop @@ -78,11 +103,7 @@ async def endpoint(request: ChatCompletionRequest): module.serve, input=request.messages, requirements=request.requirements, - model_options={ - k: v - for k, v in request.model_dump().items() - if k not in ["messages", "requirements"] - }, + model_options=model_options, ) # Extract usage information from the ModelOutputThunk if available diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py index 584be0876..5259756ac 100644 --- a/test/cli/test_serve.py +++ b/test/cli/test_serve.py @@ -122,6 +122,8 @@ async def test_system_fingerprint_always_none(self, mock_module, sample_request) @pytest.mark.asyncio async def test_model_options_passed_correctly(self, mock_module, sample_request): """Test that model options are passed to serve function correctly.""" + from mellea.backends.model_options import ModelOption + mock_output = ModelOutputThunk("Test response") mock_module.serve.return_value = mock_output @@ -134,11 +136,11 @@ async def test_model_options_passed_correctly(self, mock_module, sample_request) assert "model_options" in call_args.kwargs model_options = call_args.kwargs["model_options"] - # Should include temperature and max_tokens but not messages/requirements - assert "temperature" in model_options - assert model_options["temperature"] == 0.7 - assert "max_tokens" in model_options - assert model_options["max_tokens"] == 100 + # Should include ModelOption sentinels for temperature and max_tokens + assert ModelOption.TEMPERATURE in model_options + assert model_options[ModelOption.TEMPERATURE] == 0.7 + assert ModelOption.MAX_NEW_TOKENS in model_options + assert model_options[ModelOption.MAX_NEW_TOKENS] == 100 assert "messages" not in model_options assert "requirements" not in model_options diff --git a/test/cli/test_serve_sync_async.py b/test/cli/test_serve_sync_async.py index 1e9ece80c..d767d14ea 100644 --- a/test/cli/test_serve_sync_async.py +++ b/test/cli/test_serve_sync_async.py @@ -7,6 +7,7 @@ from cli.serve.app import make_chat_endpoint from cli.serve.models import ChatCompletionRequest, ChatMessage +from mellea.backends.model_options import ModelOption from mellea.core import ModelOutputThunk @@ -193,8 +194,32 @@ async def test_model_options_passed_to_serve(self, mock_sync_module): mock_sync_module.serve.assert_called_once() call_kwargs = mock_sync_module.serve.call_args.kwargs model_options = call_kwargs["model_options"] - assert "temperature" in model_options - assert "max_tokens" in model_options + assert ModelOption.TEMPERATURE in model_options + assert ModelOption.MAX_NEW_TOKENS in model_options + + @pytest.mark.asyncio + async def test_openai_params_mapped_to_model_options(self, mock_sync_module): + """Test that OpenAI parameters are mapped to ModelOption sentinels.""" + endpoint = make_chat_endpoint(mock_sync_module) + + request = ChatCompletionRequest( + model="test-model", + messages=[ChatMessage(role="user", content="Test")], + temperature=0.8, + max_tokens=150, + seed=42, + ) + + await endpoint(request) + + # Verify parameters are mapped correctly + mock_sync_module.serve.assert_called_once() + call_kwargs = mock_sync_module.serve.call_args.kwargs + model_options = call_kwargs["model_options"] + + assert model_options[ModelOption.TEMPERATURE] == 0.8 + assert model_options[ModelOption.MAX_NEW_TOKENS] == 150 + assert model_options[ModelOption.SEED] == 42 class TestEndpointIntegration: From 73a040fb5e31ecf92e6ee6d50492635e459a5e6f Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Fri, 3 Apr 2026 08:29:29 -0700 Subject: [PATCH 3/6] fix: catch BaseException to handle StopAsyncIteration With async added, need to fix the catch Signed-off-by: Mark Sturdevant --- mellea/helpers/async_helpers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mellea/helpers/async_helpers.py b/mellea/helpers/async_helpers.py index 0618d54c1..592b71933 100644 --- a/mellea/helpers/async_helpers.py +++ b/mellea/helpers/async_helpers.py @@ -46,7 +46,12 @@ async def send_to_queue( # Typically, nothing awaits this function directly (only through the queue). # As a result, we have to be careful about catching all errors and propagating # them to the queue. - except Exception as e: + # Note: We catch BaseException to handle StopAsyncIteration which can leak + # from async generators in some Python versions/contexts. + except BaseException as e: + # Re-raise system-exiting exceptions + if isinstance(e, (SystemExit, KeyboardInterrupt)): + raise await aqueue.put(e) From f917d24b1a8d50ffccdeb3da4b9445d5a1070512 Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Fri, 3 Apr 2026 10:21:22 -0700 Subject: [PATCH 4/6] fix: function docstring for raises but has no Raises section Fix for -- Error: function raises but has no Raises section Signed-off-by: Mark Sturdevant --- mellea/helpers/async_helpers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mellea/helpers/async_helpers.py b/mellea/helpers/async_helpers.py index 592b71933..5e16f4284 100644 --- a/mellea/helpers/async_helpers.py +++ b/mellea/helpers/async_helpers.py @@ -25,6 +25,10 @@ async def send_to_queue( co: A coroutine or async iterator producing the backend response. aqueue: The async queue to send results to. A sentinel ``None`` is appended on completion; an exception instance is appended on error. + + Raises: + SystemExit: Re-raised if caught during processing. + KeyboardInterrupt: Re-raised if caught during processing. """ try: if isinstance(co, Coroutine): From ad9e72febd786a4faae3b462979c1152b8e8ee1d Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Tue, 7 Apr 2026 17:02:21 -0700 Subject: [PATCH 5/6] fix: more mapping and filtering of model options in m serve app * filter out model, n, user, and extra * comment the filtering * use a map and ModelOption.replace_keys() for mapping * fix replace_keys to no-op with from == to Signed-off-by: Mark Sturdevant --- cli/serve/app.py | 46 ++++++++++++++++---------------- mellea/backends/model_options.py | 4 +++ mellea/helpers/async_helpers.py | 11 +------- test/cli/test_serve.py | 3 ++- 4 files changed, 30 insertions(+), 34 deletions(-) diff --git a/cli/serve/app.py b/cli/serve/app.py index 713cc4331..7dc375beb 100644 --- a/cli/serve/app.py +++ b/cli/serve/app.py @@ -58,29 +58,29 @@ def make_chat_endpoint(module): """Makes a chat endpoint using a custom module.""" def _build_model_options(request: ChatCompletionRequest) -> dict: - """Build model_options dict, mapping OpenAI params to ModelOption sentinels.""" - model_options = {} - - # Map standard OpenAI parameters to ModelOption sentinels - if request.temperature is not None: - model_options[ModelOption.TEMPERATURE] = request.temperature - if request.max_tokens is not None: - model_options[ModelOption.MAX_NEW_TOKENS] = request.max_tokens - if request.seed is not None: - model_options[ModelOption.SEED] = request.seed - - # Include any other fields that aren't messages or requirements - for k, v in request.model_dump().items(): - if k not in [ - "messages", - "requirements", - "temperature", - "max_tokens", - "seed", - ]: - model_options[k] = v - - return model_options + """Build model_options dict from OpenAI-compatible request parameters.""" + excluded_fields = { + # Request structure fields (handled separately) + "messages", # Chat messages - passed separately to serve() + "requirements", # Mellea requirements - passed separately to serve() + # Routing/metadata fields (not generation parameters) + "model", # Model identifier - used for routing, not generation + "n", # Number of completions - not supported in Mellea's model_options + "user", # User tracking ID - metadata, not a generation parameter + "extra", # Pydantic's extra fields dict - unused (see model_config) + } + openai_to_model_option = { + "temperature": ModelOption.TEMPERATURE, + "max_tokens": ModelOption.MAX_NEW_TOKENS, + "seed": ModelOption.SEED, + } + + filtered_options = { + key: value + for key, value in request.model_dump().items() + if key not in excluded_fields + } + return ModelOption.replace_keys(filtered_options, openai_to_model_option) async def endpoint(request: ChatCompletionRequest): try: diff --git a/mellea/backends/model_options.py b/mellea/backends/model_options.py index 428dcab00..2350797ff 100644 --- a/mellea/backends/model_options.py +++ b/mellea/backends/model_options.py @@ -91,6 +91,10 @@ def replace_keys(options: dict, from_to: dict[str, str]) -> dict[str, Any]: # This will usually be a @@@<>@@@ ModelOption.<> key. new_key = from_to.get(old_key, None) if new_key: + # Skip if old_key and new_key are the same (no-op replacement) + if old_key == new_key: + continue + if new_options.get(new_key, None) is not None: # The key already has a value associated with it in the dict. Leave it be. conflict_log.append( diff --git a/mellea/helpers/async_helpers.py b/mellea/helpers/async_helpers.py index 5e16f4284..0618d54c1 100644 --- a/mellea/helpers/async_helpers.py +++ b/mellea/helpers/async_helpers.py @@ -25,10 +25,6 @@ async def send_to_queue( co: A coroutine or async iterator producing the backend response. aqueue: The async queue to send results to. A sentinel ``None`` is appended on completion; an exception instance is appended on error. - - Raises: - SystemExit: Re-raised if caught during processing. - KeyboardInterrupt: Re-raised if caught during processing. """ try: if isinstance(co, Coroutine): @@ -50,12 +46,7 @@ async def send_to_queue( # Typically, nothing awaits this function directly (only through the queue). # As a result, we have to be careful about catching all errors and propagating # them to the queue. - # Note: We catch BaseException to handle StopAsyncIteration which can leak - # from async generators in some Python versions/contexts. - except BaseException as e: - # Re-raise system-exiting exceptions - if isinstance(e, (SystemExit, KeyboardInterrupt)): - raise + except Exception as e: await aqueue.put(e) diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py index 5259756ac..fb609dc5f 100644 --- a/test/cli/test_serve.py +++ b/test/cli/test_serve.py @@ -136,7 +136,8 @@ async def test_model_options_passed_correctly(self, mock_module, sample_request) assert "model_options" in call_args.kwargs model_options = call_args.kwargs["model_options"] - # Should include ModelOption sentinels for temperature and max_tokens + # Should include ModelOption keys for temperature and max_tokens + # Note: TEMPERATURE is just "temperature" (not a sentinel), so it stays as-is assert ModelOption.TEMPERATURE in model_options assert model_options[ModelOption.TEMPERATURE] == 0.7 assert ModelOption.MAX_NEW_TOKENS in model_options From 9fc4444f04e193bd0e07bde2ce21726a3171f633 Mon Sep 17 00:00:00 2001 From: Mark Sturdevant Date: Tue, 7 Apr 2026 17:51:10 -0700 Subject: [PATCH 6/6] fix: improve tests per review * Use whole seconds for timing test so it should be less flakey * Return a real ModelOutputThunk instead of a misleading Mock * Use AsyncMock with side_effect for consistency Signed-off-by: Mark Sturdevant --- test/cli/test_serve_sync_async.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/test/cli/test_serve_sync_async.py b/test/cli/test_serve_sync_async.py index d767d14ea..8e0dab9f8 100644 --- a/test/cli/test_serve_sync_async.py +++ b/test/cli/test_serve_sync_async.py @@ -1,7 +1,7 @@ """Tests for sync/async serve function handling in m serve.""" import asyncio -from unittest.mock import Mock +from unittest.mock import AsyncMock, Mock import pytest @@ -20,9 +20,7 @@ def mock_sync_module(): def sync_serve(input, requirements=None, model_options=None): """Synchronous serve function.""" # Simulate some work - result = Mock(spec=ModelOutputThunk) - result.value = f"Sync response to: {input[-1].content}" - return result + return ModelOutputThunk(f"Sync response to: {input[-1].content}") # Use Mock to wrap the function so we can track calls module.serve = Mock(side_effect=sync_serve) @@ -39,11 +37,9 @@ async def async_serve(input, requirements=None, model_options=None): """Asynchronous serve function.""" # Simulate async work await asyncio.sleep(0.01) - result = Mock(spec=ModelOutputThunk) - result.value = f"Async response to: {input[-1].content}" - return result + return ModelOutputThunk(f"Async response to: {input[-1].content}") - module.serve = async_serve + module.serve = AsyncMock(side_effect=async_serve) return module @@ -57,10 +53,8 @@ def slow_sync_serve(input, requirements=None, model_options=None): """Slow synchronous serve function that would block event loop.""" import time - time.sleep(0.1) # Simulate blocking work - result = Mock(spec=ModelOutputThunk) - result.value = f"Slow sync response to: {input[-1].content}" - return result + time.sleep(1) # Simulate blocking work with a clearer timing signal + return ModelOutputThunk(f"Slow sync response to: {input[-1].content}") module.serve = slow_sync_serve return module @@ -123,11 +117,11 @@ async def test_slow_sync_does_not_block(self, mock_slow_sync_module): results = await asyncio.gather(endpoint(request), endpoint(request)) elapsed = time.time() - start - # If blocking: would take ~0.2s (0.1s + 0.1s sequentially) - # If non-blocking: should take ~0.1s (both run concurrently in threads) - # Allow some overhead, but should be much less than 0.2s - assert elapsed < 0.15, ( - f"Took {elapsed:.3f}s - appears to be blocking (expected ~0.1s)" + # If blocking: would take ~2s (1s + 1s sequentially) + # If non-blocking: should take ~1s (both run concurrently in threads) + # Allow some overhead, but should still be well below the blocking case. + assert elapsed < 2, ( + f"Took {elapsed:.3f}s - appears to be blocking (expected ~1s)" ) assert all( r.choices[0].message.content == "Slow sync response to: Hello slow!"