From 3c06c7742e663b2635d9b833cce89b2759586dda Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Wed, 1 Apr 2026 17:25:22 -0700
Subject: [PATCH 1/6] fix(cli): handle sync/async serve functions in m serve

Fixes sync/async mismatch in `m serve` by detecting function type and
handling appropriately:
- Async serve functions are awaited directly
- Sync serve functions are wrapped in asyncio.to_thread() to prevent
  blocking FastAPI's event loop

This ensures the server can handle concurrent requests efficiently
regardless of whether user-defined serve functions are sync or async.

Changes:
- cli/serve/app.py: Add asyncio/inspect imports, update make_chat_endpoint()
  to detect coroutine functions and wrap sync functions in to_thread()
- test/cli/test_serve_sync_async.py: Add comprehensive test suite (9 tests)
  including empirical timing test that proves non-blocking behavior

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 cli/serve/app.py                  |  35 +++--
 test/cli/test_serve_sync_async.py | 236 ++++++++++++++++++++++++++++++
 2 files changed, 262 insertions(+), 9 deletions(-)
 create mode 100644 test/cli/test_serve_sync_async.py

diff --git a/cli/serve/app.py b/cli/serve/app.py
index df9eeab76..4dfacb585 100644
--- a/cli/serve/app.py
+++ b/cli/serve/app.py
@@ -1,6 +1,8 @@
 """A simple app that runs an OpenAI compatible server wrapped around a M program."""
 
+import asyncio
 import importlib.util
+import inspect
 import os
 import sys
 import time
@@ -58,15 +60,30 @@ async def endpoint(request: ChatCompletionRequest):
             completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
             created_timestamp = int(time.time())
 
-            output = module.serve(
-                input=request.messages,
-                requirements=request.requirements,
-                model_options={
-                    k: v
-                    for k, v in request.model_dump().items()
-                    if k not in ["messages", "requirements"]
-                },
-            )
+            # Detect if serve is async or sync and handle accordingly
+            if inspect.iscoroutinefunction(module.serve):
+                # It's async, await it directly
+                output = await module.serve(
+                    input=request.messages,
+                    requirements=request.requirements,
+                    model_options={
+                        k: v
+                        for k, v in request.model_dump().items()
+                        if k not in ["messages", "requirements"]
+                    },
+                )
+            else:
+                # It's sync, run in thread pool to avoid blocking event loop
+                output = await asyncio.to_thread(
+                    module.serve,
+                    input=request.messages,
+                    requirements=request.requirements,
+                    model_options={
+                        k: v
+                        for k, v in request.model_dump().items()
+                        if k not in ["messages", "requirements"]
+                    },
+                )
 
             # Extract usage information from the ModelOutputThunk if available
             usage = None
diff --git a/test/cli/test_serve_sync_async.py b/test/cli/test_serve_sync_async.py
new file mode 100644
index 000000000..1e9ece80c
--- /dev/null
+++ b/test/cli/test_serve_sync_async.py
@@ -0,0 +1,236 @@
+"""Tests for sync/async serve function handling in m serve."""
+
+import asyncio
+from unittest.mock import Mock
+
+import pytest
+
+from cli.serve.app import make_chat_endpoint
+from cli.serve.models import ChatCompletionRequest, ChatMessage
+from mellea.core import ModelOutputThunk
+
+
+@pytest.fixture
+def mock_sync_module():
+    """Create a mock module with a synchronous serve function."""
+    module = Mock()
+    module.__name__ = "test_sync_module"
+
+    def sync_serve(input, requirements=None, model_options=None):
+        """Synchronous serve function."""
+        # Simulate some work
+        result = Mock(spec=ModelOutputThunk)
+        result.value = f"Sync response to: {input[-1].content}"
+        return result
+
+    # Use Mock to wrap the function so we can track calls
+    module.serve = Mock(side_effect=sync_serve)
+    return module
+
+
+@pytest.fixture
+def mock_async_module():
+    """Create a mock module with an asynchronous serve function."""
+    module = Mock()
+    module.__name__ = "test_async_module"
+
+    async def async_serve(input, requirements=None, model_options=None):
+        """Asynchronous serve function."""
+        # Simulate async work
+        await asyncio.sleep(0.01)
+        result = Mock(spec=ModelOutputThunk)
+        result.value = f"Async response to: {input[-1].content}"
+        return result
+
+    module.serve = async_serve
+    return module
+
+
+@pytest.fixture
+def mock_slow_sync_module():
+    """Create a mock module with a slow synchronous serve function."""
+    module = Mock()
+    module.__name__ = "test_slow_sync_module"
+
+    def slow_sync_serve(input, requirements=None, model_options=None):
+        """Slow synchronous serve function that would block event loop."""
+        import time
+
+        time.sleep(0.1)  # Simulate blocking work
+        result = Mock(spec=ModelOutputThunk)
+        result.value = f"Slow sync response to: {input[-1].content}"
+        return result
+
+    module.serve = slow_sync_serve
+    return module
+
+
+class TestSyncAsyncServeHandling:
+    """Test that serve handles both sync and async serve functions correctly."""
+
+    @pytest.mark.asyncio
+    async def test_sync_serve_function(self, mock_sync_module):
+        """Test that synchronous serve functions work correctly."""
+        endpoint = make_chat_endpoint(mock_sync_module)
+
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=[ChatMessage(role="user", content="Hello sync!")],
+        )
+
+        response = await endpoint(request)
+
+        assert response.choices[0].message.content == "Sync response to: Hello sync!"
+        assert response.model == "test-model"
+        assert response.object == "chat.completion"
+
+    @pytest.mark.asyncio
+    async def test_async_serve_function(self, mock_async_module):
+        """Test that asynchronous serve functions work correctly."""
+        endpoint = make_chat_endpoint(mock_async_module)
+
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=[ChatMessage(role="user", content="Hello async!")],
+        )
+
+        response = await endpoint(request)
+
+        assert response.choices[0].message.content == "Async response to: Hello async!"
+        assert response.model == "test-model"
+        assert response.object == "chat.completion"
+
+    @pytest.mark.asyncio
+    async def test_slow_sync_does_not_block(self, mock_slow_sync_module):
+        """Test that slow sync functions run in thread pool and don't block event loop.
+
+        This test verifies non-blocking behavior by measuring timing. If the sync
+        function blocked the event loop, two sequential calls would take 2x the time.
+        With proper threading, they should overlap and take only slightly more than 1x.
+        """
+        import time
+
+        endpoint = make_chat_endpoint(mock_slow_sync_module)
+
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=[ChatMessage(role="user", content="Hello slow!")],
+        )
+
+        # Time two concurrent requests
+        start = time.time()
+        results = await asyncio.gather(endpoint(request), endpoint(request))
+        elapsed = time.time() - start
+
+        # If blocking: would take ~0.2s (0.1s + 0.1s sequentially)
+        # If non-blocking: should take ~0.1s (both run concurrently in threads)
+        # Allow some overhead, but should be much less than 0.2s
+        assert elapsed < 0.15, (
+            f"Took {elapsed:.3f}s - appears to be blocking (expected ~0.1s)"
+        )
+        assert all(
+            r.choices[0].message.content == "Slow sync response to: Hello slow!"
+            for r in results
+        )
+
+    @pytest.mark.asyncio
+    async def test_concurrent_requests_with_sync_serve(self, mock_slow_sync_module):
+        """Test that multiple sync requests can be handled concurrently."""
+        endpoint = make_chat_endpoint(mock_slow_sync_module)
+
+        requests = [
+            ChatCompletionRequest(
+                model="test-model",
+                messages=[ChatMessage(role="user", content=f"Request {i}")],
+            )
+            for i in range(3)
+        ]
+
+        # Run requests concurrently
+        responses = await asyncio.gather(*[endpoint(req) for req in requests])
+
+        # All should complete successfully
+        assert len(responses) == 3
+        for i, response in enumerate(responses):
+            assert (
+                response.choices[0].message.content
+                == f"Slow sync response to: Request {i}"
+            )
+
+    @pytest.mark.asyncio
+    async def test_requirements_passed_to_serve(self, mock_sync_module):
+        """Test that requirements are correctly passed to serve function."""
+        endpoint = make_chat_endpoint(mock_sync_module)
+
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=[ChatMessage(role="user", content="Test")],
+            requirements=["req1", "req2"],
+        )
+
+        await endpoint(request)
+
+        # Verify serve was called with requirements
+        mock_sync_module.serve.assert_called_once()
+        call_kwargs = mock_sync_module.serve.call_args.kwargs
+        assert call_kwargs["requirements"] == ["req1", "req2"]
+
+    @pytest.mark.asyncio
+    async def test_model_options_passed_to_serve(self, mock_sync_module):
+        """Test that model options are correctly passed to serve function."""
+        endpoint = make_chat_endpoint(mock_sync_module)
+
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=[ChatMessage(role="user", content="Test")],
+            temperature=0.7,
+            max_tokens=100,
+        )
+
+        await endpoint(request)
+
+        # Verify serve was called with model_options
+        mock_sync_module.serve.assert_called_once()
+        call_kwargs = mock_sync_module.serve.call_args.kwargs
+        model_options = call_kwargs["model_options"]
+        assert "temperature" in model_options
+        assert "max_tokens" in model_options
+
+
+class TestEndpointIntegration:
+    """Integration tests for the full endpoint."""
+
+    def test_endpoint_name_set_correctly(self, mock_sync_module):
+        """Test that endpoint function name is set correctly."""
+        endpoint = make_chat_endpoint(mock_sync_module)
+        assert endpoint.__name__ == "chat_test_sync_module_endpoint"
+
+    @pytest.mark.asyncio
+    async def test_completion_id_generated(self, mock_sync_module):
+        """Test that each response gets a unique completion ID."""
+        endpoint = make_chat_endpoint(mock_sync_module)
+
+        request = ChatCompletionRequest(
+            model="test-model", messages=[ChatMessage(role="user", content="Test")]
+        )
+
+        response1 = await endpoint(request)
+        response2 = await endpoint(request)
+
+        assert response1.id.startswith("chatcmpl-")
+        assert response2.id.startswith("chatcmpl-")
+        assert response1.id != response2.id
+
+    @pytest.mark.asyncio
+    async def test_timestamp_generated(self, mock_sync_module):
+        """Test that response includes a timestamp."""
+        endpoint = make_chat_endpoint(mock_sync_module)
+
+        request = ChatCompletionRequest(
+            model="test-model", messages=[ChatMessage(role="user", content="Test")]
+        )
+
+        response = await endpoint(request)
+
+        assert isinstance(response.created, int)
+        assert response.created > 0

From a2c1958aed33c8eedcf782805ac25bb72b132b9d Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Wed, 1 Apr 2026 17:52:59 -0700
Subject: [PATCH 2/6] feat: Map OpenAI parameters to ModelOption sentinels:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

   - temperature → ModelOption.TEMPERATURE
   - max_tokens → ModelOption.MAX_NEW_TOKENS
   - seed → ModelOption.SEED

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 cli/serve/app.py                  | 41 +++++++++++++++++++++++--------
 test/cli/test_serve.py            | 12 +++++----
 test/cli/test_serve_sync_async.py | 29 ++++++++++++++++++++--
 3 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/cli/serve/app.py b/cli/serve/app.py
index 4dfacb585..713cc4331 100644
--- a/cli/serve/app.py
+++ b/cli/serve/app.py
@@ -13,6 +13,8 @@
 from fastapi import FastAPI
 from fastapi.responses import JSONResponse
 
+from mellea.backends.model_options import ModelOption
+
 from .models import (
     ChatCompletion,
     ChatCompletionMessage,
@@ -55,22 +57,45 @@ def create_openai_error_response(
 def make_chat_endpoint(module):
     """Makes a chat endpoint using a custom module."""
 
+    def _build_model_options(request: ChatCompletionRequest) -> dict:
+        """Build model_options dict, mapping OpenAI params to ModelOption sentinels."""
+        model_options = {}
+
+        # Map standard OpenAI parameters to ModelOption sentinels
+        if request.temperature is not None:
+            model_options[ModelOption.TEMPERATURE] = request.temperature
+        if request.max_tokens is not None:
+            model_options[ModelOption.MAX_NEW_TOKENS] = request.max_tokens
+        if request.seed is not None:
+            model_options[ModelOption.SEED] = request.seed
+
+        # Include any other fields that aren't messages or requirements
+        for k, v in request.model_dump().items():
+            if k not in [
+                "messages",
+                "requirements",
+                "temperature",
+                "max_tokens",
+                "seed",
+            ]:
+                model_options[k] = v
+
+        return model_options
+
     async def endpoint(request: ChatCompletionRequest):
         try:
             completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
             created_timestamp = int(time.time())
 
+            model_options = _build_model_options(request)
+
             # Detect if serve is async or sync and handle accordingly
             if inspect.iscoroutinefunction(module.serve):
                 # It's async, await it directly
                 output = await module.serve(
                     input=request.messages,
                     requirements=request.requirements,
-                    model_options={
-                        k: v
-                        for k, v in request.model_dump().items()
-                        if k not in ["messages", "requirements"]
-                    },
+                    model_options=model_options,
                 )
             else:
                 # It's sync, run in thread pool to avoid blocking event loop
@@ -78,11 +103,7 @@ async def endpoint(request: ChatCompletionRequest):
                     module.serve,
                     input=request.messages,
                     requirements=request.requirements,
-                    model_options={
-                        k: v
-                        for k, v in request.model_dump().items()
-                        if k not in ["messages", "requirements"]
-                    },
+                    model_options=model_options,
                 )
 
             # Extract usage information from the ModelOutputThunk if available
diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py
index 584be0876..5259756ac 100644
--- a/test/cli/test_serve.py
+++ b/test/cli/test_serve.py
@@ -122,6 +122,8 @@ async def test_system_fingerprint_always_none(self, mock_module, sample_request)
     @pytest.mark.asyncio
     async def test_model_options_passed_correctly(self, mock_module, sample_request):
         """Test that model options are passed to serve function correctly."""
+        from mellea.backends.model_options import ModelOption
+
         mock_output = ModelOutputThunk("Test response")
         mock_module.serve.return_value = mock_output
 
@@ -134,11 +136,11 @@ async def test_model_options_passed_correctly(self, mock_module, sample_request)
         assert "model_options" in call_args.kwargs
         model_options = call_args.kwargs["model_options"]
 
-        # Should include temperature and max_tokens but not messages/requirements
-        assert "temperature" in model_options
-        assert model_options["temperature"] == 0.7
-        assert "max_tokens" in model_options
-        assert model_options["max_tokens"] == 100
+        # Should include ModelOption sentinels for temperature and max_tokens
+        assert ModelOption.TEMPERATURE in model_options
+        assert model_options[ModelOption.TEMPERATURE] == 0.7
+        assert ModelOption.MAX_NEW_TOKENS in model_options
+        assert model_options[ModelOption.MAX_NEW_TOKENS] == 100
         assert "messages" not in model_options
         assert "requirements" not in model_options
 
diff --git a/test/cli/test_serve_sync_async.py b/test/cli/test_serve_sync_async.py
index 1e9ece80c..d767d14ea 100644
--- a/test/cli/test_serve_sync_async.py
+++ b/test/cli/test_serve_sync_async.py
@@ -7,6 +7,7 @@
 
 from cli.serve.app import make_chat_endpoint
 from cli.serve.models import ChatCompletionRequest, ChatMessage
+from mellea.backends.model_options import ModelOption
 from mellea.core import ModelOutputThunk
 
 
@@ -193,8 +194,32 @@ async def test_model_options_passed_to_serve(self, mock_sync_module):
         mock_sync_module.serve.assert_called_once()
         call_kwargs = mock_sync_module.serve.call_args.kwargs
         model_options = call_kwargs["model_options"]
-        assert "temperature" in model_options
-        assert "max_tokens" in model_options
+        assert ModelOption.TEMPERATURE in model_options
+        assert ModelOption.MAX_NEW_TOKENS in model_options
+
+    @pytest.mark.asyncio
+    async def test_openai_params_mapped_to_model_options(self, mock_sync_module):
+        """Test that OpenAI parameters are mapped to ModelOption sentinels."""
+        endpoint = make_chat_endpoint(mock_sync_module)
+
+        request = ChatCompletionRequest(
+            model="test-model",
+            messages=[ChatMessage(role="user", content="Test")],
+            temperature=0.8,
+            max_tokens=150,
+            seed=42,
+        )
+
+        await endpoint(request)
+
+        # Verify parameters are mapped correctly
+        mock_sync_module.serve.assert_called_once()
+        call_kwargs = mock_sync_module.serve.call_args.kwargs
+        model_options = call_kwargs["model_options"]
+
+        assert model_options[ModelOption.TEMPERATURE] == 0.8
+        assert model_options[ModelOption.MAX_NEW_TOKENS] == 150
+        assert model_options[ModelOption.SEED] == 42
 
 
 class TestEndpointIntegration:

From 73a040fb5e31ecf92e6ee6d50492635e459a5e6f Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Fri, 3 Apr 2026 08:29:29 -0700
Subject: [PATCH 3/6] fix: catch BaseException to handle StopAsyncIteration

With async added, need to fix the catch

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 mellea/helpers/async_helpers.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mellea/helpers/async_helpers.py b/mellea/helpers/async_helpers.py
index 0618d54c1..592b71933 100644
--- a/mellea/helpers/async_helpers.py
+++ b/mellea/helpers/async_helpers.py
@@ -46,7 +46,12 @@ async def send_to_queue(
     # Typically, nothing awaits this function directly (only through the queue).
     # As a result, we have to be careful about catching all errors and propagating
     # them to the queue.
-    except Exception as e:
+    # Note: We catch BaseException to handle StopAsyncIteration which can leak
+    # from async generators in some Python versions/contexts.
+    except BaseException as e:
+        # Re-raise system-exiting exceptions
+        if isinstance(e, (SystemExit, KeyboardInterrupt)):
+            raise
         await aqueue.put(e)
 
 

From f917d24b1a8d50ffccdeb3da4b9445d5a1070512 Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Fri, 3 Apr 2026 10:21:22 -0700
Subject: [PATCH 4/6] fix: function docstring for raises but has no Raises
 section

Fix for -- Error: function raises but has no Raises section

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 mellea/helpers/async_helpers.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mellea/helpers/async_helpers.py b/mellea/helpers/async_helpers.py
index 592b71933..5e16f4284 100644
--- a/mellea/helpers/async_helpers.py
+++ b/mellea/helpers/async_helpers.py
@@ -25,6 +25,10 @@ async def send_to_queue(
         co: A coroutine or async iterator producing the backend response.
         aqueue: The async queue to send results to. A sentinel ``None`` is appended on
             completion; an exception instance is appended on error.
+
+    Raises:
+        SystemExit: Re-raised if caught during processing.
+        KeyboardInterrupt: Re-raised if caught during processing.
     """
     try:
         if isinstance(co, Coroutine):

From ad9e72febd786a4faae3b462979c1152b8e8ee1d Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Tue, 7 Apr 2026 17:02:21 -0700
Subject: [PATCH 5/6] fix: more mapping and filtering of model options in m
 serve app

* filter out model, n, user, and extra
* comment the filtering
* use a map and ModelOption.replace_keys() for mapping
* fix replace_keys to no-op with from == to

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 cli/serve/app.py                 | 46 ++++++++++++++++----------------
 mellea/backends/model_options.py |  4 +++
 mellea/helpers/async_helpers.py  | 11 +-------
 test/cli/test_serve.py           |  3 ++-
 4 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/cli/serve/app.py b/cli/serve/app.py
index 713cc4331..7dc375beb 100644
--- a/cli/serve/app.py
+++ b/cli/serve/app.py
@@ -58,29 +58,29 @@ def make_chat_endpoint(module):
     """Makes a chat endpoint using a custom module."""
 
     def _build_model_options(request: ChatCompletionRequest) -> dict:
-        """Build model_options dict, mapping OpenAI params to ModelOption sentinels."""
-        model_options = {}
-
-        # Map standard OpenAI parameters to ModelOption sentinels
-        if request.temperature is not None:
-            model_options[ModelOption.TEMPERATURE] = request.temperature
-        if request.max_tokens is not None:
-            model_options[ModelOption.MAX_NEW_TOKENS] = request.max_tokens
-        if request.seed is not None:
-            model_options[ModelOption.SEED] = request.seed
-
-        # Include any other fields that aren't messages or requirements
-        for k, v in request.model_dump().items():
-            if k not in [
-                "messages",
-                "requirements",
-                "temperature",
-                "max_tokens",
-                "seed",
-            ]:
-                model_options[k] = v
-
-        return model_options
+        """Build model_options dict from OpenAI-compatible request parameters."""
+        excluded_fields = {
+            # Request structure fields (handled separately)
+            "messages",  # Chat messages - passed separately to serve()
+            "requirements",  # Mellea requirements - passed separately to serve()
+            # Routing/metadata fields (not generation parameters)
+            "model",  # Model identifier - used for routing, not generation
+            "n",  # Number of completions - not supported in Mellea's model_options
+            "user",  # User tracking ID - metadata, not a generation parameter
+            "extra",  # Pydantic's extra fields dict - unused (see model_config)
+        }
+        openai_to_model_option = {
+            "temperature": ModelOption.TEMPERATURE,
+            "max_tokens": ModelOption.MAX_NEW_TOKENS,
+            "seed": ModelOption.SEED,
+        }
+
+        filtered_options = {
+            key: value
+            for key, value in request.model_dump().items()
+            if key not in excluded_fields
+        }
+        return ModelOption.replace_keys(filtered_options, openai_to_model_option)
 
     async def endpoint(request: ChatCompletionRequest):
         try:
diff --git a/mellea/backends/model_options.py b/mellea/backends/model_options.py
index 428dcab00..2350797ff 100644
--- a/mellea/backends/model_options.py
+++ b/mellea/backends/model_options.py
@@ -91,6 +91,10 @@ def replace_keys(options: dict, from_to: dict[str, str]) -> dict[str, Any]:
             # This will usually be a @@@<>@@@ ModelOption.<> key.
             new_key = from_to.get(old_key, None)
             if new_key:
+                # Skip if old_key and new_key are the same (no-op replacement)
+                if old_key == new_key:
+                    continue
+
                 if new_options.get(new_key, None) is not None:
                     # The key already has a value associated with it in the dict. Leave it be.
                     conflict_log.append(
diff --git a/mellea/helpers/async_helpers.py b/mellea/helpers/async_helpers.py
index 5e16f4284..0618d54c1 100644
--- a/mellea/helpers/async_helpers.py
+++ b/mellea/helpers/async_helpers.py
@@ -25,10 +25,6 @@ async def send_to_queue(
         co: A coroutine or async iterator producing the backend response.
         aqueue: The async queue to send results to. A sentinel ``None`` is appended on
             completion; an exception instance is appended on error.
-
-    Raises:
-        SystemExit: Re-raised if caught during processing.
-        KeyboardInterrupt: Re-raised if caught during processing.
     """
     try:
         if isinstance(co, Coroutine):
@@ -50,12 +46,7 @@ async def send_to_queue(
     # Typically, nothing awaits this function directly (only through the queue).
     # As a result, we have to be careful about catching all errors and propagating
     # them to the queue.
-    # Note: We catch BaseException to handle StopAsyncIteration which can leak
-    # from async generators in some Python versions/contexts.
-    except BaseException as e:
-        # Re-raise system-exiting exceptions
-        if isinstance(e, (SystemExit, KeyboardInterrupt)):
-            raise
+    except Exception as e:
         await aqueue.put(e)
 
 
diff --git a/test/cli/test_serve.py b/test/cli/test_serve.py
index 5259756ac..fb609dc5f 100644
--- a/test/cli/test_serve.py
+++ b/test/cli/test_serve.py
@@ -136,7 +136,8 @@ async def test_model_options_passed_correctly(self, mock_module, sample_request)
         assert "model_options" in call_args.kwargs
         model_options = call_args.kwargs["model_options"]
 
-        # Should include ModelOption sentinels for temperature and max_tokens
+        # Should include ModelOption keys for temperature and max_tokens
+        # Note: TEMPERATURE is just "temperature" (not a sentinel), so it stays as-is
         assert ModelOption.TEMPERATURE in model_options
         assert model_options[ModelOption.TEMPERATURE] == 0.7
         assert ModelOption.MAX_NEW_TOKENS in model_options

From 9fc4444f04e193bd0e07bde2ce21726a3171f633 Mon Sep 17 00:00:00 2001
From: Mark Sturdevant <mark.sturdevant@ibm.com>
Date: Tue, 7 Apr 2026 17:51:10 -0700
Subject: [PATCH 6/6] fix: improve tests per review

* Use whole seconds for timing test so it should be less flakey
* Return a real ModelOutputThunk instead of a misleading Mock
* Use AsyncMock with side_effect for consistency

Signed-off-by: Mark Sturdevant <mark.sturdevant@ibm.com>
---
 test/cli/test_serve_sync_async.py | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/test/cli/test_serve_sync_async.py b/test/cli/test_serve_sync_async.py
index d767d14ea..8e0dab9f8 100644
--- a/test/cli/test_serve_sync_async.py
+++ b/test/cli/test_serve_sync_async.py
@@ -1,7 +1,7 @@
 """Tests for sync/async serve function handling in m serve."""
 
 import asyncio
-from unittest.mock import Mock
+from unittest.mock import AsyncMock, Mock
 
 import pytest
 
@@ -20,9 +20,7 @@ def mock_sync_module():
     def sync_serve(input, requirements=None, model_options=None):
         """Synchronous serve function."""
         # Simulate some work
-        result = Mock(spec=ModelOutputThunk)
-        result.value = f"Sync response to: {input[-1].content}"
-        return result
+        return ModelOutputThunk(f"Sync response to: {input[-1].content}")
 
     # Use Mock to wrap the function so we can track calls
     module.serve = Mock(side_effect=sync_serve)
@@ -39,11 +37,9 @@ async def async_serve(input, requirements=None, model_options=None):
         """Asynchronous serve function."""
         # Simulate async work
         await asyncio.sleep(0.01)
-        result = Mock(spec=ModelOutputThunk)
-        result.value = f"Async response to: {input[-1].content}"
-        return result
+        return ModelOutputThunk(f"Async response to: {input[-1].content}")
 
-    module.serve = async_serve
+    module.serve = AsyncMock(side_effect=async_serve)
     return module
 
 
@@ -57,10 +53,8 @@ def slow_sync_serve(input, requirements=None, model_options=None):
         """Slow synchronous serve function that would block event loop."""
         import time
 
-        time.sleep(0.1)  # Simulate blocking work
-        result = Mock(spec=ModelOutputThunk)
-        result.value = f"Slow sync response to: {input[-1].content}"
-        return result
+        time.sleep(1)  # Simulate blocking work with a clearer timing signal
+        return ModelOutputThunk(f"Slow sync response to: {input[-1].content}")
 
     module.serve = slow_sync_serve
     return module
@@ -123,11 +117,11 @@ async def test_slow_sync_does_not_block(self, mock_slow_sync_module):
         results = await asyncio.gather(endpoint(request), endpoint(request))
         elapsed = time.time() - start
 
-        # If blocking: would take ~0.2s (0.1s + 0.1s sequentially)
-        # If non-blocking: should take ~0.1s (both run concurrently in threads)
-        # Allow some overhead, but should be much less than 0.2s
-        assert elapsed < 0.15, (
-            f"Took {elapsed:.3f}s - appears to be blocking (expected ~0.1s)"
+        # If blocking: would take ~2s (1s + 1s sequentially)
+        # If non-blocking: should take ~1s (both run concurrently in threads)
+        # Allow some overhead, but should still be well below the blocking case.
+        assert elapsed < 2, (
+            f"Took {elapsed:.3f}s - appears to be blocking (expected ~1s)"
         )
         assert all(
             r.choices[0].message.content == "Slow sync response to: Hello slow!"