From b23e6ce1cf79bf7c593164b5f40c45ef1a8b0264 Mon Sep 17 00:00:00 2001 From: royalfig Date: Fri, 5 Jun 2026 15:46:00 -0400 Subject: [PATCH] Update starter to use new turn detection model --- README.md | 8 +++++--- pyproject.toml | 2 +- src/agent.py | 22 +++++++++------------- taskfile.yaml | 4 ++-- 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d6a3c0c..d8980e7 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ The starter project includes: - A voice AI pipeline built on [LiveKit Inference](https://docs.livekit.io/agents/models/inference) with [models](https://docs.livekit.io/agents/models) from OpenAI, Cartesia, and Deepgram. More than 50 other model providers are supported, including [Realtime models](https://docs.livekit.io/agents/models/realtime) - Eval suite based on the LiveKit Agents [testing & evaluation framework](https://docs.livekit.io/agents/start/testing/) -- [LiveKit Turn Detector](https://docs.livekit.io/agents/logic/turns/turn-detector/) for contextually-aware speaker detection, with multilingual support +- [LiveKit Turn Detector](https://docs.livekit.io/agents/logic/turns/turn-detector/), a multimodal end-of-turn model that listens to the user's audio directly, combining semantic understanding with acoustic cues for state-of-the-art accuracy across 14 languages - [Background voice cancellation](https://docs.livekit.io/transport/media/noise-cancellation/) - Deep session insights from LiveKit [Agent Observability](https://docs.livekit.io/deploy/observability/) - A Dockerfile ready for [production deployment to LiveKit Cloud](https://docs.livekit.io/deploy/agents/) @@ -92,12 +92,14 @@ lk app env -w -d .env.local ## Run the agent -Before your first run, you must download certain models such as [Silero VAD](https://docs.livekit.io/agents/logic/turns/vad/) and the [LiveKit turn detector](https://docs.livekit.io/agents/logic/turns/turn-detector/): +Before your first run, download the [ai-coustics noise cancellation](https://docs.livekit.io/transport/media/noise-cancellation/) model used by the agent: ```console -uv run python src/agent.py download-files +uv run --module livekit.agents download-files ``` +The [LiveKit turn detector](https://docs.livekit.io/agents/logic/turns/turn-detector/) and the agent's voice activity detection both run on [LiveKit Inference](https://docs.livekit.io/agents/models/inference) and are built into the Agents SDK, so they don't require a separate download. + Next, run this command to speak to your agent directly in your terminal: ```console diff --git a/pyproject.toml b/pyproject.toml index dab8f85..19ef737 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ description = "Simple voice AI assistant built with LiveKit Agents for Python" requires-python = ">=3.10, <3.15" dependencies = [ - "livekit-agents[silero,turn-detector]==1.5.17", + "livekit-agents==1.6.0", "livekit-plugins-ai-coustics~=0.2", "python-dotenv", ] diff --git a/src/agent.py b/src/agent.py index 1076905..8a74311 100644 --- a/src/agent.py +++ b/src/agent.py @@ -7,13 +7,12 @@ AgentServer, AgentSession, JobContext, - JobProcess, + TurnHandlingOptions, cli, inference, room_io, ) -from livekit.plugins import ai_coustics, silero -from livekit.plugins.turn_detector.multilingual import MultilingualModel +from livekit.plugins import ai_coustics logger = logging.getLogger("agent") @@ -92,13 +91,6 @@ def __init__(self) -> None: server = AgentServer() -def prewarm(proc: JobProcess): - proc.userdata["vad"] = silero.VAD.load() - - -server.setup_fnc = prewarm - - @server.rtc_session(agent_name="my-agent") async def my_agent(ctx: JobContext): # Logging setup @@ -117,10 +109,14 @@ async def my_agent(ctx: JobContext): tts=inference.TTS( model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc" ), - # VAD and turn detection are used to determine when the user is speaking and when the agent should respond + # The LiveKit turn detector determines when the user is done speaking and the agent should respond. + # AudioTurnDetector is a multimodal model that listens to the user's audio directly, combining + # semantic understanding with acoustic cues (intonation, pitch, rhythm) for state-of-the-art accuracy. + # AgentSession supplies the required VAD automatically. # See more at https://docs.livekit.io/agents/build/turns - turn_detection=MultilingualModel(), - vad=ctx.proc.userdata["vad"], + turn_handling=TurnHandlingOptions( + turn_detection=inference.AudioTurnDetector(), + ), # allow the LLM to generate a response while waiting for the end of turn # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation preemptive_generation=True, diff --git a/taskfile.yaml b/taskfile.yaml index 512bbc6..0be77e0 100644 --- a/taskfile.yaml +++ b/taskfile.yaml @@ -43,7 +43,7 @@ tasks: - echo '' - echo '{{ indent .INDENT "cd" }} {{ .REL_PATH }}' - echo '{{ indent .INDENT "uv sync" }}' - - echo '{{ indent .INDENT "uv run" }} {{ .PYTHON_MAIN }} download-files' + - echo '{{ indent .INDENT "uv run --module livekit.agents download-files" }}' - echo '{{ indent .INDENT "uv run" }} {{ .PYTHON_MAIN }} console' help_open_web_console: @@ -57,7 +57,7 @@ tasks: - echo '' - echo '{{ indent .INDENT "cd" }} {{ .REL_PATH }}' - echo '{{ indent .INDENT "uv sync" }}' - - echo '{{ indent .INDENT "uv run" }} {{ .PYTHON_MAIN }} download-files' + - echo '{{ indent .INDENT "uv run --module livekit.agents download-files" }}' - echo '{{ indent .INDENT "uv run" }} {{ .PYTHON_MAIN }} dev' - echo '' - echo 'Then visit:'