From b23e6ce1cf79bf7c593164b5f40c45ef1a8b0264 Mon Sep 17 00:00:00 2001
From: royalfig <ryan.feigenbaum@gmail.com>
Date: Fri, 5 Jun 2026 15:46:00 -0400
Subject: [PATCH] Update starter to use new turn detection model

---
 README.md      |  8 +++++---
 pyproject.toml |  2 +-
 src/agent.py   | 22 +++++++++-------------
 taskfile.yaml  |  4 ++--
 4 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index d6a3c0c..d8980e7 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The starter project includes:
 - A voice AI pipeline built on [LiveKit Inference](https://docs.livekit.io/agents/models/inference)
   with [models](https://docs.livekit.io/agents/models) from OpenAI, Cartesia, and Deepgram. More than 50 other model providers are supported, including [Realtime models](https://docs.livekit.io/agents/models/realtime)
 - Eval suite based on the LiveKit Agents [testing & evaluation framework](https://docs.livekit.io/agents/start/testing/)
-- [LiveKit Turn Detector](https://docs.livekit.io/agents/logic/turns/turn-detector/) for contextually-aware speaker detection, with multilingual support
+- [LiveKit Turn Detector](https://docs.livekit.io/agents/logic/turns/turn-detector/), a multimodal end-of-turn model that listens to the user's audio directly, combining semantic understanding with acoustic cues for state-of-the-art accuracy across 14 languages
 - [Background voice cancellation](https://docs.livekit.io/transport/media/noise-cancellation/)
 - Deep session insights from LiveKit [Agent Observability](https://docs.livekit.io/deploy/observability/)
 - A Dockerfile ready for [production deployment to LiveKit Cloud](https://docs.livekit.io/deploy/agents/)
@@ -92,12 +92,14 @@ lk app env -w -d .env.local
 
 ## Run the agent
 
-Before your first run, you must download certain models such as [Silero VAD](https://docs.livekit.io/agents/logic/turns/vad/) and the [LiveKit turn detector](https://docs.livekit.io/agents/logic/turns/turn-detector/):
+Before your first run, download the [ai-coustics noise cancellation](https://docs.livekit.io/transport/media/noise-cancellation/) model used by the agent:
 
 ```console
-uv run python src/agent.py download-files
+uv run --module livekit.agents download-files
 ```
 
+The [LiveKit turn detector](https://docs.livekit.io/agents/logic/turns/turn-detector/) and the agent's voice activity detection both run on [LiveKit Inference](https://docs.livekit.io/agents/models/inference) and are built into the Agents SDK, so they don't require a separate download.
+
 Next, run this command to speak to your agent directly in your terminal:
 
 ```console
diff --git a/pyproject.toml b/pyproject.toml
index dab8f85..19ef737 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ description = "Simple voice AI assistant built with LiveKit Agents for Python"
 requires-python = ">=3.10, <3.15"
 
 dependencies = [
-    "livekit-agents[silero,turn-detector]==1.5.17",
+    "livekit-agents==1.6.0",
     "livekit-plugins-ai-coustics~=0.2",
     "python-dotenv",
 ]
diff --git a/src/agent.py b/src/agent.py
index 1076905..8a74311 100644
--- a/src/agent.py
+++ b/src/agent.py
@@ -7,13 +7,12 @@
     AgentServer,
     AgentSession,
     JobContext,
-    JobProcess,
+    TurnHandlingOptions,
     cli,
     inference,
     room_io,
 )
-from livekit.plugins import ai_coustics, silero
-from livekit.plugins.turn_detector.multilingual import MultilingualModel
+from livekit.plugins import ai_coustics
 
 logger = logging.getLogger("agent")
 
@@ -92,13 +91,6 @@ def __init__(self) -> None:
 server = AgentServer()
 
 
-def prewarm(proc: JobProcess):
-    proc.userdata["vad"] = silero.VAD.load()
-
-
-server.setup_fnc = prewarm
-
-
 @server.rtc_session(agent_name="my-agent")
 async def my_agent(ctx: JobContext):
     # Logging setup
@@ -117,10 +109,14 @@ async def my_agent(ctx: JobContext):
         tts=inference.TTS(
             model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"
         ),
-        # VAD and turn detection are used to determine when the user is speaking and when the agent should respond
+        # The LiveKit turn detector determines when the user is done speaking and the agent should respond.
+        # AudioTurnDetector is a multimodal model that listens to the user's audio directly, combining
+        # semantic understanding with acoustic cues (intonation, pitch, rhythm) for state-of-the-art accuracy.
+        # AgentSession supplies the required VAD automatically.
         # See more at https://docs.livekit.io/agents/build/turns
-        turn_detection=MultilingualModel(),
-        vad=ctx.proc.userdata["vad"],
+        turn_handling=TurnHandlingOptions(
+            turn_detection=inference.AudioTurnDetector(),
+        ),
         # allow the LLM to generate a response while waiting for the end of turn
         # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
         preemptive_generation=True,
diff --git a/taskfile.yaml b/taskfile.yaml
index 512bbc6..0be77e0 100644
--- a/taskfile.yaml
+++ b/taskfile.yaml
@@ -43,7 +43,7 @@ tasks:
       - echo ''
       - echo '{{ indent .INDENT "cd" }} {{ .REL_PATH }}'
       - echo '{{ indent .INDENT "uv sync" }}'
-      - echo '{{ indent .INDENT "uv run" }} {{ .PYTHON_MAIN }} download-files'
+      - echo '{{ indent .INDENT "uv run --module livekit.agents download-files" }}'
       - echo '{{ indent .INDENT "uv run" }} {{ .PYTHON_MAIN }} console'
 
   help_open_web_console:
@@ -57,7 +57,7 @@ tasks:
       - echo ''
       - echo '{{ indent .INDENT "cd" }} {{ .REL_PATH }}'
       - echo '{{ indent .INDENT "uv sync" }}'
-      - echo '{{ indent .INDENT "uv run" }} {{ .PYTHON_MAIN }} download-files'
+      - echo '{{ indent .INDENT "uv run --module livekit.agents download-files" }}'
       - echo '{{ indent .INDENT "uv run" }} {{ .PYTHON_MAIN }} dev'
       - echo ''
       - echo 'Then visit:'