From b56149c761952ea01d516ef783689eb033357fb4 Mon Sep 17 00:00:00 2001 From: Jinni Gu Date: Tue, 28 Oct 2025 20:29:28 -0700 Subject: [PATCH] Add live audio single-agent support and tutorial for inputAudioTranscription --- .../java/com/google/adk/runner/Runner.java | 4 +- .../runner/InputAudioTranscriptionTest.java | 55 ++++++++- pom.xml | 1 + tutorials/live-audio-single-agent/README.md | 64 +++++++++++ tutorials/live-audio-single-agent/pom.xml | 52 +++++++++ .../adk/tutorials/LiveAudioSingleAgent.java | 107 ++++++++++++++++++ 6 files changed, 277 insertions(+), 6 deletions(-) create mode 100644 tutorials/live-audio-single-agent/README.md create mode 100644 tutorials/live-audio-single-agent/pom.xml create mode 100644 tutorials/live-audio-single-agent/src/main/java/com/google/adk/tutorials/LiveAudioSingleAgent.java diff --git a/core/src/main/java/com/google/adk/runner/Runner.java b/core/src/main/java/com/google/adk/runner/Runner.java index 19e6a06f9..08a2bb50b 100644 --- a/core/src/main/java/com/google/adk/runner/Runner.java +++ b/core/src/main/java/com/google/adk/runner/Runner.java @@ -397,9 +397,7 @@ private void copySessionStates(Session source, Session target) { private InvocationContext newInvocationContextForLive( Session session, Optional liveRequestQueue, RunConfig runConfig) { RunConfig.Builder runConfigBuilder = RunConfig.builder(runConfig); - if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty()) { - // Parity with Python: apply modality defaults and transcription settings - // only for multi-agent live scenarios. + if (liveRequestQueue.isPresent()) { // Default to AUDIO modality if not specified. if (CollectionUtils.isNullOrEmpty(runConfig.responseModalities())) { runConfigBuilder.setResponseModalities( diff --git a/core/src/test/java/com/google/adk/runner/InputAudioTranscriptionTest.java b/core/src/test/java/com/google/adk/runner/InputAudioTranscriptionTest.java index 5719355f1..55d41916f 100644 --- a/core/src/test/java/com/google/adk/runner/InputAudioTranscriptionTest.java +++ b/core/src/test/java/com/google/adk/runner/InputAudioTranscriptionTest.java @@ -57,8 +57,7 @@ private InvocationContext invokeNewInvocationContextForLive( } @Test - public void newInvocationContextForLive_multiAgent_autoConfiguresInputAudioTranscription() - throws Exception { + public void newInvocationContextForLive_autoConfiguresInputAudioTranscription() throws Exception { TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response"))); LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build(); LlmAgent rootAgent = @@ -86,7 +85,7 @@ public void newInvocationContextForLive_multiAgent_autoConfiguresInputAudioTrans } @Test - public void newInvocationContextForLive_explicitConfig_preservesUserInputAudioTranscription() + public void newInvocationContextForLive_multiAgent_preservesUserInputAudioTranscription() throws Exception { TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response"))); LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build(); @@ -113,4 +112,54 @@ public void newInvocationContextForLive_explicitConfig_preservesUserInputAudioTr assertThat(context.runConfig().inputAudioTranscription()).isSameInstanceAs(userConfig); } + + @Test + public void newInvocationContextForLive_singleAgent_autoConfiguresInputAudioTranscription() + throws Exception { + TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response"))); + // Single agent with NO sub-agents + LlmAgent singleAgent = createTestAgentBuilder(testLlm).name("weather_agent").build(); + + Runner runner = new InMemoryRunner(singleAgent, "test", ImmutableList.of()); + Session session = runner.sessionService().createSession("test", "user").blockingGet(); + + RunConfig initialConfig = + RunConfig.builder() + .setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO))) + .setStreamingMode(RunConfig.StreamingMode.BIDI) + .build(); + + assertThat(initialConfig.inputAudioTranscription()).isNull(); + + LiveRequestQueue liveQueue = new LiveRequestQueue(); + InvocationContext context = + invokeNewInvocationContextForLive(runner, session, liveQueue, initialConfig); + + assertThat(context.runConfig().inputAudioTranscription()).isNotNull(); + } + + @Test + public void newInvocationContextForLive_singleAgent_preservesUserInputAudioTranscription() + throws Exception { + TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response"))); + // Single agent with NO sub-agents + LlmAgent singleAgent = createTestAgentBuilder(testLlm).name("weather_agent").build(); + + Runner runner = new InMemoryRunner(singleAgent, "test", ImmutableList.of()); + Session session = runner.sessionService().createSession("test", "user").blockingGet(); + + AudioTranscriptionConfig userConfig = AudioTranscriptionConfig.builder().build(); + RunConfig configWithUserSetting = + RunConfig.builder() + .setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO))) + .setStreamingMode(RunConfig.StreamingMode.BIDI) + .setInputAudioTranscription(userConfig) + .build(); + + LiveRequestQueue liveQueue = new LiveRequestQueue(); + InvocationContext context = + invokeNewInvocationContextForLive(runner, session, liveQueue, configWithUserSetting); + + assertThat(context.runConfig().inputAudioTranscription()).isSameInstanceAs(userConfig); + } } diff --git a/pom.xml b/pom.xml index d174f6ad8..b431a00e7 100644 --- a/pom.xml +++ b/pom.xml @@ -31,6 +31,7 @@ contrib/langchain4j contrib/samples tutorials/city-time-weather + tutorials/live-audio-single-agent a2a a2a/webservice diff --git a/tutorials/live-audio-single-agent/README.md b/tutorials/live-audio-single-agent/README.md new file mode 100644 index 000000000..50626fa1c --- /dev/null +++ b/tutorials/live-audio-single-agent/README.md @@ -0,0 +1,64 @@ +# Live Audio Single-Agent + +A tutorial demonstrating how the ADK (Agent Development Kit) automatically configures **inputAudioTranscription** and **outputAudioTranscription** for single-agent live scenarios. This tutorial showcases that the feature now works for all live scenarios, not just multi-agent scenarios. + +## What This Demonstrates + +This tutorial verifies the feature change in `Runner.java` that enables automatic transcription configuration for all live scenarios: + +**Before:** Only multi-agent scenarios got automatic transcription +```java +if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty()) +``` + +**After:** All live scenarios (including single-agent) get automatic transcription +```java +if (liveRequestQueue.isPresent()) +``` + +When you use this single-agent with live audio, the ADK automatically configures: +- **inputAudioTranscription** - Transcribes user speech to text +- **outputAudioTranscription** - Transcribes agent speech to text + +## Setup API Key + +```shell +export GOOGLE_GENAI_API_KEY={YOUR-KEY} +``` + +## Go to Tutorial Directory + +```shell +cd tutorials/live-audio-single-agent +``` + +## Running the Agent + +Start the server: + +```shell +mvn exec:java +``` + +This starts the ADK web server with a single weather agent (`weather_agent`) that supports live audio using the `gemini-2.0-flash-live-001` model. + +## Usage + +Once running, you can interact with the agent through: +- **Web interface:** `http://localhost:8080` +- **Agent name:** `weather_agent` +- **Try asking:** "What's the weather in Tokyo?" or "How's the weather in New York?" + +### Testing with Live Audio + +1. Open the web interface at `http://localhost:8080` +2. Enable your microphone +3. Speak to the agent: "What's the weather in Tokyo?" +4. The agent will: + - Automatically transcribe your speech to text (inputAudioTranscription) + - Process the request and call the `getWeather` tool + - Respond with audio (automatically transcribed via outputAudioTranscription) + +## Learn More + +See https://google.github.io/adk-docs/get-started/quickstart/#java for more information. diff --git a/tutorials/live-audio-single-agent/pom.xml b/tutorials/live-audio-single-agent/pom.xml new file mode 100644 index 000000000..7c08cd755 --- /dev/null +++ b/tutorials/live-audio-single-agent/pom.xml @@ -0,0 +1,52 @@ + + + + 4.0.0 + + + com.google.adk + google-adk-parent + 0.3.1-SNAPSHOT + ../../pom.xml + + + google-adk-tutorials-live-audio-single-agent + Agent Development Kit - Tutorial: Live Audio Single-Agent + + + com.google.adk.tutorials.LiveAudioSingleAgent + + + + + com.google.adk + google-adk-dev + ${project.version} + + + ch.qos.logback + logback-classic + + + + + org.slf4j + slf4j-simple + + + + diff --git a/tutorials/live-audio-single-agent/src/main/java/com/google/adk/tutorials/LiveAudioSingleAgent.java b/tutorials/live-audio-single-agent/src/main/java/com/google/adk/tutorials/LiveAudioSingleAgent.java new file mode 100644 index 000000000..85e8914a7 --- /dev/null +++ b/tutorials/live-audio-single-agent/src/main/java/com/google/adk/tutorials/LiveAudioSingleAgent.java @@ -0,0 +1,107 @@ +/* + * Copyright 2025 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.adk.tutorials; + +import com.google.adk.agents.BaseAgent; +import com.google.adk.agents.LlmAgent; +import com.google.adk.tools.Annotations.Schema; +import com.google.adk.tools.FunctionTool; +import com.google.adk.web.AdkWebServer; +import java.util.Map; + +public class LiveAudioSingleAgent { + + public static final BaseAgent WEATHER_AGENT = + LlmAgent.builder() + .name("weather_agent") + .model("gemini-2.0-flash-live-001") + .description("A helpful weather assistant that provides weather information.") + .instruction( + "You are a friendly weather assistant. When users ask about weather, " + + "you MUST call the getWeather tool with the location name. " + + "Extract the location from the user's question. " + + "ALWAYS use the getWeather tool to get accurate information - never make up weather data. " + + "After getting the tool result, provide a friendly and descriptive response. " + + "For general conversation or greetings, respond naturally and helpfully. " + + "Do NOT use code execution for anything.") + .tools(FunctionTool.create(LiveAudioSingleAgent.class, "getWeather")) + .build(); + + public static Map getWeather( + @Schema(name = "location", description = "The location for which to retrieve weather") + String location) { + + Map> weatherData = + Map.of( + "new york", + Map.of( + "status", + "success", + "temperature", + "72°F (22°C)", + "condition", + "Partly cloudy", + "report", + "The weather in New York is partly cloudy with a temperature of 72°F (22°C)."), + "london", + Map.of( + "status", + "success", + "temperature", + "59°F (15°C)", + "condition", + "Rainy", + "report", + "The weather in London is rainy with a temperature of 59°F (15°C)."), + "tokyo", + Map.of( + "status", + "success", + "temperature", + "68°F (20°C)", + "condition", + "Clear", + "report", + "The weather in Tokyo is clear with a temperature of 68°F (20°C)."), + "sydney", + Map.of( + "status", + "success", + "temperature", + "77°F (25°C)", + "condition", + "Sunny", + "report", + "The weather in Sydney is sunny with a temperature of 77°F (25°C).")); + + String normalizedLocation = location.toLowerCase().trim(); + + return weatherData.getOrDefault( + normalizedLocation, + Map.of( + "status", + "error", + "report", + String.format( + "Weather information for '%s' is not available. Try New York, London, Tokyo, or" + + " Sydney.", + location))); + } + + public static void main(String[] args) { + AdkWebServer.start(WEATHER_AGENT); + } +}