Add live audio single-agent support and tutorial for inputAudioTranscription

jinnigu · jinnigu · commit b56149c76195 · 2025-10-29T22:28:55.000-07:00
diff --git a/core/src/main/java/com/google/adk/runner/Runner.java b/core/src/main/java/com/google/adk/runner/Runner.java
@@ -397,9 +397,7 @@ private void copySessionStates(Session source, Session target) {
   private InvocationContext newInvocationContextForLive(
       Session session, Optional<LiveRequestQueue> liveRequestQueue, RunConfig runConfig) {
     RunConfig.Builder runConfigBuilder = RunConfig.builder(runConfig);
-    if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty()) {
-      // Parity with Python: apply modality defaults and transcription settings
-      // only for multi-agent live scenarios.
+    if (liveRequestQueue.isPresent()) {
       // Default to AUDIO modality if not specified.
       if (CollectionUtils.isNullOrEmpty(runConfig.responseModalities())) {
         runConfigBuilder.setResponseModalities(
diff --git a/core/src/test/java/com/google/adk/runner/InputAudioTranscriptionTest.java b/core/src/test/java/com/google/adk/runner/InputAudioTranscriptionTest.java
@@ -57,8 +57,7 @@ private InvocationContext invokeNewInvocationContextForLive(
   }
 
   @Test
-  public void newInvocationContextForLive_multiAgent_autoConfiguresInputAudioTranscription()
-      throws Exception {
+  public void newInvocationContextForLive_autoConfiguresInputAudioTranscription() throws Exception {
     TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
     LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build();
     LlmAgent rootAgent =
@@ -86,7 +85,7 @@ public void newInvocationContextForLive_multiAgent_autoConfiguresInputAudioTrans
   }
 
   @Test
-  public void newInvocationContextForLive_explicitConfig_preservesUserInputAudioTranscription()
+  public void newInvocationContextForLive_multiAgent_preservesUserInputAudioTranscription()
       throws Exception {
     TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
     LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build();
@@ -113,4 +112,54 @@ public void newInvocationContextForLive_explicitConfig_preservesUserInputAudioTr
 
     assertThat(context.runConfig().inputAudioTranscription()).isSameInstanceAs(userConfig);
   }
+
+  @Test
+  public void newInvocationContextForLive_singleAgent_autoConfiguresInputAudioTranscription()
+      throws Exception {
+    TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
+    // Single agent with NO sub-agents
+    LlmAgent singleAgent = createTestAgentBuilder(testLlm).name("weather_agent").build();
+
+    Runner runner = new InMemoryRunner(singleAgent, "test", ImmutableList.of());
+    Session session = runner.sessionService().createSession("test", "user").blockingGet();
+
+    RunConfig initialConfig =
+        RunConfig.builder()
+            .setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
+            .setStreamingMode(RunConfig.StreamingMode.BIDI)
+            .build();
+
+    assertThat(initialConfig.inputAudioTranscription()).isNull();
+
+    LiveRequestQueue liveQueue = new LiveRequestQueue();
+    InvocationContext context =
+        invokeNewInvocationContextForLive(runner, session, liveQueue, initialConfig);
+
+    assertThat(context.runConfig().inputAudioTranscription()).isNotNull();
+  }
+
+  @Test
+  public void newInvocationContextForLive_singleAgent_preservesUserInputAudioTranscription()
+      throws Exception {
+    TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
+    // Single agent with NO sub-agents
+    LlmAgent singleAgent = createTestAgentBuilder(testLlm).name("weather_agent").build();
+
+    Runner runner = new InMemoryRunner(singleAgent, "test", ImmutableList.of());
+    Session session = runner.sessionService().createSession("test", "user").blockingGet();
+
+    AudioTranscriptionConfig userConfig = AudioTranscriptionConfig.builder().build();
+    RunConfig configWithUserSetting =
+        RunConfig.builder()
+            .setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
+            .setStreamingMode(RunConfig.StreamingMode.BIDI)
+            .setInputAudioTranscription(userConfig)
+            .build();
+
+    LiveRequestQueue liveQueue = new LiveRequestQueue();
+    InvocationContext context =
+        invokeNewInvocationContextForLive(runner, session, liveQueue, configWithUserSetting);
+
+    assertThat(context.runConfig().inputAudioTranscription()).isSameInstanceAs(userConfig);
+  }
 }
diff --git a/pom.xml b/pom.xml
@@ -31,6 +31,7 @@
     <module>contrib/langchain4j</module>
     <module>contrib/samples</module>
     <module>tutorials/city-time-weather</module>
+    <module>tutorials/live-audio-single-agent</module>
     <module>a2a</module>
     <module>a2a/webservice</module>
   </modules>
diff --git a/tutorials/live-audio-single-agent/README.md b/tutorials/live-audio-single-agent/README.md
@@ -0,0 +1,64 @@
+# Live Audio Single-Agent
+
+A tutorial demonstrating how the ADK (Agent Development Kit) automatically configures **inputAudioTranscription** and **outputAudioTranscription** for single-agent live scenarios. This tutorial showcases that the feature now works for all live scenarios, not just multi-agent scenarios.
+
+## What This Demonstrates
+
+This tutorial verifies the feature change in `Runner.java` that enables automatic transcription configuration for all live scenarios:
+
+**Before:** Only multi-agent scenarios got automatic transcription
+```java
+if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty())
+```
+
+**After:** All live scenarios (including single-agent) get automatic transcription
+```java
+if (liveRequestQueue.isPresent())
+```
+
+When you use this single-agent with live audio, the ADK automatically configures:
+- **inputAudioTranscription** - Transcribes user speech to text
+- **outputAudioTranscription** - Transcribes agent speech to text
+
+## Setup API Key
+
+```shell
+export GOOGLE_GENAI_API_KEY={YOUR-KEY}
+```
+
+## Go to Tutorial Directory
+
+```shell
+cd tutorials/live-audio-single-agent
+```
+
+## Running the Agent
+
+Start the server:
+
+```shell
+mvn exec:java
+```
+
+This starts the ADK web server with a single weather agent (`weather_agent`) that supports live audio using the `gemini-2.0-flash-live-001` model.
+
+## Usage
+
+Once running, you can interact with the agent through:
+- **Web interface:** `http://localhost:8080`
+- **Agent name:** `weather_agent`
+- **Try asking:** "What's the weather in Tokyo?" or "How's the weather in New York?"
+
+### Testing with Live Audio
+
+1. Open the web interface at `http://localhost:8080`
+2. Enable your microphone
+3. Speak to the agent: "What's the weather in Tokyo?"
+4. The agent will:
+   - Automatically transcribe your speech to text (inputAudioTranscription)
+   - Process the request and call the `getWeather` tool
+   - Respond with audio (automatically transcribed via outputAudioTranscription)
+
+## Learn More
+
+See https://google.github.io/adk-docs/get-started/quickstart/#java for more information.
diff --git a/tutorials/live-audio-single-agent/pom.xml b/tutorials/live-audio-single-agent/pom.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>com.google.adk</groupId>
+    <artifactId>google-adk-parent</artifactId>
+    <version>0.3.1-SNAPSHOT</version><!-- {x-version-update:google-adk:current} -->
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>google-adk-tutorials-live-audio-single-agent</artifactId>
+  <name>Agent Development Kit - Tutorial: Live Audio Single-Agent</name>
+
+  <properties>
+    <exec.mainClass>com.google.adk.tutorials.LiveAudioSingleAgent</exec.mainClass>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.google.adk</groupId>
+      <artifactId>google-adk-dev</artifactId>
+      <version>${project.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>ch.qos.logback</groupId>
+          <artifactId>logback-classic</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-simple</artifactId>
+    </dependency>
+  </dependencies>
+</project>
+
diff --git a/tutorials/live-audio-single-agent/src/main/java/com/google/adk/tutorials/LiveAudioSingleAgent.java b/tutorials/live-audio-single-agent/src/main/java/com/google/adk/tutorials/LiveAudioSingleAgent.java
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2025 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.adk.tutorials;
+
+import com.google.adk.agents.BaseAgent;
+import com.google.adk.agents.LlmAgent;
+import com.google.adk.tools.Annotations.Schema;
+import com.google.adk.tools.FunctionTool;
+import com.google.adk.web.AdkWebServer;
+import java.util.Map;
+
+public class LiveAudioSingleAgent {
+
+  public static final BaseAgent WEATHER_AGENT =
+      LlmAgent.builder()
+          .name("weather_agent")
+          .model("gemini-2.0-flash-live-001")
+          .description("A helpful weather assistant that provides weather information.")
+          .instruction(
+              "You are a friendly weather assistant. When users ask about weather, "
+                  + "you MUST call the getWeather tool with the location name. "
+                  + "Extract the location from the user's question. "
+                  + "ALWAYS use the getWeather tool to get accurate information - never make up weather data. "
+                  + "After getting the tool result, provide a friendly and descriptive response. "
+                  + "For general conversation or greetings, respond naturally and helpfully. "
+                  + "Do NOT use code execution for anything.")
+          .tools(FunctionTool.create(LiveAudioSingleAgent.class, "getWeather"))
+          .build();
+
+  public static Map<String, String> getWeather(
+      @Schema(name = "location", description = "The location for which to retrieve weather")
+          String location) {
+
+    Map<String, Map<String, String>> weatherData =
+        Map.of(
+            "new york",
+            Map.of(
+                "status",
+                "success",
+                "temperature",
+                "72°F (22°C)",
+                "condition",
+                "Partly cloudy",
+                "report",
+                "The weather in New York is partly cloudy with a temperature of 72°F (22°C)."),
+            "london",
+            Map.of(
+                "status",
+                "success",
+                "temperature",
+                "59°F (15°C)",
+                "condition",
+                "Rainy",
+                "report",
+                "The weather in London is rainy with a temperature of 59°F (15°C)."),
+            "tokyo",
+            Map.of(
+                "status",
+                "success",
+                "temperature",
+                "68°F (20°C)",
+                "condition",
+                "Clear",
+                "report",
+                "The weather in Tokyo is clear with a temperature of 68°F (20°C)."),
+            "sydney",
+            Map.of(
+                "status",
+                "success",
+                "temperature",
+                "77°F (25°C)",
+                "condition",
+                "Sunny",
+                "report",
+                "The weather in Sydney is sunny with a temperature of 77°F (25°C)."));
+
+    String normalizedLocation = location.toLowerCase().trim();
+
+    return weatherData.getOrDefault(
+        normalizedLocation,
+        Map.of(
+            "status",
+            "error",
+            "report",
+            String.format(
+                "Weather information for '%s' is not available. Try New York, London, Tokyo, or"
+                    + " Sydney.",
+                location)));
+  }
+
+  public static void main(String[] args) {
+    AdkWebServer.start(WEATHER_AGENT);
+  }
+}