Update streaming_transcription.py to allow longer audio input per stream (GoogleCloudPlatform#13891)

deliaqi · web-flow · commit 26a3c4b5f8c7 · 2026-03-26T10:47:15.000-07:00
* Update streaming_transcription.py to allow longer audio input per stream

* Configure output_multiple_utterances=true for long audio streams

* Update comments per gemini suggestion.

* Update lint.

* Disable output_multiple_utterances by default.

* Update lint and revert license hearder change.
diff --git a/dialogflow/detect_intent_texts_with_location.py b/dialogflow/detect_intent_texts_with_location.py
@@ -58,7 +58,7 @@ def detect_intent_texts_with_location(
         print("=" * 20)
         print(f"Query text: {response.query_result.query_text}")
         print(
-            f"Detected intent: {response.query_result.intent.display_name} (confidence: {response.query_result.intent_detection_confidence,})\n"
+            f"Detected intent: {response.query_result.intent.display_name} (confidence: {response.query_result.intent_detection_confidence})\n"
         )
         print(f"Fulfillment text: {response.query_result.fulfillment_text}\n")
 
diff --git a/dialogflow/participant_management.py b/dialogflow/participant_management.py
@@ -196,6 +196,7 @@ def analyze_content_audio_stream(
     timeout: int,
     language_code: str,
     single_utterance=False,
+    output_multiple_utterances=False,
 ):
     import google.auth
     from google.cloud import dialogflow_v2beta1 as dialogflow
@@ -231,7 +232,9 @@ def gen_requests(participant_name, audio_config, stream):
         """Generates requests for streaming."""
         audio_generator = stream.generator()
         yield dialogflow.types.participant.StreamingAnalyzeContentRequest(
-            participant=participant_name, audio_config=audio_config
+            participant=participant_name,
+            audio_config=audio_config,
+            output_multiple_utterances=output_multiple_utterances
         )
         for content in audio_generator:
             yield dialogflow.types.participant.StreamingAnalyzeContentRequest(
diff --git a/dialogflow/requirements.txt b/dialogflow/requirements.txt
@@ -1,4 +1,4 @@
-google-cloud-dialogflow==2.36.0
+google-cloud-dialogflow==2.46.0
 Flask==3.0.3
 pyaudio==0.2.14
 termcolor==3.0.0
diff --git a/dialogflow/streaming_transcription.py b/dialogflow/streaming_transcription.py
@@ -34,7 +34,7 @@
 import re
 import sys
 
-from google.api_core.exceptions import DeadlineExceeded
+from google.api_core.exceptions import DeadlineExceeded, OutOfRange
 
 import pyaudio
 
@@ -51,6 +51,7 @@
 CHUNK_SIZE = int(SAMPLE_RATE / 10)  # 100ms
 RESTART_TIMEOUT = 160  # seconds
 MAX_LOOKBACK = 3  # seconds
+HALF_CLOSE_DURATION_MS = 90 * 1000  # milliseconds
 
 YELLOW = "\033[0;33m"
 
@@ -198,6 +199,9 @@ def main():
                         timeout=RESTART_TIMEOUT,
                         language_code="en-US",
                         single_utterance=False,
+                        # Uncomment to process multiple utterances detected in the audio stream
+                        # individually instead of stitching together to form a single utterance.
+                        # output_multiple_utterances=True,
                     )
 
                     # Now, print the final transcription responses to user.
@@ -213,8 +217,10 @@ def main():
                                 offset.seconds * 1000 + offset.microseconds / 1000
                             )
                             transcript = response.recognition_result.transcript
-                            # Half-close the stream with gRPC (in Python just stop yielding requests)
-                            stream.is_final = True
+                            # Half-close upon final results for better streaming experiences
+                            # (in Python just stop yielding requests)
+                            if stream.is_final_offset > HALF_CLOSE_DURATION_MS:
+                                stream.is_final = True
                             # Exit recognition if any of the transcribed phrase could be
                             # one of our keywords.
                             if re.search(r"\b(exit|quit)\b", transcript, re.I):
@@ -223,6 +229,8 @@ def main():
                                 terminate = True
                                 stream.closed = True
                                 break
+                except OutOfRange:
+                    print("Maximum audio duration exceeded in the stream, restarting.")
                 except DeadlineExceeded:
                     print("Deadline Exceeded, restarting.")
 

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def detect_intent_texts_with_location(`
`58`	`58`	`print("=" * 20)`
`59`	`59`	`print(f"Query text: {response.query_result.query_text}")`
`60`	`60`	`print(`
`61`		`- f"Detected intent: {response.query_result.intent.display_name} (confidence: {response.query_result.intent_detection_confidence,})\n"`
	`61`	`+ f"Detected intent: {response.query_result.intent.display_name} (confidence: {response.query_result.intent_detection_confidence})\n"`
`62`	`62`	`)`
`63`	`63`	`print(f"Fulfillment text: {response.query_result.fulfillment_text}\n")`
`64`	`64`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-google-cloud-dialogflow==2.36.0`
	`1`	`+google-cloud-dialogflow==2.46.0`
`2`	`2`	`Flask==3.0.3`
`3`	`3`	`pyaudio==0.2.14`
`4`	`4`	`termcolor==3.0.0`