@@ -306,19 +306,24 @@ message RecognitionConfig {
306306 // *Optional* If 'true', enables speaker detection for each recognized word in
307307 // the top alternative of the recognition result using a speaker_tag provided
308308 // in the WordInfo.
309- // Note: When this is true, we send all the words from the beginning of the
309+ // Note: Use diarization_config instead.
310+ bool enable_speaker_diarization = 16 [deprecated = true ];
311+
312+ // *Optional*
313+ // If set, specifies the estimated number of speakers in the conversation.
314+ // Defaults to '2'. Ignored unless enable_speaker_diarization is set to true.
315+ // Note: Use diarization_config instead.
316+ int32 diarization_speaker_count = 17 [deprecated = true ];
317+
318+ // *Optional* Config to enable speaker diarization and set additional
319+ // parameters to make diarization better suited for your application.
320+ // Note: When this is enabled, we send all the words from the beginning of the
310321 // audio for the top alternative in every consecutive STREAMING responses.
311322 // This is done in order to improve our speaker tags as our models learn to
312323 // identify the speakers in the conversation over time.
313324 // For non-streaming requests, the diarization results will be provided only
314325 // in the top alternative of the FINAL SpeechRecognitionResult.
315- bool enable_speaker_diarization = 16 ;
316-
317- // *Optional*
318- // If set, specifies the estimated number of speakers in the conversation.
319- // If not set, defaults to '2'.
320- // Ignored unless enable_speaker_diarization is set to true."
321- int32 diarization_speaker_count = 17 ;
326+ SpeakerDiarizationConfig diarization_config = 19 ;
322327
323328 // *Optional* Metadata regarding this request.
324329 RecognitionMetadata metadata = 9 ;
@@ -368,6 +373,29 @@ message RecognitionConfig {
368373 bool use_enhanced = 14 ;
369374}
370375
376+ // *Optional* Config to enable speaker diarization.
377+ message SpeakerDiarizationConfig {
378+ // *Optional* If 'true', enables speaker detection for each recognized word in
379+ // the top alternative of the recognition result using a speaker_tag provided
380+ // in the WordInfo.
381+ bool enable_speaker_diarization = 1 ;
382+
383+ // Note: Set min_speaker_count = max_speaker_count to fix the number of
384+ // speakers to be detected in the audio.
385+
386+ // *Optional*
387+ // Minimum number of speakers in the conversation. This range gives you more
388+ // flexibility by allowing the system to automatically determine the correct
389+ // number of speakers. If not set, the default value is 2.
390+ int32 min_speaker_count = 2 ;
391+
392+ // *Optional*
393+ // Maximum number of speakers in the conversation. This range gives you more
394+ // flexibility by allowing the system to automatically determine the correct
395+ // number of speakers. If not set, the default value is 6.
396+ int32 max_speaker_count = 3 ;
397+ }
398+
371399// Description of audio data to be recognized.
372400message RecognitionMetadata {
373401 // Use case categories that the audio recognition request can be described
0 commit comments