@@ -276,6 +276,16 @@ message RecognitionConfig {
276276 // premium feature.
277277 bool enable_automatic_punctuation = 11 ;
278278
279+ // *Optional* Config to enable speaker diarization and set additional
280+ // parameters to make diarization better suited for your application.
281+ // Note: When this is enabled, we send all the words from the beginning of the
282+ // audio for the top alternative in every consecutive STREAMING responses.
283+ // This is done in order to improve our speaker tags as our models learn to
284+ // identify the speakers in the conversation over time.
285+ // For non-streaming requests, the diarization results will be provided only
286+ // in the top alternative of the FINAL SpeechRecognitionResult.
287+ SpeakerDiarizationConfig diarization_config = 19 ;
288+
279289 // *Optional* Metadata regarding this request.
280290 RecognitionMetadata metadata = 9 ;
281291
@@ -324,6 +334,36 @@ message RecognitionConfig {
324334 bool use_enhanced = 14 ;
325335}
326336
337+ // *Optional* Config to enable speaker diarization.
338+ message SpeakerDiarizationConfig {
339+ // *Optional* If 'true', enables speaker detection for each recognized word in
340+ // the top alternative of the recognition result using a speaker_tag provided
341+ // in the WordInfo.
342+ bool enable_speaker_diarization = 1 ;
343+
344+ // Note: Set min_speaker_count = max_speaker_count to fix the number of
345+ // speakers to be detected in the audio.
346+
347+ // *Optional*
348+ // Minimum number of speakers in the conversation. This range gives you more
349+ // flexibility by allowing the system to automatically determine the correct
350+ // number of speakers. If not set, the default value is 2.
351+ int32 min_speaker_count = 2 ;
352+
353+ // *Optional*
354+ // Maximum number of speakers in the conversation. This range gives you more
355+ // flexibility by allowing the system to automatically determine the correct
356+ // number of speakers. If not set, the default value is 6.
357+ int32 max_speaker_count = 3 ;
358+
359+ // Output only. A distinct integer value is assigned for every speaker within
360+ // the audio. This field specifies which one of those speakers was detected to
361+ // have spoken this word. Value ranges from '1' to diarization_speaker_count.
362+ // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
363+ // top alternative.
364+ int32 speaker_tag = 5 ;
365+ }
366+
327367// Description of audio data to be recognized.
328368message RecognitionMetadata {
329369 // Use case categories that the audio recognition request can be described
0 commit comments