Skip to content

Commit 42e6ed1

Browse files
tseaverbusunkim96
authored andcommitted
Add speaker diarization configuration support (via synth). (#9202)
Also, exclude 'noxfile.py' from synth (to preserve 'samples'). Supersedes #9196.
1 parent 39a2114 commit 42e6ed1

4 files changed

Lines changed: 256 additions & 45 deletions

File tree

packages/google-cloud-python-speech/google/cloud/speech_v1/proto/cloud_speech.proto

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,16 @@ message RecognitionConfig {
276276
// premium feature.
277277
bool enable_automatic_punctuation = 11;
278278

279+
// *Optional* Config to enable speaker diarization and set additional
280+
// parameters to make diarization better suited for your application.
281+
// Note: When this is enabled, we send all the words from the beginning of the
282+
// audio for the top alternative in every consecutive STREAMING responses.
283+
// This is done in order to improve our speaker tags as our models learn to
284+
// identify the speakers in the conversation over time.
285+
// For non-streaming requests, the diarization results will be provided only
286+
// in the top alternative of the FINAL SpeechRecognitionResult.
287+
SpeakerDiarizationConfig diarization_config = 19;
288+
279289
// *Optional* Metadata regarding this request.
280290
RecognitionMetadata metadata = 9;
281291

@@ -324,6 +334,36 @@ message RecognitionConfig {
324334
bool use_enhanced = 14;
325335
}
326336

337+
// *Optional* Config to enable speaker diarization.
338+
message SpeakerDiarizationConfig {
339+
// *Optional* If 'true', enables speaker detection for each recognized word in
340+
// the top alternative of the recognition result using a speaker_tag provided
341+
// in the WordInfo.
342+
bool enable_speaker_diarization = 1;
343+
344+
// Note: Set min_speaker_count = max_speaker_count to fix the number of
345+
// speakers to be detected in the audio.
346+
347+
// *Optional*
348+
// Minimum number of speakers in the conversation. This range gives you more
349+
// flexibility by allowing the system to automatically determine the correct
350+
// number of speakers. If not set, the default value is 2.
351+
int32 min_speaker_count = 2;
352+
353+
// *Optional*
354+
// Maximum number of speakers in the conversation. This range gives you more
355+
// flexibility by allowing the system to automatically determine the correct
356+
// number of speakers. If not set, the default value is 6.
357+
int32 max_speaker_count = 3;
358+
359+
// Output only. A distinct integer value is assigned for every speaker within
360+
// the audio. This field specifies which one of those speakers was detected to
361+
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
362+
// speaker_tag is set if enable_speaker_diarization = 'true' and only in the
363+
// top alternative.
364+
int32 speaker_tag = 5;
365+
}
366+
327367
// Description of audio data to be recognized.
328368
message RecognitionMetadata {
329369
// Use case categories that the audio recognition request can be described

0 commit comments

Comments
 (0)