feat: Support MP3, TranscriptNormalization and SpeakerLabels in STT V1 API (googleapis#11967)

gcf-owl-bot[bot] · parthea · web-flow · commit 6a9669185327 · 2023-11-01T21:41:37.000-04:00
- [ ] Regenerate this pull request now. PiperOrigin-RevId: 578629599 Source-Link: googleapis/googleapis@08facab Source-Link: https://github.com/googleapis/googleapis-gen/commit/75903e0fe695900f684c72ca8b5b9e6bc160048a Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiNzU5MDNlMGZlNjk1OTAwZjY4NGM3MmNhOGI1YjllNmJjMTYwMDQ4YSJ9 --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Anthonios Partheniou <partheniou@google.com>
diff --git a/packages/google-cloud-speech/CONTRIBUTING.rst b/packages/google-cloud-speech/CONTRIBUTING.rst
@@ -35,21 +35,21 @@ Using a Development Checkout
 You'll have to create a development environment using a Git checkout:
 
 - While logged into your GitHub account, navigate to the
-  ``python-speech`` `repo`_ on GitHub.
+  ``google-cloud-python`` `repo`_ on GitHub.
 
-- Fork and clone the ``python-speech`` repository to your GitHub account by
+- Fork and clone the ``google-cloud-python`` repository to your GitHub account by
   clicking the "Fork" button.
 
-- Clone your fork of ``python-speech`` from your GitHub account to your local
+- Clone your fork of ``google-cloud-python`` from your GitHub account to your local
   computer, substituting your account username and specifying the destination
-  as ``hack-on-python-speech``.  E.g.::
+  as ``hack-on-google-cloud-python``.  E.g.::
 
    $ cd ${HOME}
-   $ git clone git@github.com:USERNAME/python-speech.git hack-on-python-speech
-   $ cd hack-on-python-speech
-   # Configure remotes such that you can pull changes from the googleapis/python-speech
+   $ git clone git@github.com:USERNAME/google-cloud-python.git hack-on-google-cloud-python
+   $ cd hack-on-google-cloud-python
+   # Configure remotes such that you can pull changes from the googleapis/google-cloud-python
    # repository into your local repository.
-   $ git remote add upstream git@github.com:googleapis/python-speech.git
+   $ git remote add upstream git@github.com:googleapis/google-cloud-python.git
    # fetch and merge changes from upstream into main
    $ git fetch upstream
    $ git merge upstream/main
@@ -60,7 +60,7 @@ repo, from which you can submit a pull request.
 To work on the codebase and run the tests, we recommend using ``nox``,
 but you can also use a ``virtualenv`` of your own creation.
 
-.. _repo: https://github.com/googleapis/python-speech
+.. _repo: https://github.com/googleapis/google-cloud-python
 
 Using ``nox``
 =============
@@ -113,7 +113,7 @@ Coding Style
    export GOOGLE_CLOUD_TESTING_BRANCH="main"
 
   By doing this, you are specifying the location of the most up-to-date
-  version of ``python-speech``. The
+  version of ``google-cloud-python``. The
   remote name ``upstream`` should point to the official ``googleapis``
   checkout and the branch should be the default branch on that remote (``main``).
 
@@ -209,7 +209,7 @@ The `description on PyPI`_ for the project comes directly from the
 ``README``. Due to the reStructuredText (``rst``) parser used by
 PyPI, relative links which will work on GitHub (e.g. ``CONTRIBUTING.rst``
 instead of
-``https://github.com/googleapis/python-speech/blob/main/CONTRIBUTING.rst``)
+``https://github.com/googleapis/google-cloud-python/blob/main/CONTRIBUTING.rst``)
 may cause problems creating links or rendering the description.
 
 .. _description on PyPI: https://pypi.org/project/google-cloud-speech
@@ -236,7 +236,7 @@ We support:
 
 Supported versions can be found in our ``noxfile.py`` `config`_.
 
-.. _config: https://github.com/googleapis/python-speech/blob/main/packages/google-cloud-speech/noxfile.py
+.. _config: https://github.com/googleapis/google-cloud-python/blob/main/packages/google-cloud-speech/noxfile.py
 
 
 **********
diff --git a/packages/google-cloud-speech/docs/conf.py b/packages/google-cloud-speech/docs/conf.py
@@ -156,7 +156,7 @@
 html_theme_options = {
     "description": "Google Cloud Client Libraries for google-cloud-speech",
     "github_user": "googleapis",
-    "github_repo": "python-speech",
+    "github_repo": "google-cloud-python",
     "github_banner": True,
     "font_family": "'Roboto', Georgia, sans",
     "head_font_family": "'Roboto', Georgia, serif",
diff --git a/packages/google-cloud-speech/google/cloud/speech/__init__.py b/packages/google-cloud-speech/google/cloud/speech/__init__.py
@@ -63,6 +63,7 @@
     CustomClass,
     PhraseSet,
     SpeechAdaptation,
+    TranscriptNormalization,
 )
 
 __all__ = (
@@ -104,4 +105,5 @@
     "CustomClass",
     "PhraseSet",
     "SpeechAdaptation",
+    "TranscriptNormalization",
 )
diff --git a/packages/google-cloud-speech/google/cloud/speech_v1/__init__.py b/packages/google-cloud-speech/google/cloud/speech_v1/__init__.py
@@ -55,7 +55,12 @@
     UpdateCustomClassRequest,
     UpdatePhraseSetRequest,
 )
-from .types.resource import CustomClass, PhraseSet, SpeechAdaptation
+from .types.resource import (
+    CustomClass,
+    PhraseSet,
+    SpeechAdaptation,
+    TranscriptNormalization,
+)
 
 from google.cloud.speech_v1.helpers import SpeechHelpers
 
@@ -99,6 +104,7 @@ class SpeechClient(SpeechHelpers, SpeechClient):
     "StreamingRecognitionResult",
     "StreamingRecognizeRequest",
     "StreamingRecognizeResponse",
+    "TranscriptNormalization",
     "TranscriptOutputConfig",
     "UpdateCustomClassRequest",
     "UpdatePhraseSetRequest",
diff --git a/packages/google-cloud-speech/google/cloud/speech_v1/types/__init__.py b/packages/google-cloud-speech/google/cloud/speech_v1/types/__init__.py
@@ -48,7 +48,7 @@
     UpdateCustomClassRequest,
     UpdatePhraseSetRequest,
 )
-from .resource import CustomClass, PhraseSet, SpeechAdaptation
+from .resource import CustomClass, PhraseSet, SpeechAdaptation, TranscriptNormalization
 
 __all__ = (
     "LongRunningRecognizeMetadata",
@@ -85,4 +85,5 @@
     "CustomClass",
     "PhraseSet",
     "SpeechAdaptation",
+    "TranscriptNormalization",
 )
diff --git a/packages/google-cloud-speech/google/cloud/speech_v1/types/cloud_speech.py b/packages/google-cloud-speech/google/cloud/speech_v1/types/cloud_speech.py
@@ -359,6 +359,13 @@ class RecognitionConfig(proto.Message):
             adaptation <https://cloud.google.com/speech-to-text/docs/adaptation>`__
             documentation. When speech adaptation is set it supersedes
             the ``speech_contexts`` field.
+        transcript_normalization (google.cloud.speech_v1.types.TranscriptNormalization):
+            Optional. Use transcription normalization to
+            automatically replace parts of the transcript
+            with phrases of your choosing. For
+            StreamingRecognize, this normalization only
+            applies to stable partial transcripts (stability
+            > 0.8) and final transcripts.
         speech_contexts (MutableSequence[google.cloud.speech_v1.types.SpeechContext]):
             Array of
             [SpeechContext][google.cloud.speech.v1.SpeechContext]. A
@@ -551,6 +558,12 @@ class AudioEncoding(proto.Enum):
                 5574. In other words, each RTP header is replaced with a
                 single byte containing the block length. Only Speex wideband
                 is supported. ``sample_rate_hertz`` must be 16000.
+            MP3 (8):
+                MP3 audio. MP3 encoding is a Beta feature and only available
+                in v1p1beta1. Support all standard MP3 bitrates (which range
+                from 32-320 kbps). When using this encoding,
+                ``sample_rate_hertz`` has to match the sample rate of the
+                file being used.
             WEBM_OPUS (9):
                 Opus encoded audio frames in WebM container
                 (`OggOpus <https://wiki.xiph.org/OggOpus>`__).
@@ -565,6 +578,7 @@ class AudioEncoding(proto.Enum):
         AMR_WB = 5
         OGG_OPUS = 6
         SPEEX_WITH_HEADER_BYTE = 7
+        MP3 = 8
         WEBM_OPUS = 9
 
     encoding: AudioEncoding = proto.Field(
@@ -605,6 +619,11 @@ class AudioEncoding(proto.Enum):
         number=20,
         message=resource.SpeechAdaptation,
     )
+    transcript_normalization: resource.TranscriptNormalization = proto.Field(
+        proto.MESSAGE,
+        number=24,
+        message=resource.TranscriptNormalization,
+    )
     speech_contexts: MutableSequence["SpeechContext"] = proto.RepeatedField(
         proto.MESSAGE,
         number=6,
@@ -659,7 +678,7 @@ class SpeakerDiarizationConfig(proto.Message):
         enable_speaker_diarization (bool):
             If 'true', enables speaker detection for each recognized
             word in the top alternative of the recognition result using
-            a speaker_tag provided in the WordInfo.
+            a speaker_label provided in the WordInfo.
         min_speaker_count (int):
             Minimum number of speakers in the
             conversation. This range gives you more
@@ -1469,8 +1488,17 @@ class WordInfo(proto.Message):
             speaker within the audio. This field specifies which one of
             those speakers was detected to have spoken this word. Value
             ranges from '1' to diarization_speaker_count. speaker_tag is
-            set if enable_speaker_diarization = 'true' and only in the
-            top alternative.
+            set if enable_speaker_diarization = 'true' and only for the
+            top alternative. Note: Use speaker_label instead.
+        speaker_label (str):
+            Output only. A label value assigned for every unique speaker
+            within the audio. This field specifies which speaker was
+            detected to have spoken this word. For some models, like
+            medical_conversation this can be actual speaker role, for
+            example "patient" or "provider", but generally this would be
+            a number identifying a speaker. This field is only set if
+            enable_speaker_diarization = 'true' and only for the top
+            alternative.
     """
 
     start_time: duration_pb2.Duration = proto.Field(
@@ -1495,6 +1523,10 @@ class WordInfo(proto.Message):
         proto.INT32,
         number=5,
     )
+    speaker_label: str = proto.Field(
+        proto.STRING,
+        number=6,
+    )
 
 
 class SpeechAdaptationInfo(proto.Message):
diff --git a/packages/google-cloud-speech/google/cloud/speech_v1/types/resource.py b/packages/google-cloud-speech/google/cloud/speech_v1/types/resource.py
@@ -25,6 +25,7 @@
         "CustomClass",
         "PhraseSet",
         "SpeechAdaptation",
+        "TranscriptNormalization",
     },
 )
 
@@ -228,4 +229,54 @@ class ABNFGrammar(proto.Message):
     )
 
 
+class TranscriptNormalization(proto.Message):
+    r"""Transcription normalization configuration. Use transcription
+    normalization to automatically replace parts of the transcript
+    with phrases of your choosing. For StreamingRecognize, this
+    normalization only applies to stable partial transcripts
+    (stability > 0.8) and final transcripts.
+
+    Attributes:
+        entries (MutableSequence[google.cloud.speech_v1.types.TranscriptNormalization.Entry]):
+            A list of replacement entries. We will perform replacement
+            with one entry at a time. For example, the second entry in
+            ["cat" => "dog", "mountain cat" => "mountain dog"] will
+            never be applied because we will always process the first
+            entry before it. At most 100 entries.
+    """
+
+    class Entry(proto.Message):
+        r"""A single replacement configuration.
+
+        Attributes:
+            search (str):
+                What to replace. Max length is 100
+                characters.
+            replace (str):
+                What to replace with. Max length is 100
+                characters.
+            case_sensitive (bool):
+                Whether the search is case sensitive.
+        """
+
+        search: str = proto.Field(
+            proto.STRING,
+            number=1,
+        )
+        replace: str = proto.Field(
+            proto.STRING,
+            number=2,
+        )
+        case_sensitive: bool = proto.Field(
+            proto.BOOL,
+            number=3,
+        )
+
+    entries: MutableSequence[Entry] = proto.RepeatedField(
+        proto.MESSAGE,
+        number=1,
+        message=Entry,
+    )
+
+
 __all__ = tuple(sorted(__protobuf__.manifest))
diff --git a/scripts/client-post-processing/integrate-isolated-handwritten-code.yaml b/scripts/client-post-processing/integrate-isolated-handwritten-code.yaml
@@ -102,10 +102,12 @@ replacements:
       packages/google-cloud-speech/google/cloud/speech_v1/__init__.py,
     ]    
     before: |
-      from .types.resource import CustomClass, PhraseSet, SpeechAdaptation\n
+      \)
+
       __all__ = \(
     after: |
-      from .types.resource import CustomClass, PhraseSet, SpeechAdaptation\n
+      )
+
       from google.cloud.speech_v1.helpers import SpeechHelpers\n\n
       class SpeechClient(SpeechHelpers, SpeechClient):
           __doc__ = SpeechClient.__doc__\n\n

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,7 @@`
`63`	`63`	`CustomClass,`
`64`	`64`	`PhraseSet,`
`65`	`65`	`SpeechAdaptation,`
	`66`	`+ TranscriptNormalization,`
`66`	`67`	`)`
`67`	`68`
`68`	`69`	`__all__ = (`
`@@ -104,4 +105,5 @@`
`104`	`105`	`"CustomClass",`
`105`	`106`	`"PhraseSet",`
`106`	`107`	`"SpeechAdaptation",`
	`108`	`+ "TranscriptNormalization",`
`107`	`109`	`)`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@`
`48`	`48`	`UpdateCustomClassRequest,`
`49`	`49`	`UpdatePhraseSetRequest,`
`50`	`50`	`)`
`51`		`-from .resource import CustomClass, PhraseSet, SpeechAdaptation`
	`51`	`+from .resource import CustomClass, PhraseSet, SpeechAdaptation, TranscriptNormalization`
`52`	`52`
`53`	`53`	`__all__ = (`
`54`	`54`	`"LongRunningRecognizeMetadata",`
`@@ -85,4 +85,5 @@`
`85`	`85`	`"CustomClass",`
`86`	`86`	`"PhraseSet",`
`87`	`87`	`"SpeechAdaptation",`
	`88`	`+ "TranscriptNormalization",`
`88`	`89`	`)`