feat: add transcript normalization + m4a audio format support (#11937)

gcf-owl-bot[bot] · web-flow · commit 8536b2077654 · 2023-10-30T18:53:35.000-04:00
- [ ] Regenerate this pull request now. BEGIN_COMMIT_OVERRIDE feat: add transcript normalization + m4a audio format support docs: clarify alternatives for deprecated fields docs: deprecate `BatchRecognizeFileResult.uri` in favor of `cloud_storage_result.native_format_uri` docs: deprecate `BatchRecognizeFileResult.transcript` in favor of `inline_result.transcript` END_COMMIT_OVERRIDE PiperOrigin-RevId: 577926708 Source-Link: googleapis/googleapis@37e816b Source-Link: googleapis/googleapis-gen@e12bd7b Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiZTEyYmQ3YmRiYmI5ZDJlNDE4YTkyMjA3NWQyM2Y3N2E4YzFlNzQ4NSJ9 --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
diff --git a/packages/google-cloud-speech/google/cloud/speech_v2/__init__.py b/packages/google-cloud-speech/google/cloud/speech_v2/__init__.py
@@ -28,6 +28,7 @@
     BatchRecognizeResponse,
     BatchRecognizeResults,
     BatchRecognizeTranscriptionMetadata,
+    CloudStorageResult,
     Config,
     CreateCustomClassRequest,
     CreatePhraseSetRequest,
@@ -43,6 +44,7 @@
     GetPhraseSetRequest,
     GetRecognizerRequest,
     InlineOutputConfig,
+    InlineResult,
     ListCustomClassesRequest,
     ListCustomClassesResponse,
     ListPhraseSetsRequest,
@@ -67,6 +69,7 @@
     StreamingRecognitionResult,
     StreamingRecognizeRequest,
     StreamingRecognizeResponse,
+    TranscriptNormalization,
     UndeleteCustomClassRequest,
     UndeletePhraseSetRequest,
     UndeleteRecognizerRequest,
@@ -87,6 +90,7 @@
     "BatchRecognizeResponse",
     "BatchRecognizeResults",
     "BatchRecognizeTranscriptionMetadata",
+    "CloudStorageResult",
     "Config",
     "CreateCustomClassRequest",
     "CreatePhraseSetRequest",
@@ -102,6 +106,7 @@
     "GetPhraseSetRequest",
     "GetRecognizerRequest",
     "InlineOutputConfig",
+    "InlineResult",
     "ListCustomClassesRequest",
     "ListCustomClassesResponse",
     "ListPhraseSetsRequest",
@@ -127,6 +132,7 @@
     "StreamingRecognitionResult",
     "StreamingRecognizeRequest",
     "StreamingRecognizeResponse",
+    "TranscriptNormalization",
     "UndeleteCustomClassRequest",
     "UndeletePhraseSetRequest",
     "UndeleteRecognizerRequest",
diff --git a/packages/google-cloud-speech/google/cloud/speech_v2/types/__init__.py b/packages/google-cloud-speech/google/cloud/speech_v2/types/__init__.py
@@ -22,6 +22,7 @@
     BatchRecognizeResponse,
     BatchRecognizeResults,
     BatchRecognizeTranscriptionMetadata,
+    CloudStorageResult,
     Config,
     CreateCustomClassRequest,
     CreatePhraseSetRequest,
@@ -37,6 +38,7 @@
     GetPhraseSetRequest,
     GetRecognizerRequest,
     InlineOutputConfig,
+    InlineResult,
     ListCustomClassesRequest,
     ListCustomClassesResponse,
     ListPhraseSetsRequest,
@@ -61,6 +63,7 @@
     StreamingRecognitionResult,
     StreamingRecognizeRequest,
     StreamingRecognizeResponse,
+    TranscriptNormalization,
     UndeleteCustomClassRequest,
     UndeletePhraseSetRequest,
     UndeleteRecognizerRequest,
@@ -80,6 +83,7 @@
     "BatchRecognizeResponse",
     "BatchRecognizeResults",
     "BatchRecognizeTranscriptionMetadata",
+    "CloudStorageResult",
     "Config",
     "CreateCustomClassRequest",
     "CreatePhraseSetRequest",
@@ -95,6 +99,7 @@
     "GetPhraseSetRequest",
     "GetRecognizerRequest",
     "InlineOutputConfig",
+    "InlineResult",
     "ListCustomClassesRequest",
     "ListCustomClassesResponse",
     "ListPhraseSetsRequest",
@@ -119,6 +124,7 @@
     "StreamingRecognitionResult",
     "StreamingRecognizeRequest",
     "StreamingRecognizeResponse",
+    "TranscriptNormalization",
     "UndeleteCustomClassRequest",
     "UndeletePhraseSetRequest",
     "UndeleteRecognizerRequest",
diff --git a/packages/google-cloud-speech/google/cloud/speech_v2/types/cloud_speech.py b/packages/google-cloud-speech/google/cloud/speech_v2/types/cloud_speech.py
@@ -39,6 +39,7 @@
         "ExplicitDecodingConfig",
         "SpeakerDiarizationConfig",
         "RecognitionFeatures",
+        "TranscriptNormalization",
         "SpeechAdaptation",
         "RecognitionConfig",
         "RecognizeRequest",
@@ -56,6 +57,8 @@
         "RecognitionOutputConfig",
         "BatchRecognizeResponse",
         "BatchRecognizeResults",
+        "CloudStorageResult",
+        "InlineResult",
         "BatchRecognizeFileResult",
         "BatchRecognizeTranscriptionMetadata",
         "BatchRecognizeMetadata",
@@ -587,9 +590,14 @@ class Recognizer(proto.Message):
             User-settable, human-readable name for the
             Recognizer. Must be 63 characters or less.
         model (str):
-            Optional. Which model to use for recognition requests.
-            Select the model best suited to your domain to get best
-            results.
+            Optional. This field is now deprecated. Prefer the
+            [``model``][google.cloud.speech.v2.RecognitionConfig.model]
+            field in the
+            [``RecognitionConfig``][google.cloud.speech.v2.RecognitionConfig]
+            message.
+
+            Which model to use for recognition requests. Select the
+            model best suited to your domain to get best results.
 
             Guidance for choosing which model to use can be found in the
             `Transcription Models
@@ -598,7 +606,13 @@ class Recognizer(proto.Message):
             `Table Of Supported
             Models <https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages>`__.
         language_codes (MutableSequence[str]):
-            Optional. The language of the supplied audio as a
+            Optional. This field is now deprecated. Prefer the
+            [``language_codes``][google.cloud.speech.v2.RecognitionConfig.language_codes]
+            field in the
+            [``RecognitionConfig``][google.cloud.speech.v2.RecognitionConfig]
+            message.
+
+            The language of the supplied audio as a
             `BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
             language tag.
 
@@ -772,6 +786,8 @@ class AutoDetectDecodingConfig(proto.Message):
 
     -  WEBM_OPUS: Opus audio frames in a WebM container.
 
+    -  M4A: M4A audio format.
+
     """
 
 
@@ -991,6 +1007,56 @@ class MultiChannelMode(proto.Enum):
     )
 
 
+class TranscriptNormalization(proto.Message):
+    r"""Transcription normalization configuration. Use transcription
+    normalization to automatically replace parts of the transcript
+    with phrases of your choosing. For StreamingRecognize, this
+    normalization only applies to stable partial transcripts
+    (stability > 0.8) and final transcripts.
+
+    Attributes:
+        entries (MutableSequence[google.cloud.speech_v2.types.TranscriptNormalization.Entry]):
+            A list of replacement entries. We will perform replacement
+            with one entry at a time. For example, the second entry in
+            ["cat" => "dog", "mountain cat" => "mountain dog"] will
+            never be applied because we will always process the first
+            entry before it. At most 100 entries.
+    """
+
+    class Entry(proto.Message):
+        r"""A single replacement configuration.
+
+        Attributes:
+            search (str):
+                What to replace. Max length is 100
+                characters.
+            replace (str):
+                What to replace with. Max length is 100
+                characters.
+            case_sensitive (bool):
+                Whether the search is case sensitive.
+        """
+
+        search: str = proto.Field(
+            proto.STRING,
+            number=1,
+        )
+        replace: str = proto.Field(
+            proto.STRING,
+            number=2,
+        )
+        case_sensitive: bool = proto.Field(
+            proto.BOOL,
+            number=3,
+        )
+
+    entries: MutableSequence[Entry] = proto.RepeatedField(
+        proto.MESSAGE,
+        number=1,
+        message=Entry,
+    )
+
+
 class SpeechAdaptation(proto.Message):
     r"""Provides "hints" to the speech recognizer to favor specific
     words and phrases in the results. PhraseSets can be specified as
@@ -1109,6 +1175,13 @@ class RecognitionConfig(proto.Message):
             Speech adaptation context that weights
             recognizer predictions for specific words and
             phrases.
+        transcript_normalization (google.cloud.speech_v2.types.TranscriptNormalization):
+            Optional. Use transcription normalization to
+            automatically replace parts of the transcript
+            with phrases of your choosing. For
+            StreamingRecognize, this normalization only
+            applies to stable partial transcripts (stability
+            > 0.8) and final transcripts.
     """
 
     auto_decoding_config: "AutoDetectDecodingConfig" = proto.Field(
@@ -1141,6 +1214,11 @@ class RecognitionConfig(proto.Message):
         number=6,
         message="SpeechAdaptation",
     )
+    transcript_normalization: "TranscriptNormalization" = proto.Field(
+        proto.MESSAGE,
+        number=11,
+        message="TranscriptNormalization",
+    )
 
 
 class RecognizeRequest(proto.Message):
@@ -1820,29 +1898,73 @@ class BatchRecognizeResults(proto.Message):
     )
 
 
-class BatchRecognizeFileResult(proto.Message):
-    r"""Final results for a single file.
+class CloudStorageResult(proto.Message):
+    r"""Final results written to Cloud Storage.
 
     Attributes:
         uri (str):
             The Cloud Storage URI to which recognition
             results were written.
+    """
+
+    uri: str = proto.Field(
+        proto.STRING,
+        number=1,
+    )
+
+
+class InlineResult(proto.Message):
+    r"""Final results returned inline in the recognition response.
+
+    Attributes:
+        transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
+            The transcript for the audio file.
+    """
+
+    transcript: "BatchRecognizeResults" = proto.Field(
+        proto.MESSAGE,
+        number=1,
+        message="BatchRecognizeResults",
+    )
+
+
+class BatchRecognizeFileResult(proto.Message):
+    r"""Final results for a single file.
+
+    This message has `oneof`_ fields (mutually exclusive fields).
+    For each oneof, at most one member field can be set at the same time.
+    Setting any member of the oneof automatically clears all other
+    members.
+
+    .. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields
+
+    Attributes:
         error (google.rpc.status_pb2.Status):
             Error if one was encountered.
         metadata (google.cloud.speech_v2.types.RecognitionResponseMetadata):
 
-        transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
-            The transcript for the audio file. This is populated only
-            when
+        cloud_storage_result (google.cloud.speech_v2.types.CloudStorageResult):
+            Recognition results written to Cloud Storage. This is
+            populated only when
+            [GcsOutputConfig][google.cloud.speech.v2.GcsOutputConfig] is
+            set in the
+            [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
+
+            This field is a member of `oneof`_ ``result``.
+        inline_result (google.cloud.speech_v2.types.InlineResult):
+            Recognition results. This is populated only when
             [InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig]
             is set in the
             [RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
+
+            This field is a member of `oneof`_ ``result``.
+        uri (str):
+            Deprecated. Use ``cloud_storage_result.native_format_uri``
+            instead.
+        transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
+            Deprecated. Use ``inline_result.transcript`` instead.
     """
 
-    uri: str = proto.Field(
-        proto.STRING,
-        number=1,
-    )
     error: status_pb2.Status = proto.Field(
         proto.MESSAGE,
         number=2,
@@ -1853,6 +1975,22 @@ class BatchRecognizeFileResult(proto.Message):
         number=3,
         message="RecognitionResponseMetadata",
     )
+    cloud_storage_result: "CloudStorageResult" = proto.Field(
+        proto.MESSAGE,
+        number=5,
+        oneof="result",
+        message="CloudStorageResult",
+    )
+    inline_result: "InlineResult" = proto.Field(
+        proto.MESSAGE,
+        number=6,
+        oneof="result",
+        message="InlineResult",
+    )
+    uri: str = proto.Field(
+        proto.STRING,
+        number=1,
+    )
     transcript: "BatchRecognizeResults" = proto.Field(
         proto.MESSAGE,
         number=4,
diff --git a/packages/google-cloud-speech/noxfile.py b/packages/google-cloud-speech/noxfile.py
@@ -29,8 +29,10 @@
 
 BLACK_VERSION = "black[jupyter]==23.7.0"
 ISORT_VERSION = "isort==5.11.0"
+
 LINT_PATHS = ["docs", "google", "tests", "noxfile.py", "setup.py"]
 
+
 DEFAULT_PYTHON_VERSION = "3.9"
 
 UNIT_TEST_PYTHON_VERSIONS: List[str] = ["3.7", "3.8", "3.9", "3.10", "3.11"]
@@ -89,6 +91,7 @@ def lint(session):
         "--check",
         *LINT_PATHS,
     )
+
     session.run("flake8", "google", "tests")
 
 
diff --git a/packages/google-cloud-speech/tests/unit/gapic/speech_v2/test_speech.py b/packages/google-cloud-speech/tests/unit/gapic/speech_v2/test_speech.py
@@ -6871,6 +6871,15 @@ def test_create_recognizer_rest(request_type):
                     }
                 ],
             },
+            "transcript_normalization": {
+                "entries": [
+                    {
+                        "search": "search_value",
+                        "replace": "replace_value",
+                        "case_sensitive": True,
+                    }
+                ]
+            },
         },
         "annotations": {},
         "state": 2,
@@ -7941,6 +7950,15 @@ def test_update_recognizer_rest(request_type):
                     }
                 ],
             },
+            "transcript_normalization": {
+                "entries": [
+                    {
+                        "search": "search_value",
+                        "replace": "replace_value",
+                        "case_sensitive": True,
+                    }
+                ]
+            },
         },
         "annotations": {},
         "state": 2,