Skip to content

Commit 8536b20

Browse files
feat: add transcript normalization + m4a audio format support (#11937)
- [ ] Regenerate this pull request now. BEGIN_COMMIT_OVERRIDE feat: add transcript normalization + m4a audio format support docs: clarify alternatives for deprecated fields docs: deprecate `BatchRecognizeFileResult.uri` in favor of `cloud_storage_result.native_format_uri` docs: deprecate `BatchRecognizeFileResult.transcript` in favor of `inline_result.transcript` END_COMMIT_OVERRIDE PiperOrigin-RevId: 577926708 Source-Link: googleapis/googleapis@37e816b Source-Link: googleapis/googleapis-gen@e12bd7b Copy-Tag: eyJwIjoicGFja2FnZXMvZ29vZ2xlLWNsb3VkLXNwZWVjaC8uT3dsQm90LnlhbWwiLCJoIjoiZTEyYmQ3YmRiYmI5ZDJlNDE4YTkyMjA3NWQyM2Y3N2E4YzFlNzQ4NSJ9 --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 80b7a92 commit 8536b20

File tree

5 files changed

+184
-13
lines changed

5 files changed

+184
-13
lines changed

packages/google-cloud-speech/google/cloud/speech_v2/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
BatchRecognizeResponse,
2929
BatchRecognizeResults,
3030
BatchRecognizeTranscriptionMetadata,
31+
CloudStorageResult,
3132
Config,
3233
CreateCustomClassRequest,
3334
CreatePhraseSetRequest,
@@ -43,6 +44,7 @@
4344
GetPhraseSetRequest,
4445
GetRecognizerRequest,
4546
InlineOutputConfig,
47+
InlineResult,
4648
ListCustomClassesRequest,
4749
ListCustomClassesResponse,
4850
ListPhraseSetsRequest,
@@ -67,6 +69,7 @@
6769
StreamingRecognitionResult,
6870
StreamingRecognizeRequest,
6971
StreamingRecognizeResponse,
72+
TranscriptNormalization,
7073
UndeleteCustomClassRequest,
7174
UndeletePhraseSetRequest,
7275
UndeleteRecognizerRequest,
@@ -87,6 +90,7 @@
8790
"BatchRecognizeResponse",
8891
"BatchRecognizeResults",
8992
"BatchRecognizeTranscriptionMetadata",
93+
"CloudStorageResult",
9094
"Config",
9195
"CreateCustomClassRequest",
9296
"CreatePhraseSetRequest",
@@ -102,6 +106,7 @@
102106
"GetPhraseSetRequest",
103107
"GetRecognizerRequest",
104108
"InlineOutputConfig",
109+
"InlineResult",
105110
"ListCustomClassesRequest",
106111
"ListCustomClassesResponse",
107112
"ListPhraseSetsRequest",
@@ -127,6 +132,7 @@
127132
"StreamingRecognitionResult",
128133
"StreamingRecognizeRequest",
129134
"StreamingRecognizeResponse",
135+
"TranscriptNormalization",
130136
"UndeleteCustomClassRequest",
131137
"UndeletePhraseSetRequest",
132138
"UndeleteRecognizerRequest",

packages/google-cloud-speech/google/cloud/speech_v2/types/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
BatchRecognizeResponse,
2323
BatchRecognizeResults,
2424
BatchRecognizeTranscriptionMetadata,
25+
CloudStorageResult,
2526
Config,
2627
CreateCustomClassRequest,
2728
CreatePhraseSetRequest,
@@ -37,6 +38,7 @@
3738
GetPhraseSetRequest,
3839
GetRecognizerRequest,
3940
InlineOutputConfig,
41+
InlineResult,
4042
ListCustomClassesRequest,
4143
ListCustomClassesResponse,
4244
ListPhraseSetsRequest,
@@ -61,6 +63,7 @@
6163
StreamingRecognitionResult,
6264
StreamingRecognizeRequest,
6365
StreamingRecognizeResponse,
66+
TranscriptNormalization,
6467
UndeleteCustomClassRequest,
6568
UndeletePhraseSetRequest,
6669
UndeleteRecognizerRequest,
@@ -80,6 +83,7 @@
8083
"BatchRecognizeResponse",
8184
"BatchRecognizeResults",
8285
"BatchRecognizeTranscriptionMetadata",
86+
"CloudStorageResult",
8387
"Config",
8488
"CreateCustomClassRequest",
8589
"CreatePhraseSetRequest",
@@ -95,6 +99,7 @@
9599
"GetPhraseSetRequest",
96100
"GetRecognizerRequest",
97101
"InlineOutputConfig",
102+
"InlineResult",
98103
"ListCustomClassesRequest",
99104
"ListCustomClassesResponse",
100105
"ListPhraseSetsRequest",
@@ -119,6 +124,7 @@
119124
"StreamingRecognitionResult",
120125
"StreamingRecognizeRequest",
121126
"StreamingRecognizeResponse",
127+
"TranscriptNormalization",
122128
"UndeleteCustomClassRequest",
123129
"UndeletePhraseSetRequest",
124130
"UndeleteRecognizerRequest",

packages/google-cloud-speech/google/cloud/speech_v2/types/cloud_speech.py

Lines changed: 151 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
"ExplicitDecodingConfig",
4040
"SpeakerDiarizationConfig",
4141
"RecognitionFeatures",
42+
"TranscriptNormalization",
4243
"SpeechAdaptation",
4344
"RecognitionConfig",
4445
"RecognizeRequest",
@@ -56,6 +57,8 @@
5657
"RecognitionOutputConfig",
5758
"BatchRecognizeResponse",
5859
"BatchRecognizeResults",
60+
"CloudStorageResult",
61+
"InlineResult",
5962
"BatchRecognizeFileResult",
6063
"BatchRecognizeTranscriptionMetadata",
6164
"BatchRecognizeMetadata",
@@ -587,9 +590,14 @@ class Recognizer(proto.Message):
587590
User-settable, human-readable name for the
588591
Recognizer. Must be 63 characters or less.
589592
model (str):
590-
Optional. Which model to use for recognition requests.
591-
Select the model best suited to your domain to get best
592-
results.
593+
Optional. This field is now deprecated. Prefer the
594+
[``model``][google.cloud.speech.v2.RecognitionConfig.model]
595+
field in the
596+
[``RecognitionConfig``][google.cloud.speech.v2.RecognitionConfig]
597+
message.
598+
599+
Which model to use for recognition requests. Select the
600+
model best suited to your domain to get best results.
593601
594602
Guidance for choosing which model to use can be found in the
595603
`Transcription Models
@@ -598,7 +606,13 @@ class Recognizer(proto.Message):
598606
`Table Of Supported
599607
Models <https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages>`__.
600608
language_codes (MutableSequence[str]):
601-
Optional. The language of the supplied audio as a
609+
Optional. This field is now deprecated. Prefer the
610+
[``language_codes``][google.cloud.speech.v2.RecognitionConfig.language_codes]
611+
field in the
612+
[``RecognitionConfig``][google.cloud.speech.v2.RecognitionConfig]
613+
message.
614+
615+
The language of the supplied audio as a
602616
`BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
603617
language tag.
604618
@@ -772,6 +786,8 @@ class AutoDetectDecodingConfig(proto.Message):
772786
773787
- WEBM_OPUS: Opus audio frames in a WebM container.
774788
789+
- M4A: M4A audio format.
790+
775791
"""
776792

777793

@@ -991,6 +1007,56 @@ class MultiChannelMode(proto.Enum):
9911007
)
9921008

9931009

1010+
class TranscriptNormalization(proto.Message):
1011+
r"""Transcription normalization configuration. Use transcription
1012+
normalization to automatically replace parts of the transcript
1013+
with phrases of your choosing. For StreamingRecognize, this
1014+
normalization only applies to stable partial transcripts
1015+
(stability > 0.8) and final transcripts.
1016+
1017+
Attributes:
1018+
entries (MutableSequence[google.cloud.speech_v2.types.TranscriptNormalization.Entry]):
1019+
A list of replacement entries. We will perform replacement
1020+
with one entry at a time. For example, the second entry in
1021+
["cat" => "dog", "mountain cat" => "mountain dog"] will
1022+
never be applied because we will always process the first
1023+
entry before it. At most 100 entries.
1024+
"""
1025+
1026+
class Entry(proto.Message):
1027+
r"""A single replacement configuration.
1028+
1029+
Attributes:
1030+
search (str):
1031+
What to replace. Max length is 100
1032+
characters.
1033+
replace (str):
1034+
What to replace with. Max length is 100
1035+
characters.
1036+
case_sensitive (bool):
1037+
Whether the search is case sensitive.
1038+
"""
1039+
1040+
search: str = proto.Field(
1041+
proto.STRING,
1042+
number=1,
1043+
)
1044+
replace: str = proto.Field(
1045+
proto.STRING,
1046+
number=2,
1047+
)
1048+
case_sensitive: bool = proto.Field(
1049+
proto.BOOL,
1050+
number=3,
1051+
)
1052+
1053+
entries: MutableSequence[Entry] = proto.RepeatedField(
1054+
proto.MESSAGE,
1055+
number=1,
1056+
message=Entry,
1057+
)
1058+
1059+
9941060
class SpeechAdaptation(proto.Message):
9951061
r"""Provides "hints" to the speech recognizer to favor specific
9961062
words and phrases in the results. PhraseSets can be specified as
@@ -1109,6 +1175,13 @@ class RecognitionConfig(proto.Message):
11091175
Speech adaptation context that weights
11101176
recognizer predictions for specific words and
11111177
phrases.
1178+
transcript_normalization (google.cloud.speech_v2.types.TranscriptNormalization):
1179+
Optional. Use transcription normalization to
1180+
automatically replace parts of the transcript
1181+
with phrases of your choosing. For
1182+
StreamingRecognize, this normalization only
1183+
applies to stable partial transcripts (stability
1184+
> 0.8) and final transcripts.
11121185
"""
11131186

11141187
auto_decoding_config: "AutoDetectDecodingConfig" = proto.Field(
@@ -1141,6 +1214,11 @@ class RecognitionConfig(proto.Message):
11411214
number=6,
11421215
message="SpeechAdaptation",
11431216
)
1217+
transcript_normalization: "TranscriptNormalization" = proto.Field(
1218+
proto.MESSAGE,
1219+
number=11,
1220+
message="TranscriptNormalization",
1221+
)
11441222

11451223

11461224
class RecognizeRequest(proto.Message):
@@ -1820,29 +1898,73 @@ class BatchRecognizeResults(proto.Message):
18201898
)
18211899

18221900

1823-
class BatchRecognizeFileResult(proto.Message):
1824-
r"""Final results for a single file.
1901+
class CloudStorageResult(proto.Message):
1902+
r"""Final results written to Cloud Storage.
18251903
18261904
Attributes:
18271905
uri (str):
18281906
The Cloud Storage URI to which recognition
18291907
results were written.
1908+
"""
1909+
1910+
uri: str = proto.Field(
1911+
proto.STRING,
1912+
number=1,
1913+
)
1914+
1915+
1916+
class InlineResult(proto.Message):
1917+
r"""Final results returned inline in the recognition response.
1918+
1919+
Attributes:
1920+
transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
1921+
The transcript for the audio file.
1922+
"""
1923+
1924+
transcript: "BatchRecognizeResults" = proto.Field(
1925+
proto.MESSAGE,
1926+
number=1,
1927+
message="BatchRecognizeResults",
1928+
)
1929+
1930+
1931+
class BatchRecognizeFileResult(proto.Message):
1932+
r"""Final results for a single file.
1933+
1934+
This message has `oneof`_ fields (mutually exclusive fields).
1935+
For each oneof, at most one member field can be set at the same time.
1936+
Setting any member of the oneof automatically clears all other
1937+
members.
1938+
1939+
.. _oneof: https://proto-plus-python.readthedocs.io/en/stable/fields.html#oneofs-mutually-exclusive-fields
1940+
1941+
Attributes:
18301942
error (google.rpc.status_pb2.Status):
18311943
Error if one was encountered.
18321944
metadata (google.cloud.speech_v2.types.RecognitionResponseMetadata):
18331945
1834-
transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
1835-
The transcript for the audio file. This is populated only
1836-
when
1946+
cloud_storage_result (google.cloud.speech_v2.types.CloudStorageResult):
1947+
Recognition results written to Cloud Storage. This is
1948+
populated only when
1949+
[GcsOutputConfig][google.cloud.speech.v2.GcsOutputConfig] is
1950+
set in the
1951+
[RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
1952+
1953+
This field is a member of `oneof`_ ``result``.
1954+
inline_result (google.cloud.speech_v2.types.InlineResult):
1955+
Recognition results. This is populated only when
18371956
[InlineOutputConfig][google.cloud.speech.v2.InlineOutputConfig]
18381957
is set in the
18391958
[RecognitionOutputConfig][[google.cloud.speech.v2.RecognitionOutputConfig].
1959+
1960+
This field is a member of `oneof`_ ``result``.
1961+
uri (str):
1962+
Deprecated. Use ``cloud_storage_result.native_format_uri``
1963+
instead.
1964+
transcript (google.cloud.speech_v2.types.BatchRecognizeResults):
1965+
Deprecated. Use ``inline_result.transcript`` instead.
18401966
"""
18411967

1842-
uri: str = proto.Field(
1843-
proto.STRING,
1844-
number=1,
1845-
)
18461968
error: status_pb2.Status = proto.Field(
18471969
proto.MESSAGE,
18481970
number=2,
@@ -1853,6 +1975,22 @@ class BatchRecognizeFileResult(proto.Message):
18531975
number=3,
18541976
message="RecognitionResponseMetadata",
18551977
)
1978+
cloud_storage_result: "CloudStorageResult" = proto.Field(
1979+
proto.MESSAGE,
1980+
number=5,
1981+
oneof="result",
1982+
message="CloudStorageResult",
1983+
)
1984+
inline_result: "InlineResult" = proto.Field(
1985+
proto.MESSAGE,
1986+
number=6,
1987+
oneof="result",
1988+
message="InlineResult",
1989+
)
1990+
uri: str = proto.Field(
1991+
proto.STRING,
1992+
number=1,
1993+
)
18561994
transcript: "BatchRecognizeResults" = proto.Field(
18571995
proto.MESSAGE,
18581996
number=4,

packages/google-cloud-speech/noxfile.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@
2929

3030
BLACK_VERSION = "black[jupyter]==23.7.0"
3131
ISORT_VERSION = "isort==5.11.0"
32+
3233
LINT_PATHS = ["docs", "google", "tests", "noxfile.py", "setup.py"]
3334

35+
3436
DEFAULT_PYTHON_VERSION = "3.9"
3537

3638
UNIT_TEST_PYTHON_VERSIONS: List[str] = ["3.7", "3.8", "3.9", "3.10", "3.11"]
@@ -89,6 +91,7 @@ def lint(session):
8991
"--check",
9092
*LINT_PATHS,
9193
)
94+
9295
session.run("flake8", "google", "tests")
9396

9497

packages/google-cloud-speech/tests/unit/gapic/speech_v2/test_speech.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6871,6 +6871,15 @@ def test_create_recognizer_rest(request_type):
68716871
}
68726872
],
68736873
},
6874+
"transcript_normalization": {
6875+
"entries": [
6876+
{
6877+
"search": "search_value",
6878+
"replace": "replace_value",
6879+
"case_sensitive": True,
6880+
}
6881+
]
6882+
},
68746883
},
68756884
"annotations": {},
68766885
"state": 2,
@@ -7941,6 +7950,15 @@ def test_update_recognizer_rest(request_type):
79417950
}
79427951
],
79437952
},
7953+
"transcript_normalization": {
7954+
"entries": [
7955+
{
7956+
"search": "search_value",
7957+
"replace": "replace_value",
7958+
"case_sensitive": True,
7959+
}
7960+
]
7961+
},
79447962
},
79457963
"annotations": {},
79467964
"state": 2,

0 commit comments

Comments
 (0)