1- // Copyright 2019 Google LLC.
1+ // Copyright 2022 Google LLC
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
1111// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212// See the License for the specific language governing permissions and
1313// limitations under the License.
14- //
1514
1615syntax = "proto3" ;
1716
@@ -68,7 +67,7 @@ service LanguageService {
6867 }
6968
7069 // Analyzes the syntax of the text and provides sentence boundaries and
71- // tokenization along with part-of- speech tags, dependency trees, and other
70+ // tokenization along with part of speech tags, dependency trees, and other
7271 // properties.
7372 rpc AnalyzeSyntax (AnalyzeSyntaxRequest ) returns (AnalyzeSyntaxResponse ) {
7473 option (google.api.http ) = {
@@ -100,7 +99,7 @@ service LanguageService {
10099 }
101100}
102101
103-
102+ // ################################################################ #
104103//
105104// Represents the input to API methods.
106105message Document {
@@ -116,6 +115,19 @@ message Document {
116115 HTML = 2 ;
117116 }
118117
118+ // Ways of handling boilerplate detected in the document
119+ enum BoilerplateHandling {
120+ // The boilerplate handling is not specified.
121+ BOILERPLATE_HANDLING_UNSPECIFIED = 0 ;
122+
123+ // Do not analyze detected boilerplate. Reference web URI is required for
124+ // detecting boilerplate.
125+ SKIP_BOILERPLATE = 1 ;
126+
127+ // Treat boilerplate the same as content.
128+ KEEP_BOILERPLATE = 2 ;
129+ }
130+
119131 // Required. If the type is not set or is `TYPE_UNSPECIFIED`,
120132 // returns an `INVALID_ARGUMENT` error.
121133 Type type = 1 ;
@@ -143,6 +155,15 @@ message Document {
143155 // specified by the caller or automatically detected) is not supported by the
144156 // called API method, an `INVALID_ARGUMENT` error is returned.
145157 string language = 4 ;
158+
159+ // The web URI where the document comes from. This URI is not used for
160+ // fetching the content, but as a hint for analyzing the document.
161+ string reference_web_uri = 5 ;
162+
163+ // Indicates how detected boilerplate(e.g. advertisements, copyright
164+ // declarations, banners) should be handled for this document. If not
165+ // specified, boilerplate will be treated the same as content.
166+ BoilerplateHandling boilerplate_handling = 6 ;
146167}
147168
148169// Represents a sentence in the input document.
@@ -156,6 +177,32 @@ message Sentence {
156177 Sentiment sentiment = 2 ;
157178}
158179
180+ // Represents the text encoding that the caller uses to process the output.
181+ // Providing an `EncodingType` is recommended because the API provides the
182+ // beginning offsets for various outputs, such as tokens and mentions, and
183+ // languages that natively use different text encodings may access offsets
184+ // differently.
185+ enum EncodingType {
186+ // If `EncodingType` is not specified, encoding-dependent information (such as
187+ // `begin_offset`) will be set at `-1`.
188+ NONE = 0 ;
189+
190+ // Encoding-dependent information (such as `begin_offset`) is calculated based
191+ // on the UTF-8 encoding of the input. C++ and Go are examples of languages
192+ // that use this encoding natively.
193+ UTF8 = 1 ;
194+
195+ // Encoding-dependent information (such as `begin_offset`) is calculated based
196+ // on the UTF-16 encoding of the input. Java and JavaScript are examples of
197+ // languages that use this encoding natively.
198+ UTF16 = 2 ;
199+
200+ // Encoding-dependent information (such as `begin_offset`) is calculated based
201+ // on the UTF-32 encoding of the input. Python is an example of a language
202+ // that uses this encoding natively.
203+ UTF32 = 3 ;
204+ }
205+
159206// Represents a phrase in the text that is a known entity, such as
160207// a person, an organization, or location. The API associates information, such
161208// as salience and mentions, with entities.
@@ -286,32 +333,6 @@ message Token {
286333 string lemma = 4 ;
287334}
288335
289- // Represents the text encoding that the caller uses to process the output.
290- // Providing an `EncodingType` is recommended because the API provides the
291- // beginning offsets for various outputs, such as tokens and mentions, and
292- // languages that natively use different text encodings may access offsets
293- // differently.
294- enum EncodingType {
295- // If `EncodingType` is not specified, encoding-dependent information (such as
296- // `begin_offset`) will be set at `-1`.
297- NONE = 0 ;
298-
299- // Encoding-dependent information (such as `begin_offset`) is calculated based
300- // on the UTF-8 encoding of the input. C++ and Go are examples of languages
301- // that use this encoding natively.
302- UTF8 = 1 ;
303-
304- // Encoding-dependent information (such as `begin_offset`) is calculated based
305- // on the UTF-16 encoding of the input. Java and JavaScript are examples of
306- // languages that use this encoding natively.
307- UTF16 = 2 ;
308-
309- // Encoding-dependent information (such as `begin_offset`) is calculated based
310- // on the UTF-32 encoding of the input. Python is an example of a language
311- // that uses this encoding natively.
312- UTF32 = 3 ;
313- }
314-
315336// Represents the feeling associated with the entire text or entities in
316337// the text.
317338// Next ID: 6
@@ -968,6 +989,45 @@ message ClassificationCategory {
968989 float confidence = 2 ;
969990}
970991
992+ // Model options available for classification requests.
993+ message ClassificationModelOptions {
994+ // Options for the V1 model.
995+ message V1Model {
996+
997+ }
998+
999+ // Options for the V2 model.
1000+ message V2Model {
1001+ // The content categories used for classification.
1002+ enum ContentCategoriesVersion {
1003+ // If `ContentCategoriesVersion` is not specified, this option will
1004+ // default to `V1`.
1005+ CONTENT_CATEGORIES_VERSION_UNSPECIFIED = 0 ;
1006+
1007+ // Legacy content categories of our initial launch in 2017.
1008+ V1 = 1 ;
1009+
1010+ // Updated content categories in 2022.
1011+ V2 = 2 ;
1012+ }
1013+
1014+ // The content categories used for classification.
1015+ ContentCategoriesVersion content_categories_version = 1 ;
1016+ }
1017+
1018+ // If this field is not set, then the `v1_model` will be used by default.
1019+ oneof model_type {
1020+ // Setting this field will use the V1 model and V1 content categories
1021+ // version. The V1 model is a legacy model; support for this will be
1022+ // discontinued in the future.
1023+ V1Model v1_model = 1 ;
1024+
1025+ // Setting this field will use the V2 model with the appropriate content
1026+ // categories version. The V2 model is a better performing model.
1027+ V2Model v2_model = 2 ;
1028+ }
1029+ }
1030+
9711031// The sentiment analysis request message.
9721032message AnalyzeSentimentRequest {
9731033 // Required. Input document.
@@ -1059,6 +1119,10 @@ message AnalyzeSyntaxResponse {
10591119message ClassifyTextRequest {
10601120 // Required. Input document.
10611121 Document document = 1 [(google.api.field_behavior ) = REQUIRED ];
1122+
1123+ // Model options to use for classification. Defaults to v1 options if not
1124+ // specified.
1125+ ClassificationModelOptions classification_model_options = 3 ;
10621126}
10631127
10641128// The document classification response message.
@@ -1072,7 +1136,7 @@ message ClassifyTextResponse {
10721136message AnnotateTextRequest {
10731137 // All available features for sentiment, syntax, and semantic analysis.
10741138 // Setting each one to true will enable that specific analysis for the input.
1075- // Next ID: 10
1139+ // Next ID: 11
10761140 message Features {
10771141 // Extract syntax information.
10781142 bool extract_syntax = 1 ;
@@ -1091,6 +1155,10 @@ message AnnotateTextRequest {
10911155 // [predefined
10921156 // taxonomy](https://cloud.google.com/natural-language/docs/categories).
10931157 bool classify_text = 6 ;
1158+
1159+ // The model options to use for classification. Defaults to v1 options
1160+ // if not specified. Only used if `classify_text` is set to true.
1161+ ClassificationModelOptions classification_model_options = 10 ;
10941162 }
10951163
10961164 // Required. Input document.
0 commit comments