feat: Add support for V1 and V2 classification models for the V1Beta2 API

Google APIs · copybara-github · commit 044a15c14b1a · 2022-09-20T11:16:52.000-07:00
PiperOrigin-RevId: 475604619
diff --git a/google/cloud/language/v1beta2/BUILD.bazel b/google/cloud/language/v1beta2/BUILD.bazel
@@ -27,7 +27,6 @@ proto_library(
         "//google/api:annotations_proto",
         "//google/api:client_proto",
         "//google/api:field_behavior_proto",
-        "@com_google_protobuf//:timestamp_proto",
     ],
 )
 
@@ -65,13 +64,16 @@ java_grpc_library(
 java_gapic_library(
     name = "language_java_gapic",
     srcs = [":language_proto_with_info"],
+    gapic_yaml = None,
     grpc_service_config = "language_grpc_service_config.json",
+    service_yaml = "language_v1beta2.yaml",
     test_deps = [
         ":language_java_grpc",
     ],
     transport = "grpc+rest",
     deps = [
         ":language_java_proto",
+        "//google/api:api_java_proto",
     ],
 )
 
@@ -164,6 +166,9 @@ py_gapic_library(
     srcs = [":language_proto"],
     grpc_service_config = "language_grpc_service_config.json",
     transport = "grpc",
+    service_yaml = "language_v1beta2.yaml",
+    deps = [
+    ],
 )
 
 py_test(
@@ -290,6 +295,7 @@ ruby_cloud_gapic_library(
     grpc_service_config = "language_grpc_service_config.json",
     ruby_cloud_description = "Provides natural language understanding technologies, such as sentiment analysis, entity recognition, entity sentiment analysis, and other text annotations.",
     ruby_cloud_title = "Natural Language V1beta2",
+    service_yaml = "language_v1beta2.yaml",
     deps = [
         ":language_ruby_grpc",
         ":language_ruby_proto",
@@ -353,4 +359,20 @@ csharp_gapic_assembly_pkg(
 ##############################################################################
 # C++
 ##############################################################################
-# Put your C++ rules here
+load(
+    "@com_google_googleapis_imports//:imports.bzl",
+    "cc_grpc_library",
+    "cc_proto_library",
+)
+
+cc_proto_library(
+    name = "language_cc_proto",
+    deps = [":language_proto"],
+)
+
+cc_grpc_library(
+    name = "language_cc_grpc",
+    srcs = [":language_proto"],
+    grpc_only = True,
+    deps = [":language_cc_proto"],
+)
diff --git a/google/cloud/language/v1beta2/language_service.proto b/google/cloud/language/v1beta2/language_service.proto
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC.
+// Copyright 2022 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-//
 
 syntax = "proto3";
 
@@ -68,7 +67,7 @@ service LanguageService {
   }
 
   // Analyzes the syntax of the text and provides sentence boundaries and
-  // tokenization along with part-of-speech tags, dependency trees, and other
+  // tokenization along with part of speech tags, dependency trees, and other
   // properties.
   rpc AnalyzeSyntax(AnalyzeSyntaxRequest) returns (AnalyzeSyntaxResponse) {
     option (google.api.http) = {
@@ -100,7 +99,7 @@ service LanguageService {
   }
 }
 
-
+// ################################################################ #
 //
 // Represents the input to API methods.
 message Document {
@@ -116,6 +115,19 @@ message Document {
     HTML = 2;
   }
 
+  // Ways of handling boilerplate detected in the document
+  enum BoilerplateHandling {
+    // The boilerplate handling is not specified.
+    BOILERPLATE_HANDLING_UNSPECIFIED = 0;
+
+    // Do not analyze detected boilerplate. Reference web URI is required for
+    // detecting boilerplate.
+    SKIP_BOILERPLATE = 1;
+
+    // Treat boilerplate the same as content.
+    KEEP_BOILERPLATE = 2;
+  }
+
   // Required. If the type is not set or is `TYPE_UNSPECIFIED`,
   // returns an `INVALID_ARGUMENT` error.
   Type type = 1;
@@ -143,6 +155,15 @@ message Document {
   // specified by the caller or automatically detected) is not supported by the
   // called API method, an `INVALID_ARGUMENT` error is returned.
   string language = 4;
+
+  // The web URI where the document comes from. This URI is not used for
+  // fetching the content, but as a hint for analyzing the document.
+  string reference_web_uri = 5;
+
+  // Indicates how detected boilerplate(e.g. advertisements, copyright
+  // declarations, banners) should be handled for this document. If not
+  // specified, boilerplate will be treated the same as content.
+  BoilerplateHandling boilerplate_handling = 6;
 }
 
 // Represents a sentence in the input document.
@@ -156,6 +177,32 @@ message Sentence {
   Sentiment sentiment = 2;
 }
 
+// Represents the text encoding that the caller uses to process the output.
+// Providing an `EncodingType` is recommended because the API provides the
+// beginning offsets for various outputs, such as tokens and mentions, and
+// languages that natively use different text encodings may access offsets
+// differently.
+enum EncodingType {
+  // If `EncodingType` is not specified, encoding-dependent information (such as
+  // `begin_offset`) will be set at `-1`.
+  NONE = 0;
+
+  // Encoding-dependent information (such as `begin_offset`) is calculated based
+  // on the UTF-8 encoding of the input. C++ and Go are examples of languages
+  // that use this encoding natively.
+  UTF8 = 1;
+
+  // Encoding-dependent information (such as `begin_offset`) is calculated based
+  // on the UTF-16 encoding of the input. Java and JavaScript are examples of
+  // languages that use this encoding natively.
+  UTF16 = 2;
+
+  // Encoding-dependent information (such as `begin_offset`) is calculated based
+  // on the UTF-32 encoding of the input. Python is an example of a language
+  // that uses this encoding natively.
+  UTF32 = 3;
+}
+
 // Represents a phrase in the text that is a known entity, such as
 // a person, an organization, or location. The API associates information, such
 // as salience and mentions, with entities.
@@ -286,32 +333,6 @@ message Token {
   string lemma = 4;
 }
 
-// Represents the text encoding that the caller uses to process the output.
-// Providing an `EncodingType` is recommended because the API provides the
-// beginning offsets for various outputs, such as tokens and mentions, and
-// languages that natively use different text encodings may access offsets
-// differently.
-enum EncodingType {
-  // If `EncodingType` is not specified, encoding-dependent information (such as
-  // `begin_offset`) will be set at `-1`.
-  NONE = 0;
-
-  // Encoding-dependent information (such as `begin_offset`) is calculated based
-  // on the UTF-8 encoding of the input. C++ and Go are examples of languages
-  // that use this encoding natively.
-  UTF8 = 1;
-
-  // Encoding-dependent information (such as `begin_offset`) is calculated based
-  // on the UTF-16 encoding of the input. Java and JavaScript are examples of
-  // languages that use this encoding natively.
-  UTF16 = 2;
-
-  // Encoding-dependent information (such as `begin_offset`) is calculated based
-  // on the UTF-32 encoding of the input. Python is an example of a language
-  // that uses this encoding natively.
-  UTF32 = 3;
-}
-
 // Represents the feeling associated with the entire text or entities in
 // the text.
 // Next ID: 6
@@ -968,6 +989,45 @@ message ClassificationCategory {
   float confidence = 2;
 }
 
+// Model options available for classification requests.
+message ClassificationModelOptions {
+  // Options for the V1 model.
+  message V1Model {
+
+  }
+
+  // Options for the V2 model.
+  message V2Model {
+    // The content categories used for classification.
+    enum ContentCategoriesVersion {
+      // If `ContentCategoriesVersion` is not specified, this option will
+      // default to `V1`.
+      CONTENT_CATEGORIES_VERSION_UNSPECIFIED = 0;
+
+      // Legacy content categories of our initial launch in 2017.
+      V1 = 1;
+
+      // Updated content categories in 2022.
+      V2 = 2;
+    }
+
+    // The content categories used for classification.
+    ContentCategoriesVersion content_categories_version = 1;
+  }
+
+  // If this field is not set, then the `v1_model` will be used by default.
+  oneof model_type {
+    // Setting this field will use the V1 model and V1 content categories
+    // version. The V1 model is a legacy model; support for this will be
+    // discontinued in the future.
+    V1Model v1_model = 1;
+
+    // Setting this field will use the V2 model with the appropriate content
+    // categories version. The V2 model is a better performing model.
+    V2Model v2_model = 2;
+  }
+}
+
 // The sentiment analysis request message.
 message AnalyzeSentimentRequest {
   // Required. Input document.
@@ -1059,6 +1119,10 @@ message AnalyzeSyntaxResponse {
 message ClassifyTextRequest {
   // Required. Input document.
   Document document = 1 [(google.api.field_behavior) = REQUIRED];
+
+  // Model options to use for classification. Defaults to v1 options if not
+  // specified.
+  ClassificationModelOptions classification_model_options = 3;
 }
 
 // The document classification response message.
@@ -1072,7 +1136,7 @@ message ClassifyTextResponse {
 message AnnotateTextRequest {
   // All available features for sentiment, syntax, and semantic analysis.
   // Setting each one to true will enable that specific analysis for the input.
-  // Next ID: 10
+  // Next ID: 11
   message Features {
     // Extract syntax information.
     bool extract_syntax = 1;
@@ -1091,6 +1155,10 @@ message AnnotateTextRequest {
     // [predefined
     // taxonomy](https://cloud.google.com/natural-language/docs/categories).
     bool classify_text = 6;
+
+    // The model options to use for classification. Defaults to v1 options
+    // if not specified. Only used if `classify_text` is set to true.
+    ClassificationModelOptions classification_model_options = 10;
   }
 
   // Required. Input document.
diff --git a/google/cloud/language/v1beta2/language_v1beta2.yaml b/google/cloud/language/v1beta2/language_v1beta2.yaml
@@ -1,19 +1,21 @@
 type: google.api.Service
-config_version: 2
+config_version: 3
 name: language.googleapis.com
-title: Google Cloud Natural Language API
+title: Cloud Natural Language API
 
 apis:
 - name: google.cloud.language.v1beta2.LanguageService
 
 documentation:
-  summary:
-    'Google Cloud Natural Language API provides natural language understanding
-    technologies to developers. Examples include sentiment analysis, entity
-    recognition, and text annotations.'
+  summary: |-
+    Provides natural language understanding technologies, such as sentiment
+    analysis, entity recognition, entity sentiment analysis, and other text
+    annotations, to developers.
 
 authentication:
   rules:
-  - selector: '*'
+  - selector: 'google.cloud.language.v1beta2.LanguageService.*'
     oauth:
-      canonical_scopes: https://www.googleapis.com/auth/cloud-platform
+      canonical_scopes: |-
+        https://www.googleapis.com/auth/cloud-language,
+        https://www.googleapis.com/auth/cloud-platform