From 83711841f91ab0b81f5b66f9a34542318770f360 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 17 Nov 2021 23:24:04 -0800 Subject: [PATCH] Remove refs to tensorflow_metadata Signed-off-by: Achal Shah --- Makefile | 2 - .../tensorflow_metadata/proto/v0/path.proto | 44 -- .../tensorflow_metadata/proto/v0/schema.proto | 673 ------------------ .../proto/v0/statistics.proto | 427 ----------- sdk/python/setup.py | 1 - 5 files changed, 1147 deletions(-) delete mode 100644 protos/tensorflow_metadata/proto/v0/path.proto delete mode 100644 protos/tensorflow_metadata/proto/v0/schema.proto delete mode 100644 protos/tensorflow_metadata/proto/v0/statistics.proto diff --git a/Makefile b/Makefile index 2daad95ccb..fa51919619 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,6 @@ package-protos: compile-protos-python: @$(foreach dir,$(PROTO_TYPE_SUBDIRS),cd ${ROOT_DIR}/protos; python -m grpc_tools.protoc -I. --grpc_python_out=../sdk/python/feast/protos/ --python_out=../sdk/python/feast/protos/ --mypy_out=../sdk/python/feast/protos/ feast/$(dir)/*.proto;) @$(foreach dir,$(PROTO_TYPE_SUBDIRS),grep -rli 'from feast.$(dir)' sdk/python/feast/protos | xargs -I@ sed -i.bak 's/from feast.$(dir)/from feast.protos.feast.$(dir)/g' @;) - cd ${ROOT_DIR}/protos; python -m grpc_tools.protoc -I. --python_out=../sdk/python/ --mypy_out=../sdk/python/ tensorflow_metadata/proto/v0/*.proto install-python: python -m pip install -e sdk/python -U --use-deprecated=legacy-resolver @@ -114,7 +113,6 @@ install-go-ci-dependencies: go get -u golang.org/x/lint/golint compile-protos-go: - cd ${ROOT_DIR}/protos; protoc -I/usr/local/include -I. --go_out=plugins=grpc,paths=source_relative:../sdk/go/protos/ tensorflow_metadata/proto/v0/*.proto $(foreach dir,types serving core storage,cd ${ROOT_DIR}/protos; protoc -I/usr/local/include -I. --go_out=plugins=grpc,paths=source_relative:../sdk/go/protos feast/$(dir)/*.proto;) test-go: diff --git a/protos/tensorflow_metadata/proto/v0/path.proto b/protos/tensorflow_metadata/proto/v0/path.proto deleted file mode 100644 index 3a4e41bad9..0000000000 --- a/protos/tensorflow_metadata/proto/v0/path.proto +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2018 The TensorFlow Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ============================================================================= - -syntax = "proto2"; -option cc_enable_arenas = true; - -package tensorflow.metadata.v0; - -option java_package = "org.tensorflow.metadata.v0"; -option java_multiple_files = true; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/tensorflow_metadata/proto/v0"; - -// A path is a more general substitute for the name of a field or feature that -// can be used for flat examples as well as structured data. For example, if -// we had data in a protocol buffer: -// message Person { -// int age = 1; -// optional string gender = 2; -// repeated Person parent = 3; -// } -// Thus, here the path {step:["parent", "age"]} in statistics would refer to the -// age of a parent, and {step:["parent", "parent", "age"]} would refer to the -// age of a grandparent. This allows us to distinguish between the statistics -// of parents' ages and grandparents' ages. In general, repeated messages are -// to be preferred to linked lists of arbitrary length. -// For SequenceExample, if we have a feature list "foo", this is represented -// by {step:["##SEQUENCE##", "foo"]}. -message Path { - // Any string is a valid step. - // However, whenever possible have a step be [A-Za-z0-9_]+. - repeated string step = 1; -} diff --git a/protos/tensorflow_metadata/proto/v0/schema.proto b/protos/tensorflow_metadata/proto/v0/schema.proto deleted file mode 100644 index 00005ee913..0000000000 --- a/protos/tensorflow_metadata/proto/v0/schema.proto +++ /dev/null @@ -1,673 +0,0 @@ -// Copyright 2017 The TensorFlow Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ============================================================================= - -syntax = "proto2"; - -package tensorflow.metadata.v0; - -import "google/protobuf/any.proto"; -import "tensorflow_metadata/proto/v0/path.proto"; - -option cc_enable_arenas = true; -option java_package = "org.tensorflow.metadata.v0"; -option java_multiple_files = true; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/tensorflow_metadata/proto/v0"; - -// LifecycleStage. Only UNKNOWN_STAGE, BETA, and PRODUCTION features are -// actually validated. -// PLANNED, ALPHA, and DEBUG are treated as DEPRECATED. -enum LifecycleStage { - UNKNOWN_STAGE = 0; // Unknown stage. - PLANNED = 1; // Planned feature, may not be created yet. - ALPHA = 2; // Prototype feature, not used in experiments yet. - BETA = 3; // Used in user-facing experiments. - PRODUCTION = 4; // Used in a significant fraction of user traffic. - DEPRECATED = 5; // No longer supported: do not use in new models. - DEBUG_ONLY = 6; // Only exists for debugging purposes. -} - -// -// Message to represent schema information. -// NextID: 14 -message Schema { - // Features described in this schema. - repeated Feature feature = 1; - - // Sparse features described in this schema. - repeated SparseFeature sparse_feature = 6; - - // Weighted features described in this schema. - repeated WeightedFeature weighted_feature = 12; - - // Use StructDomain instead. - // Sequences described in this schema. A sequence may be described in terms of - // several features. Any features appearing within a sequence must *not* be - // declared as top-level features in . -// GOOGLE-LEGACY repeated Sequence sequence = 2; - - // declared as top-level features in . - // String domains referenced in the features. - repeated StringDomain string_domain = 4; - - // top level float domains that can be reused by features - repeated FloatDomain float_domain = 9; - - // top level int domains that can be reused by features - repeated IntDomain int_domain = 10; - - // Default environments for each feature. - // An environment represents both a type of location (e.g. a server or phone) - // and a time (e.g. right before model X is run). In the standard scenario, - // 99% of the features should be in the default environments TRAINING, - // SERVING, and the LABEL (or labels) AND WEIGHT is only available at TRAINING - // (not at serving). - // Other possible variations: - // 1. There may be TRAINING_MOBILE, SERVING_MOBILE, TRAINING_SERVICE, - // and SERVING_SERVICE. - // 2. If one is ensembling three models, where the predictions of the first - // three models are available for the ensemble model, there may be - // TRAINING, SERVING_INITIAL, SERVING_ENSEMBLE. - // See FeatureProto::not_in_environment and FeatureProto::in_environment. - repeated string default_environment = 5; - - /* BEGIN GOOGLE-LEGACY - // TODO(b/73109633): Change default to false, before removing this field. - optional bool generate_legacy_feature_spec = 7 [default = true]; - END GOOGLE-LEGACY */ - - // Additional information about the schema as a whole. Features may also - // be annotated individually. - optional Annotation annotation = 8; - - // Dataset-level constraints. This is currently used for specifying - // information about changes in num_examples. - optional DatasetConstraints dataset_constraints = 11; - - // TensorRepresentation groups. The keys are the names of the groups. - // Key "" (empty string) denotes the "default" group, which is what should - // be used when a group name is not provided. - // See the documentation at TensorRepresentationGroup for more info. - // Under development. DO NOT USE. - map tensor_representation_group = 13; -} - -// Describes schema-level information about a specific feature. -// NextID: 31 -message Feature { - // The name of the feature. - optional string name = 1; // required - - // This field is no longer supported. Instead, use: - // lifecycle_stage: DEPRECATED - // TODO(b/111450258): remove this. - optional bool deprecated = 2 [deprecated = true]; - - // Comment field for a human readable description of the field. - // TODO(b/123518108): remove this. -// GOOGLE-LEGACY optional string comment = 3 [deprecated = true]; - - oneof presence_constraints { - // Constraints on the presence of this feature in the examples. - FeaturePresence presence = 14; - // Only used in the context of a "group" context, e.g., inside a sequence. - FeaturePresenceWithinGroup group_presence = 17; - } - - // The shape of the feature which governs the number of values that appear in - // each example. - oneof shape_type { - // The feature has a fixed shape corresponding to a multi-dimensional - // tensor. - FixedShape shape = 23; - // The feature doesn't have a well defined shape. All we know are limits on - // the minimum and maximum number of values. - ValueCount value_count = 5; - } - - // Physical type of the feature's values. - // Note that you can have: - // type: BYTES - // int_domain: { - // min: 0 - // max: 3 - // } - // This would be a field that is syntactically BYTES (i.e. strings), but - // semantically an int, i.e. it would be "0", "1", "2", or "3". - optional FeatureType type = 6; - - // Domain for the values of the feature. - oneof domain_info { - // Reference to a domain defined at the schema level. - string domain = 7; - // Inline definitions of domains. - IntDomain int_domain = 9; - FloatDomain float_domain = 10; - StringDomain string_domain = 11; - BoolDomain bool_domain = 13; - StructDomain struct_domain = 29; - // Supported semantic domains. - NaturalLanguageDomain natural_language_domain = 24; - ImageDomain image_domain = 25; - MIDDomain mid_domain = 26; - URLDomain url_domain = 27; - TimeDomain time_domain = 28; - TimeOfDayDomain time_of_day_domain = 30; - } - - // Constraints on the distribution of the feature values. - // Currently only supported for StringDomains. - // TODO(b/69473628): Extend functionality to other domain types. - optional DistributionConstraints distribution_constraints = 15; - - // Additional information about the feature for documentation purpose. - optional Annotation annotation = 16; - - // Tests comparing the distribution to the associated serving data. - optional FeatureComparator skew_comparator = 18; - - // Tests comparing the distribution between two consecutive spans (e.g. days). - optional FeatureComparator drift_comparator = 21; - - // List of environments this feature is present in. - // Should be disjoint from not_in_environment. - // This feature is in environment "foo" if: - // ("foo" is in in_environment or default_environments) AND - // "foo" is not in not_in_environment. - // See Schema::default_environments. - repeated string in_environment = 20; - - // List of environments this feature is not present in. - // Should be disjoint from of in_environment. - // See Schema::default_environments and in_environment. - repeated string not_in_environment = 19; - - // The lifecycle stage of a feature. It can also apply to its descendants. - // i.e., if a struct is DEPRECATED, its children are implicitly deprecated. - optional LifecycleStage lifecycle_stage = 22; -} - -// Additional information about the schema or about a feature. -message Annotation { - // Tags can be used to mark features. For example, tag on user_age feature can - // be `user_feature`, tag on user_country feature can be `location_feature`, - // `user_feature`. - repeated string tag = 1; - // Free-text comments. This can be used as a description of the feature, - // developer notes etc. - repeated string comment = 2; - // Application-specific metadata may be attached here. - repeated .google.protobuf.Any extra_metadata = 3; -} - -// Checks that the ratio of the current value to the previous value is not below -// the min_fraction_threshold or above the max_fraction_threshold. That is, -// previous value * min_fraction_threshold <= current value <= -// previous value * max_fraction_threshold. -// To specify that the value cannot change, set both min_fraction_threshold and -// max_fraction_threshold to 1.0. -message NumericValueComparator { - optional double min_fraction_threshold = 1; - optional double max_fraction_threshold = 2; -} - -// Constraints on the entire dataset. -message DatasetConstraints { - // Tests differences in number of examples between the current data and the - // previous span. - optional NumericValueComparator num_examples_drift_comparator = 1; - // Tests comparisions in number of examples between the current data and the - // previous version of that data. - optional NumericValueComparator num_examples_version_comparator = 2; - // Minimum number of examples in the dataset. - optional int64 min_examples_count = 3; -} - -// Specifies a fixed shape for the feature's values. The immediate implication -// is that each feature has a fixed number of values. Moreover, these values -// can be parsed in a multi-dimensional tensor using the specified axis sizes. -// The FixedShape defines a lexicographical ordering of the data. For instance, -// if there is a FixedShape { -// dim {size:3} dim {size:2} -// } -// then tensor[0][0]=field[0] -// then tensor[0][1]=field[1] -// then tensor[1][0]=field[2] -// then tensor[1][1]=field[3] -// then tensor[2][0]=field[4] -// then tensor[2][1]=field[5] -// -// The FixedShape message is identical with the TensorFlow TensorShape proto -// message. -message FixedShape { - // The dimensions that define the shape. The total number of values in each - // example is the product of sizes of each dimension. - repeated Dim dim = 2; - - // An axis in a multi-dimensional feature representation. - message Dim { - optional int64 size = 1; - - // Optional name of the tensor dimension. - optional string name = 2; - } -} - -// Limits on maximum and minimum number of values in a -// single example (when the feature is present). Use this when the minimum -// value count can be different than the maximum value count. Otherwise prefer -// FixedShape. -message ValueCount { - optional int64 min = 1; - optional int64 max = 2; -} - -/* BEGIN GOOGLE-LEGACY -// Constraint on the number of elements in a sequence. -message LengthConstraint { - optional int64 min = 1; - optional int64 max = 2; -} - -// A sequence is a logical feature that comprises several "raw" features that -// encode values at different "steps" within the sequence. -// TODO(b/110490010): Delete this. This is a special case of StructDomain. -message Sequence { - // An optional name for this sequence. Used mostly for debugging and - // presentation. - optional string name = 1; - - // Features that comprise the sequence. These features are "zipped" together - // to form the values for the sequence at different steps. - // - Use group_presence within each feature to encode presence constraints - // within the sequence. - // - If all features have the same value-count constraints then - // declare this once using the shape_constraint below. - repeated Feature feature = 2; - - // Constraints on the presence of the sequence across all examples in the - // dataset. The sequence is assumed to be present if at least one of its - // features is present. - optional FeaturePresence presence = 3; - - // Shape constraints that apply on all the features that comprise the - // sequence. If this is set then the value_count in 'feature' is - // ignored. - // TODO(martinz): delete: there is no reason to believe the shape of the - // fields in a sequence will be the same. Use the fields in Feature instead. - oneof shape_constraint { - ValueCount value_count = 4; - FixedShape fixed_shape = 5; - } - - // Constraint on the number of elements in a sequence. - optional LengthConstraint length_constraint = 6; -} -END GOOGLE-LEGACY */ - -// Represents a weighted feature that is encoded as a combination of raw base -// features. The `weight_feature` should be a float feature with identical -// shape as the `feature`. This is useful for representing weights associated -// with categorical tokens (e.g. a TFIDF weight associated with each token). -// TODO(b/142122960): Handle WeightedCategorical end to end in TFX (validation, -// TFX Unit Testing, etc) -message WeightedFeature { - // Name for the weighted feature. This should not clash with other features in - // the same schema. - optional string name = 1; // required - // Path of a base feature to be weighted. Required. - optional Path feature = 2; - // Path of weight feature to associate with the base feature. Must be same - // shape as feature. Required. - optional Path weight_feature = 3; - // The lifecycle_stage determines where a feature is expected to be used, - // and therefore how important issues with it are. - optional LifecycleStage lifecycle_stage = 4; -} - -// A sparse feature represents a sparse tensor that is encoded with a -// combination of raw features, namely index features and a value feature. Each -// index feature defines a list of indices in a different dimension. -message SparseFeature { - reserved 11; - // Name for the sparse feature. This should not clash with other features in - // the same schema. - optional string name = 1; // required - - // This field is no longer supported. Instead, use: - // lifecycle_stage: DEPRECATED - // TODO(b/111450258): remove this. - optional bool deprecated = 2 [deprecated = true]; - - // The lifecycle_stage determines where a feature is expected to be used, - // and therefore how important issues with it are. - optional LifecycleStage lifecycle_stage = 7; - - // Comment field for a human readable description of the field. - // TODO(martinz): delete, convert to annotation. -// GOOGLE-LEGACY optional string comment = 3 [deprecated = true]; - - // Constraints on the presence of this feature in examples. - // Deprecated, this is inferred by the referred features. - optional FeaturePresence presence = 4 [deprecated = true]; - - // Shape of the sparse tensor that this SparseFeature represents. - // Currently not supported. - // TODO(b/109669962): Consider deriving this from the referred features. - optional FixedShape dense_shape = 5; - - // Features that represent indexes. Should be integers >= 0. - repeated IndexFeature index_feature = 6; // at least one - message IndexFeature { - // Name of the index-feature. This should be a reference to an existing - // feature in the schema. - optional string name = 1; - } - - // If true then the index values are already sorted lexicographically. - optional bool is_sorted = 8; - - optional ValueFeature value_feature = 9; // required - message ValueFeature { - // Name of the value-feature. This should be a reference to an existing - // feature in the schema. - optional string name = 1; - } - - // Type of value feature. - // Deprecated, this is inferred by the referred features. - optional FeatureType type = 10 [deprecated = true]; -} - -// Models constraints on the distribution of a feature's values. -// TODO(martinz): replace min_domain_mass with max_off_domain (but slowly). -message DistributionConstraints { - // The minimum fraction (in [0,1]) of values across all examples that - // should come from the feature's domain, e.g.: - // 1.0 => All values must come from the domain. - // .9 => At least 90% of the values must come from the domain. - optional double min_domain_mass = 1 [default = 1.0]; -} - -// Encodes information for domains of integer values. -// Note that FeatureType could be either INT or BYTES. -message IntDomain { - // Id of the domain. Required if the domain is defined at the schema level. If - // so, then the name must be unique within the schema. - optional string name = 1; - - // Min and max values for the domain. - optional int64 min = 3; - optional int64 max = 4; - - // If true then the domain encodes categorical values (i.e., ids) rather than - // ordinal values. - optional bool is_categorical = 5; -} - -// Encodes information for domains of float values. -// Note that FeatureType could be either INT or BYTES. -message FloatDomain { - // Id of the domain. Required if the domain is defined at the schema level. If - // so, then the name must be unique within the schema. - optional string name = 1; - - // Min and max values of the domain. - optional float min = 3; - optional float max = 4; -} - -// Domain for a recursive struct. -// NOTE: If a feature with a StructDomain is deprecated, then all the -// child features (features and sparse_features of the StructDomain) are also -// considered to be deprecated. Similarly child features can only be in -// environments of the parent feature. -message StructDomain { - repeated Feature feature = 1; - - repeated SparseFeature sparse_feature = 2; -} - -// Encodes information for domains of string values. -message StringDomain { - // Id of the domain. Required if the domain is defined at the schema level. If - // so, then the name must be unique within the schema. - optional string name = 1; - - // The values appearing in the domain. - repeated string value = 2; -} - -// Encodes information about the domain of a boolean attribute that encodes its -// TRUE/FALSE values as strings, or 0=false, 1=true. -// Note that FeatureType could be either INT or BYTES. -message BoolDomain { - // Id of the domain. Required if the domain is defined at the schema level. If - // so, then the name must be unique within the schema. - optional string name = 1; - - // Strings values for TRUE/FALSE. - optional string true_value = 2; - optional string false_value = 3; -} - -// BEGIN SEMANTIC-TYPES-PROTOS -// Semantic domains are specialized feature domains. For example a string -// Feature might represent a Time of a specific format. -// Semantic domains are defined as protocol buffers to allow further sub-types / -// specialization, e.g: NaturalLanguageDomain can provide information on the -// language of the text. - -// Natural language text. -message NaturalLanguageDomain {} - -// Image data. -message ImageDomain {} - -// Knowledge graph ID, see: https://www.wikidata.org/wiki/Property:P646 -message MIDDomain {} - -// A URL, see: https://en.wikipedia.org/wiki/URL -message URLDomain {} - -// Time or date representation. -message TimeDomain { - enum IntegerTimeFormat { - FORMAT_UNKNOWN = 0; - UNIX_DAYS = 5; // Number of days since 1970-01-01. - UNIX_SECONDS = 1; - UNIX_MILLISECONDS = 2; - UNIX_MICROSECONDS = 3; - UNIX_NANOSECONDS = 4; - } - - oneof format { - // Expected format that contains a combination of regular characters and - // special format specifiers. Format specifiers are a subset of the - // strptime standard. - string string_format = 1; - - // Expected format of integer times. - IntegerTimeFormat integer_format = 2; - } -} - -// Time of day, without a particular date. -message TimeOfDayDomain { - enum IntegerTimeOfDayFormat { - FORMAT_UNKNOWN = 0; - // Time values, containing hour/minute/second/nanos, encoded into 8-byte - // bit fields following the ZetaSQL convention: - // 6 5 4 3 2 1 - // MSB 3210987654321098765432109876543210987654321098765432109876543210 LSB - // | H || M || S ||---------- nanos -----------| - PACKED_64_NANOS = 1; - } - - oneof format { - // Expected format that contains a combination of regular characters and - // special format specifiers. Format specifiers are a subset of the - // strptime standard. - string string_format = 1; - - // Expected format of integer times. - IntegerTimeOfDayFormat integer_format = 2; - } -} -// END SEMANTIC-TYPES-PROTOS - -// Describes the physical representation of a feature. -// It may be different than the logical representation, which -// is represented as a Domain. -enum FeatureType { - TYPE_UNKNOWN = 0; - BYTES = 1; - INT = 2; - FLOAT = 3; - STRUCT = 4; -} - -// Describes constraints on the presence of the feature in the data. -message FeaturePresence { - // Minimum fraction of examples that have this feature. - optional double min_fraction = 1; - // Minimum number of examples that have this feature. - optional int64 min_count = 2; -} - -// Records constraints on the presence of a feature inside a "group" context -// (e.g., .presence inside a group of features that define a sequence). -message FeaturePresenceWithinGroup { - optional bool required = 1; -} - -// Checks that the L-infinity norm is below a certain threshold between the -// two discrete distributions. Since this is applied to a FeatureNameStatistics, -// it only considers the top k. -// L_infty(p,q) = max_i |p_i-q_i| -message InfinityNorm { - // The InfinityNorm is in the interval [0.0, 1.0] so sensible bounds should - // be in the interval [0.0, 1.0). - optional double threshold = 1; -} - -message FeatureComparator { - optional InfinityNorm infinity_norm = 1; -} - -// A TensorRepresentation captures the intent for converting columns in a -// dataset to TensorFlow Tensors (or more generally, tf.CompositeTensors). -// Note that one tf.CompositeTensor may consist of data from multiple columns, -// for example, a N-dimensional tf.SparseTensor may need N + 1 columns to -// provide the sparse indices and values. -// Note that the "column name" that a TensorRepresentation needs is a -// string, not a Path -- it means that the column name identifies a top-level -// Feature in the schema (i.e. you cannot specify a Feature nested in a STRUCT -// Feature). -message TensorRepresentation { - message DefaultValue { - oneof kind { - double float_value = 1; - // Note that the data column might be of a shorter integral type. It's the - // user's responsitiblity to make sure the default value fits that type. - int64 int_value = 2; - bytes bytes_value = 3; - // uint_value should only be used if the default value can't fit in a - // int64 (`int_value`). - uint64 uint_value = 4; - } - } - - // A tf.Tensor - message DenseTensor { - // Identifies the column in the dataset that provides the values of this - // Tensor. - optional string column_name = 1; - // The shape of each row of the data (i.e. does not include the batch - // dimension) - optional FixedShape shape = 2; - // If this column is missing values in a row, the default_value will be - // used to fill that row. - optional DefaultValue default_value = 3; - } - - // A ragged tf.SparseTensor that models nested lists. - message VarLenSparseTensor { - // Identifies the column in the dataset that should be converted to the - // VarLenSparseTensor. - optional string column_name = 1; - } - - // A tf.SparseTensor whose indices and values come from separate data columns. - // This will replace Schema.sparse_feature eventually. - // The index columns must be of INT type, and all the columns must co-occur - // and have the same valency at the same row. - message SparseTensor { - // The dense shape of the resulting SparseTensor (does not include the batch - // dimension). - optional FixedShape dense_shape = 1; - // The columns constitute the coordinates of the values. - // indices_column[i][j] contains the coordinate of the i-th dimension of the - // j-th value. - repeated string index_column_names = 2; - // The column that contains the values. - optional string value_column_name = 3; - } - - oneof kind { - DenseTensor dense_tensor = 1; - VarLenSparseTensor varlen_sparse_tensor = 2; - SparseTensor sparse_tensor = 3; - } -} - -// A TensorRepresentationGroup is a collection of TensorRepresentations with -// names. These names may serve as identifiers when converting the dataset -// to a collection of Tensors or tf.CompositeTensors. -// For example, given the following group: -// { -// key: "dense_tensor" -// tensor_representation { -// dense_tensor { -// column_name: "univalent_feature" -// shape { -// dim { -// size: 1 -// } -// } -// default_value { -// float_value: 0 -// } -// } -// } -// } -// { -// key: "varlen_sparse_tensor" -// tensor_representation { -// varlen_sparse_tensor { -// column_name: "multivalent_feature" -// } -// } -// } -// -// Then the schema is expected to have feature "univalent_feature" and -// "multivalent_feature", and when a batch of data is converted to Tensors using -// this TensorRepresentationGroup, the result may be the following dict: -// { -// "dense_tensor": tf.Tensor(...), -// "varlen_sparse_tensor": tf.SparseTensor(...), -// } -message TensorRepresentationGroup { - map tensor_representation = 1; -} diff --git a/protos/tensorflow_metadata/proto/v0/statistics.proto b/protos/tensorflow_metadata/proto/v0/statistics.proto deleted file mode 100644 index 3123dad874..0000000000 --- a/protos/tensorflow_metadata/proto/v0/statistics.proto +++ /dev/null @@ -1,427 +0,0 @@ -// Copyright 2017 The TensorFlow Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ============================================================================= - -// Definitions for aggregated feature statistics for datasets. -// TODO(b/80075690): make a Javascript build rule for this. -// TODO(b/80075691): migrate Facets to use this. -syntax = "proto3"; -option cc_enable_arenas = true; - -package tensorflow.metadata.v0; - -option java_package = "org.tensorflow.metadata.v0"; -option java_multiple_files = true; -option go_package = "github.com/feast-dev/feast/sdk/go/protos/tensorflow_metadata/proto/v0"; - -import "tensorflow_metadata/proto/v0/path.proto"; - -// Copied from Facets feature_statistics.proto -// Must be kept binary-compatible with the original, until all usages -// are updated to use this version, or we write a proto-to-proto converter. - -// A list of features statistics for different datasets. If you wish to compare -// different datasets using this list, then the DatasetFeatureStatistics -// entries should all contain the same list of features. -message DatasetFeatureStatisticsList { - repeated DatasetFeatureStatistics datasets = 1; -} - -// The feature statistics for a single dataset. -message DatasetFeatureStatistics { - // The name of the dataset. - string name = 1; - // The number of examples in the dataset. - uint64 num_examples = 2; - - // Only valid if the weight feature was specified. - // Treats a missing weighted feature as zero. - double weighted_num_examples = 4; - // The feature statistics for the dataset. - repeated FeatureNameStatistics features = 3; - - // Cross feature statistics for the dataset. - repeated CrossFeatureStatistics cross_features = 5; -} - -message CrossFeatureStatistics { - // The path of feature x. - Path path_x = 1; - // The path of feature y. - Path path_y = 2; - - // Number of occurrences of this feature cross in the data. If any of - // the features in the cross is missing, the example is ignored. - uint64 count = 3; - - oneof cross_stats { - NumericCrossStatistics num_cross_stats = 4; - CategoricalCrossStatistics categorical_cross_stats = 5; - } -} - -message NumericCrossStatistics { - // Pearson product-moment correlation coefficient. - float correlation = 1; - // Standard covariance. E[(X-E[X])*(Y-E[Y])] - float covariance = 2; -} - -message CategoricalCrossStatistics { - LiftStatistics lift = 1; -} - -message LiftStatistics { - // Lift information for each value of path_y. Lift is defined for each pair of - // values (x,y) as P(path_y=y|path_x=x)/P(path_y=y). - repeated LiftSeries lift_series = 1; - // Weighted lift information for each value of path_y. Weighted lift is - // defined for each pair of values (x,y) as P(path_y=y|path_x=x)/P(path_y=y) - // where probabilities are computed over weighted example space. - repeated LiftSeries weighted_lift_series = 2; -} - -// Container for lift information for a specific y-value. -message LiftSeries { - // A bucket for referring to binned numeric features. - message Bucket { - // The low value of the bucket, inclusive. - double low_value = 1; - // The high value of the bucket, exclusive (unless the high_value is - // positive infinity). - double high_value = 2; - } - - // The particular value of path_y corresponding to this LiftSeries. Each - // element in lift_values corresponds to the lift a different x_value and - // this specific y_value. - oneof y_value { - int32 y_int = 1; - string y_string = 2; - Bucket y_bucket = 3; - } - - // The number of examples in which y_value appears. - oneof y_count_value { - uint64 y_count = 4; - double weighted_y_count = 5; - } - - // A container for lift information about a specific value of path_x. - message LiftValue { - oneof x_value { - int32 x_int = 1; - string x_string = 2; - } - // P(path_y=y|path_x=x) / P(path_y=y) for x_value and the enclosing y_value. - // In terms of concrete fields, this number represents: - // (x_and_y_count / x_count) / (y_count / num_examples) - double lift = 3; - // The number of examples in which x_value appears. - oneof x_count_value { - uint64 x_count = 4; - double weighted_x_count = 5; - } - // The number of examples in which x_value appears and y_value appears. - oneof x_and_y_count_value { - uint64 x_and_y_count = 6; - double weighted_x_and_y_count = 7; - } - } - - // The lifts for a each path_x value and this y_value. - repeated LiftValue lift_values = 6; -} - -// The complete set of statistics for a given feature name for a dataset. -message FeatureNameStatistics { - // The types supported by the feature statistics. When aggregating - // tf.Examples, if the bytelist contains a string, it is recommended to encode - // it here as STRING instead of BYTES in order to calculate string-specific - // statistical measures. - enum Type { - INT = 0; - FLOAT = 1; - STRING = 2; - BYTES = 3; - STRUCT = 4; - } - - // One can identify a field either by the name (for simple fields), or by - // a path (for structured fields). Note that: - // name: "foo" - // is equivalent to: - // path: {step:"foo"} - // Note: this oneof must be consistently either name or path across all - // FeatureNameStatistics in one DatasetFeatureStatistics. - oneof field_id { - // The feature name - string name = 1; - - // The path of the feature. - Path path = 8; - } - - // The data type of the feature - Type type = 2; - - // The statistics of the values of the feature. - oneof stats { - NumericStatistics num_stats = 3; - StringStatistics string_stats = 4; - BytesStatistics bytes_stats = 5; - StructStatistics struct_stats = 7; - } - - // Any custom statistics can be stored in this list. - repeated CustomStatistic custom_stats = 6; -} - -// Common weighted statistics for all feature types. Statistics counting number -// of values (i.e., avg_num_values and tot_num_values) include NaNs. -// If the weighted column is missing, then this counts as a weight of 1 -// for that example. -message WeightedCommonStatistics { - // Weighted number of examples not missing. - double num_non_missing = 1; - // Weighted number of examples missing. - // Note that if the weighted column is zero, this does not count - // as missing. - double num_missing = 2; - // average number of values, weighted by the number of examples. - double avg_num_values = 3; - // tot_num_values = avg_num_values * num_non_missing. - // This is calculated directly, so should have less numerical error. - double tot_num_values = 4; -} - -// Stores the name and value of any custom statistic. The value can be a string, -// double, or histogram. -message CustomStatistic { - string name = 1; - oneof val { - double num = 2; - string str = 3; - Histogram histogram = 4; - RankHistogram rank_histogram = 5; - } -} - -// Statistics for a numeric feature in a dataset. -message NumericStatistics { - CommonStatistics common_stats = 1; - // The mean of the values - double mean = 2; - // The standard deviation of the values - double std_dev = 3; - // The number of values that equal 0 - uint64 num_zeros = 4; - // The minimum value - double min = 5; - // The median value - double median = 6; - // The maximum value - double max = 7; - // The histogram(s) of the feature values. - repeated Histogram histograms = 8; - - // Weighted statistics for the feature, if the values have weights. - WeightedNumericStatistics weighted_numeric_stats = 9; -} - -// Statistics for a string feature in a dataset. -message StringStatistics { - CommonStatistics common_stats = 1; - // The number of unique values - uint64 unique = 2; - - message FreqAndValue { - string value = 2; - - // The number of times the value occurs. Stored as a double to be able to - // handle weighted features. - double frequency = 3; - - // Deleted fields. - reserved 1; - } - // A sorted list of the most-frequent values and their frequencies, with - // the most-frequent being first. - repeated FreqAndValue top_values = 3; - - // The average length of the values - float avg_length = 4; - - // The rank histogram for the values of the feature. - // The rank is used to measure of how commonly the value is found in the - // dataset. The most common value would have a rank of 1, with the second-most - // common value having a rank of 2, and so on. - RankHistogram rank_histogram = 5; - - // Weighted statistics for the feature, if the values have weights. - WeightedStringStatistics weighted_string_stats = 6; - - // A vocabulary file, used for vocabularies too large to store in the proto - // itself. Note that the file may be relative to some context-dependent - // directory. E.g. in TFX the feature statistics will live in a PPP and - // vocabulary file names will be relative to this PPP. - string vocabulary_file = 7; -} - -// Statistics for a weighted numeric feature in a dataset. -message WeightedNumericStatistics { - // The weighted mean of the values - double mean = 1; - // The weighted standard deviation of the values - double std_dev = 2; - // The weighted median of the values - double median = 3; - - // The histogram(s) of the weighted feature values. - repeated Histogram histograms = 4; -} - -// Statistics for a weighted string feature in a dataset. -message WeightedStringStatistics { - // A sorted list of the most-frequent values and their weighted frequencies, - // with the most-frequent being first. - repeated StringStatistics.FreqAndValue top_values = 1; - - // The rank histogram for the weighted values of the feature. - RankHistogram rank_histogram = 2; -} - -// Statistics for a bytes feature in a dataset. -message BytesStatistics { - CommonStatistics common_stats = 1; - // The number of unique values - uint64 unique = 2; - - // The average number of bytes in a value - float avg_num_bytes = 3; - // The minimum number of bytes in a value - float min_num_bytes = 4; - // The maximum number of bytes in a value - float max_num_bytes = 5; -} - -message StructStatistics { - CommonStatistics common_stats = 1; -} - -// Common statistics for all feature types. Statistics counting number of values -// (i.e., min_num_values, max_num_values, avg_num_values, and tot_num_values) -// include NaNs. -message CommonStatistics { - // The number of examples with at least one value for this feature. - uint64 num_non_missing = 1; - // The number of examples with no values for this feature. - uint64 num_missing = 2; - // The minimum number of values in a single example for this feature. - uint64 min_num_values = 3; - // The maximum number of values in a single example for this feature. - uint64 max_num_values = 4; - // The average number of values in a single example for this feature. - float avg_num_values = 5; - // tot_num_values = avg_num_values * num_non_missing. - // This is calculated directly, so should have less numerical error. - uint64 tot_num_values = 8; - // The quantiles histogram for the number of values in this feature. - Histogram num_values_histogram = 6; - WeightedCommonStatistics weighted_common_stats = 7; - // The histogram for the number of features in the feature list (only set if - // this feature is a non-context feature from a tf.SequenceExample). - // This is different from num_values_histogram, as num_values_histogram tracks - // the count of all values for a feature in an example, whereas this tracks - // the length of the feature list for this feature in an example (where each - // feature list can contain multiple values). - Histogram feature_list_length_histogram = 9; -} - -// The data used to create a histogram of a numeric feature for a dataset. -message Histogram { - // Each bucket defines its low and high values along with its count. The - // low and high values must be a real number or positive or negative - // infinity. They cannot be NaN or undefined. Counts of those special values - // can be found in the numNaN and numUndefined fields. - message Bucket { - // The low value of the bucket, inclusive. - double low_value = 1; - // The high value of the bucket, exclusive (unless the highValue is - // positive infinity). - double high_value = 2; - - // The number of items in the bucket. Stored as a double to be able to - // handle weighted histograms. - double sample_count = 4; - - // Deleted fields. - reserved 3; - } - - // The number of NaN values in the dataset. - uint64 num_nan = 1; - // The number of undefined values in the dataset. - uint64 num_undefined = 2; - - // A list of buckets in the histogram, sorted from lowest bucket to highest - // bucket. - repeated Bucket buckets = 3; - - // The type of the histogram. A standard histogram has equal-width buckets. - // The quantiles type is used for when the histogram message is used to store - // quantile information (by using equal-count buckets with variable widths). - enum HistogramType { - STANDARD = 0; - QUANTILES = 1; - } - - // The type of the histogram. - HistogramType type = 4; - - // An optional descriptive name of the histogram, to be used for labeling. - string name = 5; -} - -// The data used to create a rank histogram of a non-numeric feature of a -// dataset. The rank of a value in a feature can be used as a measure of how -// commonly the value is found in the entire dataset. With bucket sizes of one, -// this becomes a distribution function of all feature values. -message RankHistogram { - // Each bucket defines its start and end ranks along with its count. - message Bucket { - // The low rank of the bucket, inclusive. - uint64 low_rank = 1; - // The high rank of the bucket, exclusive. - uint64 high_rank = 2; - - // The label for the bucket. Can be used to list or summarize the values in - // this rank bucket. - string label = 4; - - // The number of items in the bucket. Stored as a double to be able to - // handle weighted histograms. - double sample_count = 5; - - // Deleted fields. - reserved 3; - } - - // A list of buckets in the histogram, sorted from lowest-ranked bucket to - // highest-ranked bucket. - repeated Bucket buckets = 1; - - // An optional descriptive name of the histogram, to be used for labeling. - string name = 2; -} \ No newline at end of file diff --git a/sdk/python/setup.py b/sdk/python/setup.py index 29a018ca51..a9fdfa8293 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -236,7 +236,6 @@ def run(self): "": [ "protos/feast/**/*.proto", "protos/feast/third_party/grpc/health/v1/*.proto", - "protos/tensorflow_metadata/proto/v0/*.proto", "feast/protos/feast/**/*.py", ], },