tensorflow
diff --git a/‎RELEASE.md‎
Lines changed: 19 additions & 0 deletions b/‎RELEASE.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/api_docs/python/tft/NumPyCombiner.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/api_docs/python/tft/NumPyCombiner.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/api_docs/python/tft/TFTransformOutput.md‎
Lines changed: 49 additions & 0 deletions b/‎docs/api_docs/python/tft/TFTransformOutput.md‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎tensorflow_transform/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow_transform/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow_transform/analyzer_nodes.py‎
Lines changed: 114 additions & 35 deletions b/‎tensorflow_transform/analyzer_nodes.py‎
Lines changed: 114 additions & 35 deletions
diff --git a/‎tensorflow_transform/analyzers.py‎
Lines changed: 24 additions & 8 deletions b/‎tensorflow_transform/analyzers.py‎
Lines changed: 24 additions & 8 deletions
@@ -4,6 +4,13 @@
 * Performance improvements for vocabulary generation when using top_k.
 * New optimized highly experimental API for analyzing a dataset was added,
   `AnalyzeDatasetWithCache`, which allows reading and writing analyzer cache.
+* Update `DatasetMetadata` to be a wrapper around the
+  `tensorflow_metadata.proto.v0.schema_pb2.Schema` proto.  TensorFlow Metadata
+  will be the schema used to define data parsing across TFX.  The serialized
+  `DatasetMetadata` is now the `Schema` proto in ascii format, but the previous
+  format can still be read.
+* Change `ApplySavedModel` implementation to use `tf.Session.make_callable`
+  instead of `tf.Session.run` for improved performance.
 
 ## Bug Fixes and Other Changes
 * `tft.vocabulary` and `tft.compute_and_apply_vocabulary` now support filtering
@@ -21,8 +28,20 @@
   `tf.Session.run` for improved performance.
 * ExampleProtoCoder now also supports non-serialized Example representations.
 * `tft.tfidf` now accepts a scalar Tensor as `vocab_size`.
+* `assertItemsEqual` in unit tests are replaced by `assertCountEqual`.
+* `NumPyCombiner` now outputs TF dtypes in output_tensor_infos instead of numpy
+  dtypes.
+* Adds function `tft.apply_pyfunc` that provides limited support for
+  `tf.pyfunc`. Note that this is incompatible with serving. See documentation
+  for more details.
 
 ## Breaking changes
+* `ColumnSchema` and related classes (`Domain`, `Axis` and
+  `ColumnRepresentation` and their subclasses) have been removed.  In order to
+  create a schema, use `from_feature_spec`.  In order to inspect a schema
+  use the `as_feature_spec` and `domains` methods of `Schema`.  The
+  constructors of these classes are replaced by functions that still work when
+  creating a `Schema` but this usage is deprecated.
 
 ## Deprecations
 
 
@@ -30,7 +30,7 @@ Combines the PCollection only on the 0th dimension using nparray.
 __init__(
     fn,
     output_dtypes,
-    output_shapes=None
+    output_shapes
 )
 ```
 
 
@@ -1,16 +1,23 @@
 <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 <meta itemprop="name" content="tft.TFTransformOutput" />
 <meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="post_transform_statistics_path"/>
+<meta itemprop="property" content="pre_transform_statistics_path"/>
+<meta itemprop="property" content="raw_metadata"/>
 <meta itemprop="property" content="transform_savedmodel_dir"/>
 <meta itemprop="property" content="transformed_metadata"/>
 <meta itemprop="property" content="__init__"/>
 <meta itemprop="property" content="load_transform_graph"/>
 <meta itemprop="property" content="num_buckets_for_transformed_feature"/>
+<meta itemprop="property" content="raw_feature_spec"/>
 <meta itemprop="property" content="transform_raw_features"/>
 <meta itemprop="property" content="transformed_feature_spec"/>
 <meta itemprop="property" content="vocabulary_by_name"/>
 <meta itemprop="property" content="vocabulary_file_by_name"/>
 <meta itemprop="property" content="vocabulary_size_by_name"/>
+<meta itemprop="property" content="POST_TRANSFORM_FEATURE_STATS_PATH"/>
+<meta itemprop="property" content="PRE_TRANSFORM_FEATURE_STATS_PATH"/>
+<meta itemprop="property" content="RAW_METADATA_DIR"/>
 <meta itemprop="property" content="TRANSFORMED_METADATA_DIR"/>
 <meta itemprop="property" content="TRANSFORM_FN_DIR"/>
 </div>
@@ -39,6 +46,30 @@ __init__(transform_output_dir)
 
 ## Properties
 
+<h3 id="post_transform_statistics_path"><code>post_transform_statistics_path</code></h3>
+
+Returns the path to the post-transform datum statistics.
+
+Note: post_transform_statistics is not guaranteed to exist in the output of
+tf.transform and hence using this could fail, if post_transform statistics
+is not present in TFTransformOutput.
+
+<h3 id="pre_transform_statistics_path"><code>pre_transform_statistics_path</code></h3>
+
+Returns the path to the pre-transform datum statistics.
+
+Note: pre_transform_statistics is not guaranteed to exist in the output of
+tf.transform and hence using this could fail, if pre_transform statistics is
+not present in TFTransformOutput.
+
+<h3 id="raw_metadata"><code>raw_metadata</code></h3>
+
+A DatasetMetadata.
+
+Note: raw_metadata is not guaranteed to exist in the output of tf.transform
+and hence using this could fail, if raw_metadata is not present in
+TFTransformOutput.
+
 <h3 id="transform_savedmodel_dir"><code>transform_savedmodel_dir</code></h3>
 
 A python str.
@@ -71,6 +102,18 @@ num_buckets_for_transformed_feature(name)
 
 Returns the number of buckets for an integerized transformed feature.
 
+<h3 id="raw_feature_spec"><code>raw_feature_spec</code></h3>
+
+``` python
+raw_feature_spec()
+```
+
+Returns a feature_spec for the raw features.
+
+#### Returns:
+
+A dict from feature names to FixedLenFeature/SparseFeature/VarLenFeature.
+
 <h3 id="transform_raw_features"><code>transform_raw_features</code></h3>
 
 ``` python
@@ -142,6 +185,12 @@ Like vocabulary_file_by_name, but returns the size of vocabulary.
 
 ## Class Members
 
+<h3 id="POST_TRANSFORM_FEATURE_STATS_PATH"><code>POST_TRANSFORM_FEATURE_STATS_PATH</code></h3>
+
+<h3 id="PRE_TRANSFORM_FEATURE_STATS_PATH"><code>PRE_TRANSFORM_FEATURE_STATS_PATH</code></h3>
+
+<h3 id="RAW_METADATA_DIR"><code>RAW_METADATA_DIR</code></h3>
+
 <h3 id="TRANSFORMED_METADATA_DIR"><code>TRANSFORMED_METADATA_DIR</code></h3>
 
 <h3 id="TRANSFORM_FN_DIR"><code>TRANSFORM_FN_DIR</code></h3>
 
@@ -20,4 +20,5 @@
 from tensorflow_transform.mappers import *
 from tensorflow_transform.output_wrapper import TFTransformOutput
 from tensorflow_transform.pretrained_models import *
+from tensorflow_transform.py_func.api import apply_pyfunc
 # pylint: enable=wildcard-import
@@ -160,6 +160,8 @@ def accumulator_coder(self):
 class CacheCoder(object):
   """A coder iterface for encoding and decoding cache items."""
 
+  __metaclass__ = abc.ABCMeta
+
   def __repr__(self):
     return '<{}>'.format(self.__class__.__name__)
 
@@ -327,32 +329,92 @@ def output_tensor_infos(self):
            ] + self.combiner.output_tensor_infos()
 
 
-class Vocabulary(
-    collections.namedtuple(
-        'Vocabulary',
-        [
-            'top_k',
-            'frequency_threshold',
-            'vocab_filename',
-            'store_frequency',
-            'vocab_ordering_type',
-            'use_adjusted_mutual_info',
-            'min_diff_from_avg',
-            'coverage_top_k',
-            'coverage_frequency_threshold',
-            'key_fn',
-            'label'
-        ]),
-    AnalyzerDef):
-  """OperationDef for computing a vocabulary of unique values.
+class VocabularyAccumulate(
+    collections.namedtuple('VocabularyAccumulate',
+                           ['vocab_ordering_type', 'label']),
+    nodes.OperationDef):
+  """An operation that accumulates unique words with their frequency or weight.
 
-  This analyzer computes a vocabulary composed of the unique values present in
-  the input elements.  It selects a subset of the unique elements based on the
-  provided parameters.  It may also accept a label and weight as input
-  depending on the parameters.
+  This operation is implemented by
+  `tensorflow_transform.beam.analyzer_impls.VocabularyAccumulateImpl`.
+  """
 
-  This analyzer is implemented by
-  `tensorflow_transform.beam.analyzer_impls.VocabularyImpl`.
+  def __new__(cls, vocab_ordering_type, label=None):
+    if label is None:
+      scope = tf.get_default_graph().get_name_scope()
+      label = '{}[{}]'.format(cls.__name__, scope)
+    return super(VocabularyAccumulate, cls).__new__(
+        cls, vocab_ordering_type=vocab_ordering_type, label=label)
+
+  @property
+  def num_outputs(self):
+    return 1
+
+  @property
+  def is_partitionable(self):
+    return True
+
+  @property
+  def cache_coder(self):
+    return _VocabularyAccumulatorCoder()
+
+
+class _VocabularyAccumulatorCoder(CacheCoder):
+  """Coder for vocabulary accumulators."""
+
+  def encode_cache(self, accumulator):
+    # Need to wrap in np.array and call tolist to make it JSON serializable.
+    word, count = accumulator
+    accumulator = (word.decode('utf-8'), count)
+    return tf.compat.as_bytes(
+        json.dumps(np.array(accumulator, dtype=object).tolist()))
+
+  def decode_cache(self, encoded_accumulator):
+    return np.array(json.loads(encoded_accumulator), dtype=object)
+
+
+class VocabularyMerge(
+    collections.namedtuple('VocabularyMerge', [
+        'vocab_ordering_type', 'use_adjusted_mutual_info', 'min_diff_from_avg',
+        'label'
+    ]), nodes.OperationDef):
+  """An operation that merges the accumulators produced by VocabularyAccumulate.
+
+  This operation operates on the output of VocabularyAccumulate and is
+  implemented by `tensorflow_transform.beam.analyzer_impls.VocabularyMergeImpl`.
+
+  See `tft.vocabulary` for a description of the parameters.
+  """
+
+  def __new__(cls,
+              vocab_ordering_type,
+              use_adjusted_mutual_info,
+              min_diff_from_avg,
+              label=None):
+    if label is None:
+      scope = tf.get_default_graph().get_name_scope()
+      label = '{}[{}]'.format(cls.__name__, scope)
+    return super(VocabularyMerge, cls).__new__(
+        cls,
+        vocab_ordering_type=vocab_ordering_type,
+        use_adjusted_mutual_info=use_adjusted_mutual_info,
+        min_diff_from_avg=min_diff_from_avg,
+        label=label)
+
+  @property
+  def num_outputs(self):
+    return 1
+
+
+class VocabularyOrderAndFilter(
+    collections.namedtuple('VocabularyOrderAndFilter', [
+        'top_k', 'frequency_threshold', 'coverage_top_k',
+        'coverage_frequency_threshold', 'key_fn', 'label'
+    ]), nodes.OperationDef):
+  """An operation that filters and orders a computed vocabulary.
+
+  This operation operates on the output of VocabularyMerge and is implemented by
+  `tensorflow_transform.beam.analyzer_impls.VocabularyOrderAndFilterImpl`.
 
   See `tft.vocabulary` for a description of the parameters.
   """
@@ -361,32 +423,49 @@ def __new__(
       cls,
       top_k,
       frequency_threshold,
-      vocab_filename,
-      store_frequency,
-      vocab_ordering_type,
-      use_adjusted_mutual_info,
-      min_diff_from_avg,
       coverage_top_k,
       coverage_frequency_threshold,
       key_fn,
       label=None):
     if label is None:
       scope = tf.get_default_graph().get_name_scope()
       label = '{}[{}]'.format(cls.__name__, scope)
-    return super(Vocabulary, cls).__new__(
+    return super(VocabularyOrderAndFilter, cls).__new__(
         cls,
         top_k=top_k,
         frequency_threshold=frequency_threshold,
-        vocab_filename=vocab_filename,
-        store_frequency=store_frequency,
-        vocab_ordering_type=vocab_ordering_type,
-        use_adjusted_mutual_info=use_adjusted_mutual_info,
-        min_diff_from_avg=min_diff_from_avg,
         coverage_top_k=coverage_top_k,
         coverage_frequency_threshold=coverage_frequency_threshold,
         key_fn=key_fn,
         label=label)
 
+  @property
+  def num_outputs(self):
+    return 1
+
+
+class VocabularyWrite(
+    collections.namedtuple('VocabularyWrite',
+                           ['vocab_filename', 'store_frequency', 'label']),
+    AnalyzerDef):
+  """An analyzer that writes vocabulary files from an accumulator.
+
+  This operation operates on the output of VocabularyOrderAndFilter and is
+  implemented by `tensorflow_transform.beam.analyzer_impls.VocabularyWriteImpl`.
+
+  See `tft.vocabulary` for a description of the parameters.
+  """
+
+  def __new__(cls, vocab_filename, store_frequency, label=None):
+    if label is None:
+      scope = tf.get_default_graph().get_name_scope()
+      label = '{}[{}]'.format(cls.__name__, scope)
+    return super(VocabularyWrite, cls).__new__(
+        cls,
+        vocab_filename=vocab_filename,
+        store_frequency=store_frequency,
+        label=label)
+
   @property
   def output_tensor_infos(self):
     return [TensorInfo(tf.string, [], True)]
 
@@ -142,7 +142,7 @@ class NumPyCombiner(analyzer_nodes.Combiner):
     output_shapes: The shapes of the outputs.
   """
 
-  def __init__(self, fn, output_dtypes, output_shapes=None):
+  def __init__(self, fn, output_dtypes, output_shapes):
     self._fn = fn
     self._output_dtypes = output_dtypes
     self._output_shapes = output_shapes
@@ -186,7 +186,7 @@ def extract_output(self, accumulator):
 
   def output_tensor_infos(self):
     return [
-        analyzer_nodes.TensorInfo(dtype, shape, False)
+        analyzer_nodes.TensorInfo(tf.as_dtype(dtype), shape, False)
         for dtype, shape in zip(self._output_dtypes, self._output_shapes)
     ]
 
@@ -820,19 +820,35 @@ def vocabulary(
       assert none_counts is None
       analyzer_inputs = [unique_inputs]
 
-    (vocab_filename,) = apply_analyzer(
-        analyzer_nodes.Vocabulary,
-        *analyzer_inputs,
+    input_values_node = analyzer_nodes.get_input_tensors_value_nodes(
+        analyzer_inputs)
+
+    accumulate_output_value_node = nodes.apply_operation(
+        analyzer_nodes.VocabularyAccumulate, input_values_node,
+        vocab_ordering_type=vocab_ordering_type)
+
+    merge_output_value_node = nodes.apply_operation(
+        analyzer_nodes.VocabularyMerge, accumulate_output_value_node,
         use_adjusted_mutual_info=use_adjusted_mutual_info,
         min_diff_from_avg=min_diff_from_avg,
+        vocab_ordering_type=vocab_ordering_type)
+
+    filtered_value_node = nodes.apply_operation(
+        analyzer_nodes.VocabularyOrderAndFilter,
+        merge_output_value_node,
         coverage_top_k=coverage_top_k,
         coverage_frequency_threshold=coverage_frequency_threshold,
         key_fn=key_fn,
         top_k=top_k,
-        frequency_threshold=frequency_threshold,
+        frequency_threshold=frequency_threshold)
+
+    vocab_filename_node = nodes.apply_operation(
+        analyzer_nodes.VocabularyWrite,
+        filtered_value_node,
         vocab_filename=vocab_filename,
-        store_frequency=store_frequency,
-        vocab_ordering_type=vocab_ordering_type)
+        store_frequency=store_frequency)
+
+    vocab_filename = analyzer_nodes.wrap_as_tensor(vocab_filename_node)
     return vocab_filename
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ Combines the PCollection only on the 0th dimension using nparray.`
`30`	`30`	`__init__(`
`31`	`31`	`fn,`
`32`	`32`	`output_dtypes,`
`33`		`- output_shapes=None`
	`33`	`+ output_shapes`
`34`	`34`	`)`
`35`	`35`	```
`36`	`36`