Add Modality.targets_weights_fn

Ryan Sepassi · Ryan Sepassi · commit 50f5515b1779 · 2017-11-29T13:34:35.000-08:00
PiperOrigin-RevId: 175722118
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
@@ -112,8 +112,8 @@ def preprocess_example(self, example, unused_mode, unused_hparams):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    p.input_modality = {"inputs": ("image:identity_no_pad", None)}
-    p.target_modality = ("image:identity_no_pad", None)
+    p.input_modality = {"inputs": ("image:identity", 256)}
+    p.target_modality = ("image:identity", 256)
     p.batch_size_multiplier = 256
     p.max_expected_batch_size_per_shard = 4
     p.input_space_id = 1
@@ -236,7 +236,7 @@ def feature_encoders(self, data_dir):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
+    p.input_modality = {"inputs": (registry.Modalities.IMAGE, 256)}
     vocab_size = self._encoders["targets"].vocab_size
     p.target_modality = (registry.Modalities.SYMBOL, vocab_size)
     p.batch_size_multiplier = 256
@@ -286,7 +286,7 @@ def generator(self, data_dir, tmp_dir, is_training):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
+    p.input_modality = {"inputs": (registry.Modalities.IMAGE, 256)}
     p.target_modality = (registry.Modalities.CLASS_LABEL,
                          self.num_classes)
     p.batch_size_multiplier = 4 if self.is_small else 256
@@ -432,8 +432,8 @@ def preprocess_example(self, example, unused_mode, unused_hparams):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    p.input_modality = {"inputs": ("image:identity_no_pad", None)}
-    p.target_modality = ("image:identity_no_pad", None)
+    p.input_modality = {"inputs": ("image:identity", 256)}
+    p.target_modality = ("image:identity", 256)
     p.batch_size_multiplier = 256
     p.max_expected_batch_size_per_shard = 4
     p.input_space_id = 1
@@ -718,8 +718,8 @@ def preprocess_example(self, example, unused_mode, unused_hparams):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    p.input_modality = {"inputs": ("image:identity_no_pad", None)}
-    p.target_modality = ("image:identity_no_pad", None)
+    p.input_modality = {"inputs": ("image:identity", 256)}
+    p.target_modality = ("image:identity", 256)
     p.batch_size_multiplier = 256
     p.max_expected_batch_size_per_shard = 4
     p.input_space_id = 1
@@ -863,7 +863,7 @@ def feature_encoders(self, data_dir):
 
   def hparams(self, defaults, unused_model_hparams):
     p = defaults
-    p.input_modality = {"inputs": (registry.Modalities.IMAGE, None)}
+    p.input_modality = {"inputs": (registry.Modalities.IMAGE, 256)}
     encoder = self._encoders["targets"]
     p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size)
     p.batch_size_multiplier = 256
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -45,14 +45,22 @@ class SymbolModality(modality.Modality):
   def name(self):
     return "symbol_modality_%d_%d" % (self._vocab_size, self._body_input_depth)
 
-  @property
-  def top_dimensionality(self):
-    return self._vocab_size
-
   @property
   def top_is_pointwise(self):
     return True
 
+  @property
+  def weights_fn(self):
+    weights_fn = common_layers.weights_nonzero
+
+    hp = self._model_hparams
+    if hp and hp.prepend_mode != "none":
+      assert (hp.prepend_mode == "prepend_inputs_masked_attention" or
+              hp.prepend_mode == "prepend_inputs_full_attention")
+      weights_fn = common_layers.weights_prepend_inputs_to_targets
+
+    return weights_fn
+
   def _get_weights(self, hidden_dim=None):
     """Create or get concatenated embedding or softmax variable.
 
@@ -151,7 +159,7 @@ def top(self, body_output, _):
 class CTCSymbolModality(SymbolModality):
   """SymbolModality that uses CTC loss."""
 
-  def loss(self, logits, targets, weights_fn=common_layers.weights_nonzero):
+  def loss(self, logits, targets):
     """Compute the CTC loss."""
     with tf.name_scope("ctc_loss", [logits, targets]):
       # For CTC we assume targets are 1d, [batch, length, 1, 1] here.
@@ -172,21 +180,14 @@ def loss(self, logits, targets, weights_fn=common_layers.weights_nonzero):
           time_major=False,
           preprocess_collapse_repeated=False,
           ctc_merge_repeated=False)
-      weights = weights_fn(targets)
+      weights = self.targets_weights_fn(targets)
       return tf.reduce_sum(xent), tf.reduce_sum(weights)
 
 
 @registry.register_image_modality("default")
 class ImageModality(modality.Modality):
   """Modality for images."""
-
-  def __init__(self, model_hparams, vocab_size):
-    super(ImageModality, self).__init__(model_hparams, vocab_size)
-    self._channels = 3
-
-  @property
-  def top_dimensionality(self):
-    return 256
+  NUM_CHANNELS = 3
 
   def bottom(self, inputs):
     with tf.variable_scope(self.name):
@@ -217,7 +218,7 @@ def top(self, body_output, _):
           common_layers.shape_dim(body_output, i) for i in range(3)
       ]
       dim = body_output.get_shape().as_list()[-1] // 3
-      reshape_shape.extend([self._channels, dim])
+      reshape_shape.extend([self.NUM_CHANNELS, dim])
 
       out = tf.reshape(body_output, reshape_shape)
       res = tf.layers.dense(out, self.top_dimensionality)
@@ -226,21 +227,11 @@ def top(self, body_output, _):
         tf.summary.image("result", res_argmax, max_outputs=1)
       return res
 
-  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
-    # Call the default implementation, but weight 1.0 on 0s by default.
-    # (Since we're processing images and so have no padding and some pixel 0s.)
-    return super(ImageModality, self).loss(
-        top_out, targets, weights_fn=weights_fn)
-
 
 @registry.register_image_modality("image_identity_compress")
 class ImageIdentityCompressModality(modality.Modality):
   """Modality for images used in generation."""
 
-  @property
-  def top_dimensionality(self):
-    return 256
-
   def bottom_compress(self, inputs, name="bottom"):
     """Transform input from data space to model space.
 
@@ -296,12 +287,6 @@ def top(self, body_output, _):
                          channels, self.top_dimensionality])
       return x
 
-  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
-    # Call the default implementation, but weight 1.0 on 0s by default.
-    # (Since we're processing images and so have no padding and some pixel 0s.)
-    return super(ImageIdentityCompressModality, self).loss(
-        top_out, targets, weights_fn=weights_fn)
-
 
 @registry.register_audio_modality("default")
 class AudioModality(modality.Modality):
@@ -399,10 +384,6 @@ def name(self):
     return "class_label_modality_%d_%d" % (self._vocab_size,
                                            self._body_input_depth)
 
-  @property
-  def top_dimensionality(self):
-    return self._vocab_size
-
   def bottom(self, x):
     with tf.variable_scope(self.name):
       return common_layers.embedding(
@@ -434,12 +415,6 @@ def top(self, body_output, _):
       res = tf.layers.dense(x, self._vocab_size)
       return tf.expand_dims(res, 3)
 
-  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
-    # Call the default implementation, but weight 1.0 on 0s by default.
-    # (Since we're processing images and so have no padding and some pixel 0s.)
-    return super(ClassLabelModality, self).loss(
-        top_out, targets, weights_fn=weights_fn)
-
 
 @registry.register_generic_modality("default")
 @registry.register_audio_modality("identity")
@@ -450,10 +425,6 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
 class IdentityModality(modality.Modality):
   """Does nothing."""
 
-  @property
-  def targets_dimensionality(self):
-    return self._vocab_size
-
   def bottom(self, x):
     return tf.to_float(x)
 
@@ -476,7 +447,7 @@ def top(self, body_output, _):
     with tf.variable_scope("real"):
       return tf.layers.dense(body_output, self._vocab_size)
 
-  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
+  def loss(self, top_out, targets):
     raise NotImplementedError()
 
 
@@ -485,70 +456,35 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
 class RealL2LossModality(RealModality):
   """Modality for real (i.e. float) vectors with L2 (Gaussian) loss."""
 
-  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
+  def loss(self, top_out, targets):
     predictions = top_out
     with tf.name_scope("l2"):
-      weights = weights_fn(targets)
+      weights = self.targets_weights_fn(targets)
       l2 = tf.pow(predictions - targets, 2)
       return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights)
 
 
 @registry.register_real_modality("log_poisson_loss")
-class RealLogPoissonLossModality(RealL2LossModality):
-  """Modality for real (i.e. float) vectors with log Poisson regression loss.
-  """
-
-  def bottom(self, x):
-    return x
+class RealLogPoissonLossModality(RealModality):
+  """Modality for real (i.e. float) vectors with log Poisson regression loss."""
 
-  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
+  def loss(self, top_out, targets):
     predictions = top_out
     with tf.name_scope("log_possion"):
-      weights = weights_fn(targets)
+      weights = self.targets_weights_fn(targets)
 
       lp_loss = tf.nn.log_poisson_loss(targets, predictions)
       return tf.reduce_sum(lp_loss * weights), tf.reduce_sum(weights)
 
 
-@registry.register_image_modality("identity_no_pad")
-class IdentityModalityNoPad(modality.Modality):
-  """Does nothing except making sure that there is no padding in cross-ent."""
-
-  @property
-  def top_dimensionality(self):
-    return 256
-
-  @property
-  def targets_dimensionality(self):
-    return self._vocab_size
-
-  def bottom(self, x):
-    return tf.to_float(x)
-
-  def top(self, body_output, _):
-    return body_output
-
-  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
-    # Call the default implementation, but weight 1.0 on 0s by default.
-    # (Since we're processing images and so have no padding and some pixel 0s.)
-    return super(IdentityModalityNoPad, self).loss(
-        top_out, targets, weights_fn=weights_fn)
-
-
-@registry.register_image_modality("no_loss")
-class NoLossModality(modality.Modality):
-  """Does nothing to the input and returns no loss."""
-
-  @property
-  def targets_dimensionality(self):
-    return self._vocab_size
-
-  def bottom(self, x):
-    return tf.to_float(x)
-
-  def top(self, body_output, _):
-    return body_output
+@registry.register_generic_modality("zero_loss")
+@registry.register_audio_modality("zero_loss")
+@registry.register_image_modality("zero_loss")
+@registry.register_symbol_modality("zero_loss")
+@registry.register_class_label_modality("zero_loss")
+@registry.register_real_modality("zero_loss")
+class IdentityZeroLossModality(IdentityModality):
+  """Identity with 0 loss."""
 
-  def loss_sharded(self, sharded_top_out, sharded_targets, data_parallelism):
-    """Return nothing."""
-    return tf.constant(0.0, tf.float32)
+  def loss(self, top_out, targets):
+    return tf.constant(0., tf.float32), tf.constant(0., tf.float32)
diff --git a/tensor2tensor/models/vanilla_gan.py b/tensor2tensor/models/vanilla_gan.py
@@ -146,8 +146,8 @@ def vanilla_gan():
 
   hparams = common_hparams.basic_params1()
 
-  hparams.input_modalities = "image:no_loss"
-  hparams.target_modality = "image:no_loss"
+  hparams.input_modalities = "inputs:image:zero_loss"
+  hparams.target_modality = "image:zero_loss"
 
   hparams.batch_size = 2048  # 3136
   hparams.label_smoothing = 0.0
diff --git a/tensor2tensor/tpu/tpu_trainer_lib.py b/tensor2tensor/tpu/tpu_trainer_lib.py
@@ -25,7 +25,6 @@
 
 import six
 
-from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import data_reader
 from tensor2tensor.utils import metrics
 from tensor2tensor.utils import optimize
@@ -192,7 +191,7 @@ def model_fn(features, labels, mode, params, config):
       problem = hp.problem_instances[0]
 
       if use_tpu:
-        eval_metrics_fn = create_eval_metrics_fn(problem)
+        eval_metrics_fn = create_eval_metrics_fn(problem, hparams)
         _remove_summaries()
         return tf.contrib.tpu.TPUEstimatorSpec(
             mode,
@@ -245,14 +244,18 @@ def model_fn(features, labels, mode, params, config):
 ])
 
 
-def create_eval_metrics_fn(problem):
+def create_eval_metrics_fn(problem, hparams):
   """Create the metrics_fn that TPUEstimatorSpec expects."""
 
+  tm = problem.get_hparams().target_modality
+  if isinstance(tm, tuple):
+    tm = registry.create_modality(tm, hparams)
+  weights_fn = tm.weights_fn
+
   def make_metric_fn(metric_fn):
 
     def wrapped_metric_fn(logits, labels):
-      num, den = metric_fn(
-          logits, labels, weights_fn=common_layers.weights_nonzero)
+      num, den = metric_fn(logits, labels, weights_fn=weights_fn)
       return tf.metrics.mean(num, den)
 
     return wrapped_metric_fn
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
@@ -24,6 +24,7 @@
 
 from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import bleu_hook
+from tensor2tensor.utils import registry
 from tensor2tensor.utils import rouge
 
 import tensorflow as tf
@@ -284,7 +285,7 @@ def problem_metric_fn(predictions, features):
       # "features".
       kwargs = {}
       args, _, keywords, _ = inspect.getargspec(metric_fn)
-      if "features" in args or keywords:
+      if ("features" in args) or keywords:
         kwargs["features"] = features
 
       def wrapped_metric_fn():
@@ -308,28 +309,21 @@ def wrapped_metric_fn():
                                                            metrics,
                                                            METRICS_FNS.keys()))
 
-    class_output = "image" in problem_name and "coco" not in problem_name
-    real_output = "gene_expression" in problem_name
-    if model_hparams.prepend_mode != "none":
-      assert (model_hparams.prepend_mode == "prepend_inputs_masked_attention" or
-              model_hparams.prepend_mode == "prepend_inputs_full_attention")
-      assert not class_output
-      weights_fn = common_layers.weights_prepend_inputs_to_targets
-    elif class_output or real_output:
-      weights_fn = common_layers.weights_all
-    else:
-      weights_fn = common_layers.weights_nonzero
-
     def image_wrapped_metric_fn(predictions,
                                 labels,
                                 weights_fn=common_layers.weights_nonzero):
       _, _ = labels, weights_fn
       return metric_fn(predictions, model_hparams)
 
+    tm = problem_instance.get_hparams().target_modality
+    if isinstance(tm, tuple):
+      tm = registry.create_modality(tm, model_hparams)
+    weights_fn = tm.weights_fn
+
     for metric in metrics:
       metric_fn = METRICS_FNS[metric]
       metric_name = "metrics-%s/%s" % (problem_name, metric)
-      if "image" in metric:
+      if metric == Metrics.IMAGE_SUMMARY:
         eval_metrics[metric_name] = image_wrapped_metric_fn
       else:
         problem_metric_fn = make_problem_specific_metric_fn(
diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py