Update documentation for adding new Problems

Ryan Sepassi · Ryan Sepassi · commit 04b4f4722b69 · 2017-07-18T17:51:30.000-07:00
PiperOrigin-RevId: 162242293
diff --git a/README.md b/README.md
@@ -153,7 +153,7 @@ python -c "from tensor2tensor.models.transformer import Transformer"
   specification.
 * Support for multi-GPU machines and synchronous (1 master, many workers) and
   asynchrounous (independent workers synchronizing through a parameter server)
-  distributed training.
+  [distributed training](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/docs/distributed_training.md).
 * Easily swap amongst datasets and models by command-line flag with the data
   generation script `t2t-datagen` and the training script `t2t-trainer`.
 
@@ -173,8 +173,10 @@ and many common sequence datasets are already available for generation and use.
 
 **Problems** define training-time hyperparameters for the dataset and task,
 mainly by setting input and output **modalities** (e.g. symbol, image, audio,
-label) and vocabularies, if applicable. All problems are defined in
-[`problem_hparams.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem_hparams.py).
+label) and vocabularies, if applicable. All problems are defined either in
+[`problem_hparams.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem_hparams.py)
+or are registered with `@registry.register_problem` (run `t2t-datagen` to see
+the list of all available problems).
 **Modalities**, defined in
 [`modality.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/modality.py),
 abstract away the input and output data types so that **models** may deal with
@@ -222,7 +224,7 @@ enables easily adding new ones and easily swapping amongst them by command-line
 flag. You can add your own components without editing the T2T codebase by
 specifying the `--t2t_usr_dir` flag in `t2t-trainer`.
 
-You can currently do so for models, hyperparameter sets, and modalities. Please
+You can do so for models, hyperparameter sets, modalities, and problems. Please
 do submit a pull request if your component might be useful to others.
 
 Here's an example with a new hyperparameter set:
@@ -253,9 +255,18 @@ You'll see under the registered HParams your
 `transformer_my_very_own_hparams_set`, which you can directly use on the command
 line with the `--hparams_set` flag.
 
+`t2t-datagen` also supports the `--t2t_usr_dir` flag for `Problem`
+registrations.
+
 ## Adding a dataset
 
-See the [data generators
+To add a new dataset, subclass
+[`Problem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py)
+and register it with `@registry.register_problem`. See
+[`WMTEnDeTokens8k`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+for an example.
+
+Also see the [data generators
 README](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/README.md).
 
 ---
diff --git a/docs/distributed_training.md b/docs/distributed_training.md
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -48,7 +48,6 @@ from tensor2tensor.data_generators import wiki
 from tensor2tensor.data_generators import wmt
 from tensor2tensor.data_generators import wsj_parsing
 from tensor2tensor.utils import registry
-from tensor2tensor.utils import usr_dir
 
 import tensorflow as tf
 
@@ -65,13 +64,6 @@ flags.DEFINE_integer("max_cases", 0,
                      "Maximum number of cases to generate (unbounded if 0).")
 flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
 
-flags.DEFINE_string("t2t_usr_dir", "",
-                    "Path to a Python module that will be imported. The "
-                    "__init__.py file should include the necessary imports. "
-                    "The imported files should contain registrations, "
-                    "e.g. @registry.register_model calls, that will then be "
-                    "available to the t2t-datagen.")
-
 # Mapping from problems that we can generate data for to their generators.
 # pylint: disable=g-long-lambda
 _SUPPORTED_PROBLEM_GENERATORS = {
@@ -281,7 +273,6 @@ def set_random_seed():
 
 def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
-  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
 
   # Calculate the list of problems to generate.
   problems = sorted(
@@ -365,7 +356,7 @@ def generate_data_for_problem(problem):
 
 def generate_data_for_registered_problem(problem_name):
   problem = registry.problem(problem_name)
-  problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir, FLAGS.num_shards)
+  problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
@@ -36,7 +36,7 @@ import sys
 # Dependency imports
 
 from tensor2tensor.utils import trainer_utils as utils
-from tensor2tensor.utils import usr_dir
+
 import tensorflow as tf
 
 flags = tf.flags
@@ -49,9 +49,25 @@ flags.DEFINE_string("t2t_usr_dir", "",
                     "e.g. @registry.register_model calls, that will then be "
                     "available to the t2t-trainer.")
 
+
+def import_usr_dir():
+  """Import module at FLAGS.t2t_usr_dir, if provided."""
+  if not FLAGS.t2t_usr_dir:
+    return
+  dir_path = os.path.expanduser(FLAGS.t2t_usr_dir)
+  if dir_path[-1] == "/":
+    dir_path = dir_path[:-1]
+  containing_dir, module_name = os.path.split(dir_path)
+  tf.logging.info("Importing user module %s from path %s", module_name,
+                  containing_dir)
+  sys.path.insert(0, containing_dir)
+  importlib.import_module(module_name)
+  sys.path.pop(0)
+
+
 def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
-  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
+  import_usr_dir()
   utils.log_registry()
   utils.validate_flags()
   utils.run(
diff --git a/tensor2tensor/data_generators/README.md b/tensor2tensor/data_generators/README.md
@@ -1,7 +1,7 @@
-# Data generators for T2T models.
+# T2T Problems.
 
-This directory contains data generators for a number of problems. We use a
-naming scheme for the problems, they have names of the form
+This directory contains `Problem` specifications for a number of problems. We
+use a naming scheme for the problems, they have names of the form
 `[task-family]_[task]_[specifics]`.  Data for all currently supported problems
 can be generated by calling the main generator binary (`t2t-datagen`). For
 example:
@@ -20,53 +20,51 @@ All tasks produce TFRecord files of `tensorflow.Example` protocol buffers.
 
 ## Adding a new problem
 
-1. Implement and register a Python generator for the dataset
-1. Add a problem specification to `problem_hparams.py` specifying input and
-   output modalities
+To add a new problem, subclass
+[`Problem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py)
+and register it with `@registry.register_problem`. See
+[`WMTEnDeTokens8k`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+for an example.
 
-To add a new problem, you first need to create python generators for training
-and development data for the problem. The python generators should yield
-dictionaries with string keys and values being lists of {int, float, str}.
-Here is a very simple generator for a data-set where inputs are lists of 1s with
-length upto 100 and targets are lists of length 1 with an integer denoting the
-length of the input list.
+`Problem`s support data generation, training, and decoding.
+
+Data generation is handles by `Problem.generate_data` which should produce 2
+datasets, training and dev, which should be named according to
+`Problem.training_filepaths` and `Problem.dev_filepaths`.
+`Problem.generate_data` should also produce any other files that may be required
+for training/decoding, e.g. a vocabulary file.
+
+A particularly easy way to implement `Problem.generate_data` for your dataset is
+to create 2 Python generators, one for the training data and another for the
+dev data, and pass them to `generator_utils.generate_dataset_and_shuffle`. See
+[`WMTEnDeTokens8k.generate_data`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+for an example of usage.
+
+The generators should yield dictionaries with string keys and values being lists
+of {int, float, str}.  Here is a very simple generator for a data-set where
+inputs are lists of 2s with length upto 100 and targets are lists of length 1
+with an integer denoting the length of the input list.
 
 ```
 def length_generator(nbr_cases):
   for _ in xrange(nbr_cases):
     length = np.random.randint(100) + 1
-    yield {"inputs": [1] * length, "targets": [length]}
+    yield {"inputs": [2] * length, "targets": [length]}
 ```
 
-Note that our data reader uses 0 for padding, so it is a good idea to never
-generate 0s, except if all your examples have the same size (in which case
-they'll never be padded anyway) or if you're doing padding on your own (in which
-case please use 0s for padding). When adding the python generator function,
-please also add unit tests to check if the code runs.
+Note that our data reader uses 0 for padding and other parts of the code assume
+end-of-string (EOS) is 1, so it is a good idea to never generate 0s or 1s,
+except if all your examples have the same size (in which case they'll never be
+padded anyway) or if you're doing padding on your own (in which case please use
+0s for padding). When adding the python generator function, please also add unit
+tests to check if the code runs.
 
 The generator can do arbitrary setup before beginning to yield examples - for
 example, downloading data, generating vocabulary files, etc.
 
 Some examples:
 
-*   [Algorithmic generators](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/algorithmic.py)
+*   [Algorithmic problems](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/algorithmic.py)
     and their [unit tests](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/algorithmic_test.py)
-*   [WMT generators](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
+*   [WMT problems](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt.py)
     and their [unit tests](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wmt_test.py)
-
-When your python generator is ready and tested, add it to the
-`_SUPPORTED_PROBLEM_GENERATORS` dictionary in the
-[data
-generator](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/bin/t2t-datagen).
-The keys are problem names, and the values are pairs of (training-set-generator
-function, dev-set-generator function). For the generator above, one could add
-the following lines:
-
-```
-  "algorithmic_length_upto100":
-  (lambda: algorithmic.length_generator(10000),
-   lambda: algorithmic.length_generator(1000)),
-```
-
-Note the lambdas above: we don't want to call the generators too early.
-
diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
@@ -36,10 +36,10 @@ class AlgorithmicIdentityBinary40(problem.Problem):
   def num_symbols(self):
     return 2
 
-  def generate_data(self, data_dir, _, num_shards=100):
+  def generate_data(self, data_dir, _):
     utils.generate_dataset_and_shuffle(
         identity_generator(self.num_symbols, 40, 100000),
-        self.training_filepaths(data_dir, num_shards, shuffled=True),
+        self.training_filepaths(data_dir, 100, shuffled=True),
         identity_generator(self.num_symbols, 400, 10000),
         self.dev_filepaths(data_dir, 1, shuffled=True),
         shuffle=False)
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
@@ -244,11 +244,6 @@ def gunzip_file(gz_path, new_path):
         "http://www.statmt.org/wmt13/training-parallel-un.tgz",
         ["un/undoc.2000.fr-en.en", "un/undoc.2000.fr-en.fr"]
     ],
-    # Macedonian-English
-    [
-        "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz",  # pylint: disable=line-too-long
-        ["train.mk", "train.en"]
-    ],
 ]
 
 
@@ -329,19 +324,18 @@ def get_or_generate_tabbed_vocab(tmp_dir, source_filename,
     return vocab
 
   # Use Tokenizer to count the word occurrences.
-  token_counts = defaultdict(int)
   filepath = os.path.join(tmp_dir, source_filename)
   with tf.gfile.GFile(filepath, mode="r") as source_file:
     for line in source_file:
       line = line.strip()
       if line and "\t" in line:
         parts = line.split("\t", maxsplit=1)
         part = parts[index].strip()
-        for tok in tokenizer.encode(text_encoder.native_to_unicode(part)):
-          token_counts[tok] += 1
+        _ = tokenizer.encode(text_encoder.native_to_unicode(part))
 
   vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
-      vocab_size, token_counts, 1, 1e3)
+      vocab_size, tokenizer.token_counts, 1,
+      min(1e3, vocab_size + text_encoder.NUM_RESERVED_TOKENS))
   vocab.store_to_file(vocab_filepath)
   return vocab
 
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
@@ -67,8 +67,6 @@ class SpaceID(object):
   ICE_TOK = 18
   # Icelandic parse tokens
   ICE_PARSE_TOK = 19
-  # Macedonian tokens
-  MK_TOK = 20
 
 
 class Problem(object):
@@ -113,7 +111,7 @@ class Problem(object):
   # BEGIN SUBCLASS INTERFACE
   # ============================================================================
 
-  def generate_data(self, data_dir, tmp_dir, num_shards=100):
+  def generate_data(self, data_dir, tmp_dir):
     raise NotImplementedError()
 
   def hparams(self, defaults, model_hparams):
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
@@ -24,7 +24,6 @@
 from __future__ import print_function
 
 from collections import defaultdict
-import re
 
 # Dependency imports
 
@@ -226,7 +225,6 @@ class SubwordTextEncoder(TextEncoder):
 
   def __init__(self, filename=None):
     """Initialize and read from a file, if provided."""
-    self._alphabet = set()
     if filename is not None:
       self._load_from_file(filename)
     super(SubwordTextEncoder, self).__init__(num_reserved_ids=None)
@@ -505,12 +503,6 @@ def _escape_token(self, token):
         ret += u"\\%d;" % ord(c)
     return ret
 
-  # Regular expression for unescaping token strings
-  # '\u' is converted to '_'
-  # '\\' is converted to '\'
-  # '\213;' is converted to unichr(213)
-  _UNESCAPE_REGEX = re.compile(u'|'.join([r"\\u", r"\\\\", r"\\([0-9]+);"]))
-
   def _unescape_token(self, escaped_token):
     """Inverse of _escape_token().
 
@@ -519,14 +511,32 @@ def _unescape_token(self, escaped_token):
     Returns:
       token: a unicode string
     """
-    def match(m):
-      if m.group(1) is not None:
-        # Convert '\213;' to unichr(213)
-        try:
-          return unichr(int(m.group(1)))
-        except (ValueError, OverflowError) as _:
-          return ""
-      # Convert '\u' to '_' and '\\' to '\'
-      return u"_" if m.group(0) == u"\\u" else u"\\"
-    # Cut off the trailing underscore and apply the regex substitution
-    return self._UNESCAPE_REGEX.sub(match, escaped_token[:-1])
+    ret = u""
+    escaped_token = escaped_token[:-1]
+    pos = 0
+    while pos < len(escaped_token):
+      c = escaped_token[pos]
+      if c == "\\":
+        pos += 1
+        if pos >= len(escaped_token):
+          break
+        c = escaped_token[pos]
+        if c == u"u":
+          ret += u"_"
+          pos += 1
+        elif c == "\\":
+          ret += u"\\"
+          pos += 1
+        else:
+          semicolon_pos = escaped_token.find(u";", pos)
+          if semicolon_pos == -1:
+            continue
+          try:
+            ret += unichr(int(escaped_token[pos:semicolon_pos]))
+            pos = semicolon_pos + 1
+          except (ValueError, OverflowError) as _:
+            pass
+      else:
+        ret += c
+        pos += 1
+    return ret
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# coding=utf-8
 """Tests for tensor2tensor.data_generators.tokenizer."""
 
 from __future__ import absolute_import
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
diff --git a/tensor2tensor/utils/usr_dir.py b/tensor2tensor/utils/usr_dir.py