tensor2tensor/tensor2tensor/data_generators/desc2code.py at master · feiga/tensor2tensor

312 lines (250 loc) · 9.64 KB
# coding=utf-8
# Copyright 2017 The Tensor2Tensor Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data generators for the Description2Code OpenAI data-set."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import random
import zipfile
# Dependency imports
from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.utils import registry
import tensorflow as tf
# End-of-sentence marker.
EOS = text_encoder.EOS_ID
_DATASET_URL = "https://drive.google.com/uc?export=download&id=0Bz3fihKG133ceWNFQTQ5S0xhZUk"
_DATASET_FILENAME = "description2code_current.zip"
_DATASET_PB_PATH = "description2code_current/"
_DESC_DIR_NAME = "description"
_VOCAB_EN_FILENAME = "vocab.endefr"
_RE_CPP_INLINE_COMMENT = re.compile("//.*?\n")  # Compiled once
# Constant defined for a language problem
CodingPbConstants = collections.namedtuple("CodingPbConstants", [
    "code_dir_name",
    "vocab_filename",
    "filter_patterns",
    "target_space",
PB_PY = CodingPbConstants(
    code_dir_name="solutions_python",
    vocab_filename="vocab.py",
    filter_patterns=["#include", "# include", "import java."],
    target_space=problem.SpaceID.PY_TOK,
PB_CPP = CodingPbConstants(
    code_dir_name="solutions_c++",
    vocab_filename="vocab.cpp",
    filter_patterns=["import java."],
    target_space=problem.SpaceID.CPP_TOK,
# Struct containing a coding problem (contains the paths to the descriptions
# and code files)
CodingPbInfo = collections.namedtuple("CodingPbInfo", "desc_file, code_files")
class Desc2CodeProblem(problem.Text2TextProblem):
  """Base class for Description2Code problems."""
  @property
  def is_character_level(self):
    return False
  @property
  def num_shards(self):
    return 10
  @property
  def use_subword_tokenizer(self):
    return True
  @property
  def input_space_id(self):
    return problem.SpaceID.EN_TOK
  @property
  def target_space_id(self):
    return self.pb_constants.target_space
  @property
  def input_vocab_size(self):
    return 2**15  # 32k
  @property
  def target_vocab_size(self):
    return 2**12  # 4k
  @property
  def vocab_input_filename(self):
    return "{}.{}".format(_VOCAB_EN_FILENAME, self.input_vocab_size)
  @property
  def vocab_target_filename(self):
    return "{}.{}".format(
        self.pb_constants.vocab_filename, self.target_vocab_size)
  def preprocess_target(self, target):
    """Apply some preprocessing to the target.
    For instance, remove space/tabs.
      target (str): code source content
    Returns:
      the pre-processed string content
    return target
  def feature_encoders(self, data_dir):
    source_vocab_filename = os.path.join(data_dir, self.vocab_input_filename)
    target_vocab_filename = os.path.join(data_dir, self.vocab_target_filename)
    source_token = text_encoder.SubwordTextEncoder(source_vocab_filename)
    target_token = text_encoder.SubwordTextEncoder(target_vocab_filename)
    return {
        "inputs": source_token,
        "targets": target_token,
  def generator(self, data_dir, tmp_dir, train):
    # Called twice: for train and test
    # Get the list of the training samples (coding challenge samples)
    samples = list(generator_samples(tmp_dir, self.pb_constants))
    # Split between train and dev
    # Suffle to get problems from diverse sources (CodeChef and CodeForces) and
    # dificulties in each set.
    # Need to sort the samples first before shuffling (as walk() isn't
    # deterministic)
    samples.sort(key=lambda x: x.desc_file)  # in-place
    rng = random.Random(7531)  # Local fixed seed
    rng.shuffle(samples)  # in-place
    # Train: 5019/5228 problems
    # Dev: 209/5228 problems
    len_samples = len(samples)
    split = len_samples // 25
    samples = samples[split:] if train else samples[:split]
    tf.logging.info("Number of samples for {}: {}/{}".format(
        "train" if train else "dev",
        len(samples),
        len_samples
    def generator_samples_content(get_source, get_target):
      source, target = None, None
      # Iterate over the coding samples
      for sample in samples:
        if get_source:
          with tf.gfile.GFile(sample.desc_file, mode="r") as source_file:
            source = source_file.read()
        if get_target:
          # Each challenge can have multiple implementations (or none)
          for code_file in sample.code_files:
            with tf.gfile.GFile(code_file, mode="r") as target_file:
              target = target_file.read()
              target = self.preprocess_target(target)
            yield source, target
        elif sample.code_files:  # Only take the source if a target exists
          yield source, target
    def generator_target():
      for _, target in generator_samples_content(False, True):
        yield target.strip()
    # Generate vocab for both source and target
    source_vocab = generator_utils.get_or_generate_vocab(
        data_dir, tmp_dir, self.vocab_input_filename, self.input_vocab_size)
    target_vocab = generator_utils.get_or_generate_vocab_inner(
        data_dir=data_dir,
        vocab_filename=self.vocab_target_filename,
        vocab_size=self.target_vocab_size,
        generator=generator_target(),)
    # Yield the training and testing samples
    eos_list = [EOS]
    for source, target in generator_samples_content(True, True):
      source_ints = source_vocab.encode(source.strip()) + eos_list
      target_ints = target_vocab.encode(target.strip()) + eos_list
      yield {
          "inputs": source_ints,
          "targets": target_ints,
@registry.register_problem
class ProgrammingDesc2codePy(Desc2CodeProblem):
  """Description2Code for python problem."""
  @property
  def pb_constants(self):
    return PB_PY
  def preprocess_target(self, target):
    """Simple tab to space replacement."""
    return target.replace("\t", "    ")
@registry.register_problem
class ProgrammingDesc2codeCpp(Desc2CodeProblem):
  """Description2Code for C++ problem."""
  @property
  def pb_constants(self):
    return PB_CPP
  def preprocess_target(self, target):
    """Pre-process Cpp files."""
    target = re.sub(_RE_CPP_INLINE_COMMENT, " ", target)  # Remove comments
    # The regex rule is quite simple, So will fail if a // is inside a string,
    # and don't remove /* */ comments
    target = " ".join(target.split())  # Normalize all spaces
    return target
# Utils functions
def generator_samples(tmp_dir, pb_cst):
  """Generator for the dataset samples.
  If not present, download and extract the dataset.
    tmp_dir: path to the directory where to download the dataset.
    pb_cst: CodingPbConstants object defining paths
    A CodingPbInfo object containing the next challenge informations.
  # Step1: Download dataset (eventually)
  data_zip_path = generator_utils.maybe_download_from_drive(
      directory=tmp_dir,
      filename=_DATASET_FILENAME,
      url=_DATASET_URL,
  tf.logging.info("Data downloaded in: {}".format(data_zip_path))
  # Step2: Extract dataset
  # We could deduce _DATASET_PB_PATH from the zip file (instead of
  # hardcoded path)
  data_rootdir = os.path.join(tmp_dir, _DATASET_PB_PATH)
  if not tf.gfile.Exists(data_rootdir):
    with zipfile.ZipFile(data_zip_path, "r") as corpus_zip:
      corpus_zip.extractall(tmp_dir)
    # We could remove the extracted __MACOSX folder
    tf.logging.info("Data extracted in: {}".format(tmp_dir))
    tf.logging.info("Data already extracted in: {}".format(tmp_dir))
  # Step3: Extract the problems list on the extracted folder
  def contains_samples(subdir, dirs, files):  # pylint: disable=unused-argument
    """Check that the folder contains a problem."""
    return (
        _DESC_DIR_NAME in dirs and
        pb_cst.code_dir_name in dirs
  def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
    """Return the filenames of the problem."""
    # More could be extracted (like the expected inputs/outputs
    # pairs, the problem difficulty, the names of the algorithmic techniques
    # needed)
    desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt")
    code_files = []
    # As the dataset is noisy, the program deduce the language from the file
    # content.
    code_pattern = os.path.join(subdir, pb_cst.code_dir_name, "*.txt")
    for f in tf.gfile.Glob(code_pattern):
      with tf.gfile.GFile(f, mode="r") as target_file:
        # Hack to filter C++/Java files. In theory some python comments could
        # make the file be concidered as C++ but in practice the chance of
        # getting a false negative is low.
        content = target_file.read()
        if not any(p in content for p in pb_cst.filter_patterns):
          code_files.append(f)
    return CodingPbInfo(
        desc_file=desc_file,
        code_files=code_files
  # The dataset contains problem from two different sources (CodeChef
  # and CodeForces). Due to the limited number of samples, all problems from
  # both sources are merged
  for w in tf.gfile.Walk(data_rootdir):
    if contains_samples(*w):
      yield next_sample(*w)
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

desc2code.py

Latest commit

History

desc2code.py

File metadata and controls