tensor2tensor/tensor2tensor/models/attention_lm.py at master · MultiPath/tensor2tensor

169 lines (141 loc) · 5.6 KB
# Copyright 2017 Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Self-attention based language model.
Like transformer.py, but no encoder
decoder: [Self-Attention, Feed-forward] x n
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
# Dependency imports
from six.moves import xrange  # pylint: disable=redefined-builtin
from tensor2tensor.models import common_attention
from tensor2tensor.models import common_hparams
from tensor2tensor.models import common_layers
from tensor2tensor.utils import registry
from tensor2tensor.utils import t2t_model
import tensorflow as tf
@registry.register_model
class AttentionLM(t2t_model.T2TModel):
  """Attention net.  See file docstring."""
  def model_fn_body(self, features, train):
    # Remove dropout if not training
    hparams = copy.copy(self._hparams)
    if not train:
      hparams.attention_dropout = 0.
      hparams.relu_dropout = 0.
      hparams.residual_dropout = 0.
    targets = features["targets"]
    targets = tf.squeeze(targets, 2)
    (decoder_input, decoder_self_attention_bias) = attention_lm_prepare_decoder(
        targets, hparams)
    def residual_fn(x, y):
      return common_layers.layer_norm(x + tf.nn.dropout(
          y, 1.0 - hparams.residual_dropout))
    decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
    decoder_output = attention_lm_decoder(
        decoder_input, residual_fn, decoder_self_attention_bias, hparams)
    decoder_output = tf.expand_dims(decoder_output, 2)
    return decoder_output
def attention_lm_prepare_decoder(targets, hparams):
  """Prepare one shard of the model for the decoder.
    targets: a Tensor.
    hparams: run hyperparameters
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a Tensor, containing large negative values
    to implement masked attention and possibly baises for diagonal alignments
  decoder_self_attention_bias = (
      common_attention.attention_bias_lower_triangle(tf.shape(targets)[1]))
  decoder_input = common_layers.shift_left_3d(targets)
  if hparams.pos == "timing":
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
  return (decoder_input, decoder_self_attention_bias)
def attention_lm_decoder(decoder_input,
                         residual_fn,
                         decoder_self_attention_bias,
                         hparams,
                         name="decoder"):
  """A stack of attention_lm layers.
    decoder_input: a Tensor
    residual_fn: a function from (layer_input, layer_output) -> combined_output
    decoder_self_attention_bias: bias Tensor for self-attention
      (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    y: a Tensors
  x = decoder_input
  # Summaries don't work in multi-problem setting yet.
  summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
  with tf.variable_scope(name):
    for layer in xrange(hparams.num_hidden_layers):
      with tf.variable_scope("layer_%d" % layer):
        x = residual_fn(
            common_attention.multihead_attention(
                None,
                decoder_self_attention_bias,
                hparams.attention_key_channels or hparams.hidden_size,
                hparams.attention_value_channels or hparams.hidden_size,
                hparams.hidden_size,
                hparams.num_heads,
                hparams.attention_dropout,
                summaries=summaries,
                name="decoder_self_attention"))
        x = residual_fn(x,
                        common_layers.conv_hidden_relu(
                            hparams.filter_size,
                            hparams.hidden_size,
                            dropout=hparams.relu_dropout))
@registry.register_hparams
def attention_lm_base():
  """Set of hyperparameters."""
  hparams = common_hparams.basic_params1()
  hparams.hidden_size = 1024
  hparams.batch_size = 8192
  hparams.max_length = 256
  hparams.dropout = 0.0
  hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
  hparams.optimizer_adam_epsilon = 1e-9
  hparams.learning_rate_decay_scheme = "noam"
  hparams.learning_rate = 1.0
  hparams.learning_rate_warmup_steps = 1000
  hparams.initializer_gain = 1.0
  hparams.num_hidden_layers = 6
  hparams.initializer = "uniform_unit_scaling"
  hparams.weight_decay = 0.0
  hparams.optimizer_adam_beta1 = 0.9
  hparams.optimizer_adam_beta2 = 0.98
  hparams.num_sampled_classes = 0
  hparams.label_smoothing = 0.1
  hparams.shared_embedding_and_softmax_weights = int(False)
  hparams.add_hparam("filter_size", 4096)  # Add new ones like this.
  # attention-related flags
  hparams.add_hparam("num_heads", 8)
  hparams.add_hparam("attention_key_channels", 0)
  hparams.add_hparam("attention_value_channels", 0)
  hparams.add_hparam("attention_dropout", 0.0)
  hparams.add_hparam("relu_dropout", 0.0)
  hparams.add_hparam("pos", "timing")  # timing, none
  hparams.add_hparam("residual_dropout", 0.1)
  return hparams
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

attention_lm.py

Latest commit

History

attention_lm.py

File metadata and controls