diff --git a/PyTorch/LanguageModeling/Transformer-XL/README.md b/PyTorch/LanguageModeling/Transformer-XL/README.md index 1ab0a50d7..0ebf3a63a 100644 --- a/PyTorch/LanguageModeling/Transformer-XL/README.md +++ b/PyTorch/LanguageModeling/Transformer-XL/README.md @@ -1113,7 +1113,11 @@ perplexity on the test dataset. ## Performance -The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference). +The performance measurements in this document were conducted at the time of +publication and may not reflect the performance achieved from NVIDIA’s latest +software release. For the most up-to-date performance measurements, go to +[NVIDIA Data Center Deep Learning Product +Performance](https://developer.nvidia.com/deep-learning-performance-training-inference). ### Benchmarking diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/mem_transformer_jit.py b/PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/mem_transformer_jit.py index 4de68c6ff..aeaddfe74 100644 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/mem_transformer_jit.py +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/mem_transformer_jit.py @@ -122,7 +122,7 @@ def forward(self, h, attn_mask=None, mems=None): head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head) # [bsz x n_head x qlen x klen] - attn_score = torch.einsum('ibnd,jbnd->bnij', (head_q, head_k)) + attn_score = torch.einsum('ibnd,jbnd->bnij', head_q, head_k) attn_score.mul_(self.scale) if attn_mask is not None: if attn_mask.dim() == 2: @@ -135,7 +135,7 @@ def forward(self, h, attn_mask=None, mems=None): attn_prob = self.dropatt(attn_prob) # [bsz x n_head x qlen x klen] * [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head] - attn_vec = torch.einsum('bnij,jbnd->ibnd', (attn_prob, head_v)) + attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, head_v) attn_vec = attn_vec.contiguous().view( attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) @@ -262,13 +262,13 @@ def forward(self, w, r, r_w_bias, r_r_bias, attn_mask, # compute attention score rw_head_q = w_head_q + r_w_bias # qlen x bsz x n_head x d_head - # AC = torch.einsum('ibnd,jbnd->bnij', (rw_head_q, w_head_k)) # bsz x n_head x qlen x klen + # AC = torch.einsum('ibnd,jbnd->bnij', rw_head_q, w_head_k) # bsz x n_head x qlen x klen rw_head_q = rw_head_q.view(qlen, bsz * self.n_head, self.d_head).permute(1, 0, 2) w_head_k = w_head_k.reshape(klen, bsz * self.n_head, self.d_head).permute(1, 2, 0) AC = torch.bmm(rw_head_q, w_head_k).view(bsz, self.n_head, qlen, klen) rr_head_q = w_head_q + r_r_bias - # BD = torch.einsum('ibnd,jnd->bnij', (rr_head_q, r_head_k)) # bsz x n_head x qlen x klen + # BD = torch.einsum('ibnd,jnd->bnij', rr_head_q, r_head_k) # bsz x n_head x qlen x klen rr_head_q = rr_head_q.permute(2, 1, 0, 3).reshape(self.n_head, bsz * qlen, self.d_head) r_head_k = r_head_k.permute(1, 2, 0).view(self.n_head, self.d_head, klen) BD = torch.bmm(rr_head_q, r_head_k).view(self.n_head, bsz, qlen, klen).permute(1, 0, 2, 3) @@ -290,7 +290,7 @@ def forward(self, w, r, r_w_bias, r_r_bias, attn_mask, attn_prob = self.dropatt(attn_prob) # compute attention vector - # attn_vec = torch.einsum('bnij,jbnd->ibnd', (attn_prob, w_head_v)) + # attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, w_head_v) attn_prob = attn_prob.view(bsz * self.n_head, qlen, klen) w_head_v = w_head_v.permute(1, 2, 0, 3).reshape(bsz * self.n_head, klen, self.d_head) attn_vec = torch.bmm(attn_prob, w_head_v).permute(1, 0, 2).view(qlen, bsz, self.n_head, self.d_head) @@ -358,11 +358,11 @@ def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): r_bias = r_bias.t() # compute attention score - rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head + rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head - AC = torch.einsum('ibnd,jbnd->bnij', (rw_head_q, w_head_k)) # bsz x n_head x qlen x klen - B_ = torch.einsum('ibnd,jnd->bnij', (w_head_q, r_emb)) # bsz x n_head x qlen x klen - D_ = r_bias[None, :, None, :] # 1 x n_head x 1 x klen + AC = torch.einsum('ibnd,jbnd->bnij', rw_head_q, w_head_k) # bsz x n_head x qlen x klen + B_ = torch.einsum('ibnd,jnd->bnij', w_head_q, r_emb) # bsz x n_head x qlen x klen + D_ = r_bias[None, :, None, :] # 1 x n_head x 1 x klen BD = self._rel_shift(B_ + D_) # [bsz x qlen x klen x n_head] @@ -381,7 +381,7 @@ def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): attn_prob = self.dropatt(attn_prob) # compute attention vector - attn_vec = torch.einsum('bnij,jbnd->ibnd', (attn_prob, w_head_v)) + attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, w_head_v) # [qlen x bsz x n_head x d_head] attn_vec = attn_vec.contiguous().view( diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/proj_adaptive_softmax_jit.py b/PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/proj_adaptive_softmax_jit.py index e50b2f0ba..5eb63482e 100644 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/proj_adaptive_softmax_jit.py +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/inference/proj_adaptive_softmax_jit.py @@ -146,7 +146,7 @@ def _compute_logit(self, hidden, weight, bias, proj: Optional[torch.Tensor]): if proj is None: logit = F.linear(hidden, weight, bias=bias) else: - logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) + logit = torch.einsum('bd,de,ev->bv', hidden, proj, weight.t()) if bias is not None: logit = logit + bias return logit diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/mem_transformer.py b/PyTorch/LanguageModeling/Transformer-XL/pytorch/mem_transformer.py index 5b8145ca6..39a25977c 100644 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/mem_transformer.py +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/mem_transformer.py @@ -125,7 +125,7 @@ def forward(self, h, attn_mask=None, mems=None): head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head) # [bsz x n_head x qlen x klen] - attn_score = torch.einsum('ibnd,jbnd->bnij', (head_q, head_k)) + attn_score = torch.einsum('ibnd,jbnd->bnij', head_q, head_k) attn_score.mul_(self.scale) if attn_mask is not None: if attn_mask.dim() == 2: @@ -138,7 +138,7 @@ def forward(self, h, attn_mask=None, mems=None): attn_prob = self.dropatt(attn_prob) # [bsz x n_head x qlen x klen] * [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head] - attn_vec = torch.einsum('bnij,jbnd->ibnd', (attn_prob, head_v)) + attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, head_v) attn_vec = attn_vec.contiguous().view( attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) @@ -264,10 +264,10 @@ def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None): # compute attention score rw_head_q = w_head_q + r_w_bias # qlen x bsz x n_head x d_head - AC = torch.einsum('ibnd,jbnd->bnij', (rw_head_q, w_head_k)) # bsz x n_head x qlen x klen + AC = torch.einsum('ibnd,jbnd->bnij', rw_head_q, w_head_k) # bsz x n_head x qlen x klen rr_head_q = w_head_q + r_r_bias - BD = torch.einsum('ibnd,jnd->bnij', (rr_head_q, r_head_k)) # bsz x n_head x qlen x klen + BD = torch.einsum('ibnd,jnd->bnij', rr_head_q, r_head_k) # bsz x n_head x qlen x klen BD = self._rel_shift(BD) # [bsz x n_head x qlen x klen] @@ -285,7 +285,7 @@ def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None): attn_prob = self.dropatt(attn_prob) # compute attention vector - attn_vec = torch.einsum('bnij,jbnd->ibnd', (attn_prob, w_head_v)) + attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, w_head_v) # [qlen x bsz x n_head x d_head] attn_vec = attn_vec.contiguous().view( @@ -350,11 +350,11 @@ def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): r_bias = r_bias.t() # compute attention score - rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head + rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head - AC = torch.einsum('ibnd,jbnd->bnij', (rw_head_q, w_head_k)) # bsz x n_head x qlen x klen - B_ = torch.einsum('ibnd,jnd->bnij', (w_head_q, r_emb)) # bsz x n_head x qlen x klen - D_ = r_bias[None, :, None, :] # 1 x n_head x 1 x klen + AC = torch.einsum('ibnd,jbnd->bnij', rw_head_q, w_head_k) # bsz x n_head x qlen x klen + B_ = torch.einsum('ibnd,jnd->bnij', w_head_q, r_emb) # bsz x n_head x qlen x klen + D_ = r_bias[None, :, None, :] # 1 x n_head x 1 x klen BD = self._rel_shift(B_ + D_) # [bsz x qlen x klen x n_head] @@ -372,7 +372,7 @@ def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): attn_prob = self.dropatt(attn_prob) # compute attention vector - attn_vec = torch.einsum('bnij,jbnd->ibnd', (attn_prob, w_head_v)) + attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, w_head_v) # [qlen x bsz x n_head x d_head] attn_vec = attn_vec.contiguous().view( diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_enwik8_base.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_enwik8_base.sh index db542a835..b6f3a9d6b 100755 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_enwik8_base.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_enwik8_base.sh @@ -1,5 +1,7 @@ #!/bin/bash +export OMP_NUM_THREADS=1 + if [[ $1 == 'train' ]]; then echo 'Run training...' python train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_enwik8_large.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_enwik8_large.sh index 5db67bf76..539e3fae0 100755 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_enwik8_large.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_enwik8_large.sh @@ -1,5 +1,7 @@ #!/bin/bash +export OMP_NUM_THREADS=1 + if [[ $1 == 'train' ]]; then echo 'Run training...' python train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_lm1b_base.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_lm1b_base.sh index e4aebeff1..b0a988d56 100755 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_lm1b_base.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_lm1b_base.sh @@ -1,5 +1,7 @@ #!/bin/bash +export OMP_NUM_THREADS=1 + if [[ $1 == 'train' ]]; then echo 'Run training...' python train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_lm1b_large.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_lm1b_large.sh index f8b330ae6..7ac4db2c7 100755 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_lm1b_large.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_lm1b_large.sh @@ -1,5 +1,7 @@ #!/bin/bash +export OMP_NUM_THREADS=1 + if [[ $1 == 'train' ]]; then echo 'Run training...' python train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_multinode_wt103_large.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_multinode_wt103_large.sh index 35a531353..782d1a95e 100644 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_multinode_wt103_large.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_multinode_wt103_large.sh @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +export OMP_NUM_THREADS=1 + if [[ $1 == 'train' ]] || [[ $1 == 'all' ]]; then echo 'Run training...' python train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_text8_base.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_text8_base.sh index 7058f77bb..6a38fb66a 100755 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_text8_base.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_text8_base.sh @@ -1,5 +1,7 @@ #!/bin/bash +export OMP_NUM_THREADS=1 + if [[ $1 == 'train' ]]; then echo 'Run training...' python train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_text8_large.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_text8_large.sh index cfc84df1c..8309409d4 100755 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_text8_large.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_text8_large.sh @@ -1,5 +1,7 @@ #!/bin/bash +export OMP_NUM_THREADS=1 + if [[ $1 == 'train' ]]; then echo 'Run training...' python train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_wt103_base.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_wt103_base.sh index 275da37d6..f68408db7 100755 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_wt103_base.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_wt103_base.sh @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +export OMP_NUM_THREADS=1 + if [[ "$1" == 'train' ]]; then echo 'Run training...' python -m torch.distributed.launch --nproc_per_node="$2" train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_wt103_large.sh b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_wt103_large.sh index bead4587e..c0350cc48 100755 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_wt103_large.sh +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/run_wt103_large.sh @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +export OMP_NUM_THREADS=1 + if [[ $1 == 'train' ]]; then echo 'Run training...' python -m torch.distributed.launch --nproc_per_node="$2" train.py \ diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/train.py b/PyTorch/LanguageModeling/Transformer-XL/pytorch/train.py index 6dc994514..b86176fae 100644 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/train.py +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/train.py @@ -285,6 +285,13 @@ def parse_args(): if args.batch_size % args.batch_chunk != 0: raise RuntimeError('Batch size needs to be divisible by batch chunk') + if ( + args.local_batch_size is not None + and args.local_batch_size % args.batch_chunk != 0 + ): + raise RuntimeError('Local batch size needs to be divisible by ' + 'batch chunk') + if args.fp16 and args.amp == 'apex' and 'apex' not in sys.modules: raise RuntimeError( 'APEX AMP unavailable, install APEX or switch to pytorch AMP' @@ -444,8 +451,10 @@ def evaluate(eval_iter, model, args): for i, (data, target, seq_len, warm) in enumerate(eval_iter): if args.eval_max_steps > 0 and i >= args.eval_max_steps: break - loss, mems = model(data, target, mems) - loss = loss.float().mean() + enable_autocast = args.fp16 and args.amp == 'pytorch' + with torch.cuda.amp.autocast(enable_autocast): + loss, mems = model(data, target, mems) + loss = loss.float().mean().type_as(loss) if warm: # assert (mems is None) or mems.size(1) == model.mem_len total_loss += seq_len * loss.item() @@ -735,6 +744,9 @@ def main(): args.batch_size = world_size * args.local_batch_size logging.info(f'--local_batch_size was set, adjusting global batch size' f' to {args.batch_size} (local_batch_size * world_size)') + if args.batch_size % args.batch_chunk != 0: + raise RuntimeError('Batch size needs to be divisible by ' + 'batch chunk') if args.profile: try: diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/log_uniform_sampler.py b/PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/log_uniform_sampler.py index 38a778072..6394b16e6 100644 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/log_uniform_sampler.py +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/log_uniform_sampler.py @@ -70,9 +70,9 @@ def sample_logits(embedding, bias, labels, inputs, sampler): hit = (labels[:, :, None] == neg_samples).detach() true_logits = torch.einsum('ijk,ijk->ij', - [true_w, inputs]) + true_b - true_log_probs + true_w, inputs) + true_b - true_log_probs sample_logits = torch.einsum('lk,ijk->ijl', - [sample_w, inputs]) + sample_b - samp_log_probs + sample_w, inputs) + sample_b - samp_log_probs sample_logits.masked_fill_(hit, -1e30) logits = torch.cat([true_logits[:, :, None], sample_logits], -1) diff --git a/PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/proj_adaptive_softmax.py b/PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/proj_adaptive_softmax.py index e890a0686..f064bcbf5 100644 --- a/PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/proj_adaptive_softmax.py +++ b/PyTorch/LanguageModeling/Transformer-XL/pytorch/utils/proj_adaptive_softmax.py @@ -112,7 +112,7 @@ def _compute_logit(self, hidden, weight, bias, proj): if proj is None: logit = F.linear(hidden, weight, bias=bias) else: - logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) + logit = torch.einsum('bd,de,ev->bv', hidden, proj, weight.t()) if bias is not None: logit = logit + bias return logit