diff --git a/PyTorch/SpeechRecognition/Jasper/.gitignore b/PyTorch/SpeechRecognition/Jasper/.gitignore
index 9dc2decd3..bb051c475 100755
--- a/PyTorch/SpeechRecognition/Jasper/.gitignore
+++ b/PyTorch/SpeechRecognition/Jasper/.gitignore
@@ -3,3 +3,7 @@ __pycache__
 results/
 datasets/
 checkpoints/
+
+*.swp
+*.swo
+*.swn
diff --git a/PyTorch/SpeechRecognition/Jasper/Dockerfile b/PyTorch/SpeechRecognition/Jasper/Dockerfile
index d8b491e58..18948e19f 100755
--- a/PyTorch/SpeechRecognition/Jasper/Dockerfile
+++ b/PyTorch/SpeechRecognition/Jasper/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3  
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
 FROM ${FROM_IMAGE_NAME}
 
 
diff --git a/PyTorch/SpeechRecognition/Jasper/README.md b/PyTorch/SpeechRecognition/Jasper/README.md
index 77fc475f1..9b2d08101 100644
--- a/PyTorch/SpeechRecognition/Jasper/README.md
+++ b/PyTorch/SpeechRecognition/Jasper/README.md
@@ -7,9 +7,10 @@ This repository provides scripts to train the Jasper model to achieve near state
    * [Model architecture](#model-architecture)
    * [Default configuration](#default-configuration)
    * [Feature support matrix](#feature-support-matrix)
- 	   * [Features](#features)
+       * [Features](#features)
    * [Mixed precision training](#mixed-precision-training)
- 	   * [Enabling mixed precision](#enabling-mixed-precision)
+       * [Enabling mixed precision](#enabling-mixed-precision)
+       * [Enabling TF32](#enabling-tf32)
    * [Glossary](#glossary)
 - [Setup](#setup)
    * [Requirements](#requirements)
@@ -23,31 +24,33 @@ This repository provides scripts to train the Jasper model to achieve near state
    * [Training process](#training-process)
    * [Inference process](#inference-process)
    * [Evaluation process](#evaluation-process)
-   * [Inference process with TensorRT](#inference-process-with-tensorrt)
-   * [Inference process with TensorRT Inference Server](#inference-process-with-tensorrt-inference-server)
+   * [Deploying Jasper using TensorRT](#deploying-jasper-using-tensorrt)
+   * [Deploying Jasper using Triton Inference Server](#deploying-jasper-using-triton-inference)
 - [Performance](#performance)
    * [Benchmarking](#benchmarking)
        * [Training performance benchmark](#training-performance-benchmark)
        * [Inference performance benchmark](#inference-performance-benchmark)
    * [Results](#results)
        * [Training accuracy results](#training-accuracy-results)
-           * [Training accuracy: NVIDIA DGX-1 (8x V100 32G)](#training-accuracy-nvidia-dgx-1-8x-v100-32G)
+           * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+           * [Training accuracy: NVIDIA DGX-1 (8x V100 32GB)](#training-accuracy-nvidia-dgx-1-8x-v100-32gb)
            * [Training stability test](#training-stability-test)
        * [Training performance results](#training-performance-results)
-         * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16G)
-         * [Training performance: NVIDIA DGX-1 (8x V100 32G)](#training-performance-nvidia-dgx-1-8x-v100-32G)
-         * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32G)
+         * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+         * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+         * [Training performance: NVIDIA DGX-1 (8x V100 32GB)](#training-performance-nvidia-dgx-1-8x-v100-32gb)
+         * [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
        * [Inference performance results](#inference-performance-results)
-           * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16G)
-           * [Inference performance: NVIDIA DGX-1 (1x V100 32G)](#inference-performance-nvidia-dgx-1-1x-v100-32G)
-           * [Inference performance: NVIDIA DGX-2 (1x V100 32G)](#inference-performance-nvidia-dgx-2-1x-v100-32G)
+           * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-gpu-1x-a100-40gb)
+           * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
+           * [Inference performance: NVIDIA DGX-1 (1x V100 32GB)](#inference-performance-nvidia-dgx-1-1x-v100-32gb)
+           * [Inference performance: NVIDIA DGX-2 (1x V100 32GB)](#inference-performance-nvidia-dgx-2-1x-v100-32gb)
            * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
 - [Release notes](#release-notes)
    * [Changelog](#changelog)
    * [Known issues](#known-issues)
 
 ## Model overview
-
 This repository provides an implementation of the Jasper model in PyTorch from the paper `Jasper: An End-to-End Convolutional Neural Acoustic Model` [https://arxiv.org/pdf/1904.03288.pdf](https://arxiv.org/pdf/1904.03288.pdf).
 The Jasper model is an end-to-end neural acoustic model for automatic speech recognition (ASR) that provides near state-of-the-art results on LibriSpeech among end-to-end ASR models without any external data. The Jasper architecture of convolutional layers was designed to facilitate fast GPU inference, by allowing whole sub-blocks to be fused into a single GPU kernel. This is important for meeting strict real-time requirements of ASR systems in deployment.
 
@@ -60,9 +63,9 @@ This repository provides a simple implementation of Jasper with scripts for trai
 This includes data preparation scripts, training and inference scripts.
 Both training and inference scripts offer the option to use Automatic Mixed Precision (AMP) to benefit from Tensor Cores for better performance.
 
-In addition to providing the hyperparameters for training a model checkpoint, we publish a thorough inference analysis across different NVIDIA GPU platforms, for example, DGX-1, DGX-2 and T4.
+In addition to providing the hyperparameters for training a model checkpoint, we publish a thorough inference analysis across different NVIDIA GPU platforms, for example, DGX A100, DGX-1, DGX-2 and T4.
 
-This model is trained with mixed precision using Tensor Cores on NVIDIA Volta GPUs and evaluated on Volta and Turing GPUs. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
 
 The original paper takes the output of the Jasper acoustic model and shows results for 3 different decoding variations: greedy decoding, beam search with a 6-gram language model and beam search with further rescoring of the best ranked hypotheses with Transformer XL, which is a neural language model. Beam search and the rescoring with the neural language model scores are run on CPU and result in better word error rates compared to greedy decoding.
 This repository provides instructions to reproduce greedy decoding results. To run beam search or rescoring with TransformerXL, use the following scripts from the [openseq2seq](https://github.com/NVIDIA/OpenSeq2Seq) repository:
@@ -70,7 +73,6 @@ https://github.com/NVIDIA/OpenSeq2Seq/blob/master/scripts/decode.py
 https://github.com/NVIDIA/OpenSeq2Seq/tree/master/external_lm_rescore
 
 ### Model architecture
-
 Details on the model architecture can be found in the paper [Jasper: An End-to-End Convolutional Neural Acoustic Model](https://arxiv.org/pdf/1904.03288.pdf).
 
 |<img src="images/jasper_model.png" width="100%" height="40%"> | <img src="images/jasper_dense_residual.png" width="100%" height="40%">|
@@ -94,34 +96,30 @@ in time in order to process a shorter time sequence for efficiency. The Epilogue
 The paper achieves best results with Jasper 10x5 with dense residual connections, which is also the focus of this repository and is in the following referred to as Jasper Large.
 
 ### Default configuration
-
 The following features were implemented in this model:
 
 * GPU-supported feature extraction with data augmentation options [SpecAugment](https://arxiv.org/abs/1904.08779) and [Cutout](https://arxiv.org/pdf/1708.04552.pdf)
 * offline and online [Speed Perturbation](https://www.danielpovey.com/files/2015_interspeech_augmentation.pdf)
 * data-parallel multi-GPU training and evaluation
 * AMP with dynamic loss scaling for Tensor Core training
-* FP16 inference with AMP
-
+* FP16 inference
 
 Competitive training results and analysis is provided for the following Jasper model configuration
 
-| **Model** | **Number of Blocks**  | **Number of Subblocks**  | **Max sequence length**   | **Number of Parameters**  |
-|---    |---    |---    |---    |---    |
-| Jasper Large  | 10    | 5 | 16.7s | 333M  |
+|    **Model** | **Number of Blocks** | **Number of Subblocks** | **Max sequence length** | **Number of Parameters** |
+|--------------|----------------------|-------------------------|-------------------------|--------------------------|
+| Jasper Large |                   10 |                       5 |                  16.7 s |                    333 M |
 
 
 ### Feature support matrix
-
 The following features are supported by this model.
 
 | **Feature**   | **Jasper**    |
-|---    |---    |
+|---------------|---------------|
 |[Apex AMP](https://nvidia.github.io/apex/amp.html) | Yes |
 |[Apex DistributedDataParallel](https://nvidia.github.io/apex/parallel.html#apex.parallel.DistributedDataParallel) | Yes |
 
 #### Features
-
 [Apex AMP](https://nvidia.github.io/apex/amp.html) - a tool that enables Tensor Core-accelerated training. Refer to the [Enabling mixed precision](#enabling-mixed-precision) section for more details.
 
 [Apex
@@ -136,15 +134,13 @@ smaller gradient transfers to reduce the total number of transfers required.
 
 
 ### Mixed precision training
-
-*Mixed precision* is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+*Mixed precision* is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
 
 1. Porting the model to use the FP16 data type where appropriate.
 2. Adding loss scaling to preserve small gradient values.
 
 The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
 
-
 For information about:
 * How to train using mixed precision, see the[Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
 * Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
@@ -152,10 +148,7 @@ For information about:
 
 
 #### Enabling mixed precision
-
-For training, mixed precision can be enabled by setting the flag: `train.py --fp16`. You can change this behavior and execute the training in
-single precision by removing the `--fp16` flag for the `train.py` training
-script. For example, in the bash scripts `scripts/train.sh`, `scripts/inference.sh`, etc. the precision can be specified with the variable `PRECISION` by setting it to either `PRECISION=’fp16’` or  `PRECISION=’fp32’`.
+For training, mixed precision can be enabled by setting the flag: `train.py --amp`. When using bash helper scripts:  `scripts/train.sh` `scripts/inference.sh`, etc., mixed precision can be enabled with env variable `AMP=true`.
 
 Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
 (AMP) library from [APEX](https://github.com/NVIDIA/apex) that casts variables
@@ -176,12 +169,12 @@ leverage Tensor Cores performance.
 The following steps were needed to enable mixed precision training in Jasper:
 
 * Import AMP from APEX (file: `train.py`):
-```
+```bash
 from apex import amp
 ```
 
 * Initialize AMP and wrap the model and the optimizer
-```
+```bash
    model, optimizer = amp.initialize(
      min_loss_scale=1.0,
      models=model,
@@ -191,46 +184,53 @@ from apex import amp
 ```
 
 * Apply `scale_loss` context manager
-```
+```bash
 with amp.scale_loss(loss, optimizer) as scaled_loss:
     scaled_loss.backward()
 ```
 
+#### Enabling TF32
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](#https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
 
-### Glossary
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](#https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
 
-Acoustic model:
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+### Glossary
+**Acoustic model**
 Assigns a probability distribution over a vocabulary of characters given an audio frame.
 
-Language Model:
+**Language Model**
 Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
 
-Pre-training:
+**Pre-training**
 Training a model on vast amounts of data on the same (or different) task to build general understandings.
 
-Automatic Speech Recognition (ASR):
+**Automatic Speech Recognition (ASR)**
 Uses both acoustic model and language model to output the transcript of an input audio signal.
 
 
 ## Setup
-
 The following section lists the requirements in order to start training and evaluating the Jasper model.
 
 ### Requirements
-
-This repository contains a `Dockerfile` which extends the PyTorch 19.10-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+This repository contains a `Dockerfile` which extends the PyTorch 20.06-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
-* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+-   Supported GPUs:
+    - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+    - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
+    - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
 
 Further required python packages are listed in `requirements.txt`, which are automatically installed with the Docker container built. To manually install them, run
 ```bash
 pip install -r requirements.txt
 ```
 
-
-
 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
 
 * [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
@@ -242,7 +242,7 @@ For those unable to use the PyTorch NGC container, to set up the required enviro
 
 ## Quick Start Guide
 
-To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the Jasper model on the Librispeech dataset. For details concerning training and inference, see [Advanced](#Advanced) section.
+To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the Jasper model on the Librispeech dataset. For details concerning training and inference, see [Advanced](#Advanced) section.
 
 1. Clone the repository.
 ```bash
@@ -325,7 +325,7 @@ Make sure the downloaded and preprocessed dataset is located at `<DATA_DIR>/Libr
 ```bash
 bash scripts/train.sh [OPTIONS]
 ```
-By default, this will use automatic mixed precision, a batch size of 64 and run on a total of 8 GPUs. The hyperparameters are tuned for DGX-1 32GB 8x V100 GPUs and will require adjustment for 16GB GPUs (e.g. by using more gradient accumulation steps)
+By default automatic precision is disabled, batch size is 64 over two gradient accumulation steps, and the recipe is run on a total of 8 GPUs. The hyperparameters are tuned for a GPU with at least 32GB of memory and will require adjustment for 16GB GPUs (e.g., by lowering batch size and using more gradient accumulation steps).
 
 More details on available [OPTIONS] can be found in [Parameters](#parameters) and [Training process](#training-process).
 
@@ -345,12 +345,12 @@ More details on available [OPTIONS] can be found in [Parameters](#parameters) an
 
 Inside the container, use the following script to run inference.
  Make sure the downloaded and preprocessed dataset is located at `<DATA_DIR>/LibriSpeech` on the host (see Step 3), which corresponds to `/datasets/LibriSpeech` inside the container.
-A pretrained model checkpoint can be downloaded from `NGC model repository`[https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16].
+A pretrained model checkpoint can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16).
 
 ```bash
 bash scripts/inference.sh [OPTIONS]
 ```
-By default this will use full precision, a batch size of 64 and run on a single GPU.
+By default this will use single precision, a batch size of 64 and run on a single GPU.
 
 More details on available [OPTIONS] can be found in [Parameters](#parameters) and [Inference process](#inference-process).
 
@@ -360,7 +360,6 @@ More details on available [OPTIONS] can be found in [Parameters](#parameters) an
 The following sections provide greater details of the dataset, running training and inference, and getting training and inference results.
 
 
-
 ### Scripts and sample code
 In the `root` directory, the most important files are:
 * `train.py` - Serves as entry point for training
@@ -384,29 +383,30 @@ The `scripts/` folder encapsulates all the one-click scripts required for runnin
 
 
 Other folders included in the `root` directory are:
-* `notebooks/` - Contains Jupyter notebooks and example audio files
-* `configs/` - Model configurations
-* `utils/` - Contains the necessary files for data download and  processing
-* `parts/` - Contains the necessary files for data pre-processing
+* `notebooks/` - Jupyter notebooks and example audio files
+* `configs/    - model configurations
+* `utils/`     - data downloading and common routines
+* `parts/`     - data pre-processing
 
 ### Parameters
 
+Parameters could be set as env variables, or passed as positional arguments.
+
 The complete list of available parameters for `scripts/train.sh` script contains:
 ```bash
  DATA_DIR: directory of dataset. (default: '/datasets/LibriSpeech')
  MODEL_CONFIG: relative path to model configuration. (default: 'configs/jasper10x5dr_sp_offline_specaugment.toml')
  RESULT_DIR: directory for results, logs, and created checkpoints. (default: '/results')
-CHECKPOINT: model checkpoint to continue training from. Model checkpoint is a dictionary object that contains apart from the model weights the optimizer state as well as the epoch number. If CHECKPOINT is "none" , training starts from scratch. (default: "none")
- CREATE_LOGFILE: boolean that indicates whether to create a training log that will be stored in `$RESULT_DIR`. (default: "true")
- CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: 'true')
+ CHECKPOINT: model checkpoint to continue training from. Model checkpoint is a dictionary object that contains apart from the model weights the optimizer state as well as the epoch number. If CHECKPOINT is set, training starts from scratch. (default: "")
+ CREATE_LOGFILE: boolean that indicates whether to create a training log that will be stored in `$RESULT_DIR`. (default: true)
+ CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: true)
  NUM_GPUS: number of GPUs to use. (default: 8)
- PRECISION: options are fp32 and fp16 with AMP. (default: 'fp16')
+ AMP: if set to `true`, enables automatic mixed precision (default: false)
  EPOCHS: number of training epochs. (default: 400)
  SEED: seed for random number generator and used for ensuring reproducibility. (default: 6)
  BATCH_SIZE: data batch size. (default: 64)
  LEARNING_RATE: Initial learning rate. (default: 0.015)
- GRADIENT_ACCUMULATION_STEPS: number of gradient accumulation steps until optimizer updates weights. (default: 1)
- LAUNCH_OPT: additional launch options. (default: "none")
+ GRADIENT_ACCUMULATION_STEPS: number of gradient accumulation steps until optimizer updates weights. (default: 2)
 ```
 
 The complete list of available parameters for `scripts/inference.sh` script contains:
@@ -416,13 +416,13 @@ DATASET: name of dataset to use. (default: 'dev-clean')
 MODEL_CONFIG: model configuration. (default: 'configs/jasper10x5dr_sp_offline_specaugment.toml')
 RESULT_DIR: directory for results and logs. (default: '/results')
 CHECKPOINT: model checkpoint path. (required)
-CREATE_LOGFILE: boolean that indicates whether to create a log file that will be stored in `$RESULT_DIR`. (default: "true")
-CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: 'false')
-PRECISION: options are fp32 and fp16 with AMP. (default: 'fp32')
+CREATE_LOGFILE: boolean that indicates whether to create a log file that will be stored in `$RESULT_DIR`. (default: true)
+CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: false)
+AMP: if set to `true`, enables FP16 inference with AMP (default: false)
 NUM_STEPS: number of inference steps. If -1 runs inference on entire dataset. (default: -1)
 SEED: seed for random number generator and useful for ensuring reproducibility. (default: 6)
 BATCH_SIZE: data batch size.(default: 64)
-MODELOUTPUT_FILE: destination path for serialized model output with binary protocol. If 'none' does not save model output. (default: 'none')
+LOGITS_FILE: destination path for serialized model output with binary protocol. If 'none' does not save model output. (default: 'none')
 PREDICTION_FILE: destination path for saving predictions. If 'none' does not save predictions. (default: '${RESULT_DIR}/${DATASET}.predictions)
 ```
 
@@ -433,10 +433,10 @@ DATASET: name of dataset to use.(default: 'dev-clean')
 MODEL_CONFIG: model configuration.(default: 'configs/jasper10x5dr_sp_offline_specaugment.toml')
 RESULT_DIR: directory for results and logs. (default: '/results')
 CHECKPOINT: model checkpoint path. (required)
-CREATE_LOGFILE: boolean that indicates whether to create a log file that will be stored in `$RESULT_DIR`. (default: 'true')
-CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: 'false')
+CREATE_LOGFILE: boolean that indicates whether to create a log file that will be stored in `$RESULT_DIR`. (default: true)
+CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mde for using more optimized kernels. (default: false)
 NUM_GPUS: number of GPUs to run evaluation on (default: 1)
-PRECISION: options are fp32 and fp16 with AMP.(default: 'fp32')
+AMP: if set to `true`, enables FP16 with AMP (default: false)
 NUM_STEPS: number of inference steps per GPU. If -1 runs inference on entire dataset (default: -1)
 SEED: seed for random number generator and useful for ensuring reproducibility. (default: 0)
 BATCH_SIZE: data batch size.(default: 64)
@@ -454,22 +454,20 @@ The complete list of available parameters for `scripts/train_benchmark.sh` scrip
 DATA_DIR: directory of dataset.(default: '/datasets/LibriSpeech')
 MODEL_CONFIG: model configuration. (default: 'configs/jasper10x5dr_sp_offline_specaugment.toml')
 RESULT_DIR: directory for results and logs. (default: '/results')
-CREATE_LOGFILE: boolean that indicates whether to create a log file that will be stored in `$RESULT_DIR`. (default: 'true')
-CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: 'true')
+CREATE_LOGFILE: boolean that indicates whether to create a log file that will be stored in `$RESULT_DIR`. (default: true)
+CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: true)
 NUM_GPUS: number of GPUs to use. (default: 8)
-PRECISION: options are fp32 and fp16 with AMP. (default: 'fp16')
+AMP: if set to `true`, enables automatic mixed precision with AMP (default: false)
 NUM_STEPS: number of training iterations. If -1 runs full training for  400 epochs. (default: -1)
 MAX_DURATION: filters out input audio data that exceed a maximum number of seconds. This ensures that when all filtered audio samples are padded to maximum length that length will stay under this specified threshold (default: 16.7)
 SEED: seed for random number generator and useful for ensuring reproducibility. (default: 0)
-BATCH_SIZE: data batch size.(default: 64)
+BATCH_SIZE: data batch size.(default: 32)
 LEARNING_RATE: Initial learning rate. (default: 0.015)
 GRADIENT_ACCUMULATION_STEPS: number of gradient accumulation steps until optimizer updates weights. (default: 1)
 PRINT_FREQUENCY: number of iterations after which training progress is printed. (default: 1)
 ```
 
-
 ### Command-line options
-
 To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option with the Python file, for example:
 
 ```bash
@@ -478,15 +476,13 @@ python inference.py --help
 ```
 
 ### Getting the data
-
 The Jasper model was trained on LibriSpeech dataset. We use the concatenation of `train-clean-100`, `train-clean-360` and `train-other-500` for training and `dev-clean` for validation.
 
 This repository contains the `scripts/download_librispeech.sh` and `scripts/preprocess_librispeech.sh` scripts which will automatically download and preprocess the training, test and development datasets. By default, data will be downloaded to the `/datasets/LibriSpeech` directory, a minimum of 500GB free space is required for download and preprocessing, the final preprocessed dataset is 320GB.
 
 
 #### Dataset guidelines
-
-The `scripts/preprocess_librispeech.sh` script converts the input audio files to WAV format with a sample rate of 16kHz, target transcripts are striped from whitespace characters, then lower-cased. For `train-clean-100`, `train-clean-360` and `train-other-500` it also creates speed perturbed versions with rates of 0.9 and 1.1 for data augmentation.
+The `scripts/preprocess_librispeech.sh` script converts the input audio files to WAV format with a sample rate of 16kHz, target transcripts are stripped from whitespace characters, then lower-cased. For `train-clean-100`, `train-clean-360` and `train-other-500` it also creates speed perturbed versions with rates of 0.9 and 1.1 for data augmentation.
 
 After preprocessing, the script creates JSON files with output file paths, sample rate, target transcript and other metadata. These JSON files are used by the training script to identify training and validation datasets.
 
@@ -499,11 +495,11 @@ The training is performed using `train.py` script along with parameters defined
 The `scripts/train.sh` script runs a job on a single node that trains the Jasper model from scratch using LibriSpeech as training data. To make training more efficient, we discard audio samples longer than 16.7 seconds from the training dataset, the total number of these samples is less than 1%. Such filtering does not degrade accuracy, but it allows us to decrease the number of time steps in a batch, which requires less GPU memory and increases training speed.
 Apart from the default arguments as listed in the [Parameters](#parameters) section, by default the training script:
 
-* Runs on 8 32GB V100 GPUs with training and evaluation batch size 64
-* Uses FP16 precision with AMP optimization level O1 (default)
-* Enables cudnn benchmark to make mixed precision training faster
+* Runs on 8 GPUs with at least 32GB of memory and training/evaluation batch size 64, split over two gradient accumulation steps
+* Uses TF32 precision (A100 GPU) or FP32 (other GPUs)
 * Trains on the concatenation of all 3 LibriSpeech training datasets and evaluates on the LibriSpeech dev-clean dataset
-* Uses a seed of 6
+* Maintains an exponential moving average of parameters for evaluation
+* Has cudnn benchmark enabled
 * Runs for 400 epochs
 * Uses an initial learning rate of 0.015 and polynomial (quadratic) learning rate decay
 * Saves a checkpoint every 10 epochs
@@ -516,97 +512,89 @@ Apart from the default arguments as listed in the [Parameters](#parameters) sect
 * Pads each sequence in a batch to the same length (smallest multiple of 16 that is at least the length of the longest sequence in the batch)
 * Uses masked convolutions and dense residuals as described in the paper
 * Uses weight decay of 0.001
-* Uses 1 gradient accumulation step
 * Uses [Novograd](https://arxiv.org/pdf/1905.11286.pdf) as optimizer with betas=(0.95, 0)
 
-
-These parameters will match the greedy WER [Results](#results) of the Jasper paper on a DGX1 with 32GB V100 GPUs.
+Enabling AMP permits batch size 64 with one gradient accumulation step. Such setup will match the greedy WER [Results](#results) of the Jasper paper on a DGX-1 with 32GB V100 GPUs.
 
 ### Inference process
-
 Inference is performed using the `inference.py` script along with parameters defined in `scripts/inference.sh`.
 The `scripts/inference.sh` script runs the job on a single GPU, taking a pre-trained Jasper model checkpoint and running it on the specified dataset.
 Apart from the default arguments as listed in the [Parameters](#parameters) section by default the inference script:
 
 * Evaluates on the LibriSpeech dev-clean dataset
-* Uses full precision
 * Uses a batch size of 64
 * Runs for 1 epoch and prints out the final word error rate
 * Creates a log file with progress and results which will be stored in the results folder
-* Pads each sequence in a batch to the same length (smallest multiple of 16 that is at least the length of the longest sequence in the batch)
+* Pads each sequence in a batch to the same length (smallest multiple of 16 that is at least the length of the longest sequence in the batch
 * Does not use data augmentation
 * Does greedy decoding and saves the transcription in the results folder
 * Has the option to save the model output tensors for more complex decoding, for example, beam search
 * Has cudnn benchmark disabled
 
 ### Evaluation process
-
 Evaluation is performed using the `inference.py` script along with parameters defined in `scripts/evaluation.sh`.
 The `scripts/evaluation.sh` script runs a job on a single GPU, taking a pre-trained Jasper model checkpoint and running it on the specified dataset.
 Apart from the default arguments as listed in the [Parameters](#parameters) section, by default the evaluation script:
 
 * Uses a batch size of 64
 * Evaluates the LibriSpeech dev-clean dataset
-* Uses full precision
 * Runs for 1 epoch and prints out the final word error rate
 * Creates a log file with progress and results which is saved in the results folder
 * Pads each sequence in a batch to the same length (smallest multiple of 16 that is at least the length of the longest sequence in the batch)
 * Does not use data augmentation
 * Has cudnn benchmark disabled
 
-
-### Inference Process with TensorRT
+### Deploying Jasper using TensorRT
 NVIDIA TensorRT is a platform for high-performance deep learning inference. It includes a deep learning inference optimizer and runtime that delivers low latency and high-throughput for deep learning inference applications. Jasper’s architecture, which is of deep convolutional nature, is designed to facilitate fast GPU inference. After optimizing the compute-intensive acoustic model with NVIDIA TensorRT, inference throughput increased by up to 1.8x over native PyTorch. 
 More information on how to perform inference using TensorRT and speed up comparison between TensorRT and native PyTorch can be found in the subfolder [./trt/README.md](trt/README.md)
 
-### Inference Process with TensorRT Inference Server
-The NVIDIA TensorRT Inference Server provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
+### Deploying Jasper using Triton Inference Server
+The NVIDIA Triton Inference Server provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
 More information on how to perform inference using TensorRT Inference Server with different model backends can be found in the subfolder [./trtis/README.md](trtis/README.md)
 
 
 ## Performance
 
 ### Benchmarking
-
 The following section shows how to run benchmarks measuring the model performance in training and inference modes.
 
 #### Training performance benchmark
-
-To benchmark the training performance on a specific batch size and audio length, run:
+To benchmark the training performance on a specific batch size and audio length, for `NUM_STEPS` run:
 
 ```bash
-bash scripts/train_benchmark.sh <DATA_DIR> <MODEL_CONFIG> <RESULT_DIR> <CREATE_LOGFILE> <CUDNN_BENCHMARK> <NUM_GPUS> <PRECISION> <NUM_STEPS> <MAX_DURATION> <SEED> <BATCH_SIZE>
-<LEARNING_RATE> <GRADIENT_ACCUMULATION_STEPS> <PRINT_FREQUENCY>
+export NUM_STEPS=<NUM_STEPS>
+export MAX_DURATION=<DURATION>
+export BATCH_SIZE=<BATCH_SIZE>
+bash scripts/train_benchmark.sh
 ```
 
-By default, this script runs 400 epochs on the configuration `configs/jasper10x5dr_sp_offline_specaugment.toml` using full precision
-and batch size 64 on a single node with 8x 32GB V100 GPUs cards.
-By default, `NUM_STEPS=-1` means training is run for 400 EPOCHS. If `$NUM_STEPS > 0` is specified, training is only run for a user-defined number of iterations. Audio samples longer than `MAX_DURATION` are filtered out, the remaining ones are padded to this duration such that all batches have the same length. At the end of training the script saves the model checkpoint to the results folder, runs evaluation on LibriSpeech dev-clean dataset, and prints out information such as average training latency performance in seconds, average training throughput in sequences per second, final training loss, final training WER, evaluation loss and evaluation WER.
-
+By default, this script runs 400 epochs on the configuration `configs/jasper10x5dr_sp_offline_specaugment.toml`
+using batch size 32 on a single node with 8x GPUs with at least 32GB of memory.
+By default, `NUM_STEPS=-1` means training is run for 400 EPOCHS. If `$NUM_STEPS > 0` is specified, training is only run for a user-defined number of iterations. Audio samples longer than `MAX_DURATION` are filtered out, the remaining ones are padded to this duration such that all batches have the same length. At the end of training the script saves the model checkpoint to the results folder, runs evaluation on LibriSpeech dev-clean dataset, and prints out information such as average training latency performance in seconds, average trng throughput in sequences per second, final training loss, final training WER, evaluation loss and evaluation WER.
 
 
 #### Inference performance benchmark
-
 To benchmark the inference performance on a specific batch size and audio length, run:
 
 ```bash
-bash scripts/inference_benchmark.sh <DATA_DIR> <DATASET> <MODEL_CONFIG> <RESULT_DIR> <CHECKPOINT> <CREATE_LOGFILE> <CUDNN_BENCHMARK> <PRECISION> <NUM_GPUS> <MAX_DURATION>
-<SEED> <BATCH_SIZE>
+bash scripts/inference_benchmark.sh
 ```
-By default, the script runs on a single GPU and evaluates on the entire dataset using the model configuration `configs/jasper10x5dr_sp_offline_specaugment.toml`, full precision, cudnn benchmark for faster fp16 inference and batch size 64.
+By default, the script runs on a single GPU and evaluates on the entire dataset using the model configuration `configs/jasper10x5dr_sp_offline_specaugment.toml` and batch size 32.
 By default, `MAX_DURATION` is set to 36 seconds, which covers the maximum audio length. All audio samples are padded to this length. The script prints out `MAX_DURATION`, `BATCH_SIZE` and latency performance in milliseconds per batch.
 
-
+Adjustments can be made with env variables, e.g.,
+```bash
+export SEED=42
+export BATCH_SIZE=1
+bash scripts/inference_benchmark.sh
+```
 
 ### Results
-
 The following sections provide details on how we achieved our performance and accuracy in training and inference.
 All results are trained on 960 hours of LibriSpeech with a maximum audio length of 16.7s. The training is evaluated
 on LibriSpeech dev-clean, dev-other, test-clean, test-other.
 The results for Jasper Large's word error rate from the original paper after greedy decoding are shown below:
 
-
-
 | **Number of GPUs**    |  **dev-clean WER** | **dev-other WER**| **test-clean WER**| **test-other WER**
 |---    |---    |---    |---    |---    |
 |8  |   3.64|   11.89| 3.86 | 11.95
@@ -614,204 +602,241 @@ The results for Jasper Large's word error rate from the original paper after gre
 
 #### Training accuracy results
 
-##### Training accuracy: NVIDIA DGX-1 (8x V100 32G)
+##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+Our results were obtained by running the `scripts/train.sh` training script in the 20.06-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs.
 
-Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.10-py3 NGC container with NVIDIA DGX-1 with (8x V100 32G) GPUs.
-The following tables report the word error rate(WER) of the acoustic model with greedy decoding on all LibriSpeech dev and test datasets for mixed precision training.
+| **Number of GPUs** | **Batch size per GPU** | **Precision** | **dev-clean WER** | **dev-other WER** | **test-clean WER** | **test-other WER** | **Time to train** | **Time to train speedup (TF32 to mixed precision)** |
+|-----|-----|-------|-------|-------|------|-------|-------|-----|
+|   8 |  64 | mixed |  3.53 | 11.11 | 3.75 | 11.07 | 60 h  | 1.9 |
+|   8 |  64 |  TF32 |  3.55 | 11.30 | 3.81 | 11.17 | 115 h |  -  |
 
-FP16 (seed #6)
+For each precision, we show the best of 8 runs chosen based on dev-clean WER. For TF32, two gradient accumulation steps have been used.
 
-| **Number of GPUs**    | **Batch size per GPU**    | **dev-clean WER** | **dev-other WER**| **test-clean WER**| **test-other WER**| **Total time to train with FP16 (Hrs)** |
-|---    |---    |---    |---    |---    |---    |---    |
-|8 |64| 3.51|11.14|3.74|11.06|100
-
-FP32 training matches the results of mixed precision training and takes approximately 330 hours.
+##### Training accuracy: NVIDIA DGX-1 (8x V100 32GB)
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 20.06-py3 NGC container with NVIDIA DGX-1 with (8x V100 32GB) GPUs.
+The following tables report the word error rate(WER) of the acoustic model with greedy decoding on all LibriSpeech dev and test datasets for mixed precision training.
 
+| **Number of GPUs** | **Batch size per GPU** | **Precision** | **dev-clean WER** | **dev-other WER** | **test-clean WER** | **test-other WER** | **Time to train** | **Time to train speedup (FP32 to mixed precision)** |
+|-----|-----|-------|-------|-------|------|-------|-------|-----|
+|   8 |  64 | mixed |  3.49 | 11.22 | 3.74 | 10.94 | 105 h | 3.1 |
+|   8 |  64 |  FP32 |  3.65 | 11.47 | 3.86 | 11.30 | 330 h |  -  |
 
+We show the best of 5 runs (mixed precision) and 2 runs (FP32) chosen based on dev-clean WER. For FP32, two gradient accumulation steps have been used.
 
 ##### Training stability test
-
 The following table compares greedy decoding word error rates across 8 different training runs with different seeds for mixed precision training.
 
-| **FP16, 8x GPUs** | **seed #1** | **seed #2** | **seed #3** | **seed #4** | **seed #5** | **seed #6** | **seed #7** | **seed #8** | **mean** | **std** |
-|:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
-|dev-clean|3.74|3.75|3.77|3.68|3.75|3.51|3.71|3.58|3.69|0.09
-|dev-other|11.56|11.62|11.5|11.36|11.62|11.14|11.8|11.3|11.49|0.21
-|test-clean|3.9|3.95|3.88|3.79|3.95|3.74|4.03|3.85|3.89|0.09
-|test-other|11.47|11.54|11.51|11.29|11.54|11.06|11.68|11.29|11.42|0.20
-
-
+| **DGX A100, FP16, 8x GPU**   |   **Seed #1** |   **Seed #2** |   **Seed #3** |   **Seed #4** |   **Seed #5** |   **Seed #6** |   **Seed #7** |   **Seed #8** |   **Mean** |   **Std** |
+|-----------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|-----:|
+| dev-clean  |  3.69 |  3.71 |  3.64 |  3.53 |  3.71 |  3.66 |  3.77 |  3.70 |  3.68 | 0.07 |
+| dev-other  | 11.39 | 11.65 | 11.46 | 11.11 | 11.23 | 11.18 | 11.43 | 11.60 | 11.38 | 0.19 |
+| test-clean |  3.97 |  3.96 |  3.81 |  3.75 |  3.90 |  3.82 |  3.93 |  3.82 |  3.87 | 0.08 |
+| test-other | 11.27 | 11.34 | 11.40 | 11.07 | 11.24 | 11.29 | 11.58 | 11.58 | 11.35 | 0.17 |
+
+| **DGX A100, TF32, 8x GPU**   |   **Seed #1** |   **Seed #2** |   **Seed #3** |   **Seed #4** |   **Seed #5** |   **Seed #6** |   **Seed #7** |   **Seed #8** |   **Mean** |   **Std** |
+|-----------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|-----:|
+| dev-clean  |  3.56 |  3.60 |  3.60 |  3.55 |  3.65 |  3.57 |  3.89 |  3.67 |  3.64 | 0.11 |
+| dev-other  | 11.27 | 11.41 | 11.65 | 11.30 | 11.51 | 11.11 | 12.18 | 11.50 | 11.49 | 0.32 |
+| test-clean |  3.80 |  3.79 |  3.88 |  3.81 |  3.94 |  3.82 |  4.13 |  3.85 |  3.88 | 0.11 |
+| test-other | 11.40 | 11.26 | 11.47 | 11.17 | 11.36 | 11.16 | 12.15 | 11.46 | 11.43 | 0.32 |
+
+| **DGX-1 32GB, FP16, 8x GPU**   |   **Seed #1** |   **Seed #2** |   **Seed #3** |   **Seed #4** |   **Seed #5** |   **Mean** |   **Std** |
+|-----------:|------:|------:|------:|------:|------:|------:|-----:|
+| dev-clean  |  3.69 |  3.75 |  3.63 |  3.86 |  3.49 |  3.68 | 0.14 |
+| dev-other  | 11.35 | 11.63 | 11.60 | 11.68 | 11.22 | 11.50 | 0.20 |
+| test-clean |  3.90 |  3.84 |  3.94 |  3.96 |  3.74 |  3.88 | 0.09 |
+| test-other | 11.17 | 11.45 | 11.31 | 11.60 | 10.94 | 11.29 | 0.26 |
 
 #### Training performance results
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 20.06-py3 NGC container. Performance (in sequences per second) is the steady-state throughput.
 
-Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 19.10-py3 NGC container. Performance (in sequences per second) is the steady-state throughput.
+##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+| **GPUs** | **Batch size / GPU** | **Throughput - TF32** | **Throughput - mixed precision** | **Throughput speedup (TF32 to mixed precision)** | **Weak scaling - TF32** | **Weak scaling - mixed precision** |
+|--:|---:|------:|-------:|-----:|-----:|-----:|
+| 1 | 32 |  36.09 |  69.33 | 1.92 | 1.00 | 1.00 |
+| 4 | 32 | 143.05 | 264.91 | 1.85 | 3.96 | 3.82 |
+| 8 | 32 | 285.25 | 524.33 | 1.84 | 7.90 | 7.56 |
 
-##### Training performance: NVIDIA DGX-1 (8x V100 16G)
+| **GPUs** | **Batch size / GPU** | **Throughput - TF32** | **Throughput - mixed precision** | **Throughput speedup (TF32 to mixed precision)** | **Weak scaling - TF32** | **Weak scaling - mixed precision** |
+|--:|---:|------:|-------:|-----:|-----:|-----:|
+| 1 | 64 |      - |  77.79 |    - |    - | 1.00 |
+| 4 | 64 |      - | 304.32 |    - |    - | 3.91 |
+| 8 | 64 |      - | 602.88 |    - |    - | 7.75 |
 
-| **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
-|---|---|-----|------|----|----|----|
-| 1 | 16 | 10| 29.63| 2.96| 1.00| 1.00|
-| 4 | 16 | 38.79| 106.67| 2.75| 3.88| 3.60|
-| 8 | 16 | 76.64| 209.84| 2.74| 7.66| 7.08|
+Note: Mixed precision permits higher batch sizes during training. We report the maximum batch sizes (as powers of 2), which are allowed without gradient accumulation.
 
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
 | **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
-|---|---|-----|------|----|----|----|
-| 1 | 32 | - | 35.16 | - | - | 1.00 |
-| 4 | 32 | - | 134.74 | - | - | 3.83 |
-| 8 | 32 | - | 263.92 | - | - | 7.51 |
-
+|--:|---:|------:|-------:|-----:|-----:|-----:|
+| 1 | 16 | 11.12 |  28.87 | 2.60 | 1.00 | 1.00 |
+| 4 | 16 | 42.39 | 109.40 | 2.58 | 3.81 | 3.79 |
+| 8 | 16 | 84.45 | 194.30 | 2.30 | 7.59 | 6.73 |
 
+| **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
+|--:|---:|------:|-------:|-----:|-----:|-----:|
+| 1 | 32 |     - |  37.57 |    - |    - | 1.00 |
+| 4 | 32 |     - | 134.80 |    - |    - | 3.59 |
+| 8 | 32 |     - | 276.14 |    - |    - | 7.35 |
 
-Note: The respective values for FP32 runs that use a batch size of 32 are not available due to out of memory errors that arise. Batch size of 32 is only available when using FP16.
+Note: Mixed precision permits higher batch sizes during training. We report the maximum batch sizes (as powers of 2), which are allowed without gradient accumulation.
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
 ##### Training performance: NVIDIA DGX-1 (8x V100 32GB)
-
 | **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
-|---|---|-----|------|----|----|----|
-| 1 | 32 | 12.26| 34.04| 2.78| 1.00| 1.00|
-| 4 | 32 | 48.67| 131.96| 2.71| 3.97| 3.88|
-| 8 | 32 | 95.88| 253.47| 2.64| 7.82| 7.45|
-
+|--:|---:|------:|-------:|-----:|-----:|-----:|
+| 1 | 32 | 13.15 |  35.63 | 2.71 | 1.00 | 1.00 |
+| 4 | 32 | 51.21 | 134.01 | 2.62 | 3.90 | 3.76 |
+| 8 | 32 | 99.88 | 247.97 | 2.48 | 7.60 | 6.96 |
 
 | **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
-|---|---|-----|------|----|----|----|
-| 1 | 64 | - | 41.03 | - | - | 1.00 |
-| 4 | 64 | - | 159.01 | - | - | 3.88 |
-| 8 | 64 | - | 312.20 | - | - | 7.61 |
+|--:|---:|------:|-------:|-----:|-----:|-----:|
+| 1 | 64 |     - |  41.74 |    - |    - | 1.00 |
+| 4 | 64 |     - | 158.44 |    - |    - | 3.80 |
+| 8 | 64 |     - | 312.22 |    - |    - | 7.48 |
 
-
-Note: The respective values for FP32 runs that use a batch size of 64 are not available due to out of memory errors that arise. Batch size of 64 is only available when using FP16.
+Note: Mixed precision permits higher batch sizes during training. We report the maximum batch sizes (as powers of 2), which are allowed without gradient accumulation.
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
 ##### Training performance: NVIDIA DGX-2 (16x V100 32GB)
-
 | **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
-|---|---|-----|------|----|----|----|
-| 1 | 32 | 8.12| 24.24| 2.98| 1.00| 1.00|
-| 4 | 32 | 32.16| 92.09| 2.86| 3.96| 3.80|
-| 8 | 32 | 63.68| 181.56| 2.85| 7.84| 7.49|
-|16 | 32 | 124.88| 275.67| 2.20| 15.38| 11.35|
-
+|---:|---:|-------:|-------:|-----:|------:|------:|
+|  1 | 32 |  14.13 |  41.05 | 2.90 |  1.00 |  1.00 |
+|  4 | 32 |  54.32 | 156.47 | 2.88 |  3.84 |  3.81 |
+|  8 | 32 | 110.26 | 307.13 | 2.79 |  7.80 |  7.48 |
+| 16 | 32 | 218.14 | 561.85 | 2.58 | 15.44 | 13.69 |
 
 | **GPUs** | **Batch size / GPU** | **Throughput - FP32** | **Throughput - mixed precision** | **Throughput speedup (FP32 to mixed precision)** | **Weak scaling - FP32** | **Weak scaling - mixed precision** |
-|---|---|-----|------|----|----|----|
-| 1 | 64 | - | 29.22 | - | - | 1.00 |
-| 4 | 64 | - | 114.29 | - | - | 3.91 |
-| 8 | 64 | - | 222.61 | - | - | 7.62 |
-|16 | 64 | - | 414.57 | - | - | 14.19 |
-
+|---:|---:|-------:|-------:|-----:|------:|------:|
+|  1 | 64 |      - |  46.41 |    - |     - |  1.00 |
+|  4 | 64 |      - | 147.90 |    - |     - |  3.19 |
+|  8 | 64 |      - | 359.15 |    - |     - |  7.74 |
+| 16 | 64 |      - | 703.13 |    - |     - | 15.15 |
 
-Note: The respective values for FP32 runs that use a batch size of 64 are not available due to out of memory errors that arise. Batch size of 64 is only available when using FP16.
+Note: Mixed precision permits higher batch sizes during training. We report the maximum batch sizes (as powers of 2), which are allowed without gradient accumulation.
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
 
 #### Inference performance results
-
-Our results were obtained by running the `scripts/inference_benchmark.sh` script in the PyTorch 19.10-py3 NGC container on NVIDIA DGX-1, DGX-2 and T4 on a single GPU. Performance numbers (latency in milliseconds per batch) were averaged over 1000 iterations.
-
-##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
-
-|   |       |FP16 Latency (ms) Percentiles  | | |       |FP32 Latency (ms) Percentiles  | | | | FP16/FP32 speed up|
-|---    |---    |---    |---    |---    |---    |---    |---    |---    |---    |---    |
-|BS |Sequence Length (in seconds) |90%    |95%    |99%    |Avg   |90%    |95%    |99%    |Avg   |Avg   |
-|1|2|62.16|64.71|67.29|61.31|69.37|69.75|75.38|68.95|1.12
-|2|2|60.94|63.60|68.03|59.57|82.18|83.12|84.26|75.33|1.26
-|4|2|68.38|69.55|75.85|64.82|85.74|86.85|93.78|82.55|1.27
-|8|2|68.80|71.54|73.28|62.83|104.22|106.58|109.41|95.77|1.52
-|16|2|72.33|72.85|74.55|64.69|127.11|129.34|131.46|109.80|1.70
-|1|7|59.06|60.51|62.83|58.10|75.41|75.72|78.64|74.70|1.29
-|2|7|61.68|67.73|68.58|59.53|97.85|98.59|98.99|91.60|1.54
-|4|7|60.88|62.13|65.23|60.38|119.08|119.80|121.28|118.67|1.97
-|8|7|70.71|71.82|74.23|70.16|181.48|185.00|186.20|177.98|2.54
-|16|7|93.75|94.70|100.58|92.96|219.72|220.25|221.28|215.09|2.31
-|1|16.7|68.87|69.48|71.75|63.63|101.03|101.66|104.00|100.32|1.58
-|2|16.7|73.00|73.76|75.58|66.44|145.64|146.64|152.41|143.69|2.16
-|4|16.7|77.71|78.75|79.90|77.34|224.62|225.43|226.43|223.96|2.90
-|8|16.7|96.34|97.07|104.46|95.94|318.52|319.13|320.74|316.14|3.30
-|16|16.7|154.63|156.81|159.25|151.05|375.67|377.00|381.79|371.83|2.46
+Our results were obtained by running the `scripts/inference_benchmark.sh` script in the PyTorch 20.06-py3 NGC container on NVIDIA DGX A100, DGX-1, DGX-2 and T4 on a single GPU. Performance numbers (latency in milliseconds per batch) were averaged over 1000 iterations.
+
+##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
+|    | |FP16 Latency (ms) Percentiles | | | | TF32 Latency (ms) Percentiles | | | | FP16/TF32 speed up |
+|---:|-------------:|------:|------:|------:|------:|-------:|-------:|-------:|-------:|-----:|
+| BS | Duration (s) |   90% |   95% |   99% |   Avg |    90% |    95% |    99% |    Avg |  Avg |
+|  1 |            2 | 36.31 | 36.85 | 43.18 | 35.96 |  41.16 |  41.63 |  47.90 |  40.89 | 1.14 |
+|  2 |            2 | 37.56 | 43.32 | 45.23 | 37.11 |  42.53 |  47.79 |  49.62 |  42.07 | 1.13 |
+|  4 |            2 | 43.10 | 44.85 | 47.22 | 41.43 |  47.88 |  49.75 |  51.55 |  43.25 | 1.04 |
+|  8 |            2 | 44.02 | 44.30 | 45.21 | 39.51 |  50.14 |  50.47 |  51.50 |  45.63 | 1.16 |
+| 16 |            2 | 48.04 | 48.38 | 49.12 | 42.76 |  70.90 |  71.22 |  72.50 |  60.78 | 1.42 |
+|  1 |            7 | 37.74 | 37.88 | 38.92 | 37.02 |  41.53 |  42.17 |  44.75 |  40.79 | 1.10 |
+|  2 |            7 | 40.91 | 41.11 | 42.35 | 40.02 |  46.44 |  46.80 |  49.67 |  45.67 | 1.14 |
+|  4 |            7 | 43.94 | 44.32 | 46.71 | 43.00 |  54.39 |  54.80 |  56.63 |  53.53 | 1.24 |
+|  8 |            7 | 50.01 | 50.19 | 52.92 | 48.62 |  68.55 |  69.25 |  72.28 |  67.61 | 1.39 |
+| 16 |            7 | 60.38 | 60.76 | 62.44 | 57.92 |  93.17 |  94.15 |  98.84 |  92.21 | 1.59 |
+|  1 |         16.7 | 41.39 | 41.75 | 43.62 | 40.73 |  45.79 |  46.10 |  47.76 |  45.21 | 1.11 |
+|  2 |         16.7 | 46.43 | 46.76 | 47.72 | 45.81 |  52.53 |  53.13 |  55.60 |  51.71 | 1.13 |
+|  4 |         16.7 | 50.88 | 51.68 | 54.74 | 50.11 |  66.29 |  66.96 |  70.45 |  65.00 | 1.30 |
+|  8 |         16.7 | 62.09 | 62.76 | 65.08 | 61.40 |  94.16 |  94.67 |  97.46 |  93.00 | 1.51 |
+| 16 |         16.7 | 75.22 | 76.86 | 80.76 | 73.99 | 139.51 | 140.88 | 144.10 | 137.94 | 1.86 |
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
+|    | |FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
+|---:|-------------:|------:|------:|------:|------:|-------:|-------:|-------:|-------:|-----:|
+| BS | Duration (s) |   90% |   95% |   99% |   Avg |    90% |    95% |    99% |    Avg |  Avg |
+|  1 |    2 |  52.26 |  59.93 |  66.62 |  50.34 |  70.90 |  76.47 |  79.84 |  68.61 | 1.36 |
+|  2 |    2 |  62.04 |  67.68 |  70.91 |  58.65 |  75.72 |  80.15 |  83.50 |  71.33 | 1.22 |
+|  4 |    2 |  75.12 |  77.12 |  82.80 |  66.55 |  80.88 |  82.60 |  86.63 |  73.65 | 1.11 |
+|  8 |    2 |  71.62 |  72.99 |  81.10 |  66.39 |  99.57 | 101.43 | 107.16 |  92.34 | 1.39 |
+| 16 |    2 |  78.51 |  80.33 |  87.31 |  72.91 | 104.79 | 107.22 | 114.21 |  96.18 | 1.32 |
+|  1 |    7 |  52.67 |  54.40 |  64.27 |  50.47 |  73.86 |  75.61 |  84.93 |  72.08 | 1.43 |
+|  2 |    7 |  60.49 |  62.41 |  72.87 |  58.45 |  93.07 |  94.51 | 102.40 |  91.55 | 1.57 |
+|  4 |    7 |  70.55 |  72.95 |  82.59 |  68.43 | 131.48 | 137.60 | 149.06 | 129.23 | 1.89 |
+|  8 |    7 |  83.91 |  85.28 |  93.08 |  76.40 | 152.49 | 157.92 | 166.80 | 150.49 | 1.97 |
+| 16 |    7 | 100.21 | 103.12 | 109.00 |  96.31 | 178.45 | 181.46 | 187.20 | 174.33 | 1.81 |
+|  1 | 16.7 |  56.84 |  60.05 |  66.54 |  54.69 | 109.55 | 111.19 | 120.40 | 102.25 | 1.87 |
+|  2 | 16.7 |  69.39 |  70.97 |  75.34 |  67.39 | 149.93 | 150.79 | 154.06 | 147.45 | 2.19 |
+|  4 | 16.7 |  87.48 |  93.96 | 102.73 |  85.09 | 211.78 | 219.66 | 232.99 | 208.38 | 2.45 |
+|  8 | 16.7 | 106.91 | 111.92 | 116.55 | 104.13 | 246.92 | 250.94 | 268.44 | 243.34 | 2.34 |
+| 16 | 16.7 | 149.08 | 153.86 | 166.17 | 146.28 | 292.84 | 298.02 | 313.04 | 288.54 | 1.97 |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
-
-##### Inference performance: NVIDIA DGX-1 (1x V100 32G)
-
-|   |       |FP16 Latency (ms) Percentiles  | | |       |FP32 Latency (ms) Percentiles  | | | | FP16/FP32 speed up|
-|---    |---    |---    |---    |---    |---    |---    |---    |---    |---    |---    |
-|BS |Sequence Length (in seconds) |90%    |95%    |99%    |Avg   |90%    |95%    |99%    |Avg   |Avg   |
-|1|2|61.60|62.81|69.62|60.71|82.32|83.03|85.72|77.48|1.28
-|2|2|68.82|70.10|72.08|61.91|77.99|81.99|85.13|76.93|1.24
-|4|2|70.06|70.69|72.58|74.76|88.36|89.67|95.61|94.50|1.26
-|8|2|69.98|71.51|74.20|64.20|105.82|107.16|110.04|98.02|1.53
-|16|2|72.05|74.16|75.51|65.46|130.49|130.97|132.83|112.74|1.72
-|1|7|61.40|61.78|65.53|60.93|75.72|75.83|76.55|75.35|1.24
-|2|7|60.50|60.63|61.77|60.15|91.05|91.16|92.39|90.75|1.51
-|4|7|64.67|71.41|72.10|64.19|123.77|123.99|124.92|123.38|1.92
-|8|7|67.96|68.04|69.38|67.60|176.43|176.65|177.25|175.39|2.59
-|16|7|95.41|95.80|100.94|93.86|213.04|213.38|215.52|212.05|2.26
-|1|16.7|61.28|61.67|62.52|60.63|104.37|104.56|105.22|103.83|1.71
-|2|16.7|66.88|67.31|68.09|66.40|151.08|151.61|152.26|146.73|2.21
-|4|16.7|80.51|80.79|81.95|80.12|226.75|227.07|228.76|225.82|2.82
-|8|16.7|95.66|95.89|98.86|95.62|314.74|316.74|318.66|312.10|3.26
-|16|16.7|156.60|157.07|160.15|151.13|366.70|367.41|370.98|364.05|2.41
+##### Inference performance: NVIDIA DGX-1 (1x V100 32GB)
+|    | |FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
+|---:|-------------:|------:|------:|------:|------:|-------:|-------:|-------:|-------:|-----:|
+| BS | Duration (s) |   90% |   95% |   99% |   Avg |    90% |    95% |    99% |    Avg |  Avg |
+|  1 |    2 |  64.60 |  67.34 |  79.87 |  60.73 |  84.69 |  86.78 |  96.02 |  79.32 | 1.31 |
+|  2 |    2 |  71.52 |  73.32 |  82.00 |  63.93 |  85.33 |  87.65 |  96.34 |  78.09 | 1.22 |
+|  4 |    2 |  80.38 |  84.62 |  93.09 |  74.95 |  90.29 |  97.59 | 100.61 |  84.44 | 1.13 |
+|  8 |    2 |  83.43 |  85.51 |  91.17 |  74.09 | 107.28 | 111.89 | 115.19 |  98.76 | 1.33 |
+| 16 |    2 |  90.01 |  90.81 |  96.48 |  79.85 | 115.39 | 116.95 | 123.71 | 103.26 | 1.29 |
+|  1 |    7 |  53.74 |  54.09 |  56.67 |  53.07 |  86.07 |  86.55 |  91.59 |  78.79 | 1.48 |
+|  2 |    7 |  63.34 |  63.67 |  66.08 |  62.62 |  96.25 |  96.82 |  99.72 |  95.44 | 1.52 |
+|  4 |    7 |  80.35 |  80.86 |  83.80 |  73.41 | 132.19 | 132.94 | 135.59 | 131.46 | 1.79 |
+|  8 |    7 |  77.68 |  78.11 |  86.71 |  75.72 | 156.30 | 157.72 | 165.55 | 154.87 | 2.05 |
+| 16 |    7 | 103.52 | 106.66 | 111.93 |  98.15 | 180.71 | 182.82 | 191.12 | 178.61 | 1.82 |
+|  1 | 16.7 |  57.58 |  57.79 |  59.75 |  56.58 | 104.51 | 104.87 | 108.01 | 104.04 | 1.84 |
+|  2 | 16.7 |  69.19 |  69.58 |  71.49 |  68.58 | 151.25 | 152.07 | 155.21 | 149.30 | 2.18 |
+|  4 | 16.7 |  87.17 |  88.53 |  97.41 |  86.56 | 211.28 | 212.41 | 214.97 | 208.54 | 2.41 |
+|  8 | 16.7 | 116.25 | 116.90 | 120.14 | 109.21 | 247.63 | 248.93 | 254.77 | 245.19 | 2.25 |
+| 16 | 16.7 | 151.99 | 154.79 | 163.36 | 149.80 | 293.99 | 296.05 | 303.04 | 291.00 | 1.94 |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
-
-
-##### Inference performance: NVIDIA DGX-2 (1x V100 32G)
-
-|   |       |FP16 Latency (ms) Percentiles  | | |       |FP32 Latency (ms) Percentiles  | | | | FP16/FP32 speed up|
-|---    |---    |---    |---    |---    |---    |---    |---    |---    |---    |---    |
-|BS |Sequence Length (in seconds) |90%    |95%    |99%    |Avg   |90%    |95%    |99%    |Avg   |Avg   |
-|1|2|56.11|56.76|62.18|51.77|67.75|68.91|73.80|64.96|1.25
-|2|2|55.56|56.96|61.72|50.63|65.84|69.88|74.05|63.57|1.26
-|4|2|54.84|57.69|61.16|60.74|74.00|76.58|81.62|81.01|1.33
-|8|2|57.15|57.92|60.80|52.47|90.56|91.83|93.79|84.58|1.61
-|16|2|58.27|58.54|60.24|53.26|113.25|113.55|115.41|98.56|1.85
-|1|7|49.16|49.39|50.82|48.31|64.53|64.84|65.79|63.90|1.32
-|2|7|53.54|54.07|55.28|49.11|78.64|79.46|81.25|78.17|1.59
-|4|7|50.87|51.15|53.36|50.07|109.33|110.61|114.00|108.17|2.16
-|8|7|63.57|64.18|65.55|60.64|163.95|164.19|165.75|163.49|2.70
-|16|7|82.15|83.66|87.01|81.46|196.15|197.18|202.09|195.36|2.40
-|1|16.7|49.68|50.00|51.39|48.76|89.10|89.42|90.41|88.57|1.82
-|2|16.7|52.47|52.91|54.27|51.51|128.58|129.09|130.34|127.36|2.47
-|4|16.7|66.60|67.52|68.88|65.88|220.50|221.50|223.14|219.42|3.33
-|8|16.7|85.42|86.03|88.37|85.11|293.80|294.39|296.21|290.58|3.41
-|16|16.7|140.76|141.74|147.25|137.31|345.26|346.29|351.15|342.64|2.50
-
+##### Inference performance: NVIDIA DGX-2 (1x V100 32GB)
+|    | |FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
+|---:|-------------:|------:|------:|------:|------:|-------:|-------:|-------:|-------:|-----:|
+| BS | Duration (s) |   90% |   95% |   99% |   Avg |    90% |    95% |    99% |    Avg |  Avg |
+|  1 |    2 |  47.25 |  48.24 |  50.28 |  41.53 |  67.03 |  68.15 |  70.17 |  61.82 | 1.49 |
+|  2 |    2 |  54.11 |  55.20 |  60.44 |  48.82 |  69.11 |  70.38 |  75.93 |  64.45 | 1.32 |
+|  4 |    2 |  63.82 |  67.64 |  71.58 |  61.47 |  71.51 |  74.55 |  79.31 |  67.85 | 1.10 |
+|  8 |    2 |  64.78 |  65.86 |  67.68 |  59.07 |  90.84 |  91.99 |  94.10 |  84.28 | 1.43 |
+| 16 |    2 |  70.59 |  71.49 |  73.58 |  63.85 |  96.92 |  97.58 |  99.98 |  87.73 | 1.37 |
+|  1 |    7 |  42.35 |  42.55 |  43.50 |  41.08 |  63.87 |  64.02 |  64.73 |  62.54 | 1.52 |
+|  2 |    7 |  47.82 |  48.04 |  49.43 |  46.79 |  81.17 |  81.43 |  82.28 |  80.02 | 1.71 |
+|  4 |    7 |  58.27 |  58.54 |  59.69 |  56.96 | 116.00 | 116.46 | 118.79 | 114.82 | 2.02 |
+|  8 |    7 |  62.88 |  63.62 |  67.16 |  61.47 | 143.90 | 144.34 | 147.36 | 139.54 | 2.27 |
+| 16 |    7 |  88.04 |  88.57 |  90.96 |  82.84 | 163.04 | 164.04 | 167.30 | 161.36 | 1.95 |
+|  1 | 16.7 |  44.54 |  44.86 |  45.86 |  43.53 |  88.10 |  88.41 |  89.37 |  87.21 | 2.00 |
+|  2 | 16.7 |  55.21 |  55.55 |  56.92 |  54.33 | 134.99 | 135.69 | 137.87 | 132.97 | 2.45 |
+|  4 | 16.7 |  72.93 |  73.58 |  74.95 |  72.02 | 193.50 | 194.21 | 196.04 | 191.24 | 2.66 |
+|  8 | 16.7 |  96.94 |  97.66 |  99.58 |  92.73 | 227.70 | 228.74 | 231.59 | 225.35 | 2.43 |
+| 16 | 16.7 | 138.25 | 139.75 | 143.71 | 133.82 | 273.69 | 274.53 | 279.50 | 269.13 | 2.01 |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
 ##### Inference performance: NVIDIA T4
-|   |       |FP16 Latency (ms) Percentiles  | | |       |FP32 Latency (ms) Percentiles  | | | | FP16/FP32 speed up|
-|---    |---    |---    |---    |---    |---    |---    |---    |---    |---    |---    |
-|BS |Sequence Length (in seconds) |90%    |95%    |99%    |Avg   |90%    |95%    |99%    |Avg   |Avg   |
-|1|2|57.30|57.50|74.62|56.74|73.71|73.98|88.79|72.95|1.29
-|2|2|53.68|69.69|76.08|52.63|82.83|93.38|97.67|78.23|1.49
-|4|2|72.26|76.49|83.92|57.60|116.06|121.25|125.98|104.17|1.81
-|8|2|70.52|71.85|76.26|58.16|159.92|161.22|164.76|148.34|2.55
-|16|2|78.29|79.04|82.86|66.97|251.96|252.67|253.64|206.41|3.08
-|1|7|54.83|54.94|55.50|54.58|85.57|89.11|89.71|84.08|1.54
-|2|7|55.17|55.38|67.09|54.87|134.28|135.76|138.23|131.01|2.39
-|4|7|74.24|78.09|79.51|73.75|214.77|215.65|217.28|211.66|2.87
-|8|7|99.99|100.34|104.26|98.84|379.67|380.96|382.70|375.12|3.80
-|16|7|167.48|168.07|177.29|166.53|623.36|624.11|625.89|619.34|3.72
-|1|16.7|72.23|72.65|80.13|67.77|155.76|157.11|160.05|151.85|2.24
-|2|16.7|75.43|76.04|80.41|74.65|259.56|261.23|266.09|252.80|3.39
-|4|16.7|131.71|132.45|134.92|129.63|481.40|484.17|486.88|469.05|3.62
-|8|16.7|197.10|197.94|200.15|193.88|806.76|812.73|822.27|792.85|4.09
-|16|16.7|364.22|365.22|372.17|358.62|1165.78|1167.11|1171.02|1150.44|3.21
+|    | |FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
+|---:|-------------:|------:|------:|------:|------:|-------:|-------:|-------:|-------:|-----:|
+| BS | Duration (s) |   90% |   95% |   99% |   Avg |    90% |    95% |    99% |    Avg |  Avg |
+|  1 |    2 |  64.13 |  65.25 |  76.11 |  59.08 |  94.69 |  98.23 | 109.86 |  89.00 | 1.51 |
+|  2 |    2 |  67.59 |  70.77 |  84.06 |  57.47 | 103.88 | 105.37 | 114.59 |  93.30 | 1.62 |
+|  4 |    2 |  75.19 |  81.05 |  87.01 |  65.79 | 120.73 | 128.29 | 146.83 | 112.96 | 1.72 |
+|  8 |    2 |  74.15 |  77.69 |  84.96 |  62.77 | 161.97 | 163.46 | 170.25 | 153.07 | 2.44 |
+| 16 |    2 | 100.62 | 105.08 | 113.00 |  82.06 | 216.18 | 217.92 | 222.46 | 188.57 | 2.30 |
+|  1 |    7 |  77.88 |  79.61 |  81.90 |  70.22 | 110.37 | 113.93 | 121.39 | 107.17 | 1.53 |
+|  2 |    7 |  81.09 |  83.94 |  87.28 |  78.06 | 148.30 | 151.21 | 158.55 | 141.26 | 1.81 |
+|  4 |    7 |  99.85 | 100.83 | 104.24 |  96.81 | 229.94 | 232.34 | 238.11 | 225.43 | 2.33 |
+|  8 |    7 | 147.38 | 150.37 | 153.66 | 142.64 | 394.26 | 396.35 | 398.89 | 390.77 | 2.74 |
+| 16 |    7 | 280.32 | 281.37 | 282.74 | 278.01 | 484.20 | 485.74 | 499.89 | 482.67 | 1.74 |
+|  1 | 16.7 |  76.97 |  79.78 |  81.61 |  75.55 | 171.45 | 176.90 | 179.18 | 167.95 | 2.22 |
+|  2 | 16.7 |  96.48 |  99.42 | 101.21 |  92.74 | 276.12 | 278.67 | 282.06 | 270.05 | 2.91 |
+|  4 | 16.7 | 129.63 | 131.67 | 134.42 | 124.55 | 522.23 | 524.79 | 527.32 | 509.75 | 4.09 |
+|  8 | 16.7 | 209.64 | 211.36 | 214.66 | 204.83 | 706.84 | 709.21 | 715.57 | 697.97 | 3.41 |
+| 16 | 16.7 | 342.23 | 344.62 | 350.84 | 337.42 | 848.02 | 849.83 | 858.22 | 834.38 | 2.47 |
 
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 
 ## Release notes
 
 ### Changelog
+June 2020
+- Updated performance tables to include A100 results
 
 December 2019
 * Inference support for TRT 6 with dynamic shapes
@@ -828,7 +853,5 @@ September 2019
 August 2019
 * Initial release
 
-
 ### Known issues
-
 There are no known issues in this release.
diff --git a/PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server b/PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server
deleted file mode 160000
index 71f0771cb..000000000
--- a/PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 71f0771cb8cb2a2eb1c6a9433f9a56dd1f206c96
diff --git a/PyTorch/SpeechRecognition/Jasper/helpers.py b/PyTorch/SpeechRecognition/Jasper/helpers.py
index d637ecb77..236064537 100644
--- a/PyTorch/SpeechRecognition/Jasper/helpers.py
+++ b/PyTorch/SpeechRecognition/Jasper/helpers.py
@@ -19,11 +19,6 @@
 from metrics import word_error_rate
 
 
-
-
-
-AmpOptimizations = ["O0", "O1", "O2", "O3"]
-
 def print_once(msg):
     if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
         print(msg)
diff --git a/PyTorch/SpeechRecognition/Jasper/inference.py b/PyTorch/SpeechRecognition/Jasper/inference.py
index 8de9149a3..ed5d7a7ea 100644
--- a/PyTorch/SpeechRecognition/Jasper/inference.py
+++ b/PyTorch/SpeechRecognition/Jasper/inference.py
@@ -19,7 +19,7 @@
 import math
 import toml
 from dataset import AudioToTextDataLayer
-from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, AmpOptimizations, print_dict, model_multi_gpu, __ctc_decoder_predictions_tensor
+from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, print_dict, model_multi_gpu, __ctc_decoder_predictions_tensor
 from model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder
 from parts.features import audio_from_file
 import torch
@@ -46,21 +46,21 @@ def parse_args():
     parser.add_argument("--ckpt", default=None, type=str, required=True, help='path to model checkpoint')
     parser.add_argument("--max_duration", default=None, type=float, help='maximum duration of sequences. if None uses attribute from model configuration file')
     parser.add_argument("--pad_to", default=None, type=int, help="default is pad to value as specified in model configurations. if -1 pad to maximum duration. If > 0 pad batch to next multiple of value")
-    parser.add_argument("--fp16", action='store_true', help='use half precision')
-    parser.add_argument("--pyt_fp16", action='store_true', help='use half precision')
+    parser.add_argument("--amp", "--fp16", action='store_true', help='use half precision')
     parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark")
     parser.add_argument("--save_prediction", type=str, default=None, help="if specified saves predictions in text form at this location")
     parser.add_argument("--logits_save_to", default=None, type=str, help="if specified will save logits to path")
     parser.add_argument("--seed", default=42, type=int, help='seed')
-    parser.add_argument("--masked_fill", type="bool", help="Overrides the masked_fill option for the Encoder")
     parser.add_argument("--output_dir", default="results/", type=str, help="Output directory to store exported models. Only used if --export_model is used")
     parser.add_argument("--export_model", action='store_true', help="Exports the audio_featurizer, encoder and decoder using torch.jit to the output_dir")
     parser.add_argument("--wav", type=str, help='absolute path to .wav file (16KHz)')
+    parser.add_argument("--cpu", action="store_true", help="Run inference on CPU")
+    parser.add_argument("--ema", action="store_true", help="If available, load EMA model weights")
     return parser.parse_args()
 
-def calc_wer(data_layer, audio_processor, 
-             encoderdecoder, greedy_decoder, 
-             labels, args):
+def calc_wer(data_layer, audio_processor,
+             encoderdecoder, greedy_decoder,
+             labels, args, device):
 
     encoderdecoder = encoderdecoder.module if hasattr(encoderdecoder, 'module') else encoderdecoder
     with torch.no_grad():
@@ -74,16 +74,14 @@ def calc_wer(data_layer, audio_processor,
         # Evaluation mini-batch for loop
         for it, data in enumerate(tqdm(data_layer.data_iterator)):
 
-            tensors = []
-            for d in data:
-                tensors.append(d.cuda())
+            tensors = [t.to(device) for t in data]
     
             t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
-    
-            t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e) 
+
+            t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e)
             t_log_probs_e, _ = encoderdecoder.infer(t_processed_signal)
             t_predictions_e = greedy_decoder(t_log_probs_e)
-    
+
             values_dict = dict(
                 predictions=[t_predictions_e],
                 transcript=[t_transcript_e],
@@ -92,7 +90,7 @@ def calc_wer(data_layer, audio_processor,
             )
             # values_dict will contain results from all workers
             process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
-    
+
             if args.steps is not None and it + 1 >= args.steps:
                 break
 
@@ -102,18 +100,13 @@ def calc_wer(data_layer, audio_processor,
         return wer, _global_var_dict
 
 
-def jit_export(
-         audio, audio_len,
-         audio_processor,
-         encoderdecoder,
-         greedy_decoder,
-         args):
+def jit_export(audio, audio_len, audio_processor, encoderdecoder, greedy_decoder, args):
 
                 print("##############")
 
-                module_name = "{}_{}".format(os.path.basename(args.model_toml), "fp16" if args.fp16 else "fp32")
+                module_name = "{}_{}".format(os.path.basename(args.model_toml), "fp16" if args.amp else "fp32")
 
-                if args.masked_fill is not None and args.masked_fill == False:
+                if args.use_conv_mask:
                     module_name = module_name + "_noMaskConv"
 
                 # Export just the featurizer
@@ -137,12 +130,18 @@ def jit_export(
 
                 return traced_module_feat, traced_module_acoustic, traced_module_decode
 
-def run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels):
-            features = audio_processor(audio, audio_len)
-            torch.cuda.synchronize()
+def run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels, device):
+            features, lens = audio_processor(audio, audio_len)
+            if not device.type == 'cpu':
+                torch.cuda.synchronize()
             t0 = time.perf_counter()
-            t_log_probs_e = encoderdecoder(features[0])
-            torch.cuda.synchronize()
+            # TorchScripted model does not support (features, lengths)
+            if isinstance(encoderdecoder, torch.jit.TracedModule):
+                t_log_probs_e = encoderdecoder(features)
+            else:
+                t_log_probs_e, _ = encoderdecoder.infer((features, lens))
+            if not device.type == 'cpu':
+                torch.cuda.synchronize()
             t1 = time.perf_counter()
             t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
             hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
@@ -157,6 +156,7 @@ def eval(
          greedy_decoder,
          labels,
          multi_gpu,
+         device,
          args):
     """performs inference / evaluation
     Args:
@@ -169,21 +169,19 @@ def eval(
         args: script input arguments
     """
     logits_save_to=args.logits_save_to
-    
+
     with torch.no_grad():
         if args.wav:
             audio, audio_len = audio_from_file(args.wav)
-            run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels)
+            run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels, device)
             if args.export_model:
-                jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(audio, audio_len, audio_processor,
-                                                                                         encoderdecoder,
-                                                                                         greedy_decoder,args)
-            run_once(jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder, audio, audio_len, labels)
+                jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(audio, audio_len, audio_processor, encoderdecoder,greedy_decoder,args)
+                run_once(jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder, audio, audio_len, labels, device)
             return
-        wer, _global_var_dict = calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args)
+        wer, _global_var_dict = calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args, device)
         if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
             print("==========>>>>>>Evaluation WER: {0}\n".format(wer))
-      
+
             if args.save_prediction is not None:
                 with open(args.save_prediction, 'w') as fp:
                     fp.write('\n'.join(_global_var_dict['predictions']))
@@ -203,26 +201,29 @@ def eval(
             #     print("===>>>Diff      : {0} %".format((wer_after - wer_before) * 100.0 / wer_before))
             #     print("")
 
-                
+
 def main(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    torch.backends.cudnn.benchmark = args.cudnn_benchmark
-    print("CUDNN BENCHMARK ", args.cudnn_benchmark)
-    assert(torch.cuda.is_available())
 
-    if args.local_rank is not None:
-        torch.cuda.set_device(args.local_rank)
-        torch.distributed.init_process_group(backend='nccl', init_method='env://')
     multi_gpu = args.local_rank is not None
-    if multi_gpu:
-        print("DISTRIBUTED with ", torch.distributed.get_world_size())
 
-    if args.fp16:
-        optim_level = 3
+    if args.cpu:
+        assert(not multi_gpu)
+        device = torch.device('cpu')
     else:
-        optim_level = 0
+        assert(torch.cuda.is_available())
+        device = torch.device('cuda')
+        torch.backends.cudnn.benchmark = args.cudnn_benchmark
+        print("CUDNN BENCHMARK ", args.cudnn_benchmark)
+
+        if multi_gpu:
+            print("DISTRIBUTED with ", torch.distributed.get_world_size())
+            torch.cuda.set_device(args.local_rank)
+            torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    optim_level = 3 if args.amp else 0
 
     jasper_model_definition = toml.load(args.model_toml)
     dataset_vocab = jasper_model_definition['labels']['labels']
@@ -231,21 +232,21 @@ def main(args):
     val_manifest = args.val_manifest
     featurizer_config = jasper_model_definition['input_eval']
     featurizer_config["optimization_level"] = optim_level
-    featurizer_config["fp16"] = args.fp16
-    args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
+    featurizer_config["fp16"] = args.amp
 
-    if args.masked_fill is not None:
-        print("{} masked_fill".format("Enabling" if args.masked_fill else "Disabling"))
-        jasper_model_definition["encoder"]["conv_mask"] = args.masked_fill
+    args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
+    if args.use_conv_mask and args.export_model:
+        print('WARNING: Masked convs currently not supported for TorchScript. Disabling.')
+        jasper_model_definition['encoder']['convmask'] = False
 
     if args.max_duration is not None:
         featurizer_config['max_duration'] = args.max_duration
     if args.pad_to is not None:
-        featurizer_config['pad_to'] = args.pad_to 
+        featurizer_config['pad_to'] = args.pad_to
 
     if featurizer_config['pad_to'] == "max":
         featurizer_config['pad_to'] = -1
-        
+
     print('=== model_config ===')
     print_dict(jasper_model_definition)
     print()
@@ -253,10 +254,10 @@ def main(args):
     print_dict(featurizer_config)
     print()
     data_layer = None
-    
+
     if args.wav is None:
         data_layer = AudioToTextDataLayer(
-            dataset_dir=args.dataset_dir, 
+            dataset_dir=args.dataset_dir,
             featurizer_config=featurizer_config,
             manifest_filepath=val_manifest,
             labels=dataset_vocab,
@@ -274,10 +275,16 @@ def main(args):
             exit(0)
         else:
             checkpoint = torch.load(args.ckpt, map_location="cpu")
+            if args.ema and 'ema_state_dict' in checkpoint:
+                print('Loading EMA state dict')
+                sd = 'ema_state_dict'
+            else:
+                sd = 'state_dict'
+
             for k in audio_preprocessor.state_dict().keys():
-                checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k)
-            audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False)
-            encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)
+                checkpoint[sd][k] = checkpoint[sd].pop("audio_preprocessor." + k)
+            audio_preprocessor.load_state_dict(checkpoint[sd], strict=False)
+            encoderdecoder.load_state_dict(checkpoint[sd], strict=False)
 
     greedy_decoder = GreedyCTCDecoder()
 
@@ -298,17 +305,19 @@ def main(args):
             print('-----------------')
 
     print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize)
-    audio_preprocessor.cuda()
-    encoderdecoder.cuda()
-    if args.fp16:
-        encoderdecoder = amp.initialize( models=encoderdecoder,
-                                         opt_level=AmpOptimizations[optim_level])
+
+    audio_preprocessor.to(device)
+    encoderdecoder.to(device)
+
+    if args.amp:
+        encoderdecoder = amp.initialize(models=encoderdecoder,
+                                        opt_level='O'+str(optim_level))
 
     encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu)
     audio_preprocessor.eval()
     encoderdecoder.eval()
     greedy_decoder.eval()
-    
+
     eval(
         data_layer=data_layer,
         audio_processor=audio_preprocessor,
@@ -316,6 +325,7 @@ def main(args):
         greedy_decoder=greedy_decoder,
         labels=ctc_vocab,
         args=args,
+        device=device,
         multi_gpu=multi_gpu)
 
 if __name__=="__main__":
diff --git a/PyTorch/SpeechRecognition/Jasper/inference_benchmark.py b/PyTorch/SpeechRecognition/Jasper/inference_benchmark.py
index 8bd9eb69b..c0be36cd4 100644
--- a/PyTorch/SpeechRecognition/Jasper/inference_benchmark.py
+++ b/PyTorch/SpeechRecognition/Jasper/inference_benchmark.py
@@ -26,8 +26,9 @@
 import torch
 from apex import amp
 from dataset import AudioToTextDataLayer
-from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, AmpOptimizations, print_dict
+from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, print_dict
 from model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder
+from parts.features import audio_from_file
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Jasper')
@@ -40,16 +41,53 @@ def parse_args():
     parser.add_argument("--val_manifest", type=str, help='relative path to evaluation dataset manifest file')
     parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark")
     parser.add_argument("--ckpt", default=None, type=str, required=True, help='path to model checkpoint')
-    parser.add_argument("--fp16", action='store_true', help='use half precision')
+    parser.add_argument("--amp", "--fp16", action='store_true', help='use half precision')
     parser.add_argument("--seed", default=42, type=int, help='seed')
+    parser.add_argument("--cpu", action='store_true', help='run inference on CPU')
+    parser.add_argument("--torch_script", action='store_true', help='export model')
+    parser.add_argument("--sample_audio", default="/datasets/LibriSpeech/dev-clean-wav/1272/128104/1272-128104-0000.wav", type=str, help='audio sample path for torchscript, points to one of the files in /datasets/LibriSpeech/dev-clean-wav/ if not defined')
     return parser.parse_args()
 
+def jit_export(
+         audio,
+         audio_len,
+         audio_processor,
+         encoderdecoder,
+         greedy_decoder,
+         args):
+    """applies torchscript
+    Args:
+        audio:
+        audio_len: 
+        audio_processor: data processing module
+        encoderdecoder: acoustic model
+        greedy_decoder: greedy decoder
+        args: script input arguments
+    """
+    # Export just the featurizer
+    print("torchscripting featurizer ...")
+    traced_module_feat = torch.jit.script(audio_processor)
+
+    # Export just the acoustic model
+    print("torchscripting acoustic model ...")
+    inp_postFeat, _ = audio_processor(audio, audio_len)
+    traced_module_acoustic = torch.jit.trace(encoderdecoder, inp_postFeat)
+
+    # Export just the decoder
+    print("torchscripting decoder ...")
+    inp_postAcoustic = encoderdecoder(inp_postFeat)
+    traced_module_decode = torch.jit.script(greedy_decoder, inp_postAcoustic)
+    print("JIT process complete")
+
+    return traced_module_feat, traced_module_acoustic, traced_module_decode
+
 def eval(
         data_layer,
         audio_processor,
         encoderdecoder,
         greedy_decoder,
         labels,
+        device,
         args):
     """performs evaluation and prints performance statistics
     Args:
@@ -64,6 +102,12 @@ def eval(
     steps=args.steps
     audio_processor.eval()
     encoderdecoder.eval()
+    greedy_decoder.eval()
+
+    if args.torch_script:
+        audio, audio_len = audio_from_file(args.sample_audio, device=device)
+        audio_processor, encoderdecoder, greedy_decoder = jit_export(audio, audio_len, audio_processor, encoderdecoder, greedy_decoder, args)
+
     with torch.no_grad():
         _global_var_dict = {
             'predictions': [],
@@ -78,29 +122,32 @@ def eval(
         durations_dnn = []
         durations_dnn_and_prep = []
         seq_lens = []
+
+        sync = lambda: torch.cuda.synchronize() if device.type == 'cuda' else None
+
         while True:
             ep += 1
             for data in tqdm(data_layer.data_iterator):
                 it += 1
                 if it > steps:
                     break
-                tensors = []
-                dl_device = torch.device("cuda")
-                for d in data:
-                    tensors.append(d.to(dl_device))
+                tensors = [t.to(device) for t in data]
      
                 t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
-                torch.cuda.synchronize()
+
+                sync()
                 t0 = time.perf_counter()
-                t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e)
-                torch.cuda.synchronize()
+                features, lens = audio_processor(t_audio_signal_e, t_a_sig_length_e)
+
+                sync()
                 t1 = time.perf_counter()
-                
-                t_log_probs_e, _  = encoderdecoder.infer(t_processed_signal)
+                if isinstance(encoderdecoder, torch.jit.TracedModule):
+                    t_log_probs_e = encoderdecoder(features)
+                else:
+                    t_log_probs_e, _ = encoderdecoder.infer((features, lens))
 
-                torch.cuda.synchronize()
+                sync()
                 stop_time = time.perf_counter()
-
                 time_prep_and_dnn = stop_time - t0
                 time_dnn = stop_time - t1
                 t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
@@ -113,7 +160,7 @@ def eval(
                 process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
                 durations_dnn.append(time_dnn)
                 durations_dnn_and_prep.append(time_prep_and_dnn)
-                seq_lens.append(t_processed_signal[0].shape[-1])
+                seq_lens.append(features[0].shape[-1])
 
             if it >= steps:
 
@@ -151,15 +198,17 @@ def main(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    torch.backends.cudnn.benchmark = args.cudnn_benchmark
     assert(args.steps is None or args.steps > 5)
-    print("CUDNN BENCHMARK ", args.cudnn_benchmark)
-    assert(torch.cuda.is_available())
 
-    if args.fp16:
-        optim_level = 3
+    if args.cpu:
+        device = torch.device('cpu')
     else:
-        optim_level = 0
+        assert(torch.cuda.is_available())
+        device = torch.device('cuda')
+        torch.backends.cudnn.benchmark = args.cudnn_benchmark
+        print("CUDNN BENCHMARK ", args.cudnn_benchmark)
+
+    optim_level = 3 if args.amp else 0
     batch_size = args.batch_size
 
     jasper_model_definition = toml.load(args.model_toml)
@@ -180,6 +229,11 @@ def main(args):
     if featurizer_config['pad_to'] == "max":
         featurizer_config['pad_to'] = -1
 
+    args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
+    if args.use_conv_mask and args.torch_script:
+        print('WARNING: Masked convs currently not supported for TorchScript. Disabling.')
+        jasper_model_definition['encoder']['convmask'] = False
+
     print('model_config')
     print_dict(jasper_model_definition)
     print('feature_config')
@@ -217,18 +271,18 @@ def main(args):
     print('-----------------')
     if args.steps is None:
         print('Have {0} examples to eval on.'.format(N))
-        print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch))
+        print('Have {0} steps / (epoch).'.format(step_per_epoch))
     else:
         print('Have {0} examples to eval on.'.format(args.steps * args.batch_size))
-        print('Have {0} steps / (gpu * epoch).'.format(args.steps))
+        print('Have {0} steps / (epoch).'.format(args.steps))
     print('-----------------')
 
-    audio_preprocessor.cuda()
-    encoderdecoder.cuda()
-    if args.fp16:
+    audio_preprocessor.to(device)
+    encoderdecoder.to(device)
+
+    if args.amp:
         encoderdecoder = amp.initialize(
-            models=encoderdecoder,
-            opt_level=AmpOptimizations[optim_level])
+            models=encoderdecoder, opt_level='O'+str(optim_level))
 
     eval(
         data_layer=data_layer,
@@ -236,6 +290,7 @@ def main(args):
         encoderdecoder=encoderdecoder,
         greedy_decoder=greedy_decoder,
         labels=ctc_vocab,
+        device=device,
         args=args)
 
 if __name__=="__main__":
diff --git a/PyTorch/SpeechRecognition/Jasper/parts/features.py b/PyTorch/SpeechRecognition/Jasper/parts/features.py
index c075903f7..1d0611f21 100644
--- a/PyTorch/SpeechRecognition/Jasper/parts/features.py
+++ b/PyTorch/SpeechRecognition/Jasper/parts/features.py
@@ -21,13 +21,14 @@
 from apex import amp
 
 
-def audio_from_file(file_path, offset=0, duration=0, trim=False, target_sr=16000):
+def audio_from_file(file_path, offset=0, duration=0, trim=False, target_sr=16000,
+                    device=torch.device('cuda')):
     audio = AudioSegment.from_file(file_path,
                                    target_sr=target_sr,
                                    int_values=False,
                                    offset=offset, duration=duration, trim=trim)
-    samples=torch.tensor(audio.samples, dtype=torch.float).cuda()
-    num_samples = torch.tensor(samples.shape[0]).int().cuda()
+    samples=torch.tensor(audio.samples, dtype=torch.float, device=device)
+    num_samples = torch.tensor(samples.shape[0], device=device).int()
     return (samples.unsqueeze(0), num_samples.unsqueeze(0))
 
 class WaveformFeaturizer(object):
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGX1-16GB_Jasper_AMP_8GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGX1-16GB_Jasper_AMP_8GPU.sh
new file mode 100644
index 000000000..4ac61b677
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGX1-16GB_Jasper_AMP_8GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=8 AMP=true BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=2 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGX1-16GB_Jasper_FP32_8GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGX1-16GB_Jasper_FP32_8GPU.sh
new file mode 100644
index 000000000..0317b41a6
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGX1-16GB_Jasper_FP32_8GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=8 BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=4 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGX1-32GB_Jasper_AMP_8GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGX1-32GB_Jasper_AMP_8GPU.sh
new file mode 100644
index 000000000..8953566d4
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGX1-32GB_Jasper_AMP_8GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=8 AMP=true BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=1 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGX1-32GB_Jasper_FP32_8GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGX1-32GB_Jasper_FP32_8GPU.sh
new file mode 100644
index 000000000..eed6a1273
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGX1-32GB_Jasper_FP32_8GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=8 BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=2 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_AMP_16GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_AMP_16GPU.sh
new file mode 100644
index 000000000..a0ccca4a1
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_AMP_16GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=16 AMP=true BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=1 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_AMP_8GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_AMP_8GPU.sh
new file mode 100644
index 000000000..8953566d4
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_AMP_8GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=8 AMP=true BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=1 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_FP32_16GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_FP32_16GPU.sh
new file mode 100644
index 000000000..873fb92f1
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_FP32_16GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=16 BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=1 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_FP32_8GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_FP32_8GPU.sh
new file mode 100644
index 000000000..4ac61b677
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGX2_Jasper_FP32_8GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=8 AMP=true BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=2 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGXA100_Jasper_AMP_8GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGXA100_Jasper_AMP_8GPU.sh
new file mode 100644
index 000000000..8953566d4
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGXA100_Jasper_AMP_8GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=8 AMP=true BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=1 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/platform/DGXA100_Jasper_TF32_8GPU.sh b/PyTorch/SpeechRecognition/Jasper/platform/DGXA100_Jasper_TF32_8GPU.sh
new file mode 100644
index 000000000..eed6a1273
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/platform/DGXA100_Jasper_TF32_8GPU.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+NUM_GPUS=8 BATCH_SIZE=64 GRADIENT_ACCUMULATION_STEPS=2 bash scripts/train.sh "$@"
diff --git a/PyTorch/SpeechRecognition/Jasper/scripts/docker/launch.sh b/PyTorch/SpeechRecognition/Jasper/scripts/docker/launch.sh
index 2501d8a23..0ec7b9905 100755
--- a/PyTorch/SpeechRecognition/Jasper/scripts/docker/launch.sh
+++ b/PyTorch/SpeechRecognition/Jasper/scripts/docker/launch.sh
@@ -4,27 +4,16 @@ JASPER_REPO=${JASPER_REPO:-"${SCRIPT_DIR}/../.."}
 
 # Launch TRT JASPER container.
 
-DATA_DIR=${DATA_DIR:-"/datasets"}
-CHECKPOINT_DIR=${CHECKPOINT_DIR:-"/checkpoints"}
-RESULT_DIR=${RESULT_DIR:-"/results"}
+DATA_DIR=${1:-${DATA_DIR-"/datasets"}}
+CHECKPOINT_DIR=${2:-${CHECKPOINT_DIR:-"/checkpoints"}}
+RESULT_DIR=${3:-${RESULT_DIR:-"/results"}}
 PROGRAM_PATH=${PROGRAM_PATH}
-    
 
 MOUNTS=""
-if [ ! -z "$DATA_DIR" ]; 
-then
-    MOUNTS="$MOUNTS -v $DATA_DIR:/datasets "
-fi
-
-if [ ! -z "$CHECKPOINT_DIR" ]; 
-then
-    MOUNTS="$MOUNTS -v $CHECKPOINT_DIR:/checkpoints "
-fi
-
-if [ ! -z "$RESULT_DIR" ]; 
-then
-    MOUNTS="$MOUNTS -v $RESULT_DIR:/results "
-fi
+MOUNTS+=" -v $DATA_DIR:/datasets"
+MOUNTS+=" -v $CHECKPOINT_DIR:/checkpoints"
+MOUNTS+=" -v $RESULT_DIR:/results"
+MOUNTS+=" -v ${JASPER_REPO}:/jasper"
 
 echo $MOUNTS
 nvidia-docker run -it --rm \
@@ -33,6 +22,5 @@ nvidia-docker run -it --rm \
   --ulimit memlock=-1 \
   --ulimit stack=67108864 \
   ${MOUNTS} \
-  -v ${JASPER_REPO}:/jasper \
   ${EXTRA_JASPER_ENV} \
   jasper:latest bash $PROGRAM_PATH
diff --git a/PyTorch/SpeechRecognition/Jasper/scripts/download_librispeech.sh b/PyTorch/SpeechRecognition/Jasper/scripts/download_librispeech.sh
index ee322fe30..07a07b866 100755
--- a/PyTorch/SpeechRecognition/Jasper/scripts/download_librispeech.sh
+++ b/PyTorch/SpeechRecognition/Jasper/scripts/download_librispeech.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,16 +15,18 @@
 # limitations under the License.
 
 
-#!/usr/bin/env bash
-
 DATA_SET="LibriSpeech"
 DATA_ROOT_DIR="/datasets"
 DATA_DIR="${DATA_ROOT_DIR}/${DATA_SET}"
+
 if [ ! -d "$DATA_DIR" ]
 then
-    mkdir $DATA_DIR
-    chmod go+rx $DATA_DIR
-    python utils/download_librispeech.py utils/librispeech.csv $DATA_DIR -e ${DATA_ROOT_DIR}/
+   mkdir --mode 755 $DATA_DIR
+
+   python utils/download_librispeech.py \
+      utils/librispeech.csv \
+      $DATA_DIR \
+      -e ${DATA_ROOT_DIR}/
 else
-    echo "Directory $DATA_DIR already exists."
+   echo "Directory $DATA_DIR already exists."
 fi
diff --git a/PyTorch/SpeechRecognition/Jasper/scripts/evaluation.sh b/PyTorch/SpeechRecognition/Jasper/scripts/evaluation.sh
index fcd472fd9..6c5790dd1 100755
--- a/PyTorch/SpeechRecognition/Jasper/scripts/evaluation.sh
+++ b/PyTorch/SpeechRecognition/Jasper/scripts/evaluation.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,54 +15,22 @@
 # limitations under the License.
 
 
-#!/bin/bash
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-DATA_DIR=${1:-"/datasets/LibriSpeech"}
-DATASET=${2:-"dev-clean"}
-MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
-RESULT_DIR=${4:-"/results"}
-CHECKPOINT=$5
-CREATE_LOGFILE=${6:-"true"}
-CUDNN_BENCHMARK=${7:-"false"}
-NUM_GPUS=${8:-1}
-PRECISION=${9:-"fp32"}
-NUM_STEPS=${10:-"-1"}
-SEED=${11:-0}
-BATCH_SIZE=${12:-64}
-
-
-if [ "$CREATE_LOGFILE" = "true" ] ; then
-    export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
-    printf -v TAG "jasper_evaluation_${DATASET}_%s_gbs%d" "$PRECISION" $GBS
-    DATESTAMP=`date +'%y%m%d%H%M%S'`
-    LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
-    printf "Logs written to %s\n" "$LOGFILE"
-fi
-
-
+echo "NVIDIA container build: ${NVIDIA_BUILD_ID}"
 
-PREC=""
-if [ "$PRECISION" = "fp16" ] ; then
-    PREC="--fp16"
-elif [ "$PRECISION" = "fp32" ] ; then
-    PREC=""
-else
-    echo "Unknown <precision> argument"
-    exit -2
-fi
-
-STEPS=""
-if [ "$NUM_STEPS" -gt 0 ] ; then
-    STEPS=" --steps $NUM_STEPS"
-fi
-
-if [ "$CUDNN_BENCHMARK" = "true" ] ; then
-    CUDNN_BENCHMARK=" --cudnn_benchmark"
-else
-    CUDNN_BENCHMARK=""
-fi
+DATA_DIR=${1:-${DATA_DIR:-"/datasets/LibriSpeech"}}
+DATASET=${2:-${DATASET:-"dev-clean"}}
+MODEL_CONFIG=${3:-${MODEL_CONFIG:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}}
+RESULT_DIR=${4:-${RESULT_DIR:-"/results"}}
+CHECKPOINT=${5:-${CHECKPOINT:-"/checkpoints/jasper_fp16.pt"}}
+CREATE_LOGFILE=${6:-${CREATE_LOGFILE:-"true"}}
+CUDNN_BENCHMARK=${7:-${CUDNN_BENCHMARK:-"false"}}
+NUM_GPUS=${8:-${NUM_GPUS:-1}}
+AMP=${9:-${AMP:-"false"}}
+NUM_STEPS=${10:-${NUM_STEPS:-"-1"}}
+SEED=${11:-${SEED:-0}}
+BATCH_SIZE=${12:-${BATCH_SIZE:-64}}
 
+mkdir -p "$RESULT_DIR"
 
 CMD=" inference.py "
 CMD+=" --batch_size $BATCH_SIZE "
@@ -69,18 +39,27 @@ CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
 CMD+=" --model_toml $MODEL_CONFIG  "
 CMD+=" --seed $SEED "
 CMD+=" --ckpt $CHECKPOINT "
-CMD+=" $CUDNN_BENCHMARK"
-CMD+=" $PREC "
-CMD+=" $STEPS "
+[ "$AMP" == "true" ] && \
+CMD+=" --amp"
+[ "$NUM_STEPS" -gt 0 ] && \
+CMD+=" --steps $NUM_STEPS"
+[ "$CUDNN_BENCHMARK" = "true" ] && \
+CMD+=" --cudnn"
 
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+   export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+   printf -v TAG "jasper_train_benchmark_amp-%s_gbs%d" "$AMP" $GBS
+   DATESTAMP=`date +'%y%m%d%H%M%S'`
+   LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+   printf "Logs written to %s\n" "$LOGFILE"
+fi
 
 if [ "$NUM_GPUS" -gt 1  ] ; then
-    CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
+   CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
 else
-    CMD="python3  $CMD"
+   CMD="python3  $CMD"
 fi
 
-
 set -x
 if [ -z "$LOGFILE" ] ; then
    $CMD
diff --git a/PyTorch/SpeechRecognition/Jasper/scripts/inference.sh b/PyTorch/SpeechRecognition/Jasper/scripts/inference.sh
index a11c467b5..ac9fcb945 100755
--- a/PyTorch/SpeechRecognition/Jasper/scripts/inference.sh
+++ b/PyTorch/SpeechRecognition/Jasper/scripts/inference.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,83 +15,56 @@
 # limitations under the License.
 
 
-#!/bin/bash
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
+echo "NVIDIA container build: ${NVIDIA_BUILD_ID}"
 
-DATA_DIR=${1-"/datasets/LibriSpeech"}
-DATASET=${2:-"dev-clean"}
-MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
-RESULT_DIR=${4:-"/results"}
-CHECKPOINT=${5:-"/checkpoints/jasper_fp16.pt"}
-CREATE_LOGFILE=${6:-"true"}
-CUDNN_BENCHMARK=${7:-"false"}
-PRECISION=${8:-"fp32"}
-NUM_STEPS=${9:-"-1"}
-SEED=${10:-0}
-BATCH_SIZE=${11:-64}
-MODELOUTPUT_FILE=${12:-"none"}
-PREDICTION_FILE=${13:-"$RESULT_DIR/${DATASET}.predictions"}
-
-if [ "$CREATE_LOGFILE" = "true" ] ; then
-    export GBS=$(expr $BATCH_SIZE)
-    printf -v TAG "jasper_inference_${DATASET}_%s_gbs%d" "$PRECISION" $GBS
-    DATESTAMP=`date +'%y%m%d%H%M%S'`
-    LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
-    printf "Logs written to %s\n" "$LOGFILE"
-fi
+DATA_DIR=${1:-${DATA_DIR:-"/datasets/LibriSpeech"}}
+DATASET=${2:-${DATASET:-"dev-clean"}}
+MODEL_CONFIG=${3:-${MODEL_CONFIG:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}}
+RESULT_DIR=${4:-${RESULT_DIR:-"/results"}}
+CHECKPOINT=${5:-${CHECKPOINT:-"/checkpoints/jasper_fp16.pt"}}
+CREATE_LOGFILE=${6:-${CREATE_LOGFILE:-"true"}}
+CUDNN_BENCHMARK=${7:-${CUDNN_BENCHMARK:-"false"}}
+AMP=${8:-${AMP:-"false"}}
+NUM_STEPS=${9:-${NUM_STEPS:-"-1"}}
+SEED=${10:-${SEED:-0}}
+BATCH_SIZE=${11:-${BATCH_SIZE:-64}}
+LOGITS_FILE=${12:-${LOGITS_FILE:-""}}
+PREDICTION_FILE=${13:-${PREDICTION_FILE:-"${RESULT_DIR}/${DATASET}.predictions"}}
+CPU=${14:-${CPU:-"false"}}
+EMA=${14:-${EMA:-"false"}}
 
+mkdir -p "$RESULT_DIR"
 
-
-PREC=""
-if [ "$PRECISION" = "fp16" ] ; then
-    PREC="--fp16"
-elif [ "$PRECISION" = "fp32" ] ; then
-    PREC=""
-else
-    echo "Unknown <precision> argument"
-    exit -2
-fi
-
-PRED=""
-if [ "$PREDICTION_FILE" = "none" ] ; then
-    PRED=""
-else
-    PRED=" --save_prediction $PREDICTION_FILE"
-fi
-
-OUTPUT=""
-if [ "$MODELOUTPUT_FILE" = "none" ] ; then
-    OUTPUT=" "
-else
-    OUTPUT=" --logits_save_to $MODELOUTPUT_FILE"
-fi
-
-
-if [ "$CUDNN_BENCHMARK" = "true" ]; then
-    CUDNN_BENCHMARK=" --cudnn_benchmark"
-else
-    CUDNN_BENCHMARK=""
-fi
-
-STEPS=""
-if [ "$NUM_STEPS" -gt 0 ] ; then
-    STEPS=" --steps $NUM_STEPS"
-fi
-
-CMD=" python inference.py "
+CMD="python inference.py "
 CMD+=" --batch_size $BATCH_SIZE "
 CMD+=" --dataset_dir $DATA_DIR "
 CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
 CMD+=" --model_toml $MODEL_CONFIG  "
 CMD+=" --seed $SEED "
-CMD+=" --ckpt $CHECKPOINT "
-CMD+=" $CUDNN_BENCHMARK"
-CMD+=" $PRED "
-CMD+=" $OUTPUT "
-CMD+=" $PREC "
-CMD+=" $STEPS "
+[ "$NUM_STEPS" -gt 0 ] && \
+CMD+=" --steps $NUM_STEPS"
+[ "$CUDNN_BENCHMARK" = "true" ] && \
+CMD+=" --cudnn"
+[ "$AMP" == "true" ] && \
+CMD+=" --amp"
+[ "$CPU" == "true" ] && \
+CMD+=" --cpu"
+[ "$EMA" == "true" ] && \
+CMD+=" --ema"
+[ -n "$CHECKPOINT" ] && \
+CMD+=" --ckpt=${CHECKPOINT}"
+[ -n "$PREDICTION_FILE" ] && \
+CMD+=" --save_prediction $PREDICTION_FILE"
+[ -n "$LOGITS_FILE" ] && \
+CMD+=" --logits_save_to $LOGITS_FILE"
 
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+   export GBS=$(expr $BATCH_SIZE)
+   printf -v TAG "jasper_train_benchmark_amp-%s_gbs%d" "$AMP" $GBS
+   DATESTAMP=`date +'%y%m%d%H%M%S'`
+   LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+   printf "Logs written to %s\n" "$LOGFILE"
+fi
 
 set -x
 if [ -z "$LOGFILE" ] ; then
@@ -100,5 +75,5 @@ else
    ) |& tee "$LOGFILE"
 fi
 set +x
-echo "MODELOUTPUT_FILE: ${MODELOUTPUT_FILE}"
-echo "PREDICTION_FILE: ${PREDICTION_FILE}"
+[ -n "$PREDICTION_FILE" ] && echo "PREDICTION_FILE: ${PREDICTION_FILE}"
+[ -n "$LOGITS_FILE" ] && echo "LOGITS_FILE: ${LOGITS_FILE}"
diff --git a/PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark.sh b/PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark.sh
index f3d9fddc5..82d5ead5a 100755
--- a/PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark.sh
+++ b/PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,42 +15,22 @@
 # limitations under the License.
 
 
-#!/bin/bash
-
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
+echo "NVIDIA container build: ${NVIDIA_BUILD_ID}"
 
+DATA_DIR=${1:-${DATA_DIR:-"/datasets/LibriSpeech"}}
+DATASET=${2:-${DATASET:-"dev-clean"}}
+MODEL_CONFIG=${3:-${MODEL_CONFIG:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}}
+RESULT_DIR=${4:-${RESULT_DIR:-"/results"}}
+CHECKPOINT=${5:-${CHECKPOINT:-"/checkpoints/jasper_fp16.pt"}}
+CREATE_LOGFILE=${6:-${CREATE_LOGFILE:-"true"}}
+CUDNN_BENCHMARK=${7:-${CUDNN_BENCHMARK:-"true"}}
+AMP=${8:-${AMP:-"false"}}
+NUM_STEPS=${9:-${NUM_STEPS:-"-1"}}
+MAX_DURATION=${10:-${MAX_DURATION:-"36"}}
+SEED=${11:-${SEED:-0}}
+BATCH_SIZE=${12:-${BATCH_SIZE:-64}}
 
-DATA_DIR=${1:-"/datasets/LibriSpeech"}
-DATASET=${2:-"dev-clean"}
-MODEL_CONFIG=${3:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
-RESULT_DIR=${4:-"/results"}
-CHECKPOINT=${5:-"/checkpoints/jasper_fp16.pt"}
-CREATE_LOGFILE=${6:-"true"}
-CUDNN_BENCHMARK=${7:-"true"}
-PRECISION=${8:-"fp32"}
-NUM_STEPS=${9:-"-1"}
-MAX_DURATION=${10:-"36"}
-SEED=${11:-0}
-BATCH_SIZE=${12:-64}
-
-PREC=""
-if [ "$PRECISION" = "fp16" ] ; then
-   PREC="--fp16"
-elif [ "$PRECISION" = "fp32" ] ; then
-   PREC=""
-else
-   echo "Unknown <precision> argument"
-   exit -2
-fi
-STEPS=""
-if [ "$NUM_STEPS" -gt 0 ] ; then
-   STEPS=" --steps $NUM_STEPS"
-fi
-if [ "$CUDNN_BENCHMARK" = "true" ] ; then
-    CUDNN_BENCHMARK=" --cudnn_benchmark"
-else
-    CUDNN_BENCHMARK=""
-fi
+mkdir -p "$RESULT_DIR"
 
 CMD=" python inference_benchmark.py"
 CMD+=" --batch_size=$BATCH_SIZE"
@@ -59,17 +41,19 @@ CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
 CMD+=" --ckpt=$CHECKPOINT"
 CMD+=" --max_duration=$MAX_DURATION"
 CMD+=" --pad_to=-1"
-CMD+=" $CUDNN_BENCHMARK"
-CMD+=" $PREC"
-CMD+=" $STEPS"
-
+[ "$AMP" == "true" ] && \
+CMD+=" --amp"
+[ "$NUM_STEPS" -gt 0 ] && \
+CMD+=" --steps $NUM_STEPS"
+[ "$CUDNN_BENCHMARK" = "true" ] && \
+CMD+=" --cudnn"
 
 if [ "$CREATE_LOGFILE" = "true" ] ; then
-  export GBS=$(expr $BATCH_SIZE )
-  printf -v TAG "jasper_inference_benchmark_%s_gbs%d" "$PRECISION" $GBS
-  DATESTAMP=`date +'%y%m%d%H%M%S'`
-  LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
-  printf "Logs written to %s\n" "$LOGFILE"
+   export GBS=$(expr $BATCH_SIZE )
+   printf -v TAG "jasper_train_benchmark_amp-%s_gbs%d" "$AMP" $GBS
+   DATESTAMP=`date +'%y%m%d%H%M%S'`
+   LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+   printf "Logs written to %s\n" "$LOGFILE"
 fi
 
 set -x
diff --git a/PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark_cpu.sh b/PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark_cpu.sh
new file mode 100755
index 000000000..598108749
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/scripts/inference_benchmark_cpu.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+echo "NVIDIA container build: ${NVIDIA_BUILD_ID}"
+
+CUDA_VISIBLE_DEVICES=""
+DATA_DIR=${1:-${DATA_DIR:-"/datasets/LibriSpeech"}}
+DATASET=${2:-${DATASET:-"dev-clean"}}
+MODEL_CONFIG=${3:-${MODEL_CONFIG:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}}
+RESULT_DIR=${4:-${RESULT_DIR:-"/results"}}
+CHECKPOINT=${5:-${CHECKPOINT:-"/checkpoints/jasper_fp16.pt"}}
+CREATE_LOGFILE=${6:-${CREATE_LOGFILE:-"true"}}
+NUM_STEPS=${7:-${NUM_STEPS:-"-1"}}
+MAX_DURATION=${8:-${MAX_DURATION:-"36"}}
+SEED=${9:-${SEED:-0}}
+BATCH_SIZE=${10:-${BATCH_SIZE:-32}}
+SAMPLE_AUDIO=${11:-${SAMPLE_AUDIO:-"/datasets/LibriSpeech/dev-clean-wav/1272/128104/1272-128104-0000.wav"}}
+
+mkdir -p "$RESULT_DIR"
+
+CMD=" python inference_benchmark.py"
+CMD+=" --cpu"
+CMD+=" --batch_size=$BATCH_SIZE"
+CMD+=" --model_toml=$MODEL_CONFIG"
+CMD+=" --seed=$SEED"
+CMD+=" --dataset_dir=$DATA_DIR"
+CMD+=" --val_manifest $DATA_DIR/librispeech-${DATASET}-wav.json "
+CMD+=" --ckpt=$CHECKPOINT"
+CMD+=" --max_duration=$MAX_DURATION"
+CMD+=" --pad_to=-1"
+CMD+=" --sample_audio=$SAMPLE_AUDIO"
+[ "$NUM_STEPS" -gt 0 ] && \
+CMD+=" --steps $NUM_STEPS"
+
+if [ "$CREATE_LOGFILE" = "true" ] ; then
+   export GBS=$(expr $BATCH_SIZE )
+   printf -v TAG "jasper_train_benchmark_amp-%s_gbs%d" "$AMP" $GBS
+   DATESTAMP=`date +'%y%m%d%H%M%S'`
+   LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+   printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee "$LOGFILE"
+   grep 'latency' "$LOGFILE"
+fi
+set +x
diff --git a/PyTorch/SpeechRecognition/Jasper/scripts/train.sh b/PyTorch/SpeechRecognition/Jasper/scripts/train.sh
index 17c1042fb..07fbd5bbf 100755
--- a/PyTorch/SpeechRecognition/Jasper/scripts/train.sh
+++ b/PyTorch/SpeechRecognition/Jasper/scripts/train.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,90 +15,61 @@
 # limitations under the License.
 
 
-#!/bin/bash
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-
-DATA_DIR=${1:-"/datasets/LibriSpeech"}
-MODEL_CONFIG=${2:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
-RESULT_DIR=${3:-"/results"}
-CHECKPOINT=${4:-"none"}
-CREATE_LOGFILE=${5:-"true"}
-CUDNN_BENCHMARK=${6:-"true"}
-NUM_GPUS=${7:-8}
-PRECISION=${8:-"fp16"}
-EPOCHS=${9:-400}
-SEED=${10:-6}
-BATCH_SIZE=${11:-64}
-LEARNING_RATE=${12:-"0.015"}
-GRADIENT_ACCUMULATION_STEPS=${13:-1}
-LAUNCH_OPT=${LAUNCH_OPT:-"none"}
-
-
-PREC=""
-if [ "$PRECISION" = "fp16" ] ; then
-   PREC="--fp16"
-elif [ "$PRECISION" = "fp32" ] ; then
-   PREC=""
-else
-   echo "Unknown <precision> argument"
-   exit -2
-fi
-
-CUDNN=""
-if [ "$CUDNN_BENCHMARK" = "true" ] && [ "$PRECISION" = "fp16" ]; then
-   CUDNN=" --cudnn"
-else
-   CUDNN=""
-fi
-
-
-
-if [ "$CHECKPOINT" = "none" ] ; then
-   CHECKPOINT=""
-else
-   CHECKPOINT=" --ckpt=${CHECKPOINT}"
-fi
-
-
-CMD=" train.py"
+echo "NVIDIA container build: ${NVIDIA_BUILD_ID}"
+
+DATA_DIR=${1:-${DATA_DIR:-"/datasets/LibriSpeech"}}
+MODEL_CONFIG=${2:-${MODEL_CONFIG:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}}
+RESULT_DIR=${3:-${RESULT_DIR:-"/results"}}
+CHECKPOINT=${4:-${CHECKPOINT:-""}}
+CREATE_LOGFILE=${5:-${CREATE_LOGFILE:-"true"}}
+CUDNN_BENCHMARK=${6:-${CUDNN_BENCHMARK:-"true"}}
+NUM_GPUS=${7:-${NUM_GPUS:-8}}
+AMP=${8:-${AMP:-"false"}}
+EPOCHS=${9:-${EPOCHS:-400}}
+SEED=${10:-${SEED:-6}}
+BATCH_SIZE=${11:-${BATCH_SIZE:-64}}
+LEARNING_RATE=${12:-${LEARNING_RATE:-"0.015"}}
+GRADIENT_ACCUMULATION_STEPS=${13:-${GRADIENT_ACCUMULATION_STEPS:-2}}
+EMA=${EMA:-0.999}
+SAVE_FREQUENCY=${SAVE_FREQUENCY:-10}
+
+mkdir -p "$RESULT_DIR"
+
+CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS"
+CMD+=" train.py"
 CMD+=" --batch_size=$BATCH_SIZE"
 CMD+=" --num_epochs=$EPOCHS"
 CMD+=" --output_dir=$RESULT_DIR"
 CMD+=" --model_toml=$MODEL_CONFIG"
 CMD+=" --lr=$LEARNING_RATE"
+CMD+=" --ema=$EMA"
 CMD+=" --seed=$SEED"
 CMD+=" --optimizer=novograd"
 CMD+=" --dataset_dir=$DATA_DIR"
 CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
-CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json"
+CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,"
+CMD+="$DATA_DIR/librispeech-train-clean-360-wav.json,"
+CMD+="$DATA_DIR/librispeech-train-other-500-wav.json"
 CMD+=" --weight_decay=1e-3"
-CMD+=" --save_freq=10"
+CMD+=" --save_freq=$SAVE_FREQUENCY"
 CMD+=" --eval_freq=100"
-CMD+=" --train_freq=25"
+CMD+=" --train_freq=1"
 CMD+=" --lr_decay"
 CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS "
-CMD+=" $CHECKPOINT"
-CMD+=" $PREC"
-CMD+=" $CUDNN"
-
-
-if [ "${LAUNCH_OPT}" != "none" ]; then
-   CMD="python -m $LAUNCH_OPT $CMD"
-elif [ "$NUM_GPUS" -gt 1  ] ; then
-   CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
-else
-   CMD="python3  $CMD"
-fi
-
 
+[ "$AMP" == "true" ] && \
+CMD+=" --amp"
+[ "$CUDNN_BENCHMARK" = "true" ] && \
+CMD+=" --cudnn"
+[ -n "$CHECKPOINT" ] && \
+CMD+=" --ckpt=${CHECKPOINT}"
 
 if [ "$CREATE_LOGFILE" = "true" ] ; then
-  export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
-  printf -v TAG "jasper_train_%s_gbs%d" "$PRECISION" $GBS
-  DATESTAMP=`date +'%y%m%d%H%M%S'`
-  LOGFILE=$RESULT_DIR/$TAG.$DATESTAMP.log
-  printf "Logs written to %s\n" "$LOGFILE"
+   export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+   printf -v TAG "jasper_train_benchmark_amp-%s_gbs%d" "$AMP" $GBS
+   DATESTAMP=`date +'%y%m%d%H%M%S'`
+   LOGFILE=$RESULT_DIR/$TAG.$DATESTAMP.log
+   printf "Logs written to %s\n" "$LOGFILE"
 fi
 
 set -x
diff --git a/PyTorch/SpeechRecognition/Jasper/scripts/train_benchmark.sh b/PyTorch/SpeechRecognition/Jasper/scripts/train_benchmark.sh
index a75cb2fd1..c74f704b3 100755
--- a/PyTorch/SpeechRecognition/Jasper/scripts/train_benchmark.sh
+++ b/PyTorch/SpeechRecognition/Jasper/scripts/train_benchmark.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,59 +14,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#!/bin/bash
 
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
+echo "NVIDIA container build: ${NVIDIA_BUILD_ID}"
+
 SCRIPT_DIR=$(cd $(dirname $0); pwd)
 PROJECT_DIR=${SCRIPT_DIR}/..
 
-DATA_DIR=${1:-"/datasets/LibriSpeech"}
-MODEL_CONFIG=${2:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}
-RESULT_DIR=${3:-"/results"}
-CREATE_LOGFILE=${4:-"true"}
-CUDNN_BENCHMARK=${5:-"true"}
-NUM_GPUS=${6:-8}
-PRECISION=${7:-"fp16"}
-NUM_STEPS=${8:-"-1"}
-MAX_DURATION=${9:-16.7}
-SEED=${10:-0}
-BATCH_SIZE=${11:-64}
-LEARNING_RATE=${12:-"0.015"}
-GRADIENT_ACCUMULATION_STEPS=${13:-1}
-PRINT_FREQUENCY=${14:-1}
+DATA_DIR=${1:-${DATA_DIR:-"/datasets/LibriSpeech"}}
+MODEL_CONFIG=${2:-${MODEL_CONFIG:-"configs/jasper10x5dr_sp_offline_specaugment.toml"}}
+RESULT_DIR=${3:-${RESULT_DIR:-"/results"}}
+CREATE_LOGFILE=${4:-${CREATE_LOGFILE:-"true"}}
+CUDNN_BENCHMARK=${5:-${CUDNN_BENCHMARK:-"true"}}
+NUM_GPUS=${6:-${NUM_GPUS:-8}}
+AMP=${7:-${AMP:-"false"}}
+NUM_STEPS=${8:-${NUM_STEPS:-"-1"}}
+MAX_DURATION=${9:-${MAX_DURATION:-16.7}}
+SEED=${10:-${SEED:-0}}
+BATCH_SIZE=${11:-${BATCH_SIZE:-32}}
+LEARNING_RATE=${12:-${LEARNING_RATE:-"0.015"}}
+GRADIENT_ACCUMULATION_STEPS=${13:-${GRADIENT_ACCUMULATION_STEPS:-1}}
+PRINT_FREQUENCY=${14:-${PRINT_FREQUENCY:-1}}
 USE_PROFILER=${USE_PROFILER:-"false"}
 
-PREC=""
-if [ "$PRECISION" = "fp16" ] ; then
-   PREC=" --fp16"
-elif [ "$PRECISION" = "fp32" ] ; then
-   PREC=""
-else
-   echo "Unknown <precision> argument"
-   exit -2
-fi
-
-STEPS=""
-if [ "$NUM_STEPS" -ne "-1" ] ; then
-   STEPS=" --num_steps=$NUM_STEPS"
-elif [ "$NUM_STEPS" = "-1" ] ; then
-   STEPS=""
-else
-   echo "Unknown <precision> argument"
-   exit -2
-fi
+mkdir -p "$RESULT_DIR"
 
-CUDNN=""
-if [ "$CUDNN_BENCHMARK" = "true" ] ; then
-   CUDNN=" --cudnn"
-else
-   CUDNN=""
-fi
+[ "${USE_PROFILER}" = "true" ] && PYTHON_ARGS="-m cProfile -s cumtime"
 
-if [ "${USE_PROFILER}" = "true" ] ; then
-    PYTHON_ARGS+="-m cProfile  -s cumtime"
-fi
-    
 CMD="${PYTHON_ARGS} ${PROJECT_DIR}/train.py"
 CMD+=" --batch_size=$BATCH_SIZE"
 CMD+=" --num_epochs=400"
@@ -76,7 +51,9 @@ CMD+=" --optimizer=novograd"
 CMD+=" --gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS"
 CMD+=" --dataset_dir=$DATA_DIR"
 CMD+=" --val_manifest=$DATA_DIR/librispeech-dev-clean-wav.json"
-CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,$DATA_DIR/librispeech-train-clean-360-wav.json,$DATA_DIR/librispeech-train-other-500-wav.json"
+CMD+=" --train_manifest=$DATA_DIR/librispeech-train-clean-100-wav.json,"
+CMD+="$DATA_DIR/librispeech-train-clean-360-wav.json,"
+CMD+="$DATA_DIR/librispeech-train-other-500-wav.json"
 CMD+=" --weight_decay=1e-3"
 CMD+=" --save_freq=100000"
 CMD+=" --eval_freq=100000"
@@ -84,24 +61,25 @@ CMD+=" --max_duration=$MAX_DURATION"
 CMD+=" --pad_to_max"
 CMD+=" --train_freq=$PRINT_FREQUENCY"
 CMD+=" --lr_decay "
-CMD+=" $CUDNN"
-CMD+=" $PREC"
-CMD+=" $STEPS"
+[ "$AMP" == "true" ] && \
+CMD+=" --amp"
+[ "$CUDNN_BENCHMARK" = "true" ] && \
+CMD+=" --cudnn"
+[ "$NUM_STEPS" -gt 1 ] && \
+CMD+=" --num_steps=$NUM_STEPS"
 
 if [ "$NUM_GPUS" -gt 1  ] ; then
    CMD="python3 -m torch.distributed.launch --nproc_per_node=$NUM_GPUS $CMD"
 else
-   CMD="python3  $CMD"
+   CMD="python3 $CMD"
 fi
 
-
 if [ "$CREATE_LOGFILE" = "true" ] ; then
-  export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
-  printf -v TAG "jasper_train_benchmark_%s_gbs%d" "$PRECISION" $GBS
-  DATESTAMP=`date +'%y%m%d%H%M%S'`
-  LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
-  printf "Logs written to %s\n" "$LOGFILE"
-
+   export GBS=$(expr $BATCH_SIZE \* $NUM_GPUS)
+   printf -v TAG "jasper_train_benchmark_amp-%s_gbs%d" "$AMP" $GBS
+   DATESTAMP=`date +'%y%m%d%H%M%S'`
+   LOGFILE="${RESULT_DIR}/${TAG}.${DATESTAMP}.log"
+   printf "Logs written to %s\n" "$LOGFILE"
 fi
 
 if [ -z "$LOGFILE" ] ; then
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/Dockerfile b/PyTorch/SpeechRecognition/Jasper/tensorrt/Dockerfile
similarity index 88%
rename from PyTorch/SpeechRecognition/Jasper/trt/Dockerfile
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/Dockerfile
index f89dd7107..ad31eabce 100644
--- a/PyTorch/SpeechRecognition/Jasper/trt/Dockerfile
+++ b/PyTorch/SpeechRecognition/Jasper/tensorrt/Dockerfile
@@ -1,8 +1,6 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3 
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
 FROM ${FROM_IMAGE_NAME}
 
-RUN apt-get update && apt-get install -y python3
-
 WORKDIR /tmp/onnx-trt
 COPY trt/onnx-trt.patch .
 RUN git clone https://github.com/onnx/onnx-tensorrt.git && cd onnx-tensorrt && git checkout 8716c9b && git submodule update --init --recursive && \
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/README.md b/PyTorch/SpeechRecognition/Jasper/tensorrt/README.md
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/README.md
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/README.md
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/onnx-trt.patch b/PyTorch/SpeechRecognition/Jasper/tensorrt/onnx-trt.patch
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/onnx-trt.patch
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/onnx-trt.patch
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/perf.py b/PyTorch/SpeechRecognition/Jasper/tensorrt/perf.py
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/perf.py
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/perf.py
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/perfprocedures.py b/PyTorch/SpeechRecognition/Jasper/tensorrt/perfprocedures.py
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/perfprocedures.py
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/perfprocedures.py
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/perfutils.py b/PyTorch/SpeechRecognition/Jasper/tensorrt/perfutils.py
similarity index 97%
rename from PyTorch/SpeechRecognition/Jasper/trt/perfutils.py
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/perfutils.py
index 3da69fc73..483c64115 100644
--- a/PyTorch/SpeechRecognition/Jasper/trt/perfutils.py
+++ b/PyTorch/SpeechRecognition/Jasper/tensorrt/perfutils.py
@@ -16,7 +16,7 @@
 
 from model import GreedyCTCDecoder, AudioPreprocessing, JasperEncoderDecoder
 from dataset import AudioToTextDataLayer
-from helpers import AmpOptimizations, process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, norm
+from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, norm
 from apex import amp
 import torch
 import torch.nn as nn
@@ -218,7 +218,7 @@ def get_pytorch_components_and_onnx(args):
     # if we are to produce engine, not run/create ONNX, postpone AMP initialization
     # (ONNX parser cannot handle mixed FP16 ONNX yet)
     if args.pyt_fp16 and args.engine_path is None:
-        amp.initialize(models=model, opt_level=AmpOptimizations[optim_level])
+        amp.initialize(models=model, opt_level='O'+str(optim_level))
         
     if args.make_onnx:
         if args.onnx_path is None or args.ckpt_path is None:
@@ -226,7 +226,7 @@ def get_pytorch_components_and_onnx(args):
         onnx_path = get_onnx(args.onnx_path, model, args)
 
     if args.pyt_fp16 and args.engine_path is not None:
-        amp.initialize(models=model, opt_level=AmpOptimizations[optim_level])
+        amp.initialize(models=model, opt_level='O'+str(optim_level))
     
     return {'data_layer': data_layer,
             'audio_preprocessor': audio_preprocessor,
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/requirements.txt b/PyTorch/SpeechRecognition/Jasper/tensorrt/requirements.txt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/requirements.txt
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/requirements.txt
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/build.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/build.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/build.sh
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/build.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/launch.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/launch.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/launch.sh
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/launch.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/download_inference_librispeech.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/download_inference_librispeech.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/scripts/download_inference_librispeech.sh
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/download_inference_librispeech.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/preprocess_inference_librispeech.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/preprocess_inference_librispeech.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/scripts/preprocess_inference_librispeech.sh
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/preprocess_inference_librispeech.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/trt_inference.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/trt_inference.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/scripts/trt_inference.sh
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/trt_inference.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/trt_inference_benchmark.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/trt_inference_benchmark.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/scripts/trt_inference_benchmark.sh
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/trt_inference_benchmark.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/walk_benchmark.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/walk_benchmark.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/scripts/walk_benchmark.sh
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/walk_benchmark.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/trtutils.py b/PyTorch/SpeechRecognition/Jasper/tensorrt/trtutils.py
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trt/trtutils.py
rename to PyTorch/SpeechRecognition/Jasper/tensorrt/trtutils.py
diff --git a/PyTorch/SpeechRecognition/Jasper/train.py b/PyTorch/SpeechRecognition/Jasper/train.py
index a3d3cee00..aed09e8e1 100644
--- a/PyTorch/SpeechRecognition/Jasper/train.py
+++ b/PyTorch/SpeechRecognition/Jasper/train.py
@@ -13,21 +13,26 @@
 # limitations under the License.
 
 import argparse
+import copy
 import itertools
+import math
 import os
+import random
 import time
+
 import toml
 import torch
-import apex
-from apex import amp
-import random
 import numpy as np
-import math
+from apex import amp
+
 from dataset import AudioToTextDataLayer
-from helpers import monitor_asr_train_progress, process_evaluation_batch, process_evaluation_epoch,  add_ctc_labels, AmpOptimizations, model_multi_gpu, print_dict, print_once
+from helpers import (add_ctc_labels, model_multi_gpu, monitor_asr_train_progress,
+                     print_dict, print_once, process_evaluation_batch,
+                     process_evaluation_epoch)
 from model import AudioPreprocessing, CTCLossNM, GreedyCTCDecoder, Jasper
 from optimizers import Novograd, AdamW
 
+
 def lr_policy(initial_lr, step, N):
     """
     learning rate decay
@@ -40,37 +45,57 @@ def lr_policy(initial_lr, step, N):
     res = initial_lr * ((N - step) / N) ** 2
     return max(res, min_lr)
 
-def save(model, optimizer, epoch, output_dir):
+
+def save(model, ema_model, optimizer, epoch, output_dir, optim_level):
     """
     Saves model checkpoint
     Args:
         model: model
+        ema_model: model with exponential averages of weights
         optimizer: optimizer
         epoch: epoch of model training
         output_dir: path to save model checkpoint
     """
-    class_name = model.__class__.__name__
-    unix_time = time.time()
-    file_name = "{0}_{1}-epoch-{2}.pt".format(class_name, unix_time, epoch)
-    print_once("Saving module {0} in {1}".format(class_name, os.path.join(output_dir, file_name)))
-    if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-        save_checkpoint={
-                        'epoch': epoch,
-                        'state_dict': model_to_save.state_dict(),
-                        'optimizer': optimizer.state_dict()
-                        }
-
-        torch.save(save_checkpoint, os.path.join(output_dir, file_name))
+    out_fpath = os.path.join(output_dir, f"Jasper_epoch{epoch}_checkpoint.pt")
+    print_once(f"Saving {out_fpath}...")
+
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+
+    if rank == 0:
+        checkpoint = {
+            'epoch': epoch,
+            'state_dict': getattr(model, 'module', model).state_dict(),
+            'optimizer': optimizer.state_dict(),
+            'amp': amp.state_dict() if optim_level > 0 else None,
+
+        }
+        if ema_model is not None:
+            checkpoint['ema_state_dict'] = getattr(ema_model, 'module', ema_model).state_dict()
+        torch.save(checkpoint, out_fpath)
+
     print_once('Saved.')
 
 
+def apply_ema(model, ema_model, decay):
+    if not decay:
+        return
+    st = model.state_dict()
+    add_module = hasattr(model, 'module') and not hasattr(ema_model, 'module')
+    for k,v in ema_model.state_dict().items():
+        if add_module and not k.startswith('module.'):
+            k = 'module.' + k
+        v.copy_(decay * v + (1 - decay) * st[k])
 
 
 def train(
         data_layer,
         data_layer_eval,
         model,
+        ema_model,
         ctc_loss,
         greedy_decoder,
         optimizer,
@@ -93,7 +118,7 @@ def train(
         args: script input argument list
         fn_lr_policy: learning rate adjustment function
     """
-    def eval():
+    def eval(model, name=''):
         """Evaluates model on evaluation dataset
         """
         with torch.no_grad():
@@ -136,8 +161,11 @@ def eval():
             # final aggregation across all workers and minibatches) and logging of results
             wer, eloss = process_evaluation_epoch(_global_var_dict)
 
-            print_once("==========>>>>>>Evaluation Loss: {0}\n".format(eloss))
-            print_once("==========>>>>>>Evaluation WER: {0}\n".format(wer))
+            if name != '':
+                name = '_' + name
+
+            print_once(f"==========>>>>>>Evaluation{name} Loss: {eloss}\n")
+            print_once(f"==========>>>>>>Evaluation{name} WER: {wer}\n")
 
     print_once("Starting .....")
     start_time = time.time()
@@ -178,7 +206,7 @@ def eval():
             model.train()
             if optim_level == 1:
               with amp.disable_casts():
-                  t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor(t_audio_signal_t, t_a_sig_length_t) 
+                  t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor(t_audio_signal_t, t_a_sig_length_t)
             else:
               t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor(t_audio_signal_t, t_a_sig_length_t)
             t_processed_signal_t = data_spectr_augmentation(t_processed_signal_t)
@@ -186,12 +214,12 @@ def eval():
                 t_log_probs_t, t_encoded_len_t = model.forward((t_processed_signal_t, t_processed_sig_length_t))
             else:
                 t_log_probs_t = model.forward(t_processed_signal_t)
-            
+
             t_loss_t = ctc_loss(log_probs=t_log_probs_t, targets=t_transcript_t, input_length=t_encoded_len_t, target_length=t_transcript_len_t)
             if args.gradient_accumulation_steps > 1:
                     t_loss_t = t_loss_t / args.gradient_accumulation_steps
 
-            if optim_level >=0 and optim_level <=3:
+            if 0 < optim_level <= 3:
                 with amp.scale_loss(t_loss_t, optimizer) as scaled_loss:
                     scaled_loss.backward()
             else:
@@ -211,7 +239,10 @@ def eval():
                     print_once("Step time: {0} seconds".format(time.time() - last_iter_start))
                 if step > 0 and step % args.eval_frequency == 0:
                     print_once("Doing Evaluation ....................... ......  ... .. . .")
-                    eval()
+                    eval(model)
+                    if args.ema > 0:
+                        eval(ema_model, 'EMA')
+
                 step += 1
                 batch_counter = 0
                 average_loss = 0
@@ -223,13 +254,16 @@ def eval():
         print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start))
         epoch += 1
         if epoch % args.save_frequency == 0 and epoch > 0:
-            save(model, optimizer, epoch, output_dir=args.output_dir)
+            save(model, ema_model, optimizer, epoch, args.output_dir, optim_level)
         if args.num_steps is None and epoch >= args.num_epochs:
             break
     print_once("Done in {0}".format(time.time() - start_time))
     print_once("Final Evaluation ....................... ......  ... .. . .")
-    eval()
-    save(model, optimizer, epoch, output_dir=args.output_dir)
+    eval(model)
+    if args.ema > 0:
+        eval(ema_model, 'EMA')
+    save(model, ema_model, optimizer, epoch, args.output_dir, optim_level)
+
 
 def main(args):
     random.seed(args.seed)
@@ -247,12 +281,9 @@ def main(args):
     multi_gpu = torch.distributed.is_initialized()
     if multi_gpu:
         print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size()))
-                
+
     # define amp optimiation level
-    if args.fp16:
-        optim_level = 1
-    else:
-        optim_level = 0
+    optim_level = 1 if args.amp else 0
 
     jasper_model_definition = toml.load(args.model_toml)
     dataset_vocab = jasper_model_definition['labels']['labels']
@@ -271,9 +302,9 @@ def main(args):
         assert(args.max_duration > 0)
         featurizer_config['max_duration'] = args.max_duration
         featurizer_config_eval['max_duration'] = args.max_duration
-        featurizer_config['pad_to'] = -1        
+        featurizer_config['pad_to'] = -1
         featurizer_config_eval['pad_to'] = -1
-        
+
     print_once('model_config')
     print_dict(jasper_model_definition)
 
@@ -306,14 +337,6 @@ def main(args):
 
     model = Jasper(feature_config=featurizer_config, jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab))
 
-    if args.ckpt is not None:
-        print_once("loading model from {}".format(args.ckpt))
-        checkpoint = torch.load(args.ckpt, map_location="cpu")
-        model.load_state_dict(checkpoint['state_dict'], strict=True)
-        args.start_epoch = checkpoint['epoch']
-    else:
-        args.start_epoch = 0
-
     ctc_loss = CTCLossNM( num_classes=len(ctc_vocab))
     greedy_decoder = GreedyCTCDecoder()
 
@@ -346,19 +369,53 @@ def main(args):
                         weight_decay=args.weight_decay)
     else:
         raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind))
-    if optim_level >= 0 and optim_level <=3:
+
+    if 0 < optim_level <= 3:
         model, optimizer = amp.initialize(
             min_loss_scale=1.0,
             models=model,
             optimizers=optimizer,
-            opt_level=AmpOptimizations[optim_level])
+            opt_level='O' + str(optim_level))
+
+    if args.ema > 0:
+        ema_model = copy.deepcopy(model)
+    else:
+        ema_model = None
+
+    model = model_multi_gpu(model, multi_gpu)
+
     if args.ckpt is not None:
+        print_once("loading model from {}".format(args.ckpt))
+        checkpoint = torch.load(args.ckpt, map_location="cpu")
+        if hasattr(model, 'module'):
+            model.module.load_state_dict(checkpoint['state_dict'], strict=True)
+        else:
+            model.load_state_dict(checkpoint['state_dict'], strict=True)
+
+        if args.ema > 0:
+            if 'ema_state_dict' in checkpoint:
+                if hasattr(ema_model, 'module'):
+                    ema_model.module.load_state_dict(checkpoint['ema_state_dict'], strict=True)
+                else:
+                    ema_model.load_state_dict(checkpoint['ema_state_dict'], strict=True)
+            else:
+                print_once('WARNING: ema_state_dict not found in the checkpoint')
+                print_once('WARNING: initializing EMA model with regular params')
+                if hasattr(ema_model, 'module'):
+                    ema_model.module.load_state_dict(checkpoint['state_dict'], strict=True)
+                else:
+                    ema_model.load_state_dict(checkpoint['state_dict'], strict=True)
+
         optimizer.load_state_dict(checkpoint['optimizer'])
 
-    model = model_multi_gpu(model, multi_gpu)
+        if optim_level > 0:
+            amp.load_state_dict(checkpoint['amp'])
 
+        args.start_epoch = checkpoint['epoch']
+    else:
+        args.start_epoch = 0
 
-    train(data_layer, data_layer_eval, model, \
+    train(data_layer, data_layer_eval, model, ema_model,
           ctc_loss=ctc_loss, \
           greedy_decoder=greedy_decoder, \
           optimizer=optimizer, \
@@ -368,6 +425,7 @@ def main(args):
           fn_lr_policy=fn_lr_policy if args.lr_decay else None, \
           args=args)
 
+
 def parse_args():
     parser = argparse.ArgumentParser(description='Jasper')
     parser.add_argument("--local_rank", default=None, type=int)
@@ -389,10 +447,11 @@ def parse_args():
     parser.add_argument("--dataset_dir", dest="dataset_dir", required=True, type=str, help='root dir of dataset')
     parser.add_argument("--lr_decay", action="store_true", default=False, help='use learning rate decay')
     parser.add_argument("--cudnn", action="store_true", default=False, help="enable cudnn benchmark")
-    parser.add_argument("--fp16", action="store_true", default=False, help="use mixed precision training")
+    parser.add_argument("--amp", "--fp16", action="store_true", default=False, help="use mixed precision training")
     parser.add_argument("--output_dir", type=str, required=True, help='saves results in this directory')
     parser.add_argument("--ckpt", default=None, type=str, help="if specified continues training from given checkpoint. Otherwise starts from beginning")
     parser.add_argument("--seed", default=42, type=int, help='seed')
+    parser.add_argument("--ema", type=float, default=0.0, help='discount factor for exponential averaging of model weights during training')
     args=parser.parse_args()
     return args
 
diff --git a/PyTorch/SpeechRecognition/Jasper/triton/Dockerfile b/PyTorch/SpeechRecognition/Jasper/triton/Dockerfile
new file mode 100644
index 000000000..48ed7700e
--- /dev/null
+++ b/PyTorch/SpeechRecognition/Jasper/triton/Dockerfile
@@ -0,0 +1,22 @@
+ARG FROM_IMAGE_NAME=nvcr.io/nvidian/pytorch:20.03-py3
+ARG TRITON_BASE_IMAGE=nvcr.io/nvidia/tritonserver:20.03.1-py3-clientsdk
+FROM ${TRITON_BASE_IMAGE} as triton
+FROM ${FROM_IMAGE_NAME}
+
+ADD requirements.txt .
+RUN pip install -r requirements.txt
+RUN pip install onnxruntime
+
+ADD triton/requirements.txt .
+RUN pip install -r requirements.txt
+
+ADD tensorrt/requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY --from=triton /opt/tritonserver/qa/pkgs/tensorrtserver-1.13.0-py3-none-linux_x86_64.whl ./tensorrtserver-1.13.0-py3-none-linux_x86_64.whl
+
+RUN pip install tensorrtserver-1.13.0-py3-none-linux_x86_64.whl
+
+WORKDIR /workspace/jasper
+COPY . .
+RUN pip install --no-cache-dir -e .
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/README.md b/PyTorch/SpeechRecognition/Jasper/triton/README.md
similarity index 82%
rename from PyTorch/SpeechRecognition/Jasper/trtis/README.md
rename to PyTorch/SpeechRecognition/Jasper/triton/README.md
index a9132262e..143d06ee0 100644
--- a/PyTorch/SpeechRecognition/Jasper/trtis/README.md
+++ b/PyTorch/SpeechRecognition/Jasper/triton/README.md
@@ -1,14 +1,14 @@
-# Jasper Inference Using TensorRT Inference Server
+# Jasper Inference Using Triton Inference Server
 
-This is a subfolder of the Jasper for PyTorch repository that provides scripts to deploy high-performance inference using NVIDIA TensorRT Inference Server offering different options for the inference model pipeline.
+This is a subfolder of the Jasper for PyTorch repository that provides scripts to deploy high-performance inference using NVIDIA Triton Inference Server (formerly NVIDIA TensorRT Inference Server). It offers different options for the inference model pipeline.
 
 
 ## Table Of Contents
 
 - [Model overview](#model-overview)
    * [Model architecture](#model-architecture)
-   * [TensorRT Inference Server Overview](#tensorrt-inference-server-overview)
-   * [Inference Pipeline in TensorRT Inference Server](#inference-pipeline-in-tensorrt-inference-server)
+   * [Triton Inference Server Overview](#triton-inference-server-overview)
+   * [Inference Pipeline in Triton Inference Server](#inference-pipeline-in-triton-inference-server)
 - [Setup](#setup)
   * [Supported Software](#supported-software)
   * [Requirements](#requirements)
@@ -16,9 +16,9 @@ This is a subfolder of the Jasper for PyTorch repository that provides scripts t
 - [Advanced](#advanced)
   * [Scripts and sample code](#scripts-and-sample-code)
 - [Performance](#performance)
-  * [Inference Benchmarking in TensorRT Inference Server](#inference-benchmarking-in-tensorrt-inference-server)
+  * [Inference Benchmarking in Triton Inference Server](#inference-benchmarking-in-triton-inference-server)
   * [Results](#results)
-    * [Performance analysis for TensorRT Inference Server: NVIDIA T4](#performance-analysis-for-tensorrt-inference-server-nvidia-t4)
+    * [Performance analysis for Triton Inference Server: NVIDIA T4](#performance-analysis-for-triton-inference-server-nvidia-t4)
 	* [Maximum Batch Size](#maximum-batch-size)
 	* [Batching techniques: Static versus Dynamic Batching](#batching-techniques-static-versus-dynamic-batching)
     	* [TensorRT/ONNX/PyTorch JIT comparisons](#tensorrt/onnx/pytorch-jit-comparisons)
@@ -42,12 +42,12 @@ More information on the Jasper model architecture can be found in the [Jasper Py
 
 
 
-### TensorRT Inference Server Overview
+### Triton Inference Server Overview
 
-The [NVIDIA TensorRT Inference Server](https://github.com/NVIDIA/tensorrt-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
-This folder contains detailed performance analysis as well as scripts to run Jasper inference using TensorRT Inference Server.
+The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
+This folder contains detailed performance analysis as well as scripts to run Jasper inference using Triton Inference Server.
 
-A typical TensorRT Inference Server pipeline can be broken down into the following steps:
+A typical Triton Inference Server pipeline can be broken down into the following steps:
 
 1. The client serializes the inference request into a message and sends it to the server (Client Send). 
 2. The message travels over the network from the client to the server (Network).
@@ -59,12 +59,12 @@ A typical TensorRT Inference Server pipeline can be broken down into the followi
 8. The completed message is deserialized by the client and processed as a completed inference request (Client Receive).
 
 Generally, for local clients, steps 1-4 and 6-8 will only occupy a small fraction of time, compared to steps 5-6. As backend deep learning systems like Jasper are rarely exposed directly to end users, but instead only interfacing with local front-end servers, for the sake of Jasper, we can consider that all clients are local.
-In this section, we will go over how to launch both the TensorRT Inference Server and the client and get the best performance solution that fits your specific application needs.
+In this section, we will go over how to launch both the Triton Inference Server and the client and get the best performance solution that fits your specific application needs.
 
 Note: The following instructions are run from outside the container and call `docker run` commands as required.
 
 
-## Inference Pipeline in TensorRT Inference Server
+## Inference Pipeline in Triton Inference Server
 
 The Jasper model pipeline consists of 3 components, where each part can be customized to be a different backend: 
 
@@ -88,9 +88,7 @@ To run a model with TensorRT, we first construct the model in PyTorch, which is
 |ONNX|-|x|-|
 |TensorRT|-|x|-|
 
-In order to run inference with TensorRT outside of the inference server, refer to the [Jasper TensorRT README](../trt/README.md).
-
-
+In order to run inference with TensorRT outside of the inference server, refer to the [Jasper TensorRT README](../tensort/README.md).
 
 
 ## Setup
@@ -107,15 +105,15 @@ The following software version configuration is supported has been tested.
 |CUDA|10.1.243|
 
 
-The following section lists the requirements in order to start inference with Jasper in TensorRT Inference Server.
+The following section lists the requirements in order to start inference with Jasper in Triton Inference Server.
 
 ### Requirements
 
 The repository contains a folder `./trtis/` with a `Dockerfile` which extends the PyTorch 19.09-py3 NGC container and encapsulates some dependencies. Ensure you have the following components:
 
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.09-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
-* [TensorRT Inference Server 19.09 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrtserver)
+* [PyTorch 20.06-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+* [Triton Inference Server 20.06 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver)
 * Access to [NVIDIA machine learning repository](https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb) and [NVIDIA cuda repository](https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb) for NVIDIA TensorRT 6
 * [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
 * [Pretrained Jasper Model Checkpoint](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16)
@@ -134,7 +132,7 @@ Running the following scripts will build and launch the container containing all
     cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper
     ```
 
-2. Build a container that extends NGC PyTorch 19.09, TensorRT, TensorRT Inference Server, and TensorRT Inference Client:
+2. Build a container that extends NGC PyTorch 19.09, TensorRT, Triton Inference Server, and Triton Inference Client:
 
     ```bash
     bash trtis/scripts/docker/build.sh
@@ -161,11 +159,11 @@ Running the following scripts will build and launch the container containing all
     bash trtis/scripts/docker/trtis.sh
     ```
 
-    The `/datasets`, `/checkpoints`, `/results` directories will be mounted as volumes and mapped to the corresponding directories `<DATA_DIR>`, `<CHECKPOINT_DIR>`, `<RESULT_DIR>` on the host. Note that `<DATA_DIR>`, `<CHECKPOINT_DIR>`, and `<RESULT_DIR>` directly correspond to the same arguments in `scripts/docker/launch.sh` and `trt/scripts/docker/launch.sh` mentioned in the [Jasper PyTorch README](../README.md) and [Jasper TensorRT README](../trt/README.md).
+    The `/datasets`, `/checkpoints`, `/results` directories will be mounted as volumes and mapped to the corresponding directories `<DATA_DIR>`, `<CHECKPOINT_DIR>`, `<RESULT_DIR>` on the host. Note that `<DATA_DIR>`, `<CHECKPOINT_DIR>`, and `<RESULT_DIR>` directly correspond to the same arguments in `scripts/docker/launch.sh` and `trt/scripts/docker/launch.sh` mentioned in the [Jasper PyTorch README](../README.md) and [Jasper TensorRT README](../tensorrt/README.md).
 
-    Briefly, `<DATA_DIR>` should contain, or be prepared to contain a `LibriSpeech` sub-directory (created in [Acquiring Dataset](../trt/README.md)), `<CHECKPOINT_DIR>` should contain a PyTorch model checkpoint (`*.pt`) file obtained through training described in [Jasper PyTorch README](../README.md), and `<RESULT_DIR>` should be prepared to contain timing results and logs. Downloading `LibriSpeech` is not required for Inference in TensorRT Inference Server on a single .wav audio file. To do inference and evaluation on LibriSpeech, download the dataset following the instructions in the [Jasper TensorRT README](../README.md)
+    Briefly, `<DATA_DIR>` should contain, or be prepared to contain a `LibriSpeech` sub-directory (created in [Acquiring Dataset](../trt/README.md)), `<CHECKPOINT_DIR>` should contain a PyTorch model checkpoint (`*.pt`) file obtained through training described in [Jasper PyTorch README](../README.md), and `<RESULT_DIR>` should be prepared to contain timing results and logs. Downloading `LibriSpeech` is not required for Inference in Triton Inference Server on a single .wav audio file. To do inference and evaluation on LibriSpeech, download the dataset following the instructions in the [Jasper TensorRT README](../tensorrt/README.md)
 
-4. Convert pretrained PyTorch model checkpoint into TensorRT Inference Server compatible model backends.
+4. Convert pretrained PyTorch model checkpoint into Triton Inference Server compatible model backends.
 
     From outside the container, run:
 
@@ -195,7 +193,7 @@ Running the following scripts will build and launch the container containing all
     bash trtis/scripts/prepare_model_repository.sh
     ```
 
-6. Launch TensorRT Inference Server.
+6. Launch Triton Inference Server.
 
     Start the server:
     ```bash
@@ -218,7 +216,7 @@ Running the following scripts will build and launch the container containing all
 
     Note: This can take several hours to complete due to the extensiveness of the benchmark. More details about the benchmark are found in the [Advanced](#advanced) section under [Performance](#performance).
 
-8. Run inference using the Client and TensorRT Inference Server.
+8. Run inference using the Client and Triton Inference Server.
 
     8.1 From outside the container, restart the server:
     ```bash
@@ -246,7 +244,7 @@ Running the following scripts will build and launch the container containing all
 
 ## Advanced
 
-The following sections provide greater details about the TensorRT Inference Server pipeline and inference analysis and benchmarking results.
+The following sections provide greater details about the Triton Inference Server pipeline and inference analysis and benchmarking results.
 
 
 ### Scripts and sample code
@@ -260,9 +258,9 @@ The `trtis/scripts/` directory has easy to use scripts to run supported function
 * `./docker/launch.sh`: launches container
 * `execute_all_perf_runs.sh`: runs all benchmarks using TRTIS perfclient calls `generate_perf_results.sh`
 * `export_model.sh`: from pretrained PyTorch checkpoint generates backends for every version of the model inference pipeline, calls `export_model_helper.sh`
-* `prepare_model_repository.sh`: copies model config files from `./model_repo/` to `./deploy/model_rep`o and creates links to generated model backends, setting up the model repository for TensorRT Inference Server
+* `prepare_model_repository.sh`: copies model config files from `./model_repo/` to `./deploy/model_rep`o and creates links to generated model backends, setting up the model repository for Triton Inference Server
 * `generate_perf_results.sh`: runs benchmark with perf-client for specific configuration and calls `run_perf_client.sh`
-* `run_server.sh`: launches TensorRT Inference Server
+* `run_server.sh`: launches Triton Inference Server
 * `run_client.sh`: launches client by using `jasper-client.py` to submit inference requests to server
 
 
@@ -270,69 +268,68 @@ The `trtis/scripts/` directory has easy to use scripts to run supported function
 
 ## Performance
 
-### Inference Benchmarking in TensorRT Inference Server
+### Inference Benchmarking in Triton Inference Server
 
 To benchmark the inference performance on either Volta or Turing GPU, run `bash trtis/scripts/execute_all_perf_runs.sh` according to [Quick-Start-Guide](#quick-start-guide) Step 7 and set `ARCH` according to the underlying hardware (`ARCH=70` for Volta and `ARCH=75` for Turing)
 
-By default, this script measures inference performance for all 3 model pipelines: PyTorch JIT  (‘pyt’) pipeline, ONNX (‘onnx’) pipeline, TensorRT(‘trt’) pipeline, both with fp32 and fp16 precision. Each of these pipelines is measured for different audio input lengths (2sec, 7sec, 16.7sec) and a range of different server batch sizes (up to 64). This takes place in `trtis/scripts/generate_perf_results.sh`. For a specific audio length and batch size static and dynamic batching comparison is performed. For benchmarking we used `MAX_SEQUENCE_LENGTH_FOR_ENGINE=1792` for inference model generation.
+By default, this script measures inference performance for all 3 model pipelines: PyTorch JIT  (‘pyt’) pipeline, ONNX (‘onnx’) pipeline, TensorRT(‘tensorrt’) pipeline, both with fp32 and fp16 precision. Each of these pipelines is measured for different audio input lengths (2sec, 7sec, 16.7sec) and a range of different server batch sizes (up to 64). This takes place in `trtis/scripts/generate_perf_results.sh`. For a specific audio length and batch size static and dynamic batching comparison is performed. For benchmarking we used `MAX_SEQUENCE_LENGTH_FOR_ENGINE=1792` for inference model generation.
 
 
 
 ### Results
 
 
-#### Performance Analysis for TensorRT Inference Server: NVIDIA T4
+#### Performance Analysis for Triton Inference Server: NVIDIA T4
 
 
 
 ### Results
 
 
-#### Performance Analysis for TensorRT Inference Server: NVIDIA T4
+#### Performance Analysis for Triton Inference Server: NVIDIA T4
 
 Based on the figure below, we recommend using the Dynamic Batcher with `max_batch_size=8`, `max_queue_delay_microseconds` as large as possible to fit within your latency window (the values used below are extremely large to exaggerate their effect). The largest improvements to both throughput and latency come from increasing the batch size due to efficiency gains in the GPU with larger batches. The Dynamic Batcher combines the best of both worlds by efficiently batching together a large number of concurrent requests, while also keeping latency down for infrequent requests. 
 
 All results below are obtained using the following configurations:
 * Single T4 16GB GPU on a local server
 * Jasper Large
-* Audio length = 7 seconds 
+* Audio length = 7 seconds
 * FP16 precision
 
 Latencies are indicated by bar plots using the left axis. Throughput is indicated by the blue line plot using the right axis. X-axis indicates the concurrency - the maximum number of inference requests that can be in the pipeline at any given time. For example, when the concurrency is set to 1, the client waits for an inference request to be completed (Step 8) before it sends another to the server (Step 1). A high number of concurrent requests can reduce the impact of network latency on overall throughput.
 
 
-<img src="../images/trtis_throughput_latency_summary.png" width="100%" height="100%"> 
+<img src="../images/triton_throughput_latency_summary.png" width="100%" height="100%">
 
-Figure 1: Latency vs Throughput for Jasper Large, FP16, Audio Length = 7sec using various configurations and all 3 model backends  available in TensorRT Inference Server. TensorRT is denoted as TRT, PyTorch as PyT.
+Figure 1: Latency vs Throughput for Jasper Large, FP16, Audio Length = 7sec using various configurations and all 3 model backends  available in Triton Inference Server. TODO: TensorRT is denoted as TRT, PyTorch as PyT.
 
 
 ##### Maximum Batch Size
-In general, increasing batch size leads to higher throughput at the cost of higher latency. In the following sections, we analyze the results using the example of the TensorRT-pipeline. 
- 
+In general, increasing batch size leads to higher throughput at the cost of higher latency. In the following sections, we analyze the results using the example of the Triton pipeline.
+
 As we can see in Figure 2, the throughput at Batch Size=1, Client Concurrent Requests = 8 is 45 and in Figure 3, the throughput at Batch Size=8, Client Concurrent Requests = 1 is 101, giving a speedup of ~2.24x. 
 Note: We compare Batch Size=1, Client Concurrent Requests = 8 to Batch Size=8, Client Concurrent Requests = 1 to keep the Total Number of Outstanding Requests equal between the two different modes. Where Total Number of Outstanding Requests = Batch Size * Client Concurrent Requests. 
 Increasing the batch size by 8-fold from 1 to 8 results in an increase in compute time by only 2.42x (45ms to 109ms) showing that computation is more efficient at higher batch sizes. Hence, an optimal batch size would be the maximum batch size that can both fit in memory and is within the preferred latency threshold.
 
-<img src="../images/trtis_static_batching_bs1.png" width="80%" height="80%"> 
+<img src="../images/triton_static_batching_bs1.png" width="80%" height="80%"> 
 
-Figure 2: TensorRT pipeline - Latency & Throughput vs Concurrency using Static Batching at Batch size = 1
+Figure 2: Triton pipeline - Latency & Throughput vs Concurrency using Static Batching at Batch size = 1
 
-<img src="../images/trtis_static_batching_bs8.png" width="80%" height="80%"> 
+<img src="../images/triton_static_batching_bs8.png" width="80%" height="80%"> 
 
-Figure 3: TensorRT pipeline - Latency & Throughput vs Concurrency using Static Batching at Batch size = 8
+Figure 3: Triton pipeline - Latency & Throughput vs Concurrency using Static Batching at Batch size = 8
 
 ##### Batching techniques: Static versus Dynamic Batching
-In the following section, we analyze the results using the example of the TensorRT-pipeline.
+In the following section, we analyze the results using the example of the Triton pipeline.
 Static batching is a feature of the inference server that allows inference requests to be served as they are received. It is preferred in scenarios where low latency is desired at the cost of throughput when the GPU is under utilized.
-Dynamic batching is a feature of the inference server that allows inference requests to be combined by the server, so that a batch is created dynamically, resulting in an increased throughput. It is preferred in scenarios where we would like to maximize throughput and GPU utilization at the cost of higher latencies. You can set the Dynamic Batcher parameter `max_queue_delay_microseconds` to indicate the maximum amount of time you are willing to wait and `preferred_batch_size` to indicate your maximum server batch size in the TensorRT Inference Server model config. 
+Dynamic batching is a feature of the inference server that allows inference requests to be combined by the server, so that a batch is created dynamically, resulting in an increased throughput. It is preferred in scenarios where we would like to maximize throughput and GPU utilization at the cost of higher latencies. You can set the Dynamic Batcher parameter `max_queue_delay_microseconds` to indicate the maximum amount of time you are willing to wait and `preferred_batch_size` to indicate your maximum server batch size in the Triton Inference Server model config. 
 Figure 4 emphasizes the increase in overall throughput with dynamic batching. At low numbers of concurrent requests, the increased throughput comes at the cost of increasing latency as the requests are queued up to max_queue_delay_microseconds. The effect of preferred_batchsize for dynamic batching is visually depicted by the dip in Server Queue time at integer multiples of the preferred batch sizes. At higher numbers of concurrent requests, the throughput approaches a maximum limit as we saturate the GPU utilization.
 
-<img src="../images/trtis_dynamic_batching.png" width="80%" height="80%"> 
+<img src="../images/triton_dynamic_batching.png" width="80%" height="80%"> 
  
-Figure 4: TensorRT pipeline - Latency & Throughput vs Concurrency using dynamic Batching at client Batch size = 1, maximum server batch size=4, max_queue_delay_microseconds = 5000
+Figure 4: Triton pipeline - Latency & Throughput vs Concurrency using dynamic Batching at client Batch size = 1, maximum server batch size=4, max_queue_delay_microseconds = 5000
 
 
- 
 ##### TensorRT/ONNX/PyTorch JIT comparisons
 The following tables show inference and latency comparisons across all 3 backends for mixed precision and static batching. The main observations are:
 Increasing the batch size leads to higher inference throughput and latency up to a certain batch size, after which it slowly saturates.
@@ -342,7 +339,7 @@ The longer the audio length, the lower the throughput and the higher the latency
 
 ###### Throughput Comparison
 
-Following Table shows throughput benchmark results for all 3 model backends in TensorRT Inference Server using static batching under optimal concurrency
+Following Table shows throughput benchmark results for all 3 model backends in Triton Inference Server using static batching under optimal concurrency
 
 
 |Audio length in seconds|Batch Size|TensorRT (inf/s)|PyTorch (inf/s)|ONNX (inf/s)|TensorRT/PyTorch Speedup|TensorRT/Onnx Speedup|
@@ -362,7 +359,7 @@ Following Table shows throughput benchmark results for all 3 model backends in T
 
 ###### Latency Comparison
 
-Following Table shows throughput benchmark results for all 3 model backends in TensorRT Inference Server using static batching and a single concurrent request. 
+Following Table shows throughput benchmark results for all 3 model backends in Triton Inference Server using static batching and a single concurrent request. 
 
 
 |Audio length in seconds|Batch Size|TensorRT (ms)|PyTorch (ms)|ONNX (ms)|TensorRT/PyTorch Speedup|TensorRT/Onnx Speedup|
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/jasper-client.py b/PyTorch/SpeechRecognition/Jasper/triton/jasper-client.py
similarity index 99%
rename from PyTorch/SpeechRecognition/Jasper/trtis/jasper-client.py
rename to PyTorch/SpeechRecognition/Jasper/triton/jasper-client.py
index 4e6152d5d..dfdf31715 100755
--- a/PyTorch/SpeechRecognition/Jasper/trtis/jasper-client.py
+++ b/PyTorch/SpeechRecognition/Jasper/triton/jasper-client.py
@@ -347,9 +347,9 @@ def audio_features_generator_with_buffer(input_filename,
         table = str.maketrans(punctuation, " " * len(punctuation))
 
         import json
-        if "./trtis" not in sys.path:
+        if "./triton" not in sys.path:
             sys.path.append("./")
-            sys.path.append("./trtis")
+            sys.path.append("./triton")
         from speech_utils import normalize_string, parse_transcript
         FLAGS.manifest_filename = FLAGS.manifest_filename.split(',')
         for manifest in FLAGS.manifest_filename:
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-decoder/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-decoder/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-decoder/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-decoder/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-feature-extractor/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-feature-extractor/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-feature-extractor/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-feature-extractor/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-cpu-ensemble/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-onnx-cpu-ensemble/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-cpu-ensemble/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-onnx-cpu-ensemble/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-cpu/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-onnx-cpu/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-cpu/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-onnx-cpu/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-ensemble/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-onnx-ensemble/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx-ensemble/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-onnx-ensemble/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-onnx/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-onnx/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-onnx/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-pyt-ensemble/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-pyt-ensemble/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-pyt-ensemble/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-pyt-ensemble/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-pyt/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-pyt/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-pyt/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-pyt/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-trt-ensemble/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-trt-ensemble/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-trt-ensemble/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-trt-ensemble/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-trt/config.pbtxt b/PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-trt/config.pbtxt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/model_repo/jasper-trt/config.pbtxt
rename to PyTorch/SpeechRecognition/Jasper/triton/model_repo/jasper-trt/config.pbtxt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/requirements.txt b/PyTorch/SpeechRecognition/Jasper/triton/requirements.txt
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/requirements.txt
rename to PyTorch/SpeechRecognition/Jasper/triton/requirements.txt
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/build.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/build.sh
similarity index 79%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/build.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/build.sh
index dc75e10f1..19008c810 100755
--- a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/build.sh
+++ b/PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/build.sh
@@ -4,5 +4,5 @@ SCRIPT_DIR=$(cd $(dirname $0); pwd)
 PROJECT_DIR=${SCRIPT_DIR}/../../../
 docker pull nvcr.io/nvidia/tensorrtserver:19.09-py3
 git submodule update --init --recursive
-docker build -t tensorrtserver_client -f ${PROJECT_DIR}/external/tensorrt-inference-server/Dockerfile.client ${PROJECT_DIR}/external/tensorrt-inference-server
+docker build -t tensorrtserver_client -f ${PROJECT_DIR}/external/triton-inference-server/Dockerfile.client ${PROJECT_DIR}/external/triton-inference-server
 docker build . --rm -f ${PROJECT_DIR}/trtis/Dockerfile -t jasper:trtis
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/launch.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/launch.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/docker/launch.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/docker/launch.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/execute_all_perf_runs.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/execute_all_perf_runs.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/execute_all_perf_runs.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/execute_all_perf_runs.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/export_model.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/export_model.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/export_model.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/export_model.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/export_model_helper.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/export_model_helper.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/export_model_helper.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/export_model_helper.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/generate_perf_results.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/generate_perf_results.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/generate_perf_results.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/generate_perf_results.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/prepare_model_repository.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/prepare_model_repository.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/prepare_model_repository.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/prepare_model_repository.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_client.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/run_client.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_client.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/run_client.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_perf_client.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/run_perf_client.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_perf_client.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/run_perf_client.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_server.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/run_server.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/run_server.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/run_server.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/scripts/wait_for_trtis_server.sh b/PyTorch/SpeechRecognition/Jasper/triton/scripts/wait_for_trtis_server.sh
similarity index 100%
rename from PyTorch/SpeechRecognition/Jasper/trtis/scripts/wait_for_trtis_server.sh
rename to PyTorch/SpeechRecognition/Jasper/triton/scripts/wait_for_trtis_server.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/speech_utils.py b/PyTorch/SpeechRecognition/Jasper/triton/speech_utils.py
similarity index 99%
rename from PyTorch/SpeechRecognition/Jasper/trtis/speech_utils.py
rename to PyTorch/SpeechRecognition/Jasper/triton/speech_utils.py
index 54278cc37..ac49bc3f6 100644
--- a/PyTorch/SpeechRecognition/Jasper/trtis/speech_utils.py
+++ b/PyTorch/SpeechRecognition/Jasper/triton/speech_utils.py
@@ -25,9 +25,9 @@
 from tensorrtserver.api import api_pb2
 from tensorrtserver.api import grpc_service_pb2
 from tensorrtserver.api import grpc_service_pb2_grpc
-if "./trtis" not in sys.path:
+if "./triton" not in sys.path:
     sys.path.append("./")
-    sys.path.append("./trtis")
+    sys.path.append("./triton")
 from parts.text import _clean_text
 
 WINDOWS_FNS = {"hanning": np.hanning, "hamming": np.hamming, "none": None}
diff --git a/PyTorch/SpeechRecognition/Jasper/trtis/Dockerfile b/PyTorch/SpeechRecognition/Jasper/trtis/Dockerfile
deleted file mode 100644
index 4a0b6cd7c..000000000
--- a/PyTorch/SpeechRecognition/Jasper/trtis/Dockerfile
+++ /dev/null
@@ -1,40 +0,0 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3 
-
-FROM tensorrtserver_client as trtis-client
-FROM ${FROM_IMAGE_NAME}
-RUN apt-get update && apt-get install -y python3
-ARG version=6.0.1-1+cuda10.1
-RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.1.243-1_amd64.deb \
-&& dpkg -i cuda-repo-*.deb \
-&& wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb \
-&& dpkg -i nvidia-machine-learning-repo-*.deb \
-&& apt-get update \
-&& apt-get install -y --no-install-recommends libnvinfer6=${version} libnvonnxparsers6=${version} libnvparsers6=${version} libnvinfer-plugin6=${version} libnvinfer-dev=${version} libnvonnxparsers-dev=${version} libnvparsers-dev=${version} libnvinfer-plugin-dev=${version} python-libnvinfer=${version} python3-libnvinfer=${version}
-RUN cp -r /usr/lib/python3.6/dist-packages/tensorrt /opt/conda/lib/python3.6/site-packages/tensorrt
-
-
-ENV PATH=$PATH:/usr/src/tensorrt/bin
-WORKDIR /tmp/onnx-trt
-COPY trt/onnx-trt.patch .
-RUN git clone https://github.com/onnx/onnx-tensorrt.git && cd onnx-tensorrt && git checkout  b677b9cbf19af803fa6f76d05ce558e657e4d8b6  && git submodule update --init --recursive && \
-    patch -f < ../onnx-trt.patch && mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DGPU_ARCHS="60 70 75" && make -j16 && make install && mv -f /usr/lib/libnvonnx* /usr/lib/x86_64-linux-gnu/ && ldconfig
-
-
-# Here's a good place to install pip reqs from JoC repo.
-# At the same step, also install TRT pip reqs
-WORKDIR /tmp/pipReqs
-COPY requirements.txt /tmp/pipReqs/pytRequirements.txt
-COPY trt/requirements.txt /tmp/pipReqs/trtRequirements.txt
-COPY trtis/requirements.txt /tmp/pipReqs/trtisRequirements.txt
-RUN apt-get update && apt-get install -y --no-install-recommends portaudio19-dev && pip install -r pytRequirements.txt && pip install -r trtRequirements.txt && pip install -r trtisRequirements.txt
-
-
-#Copy the perf_client over
-COPY --from=trtis-client /workspace/install/bin/perf_client /workspace/install/bin/perf_client
-#Copy the python wheel and install with pip
-COPY --from=trtis-client /workspace/install/python/tensorrtserver*.whl /tmp/
-RUN pip install /tmp/tensorrtserver*.whl && rm /tmp/tensorrtserver*.whl
-
-WORKDIR /workspace/jasper
-COPY . .
-