NVIDIA · nv-kkudrynski · Jul 7, 2020 · Jul 7, 2020
diff --git a/PyTorch/SpeechRecognition/Jasper/.gitignore b/PyTorch/SpeechRecognition/Jasper/.gitignore
@@ -3,3 +3,7 @@ __pycache__
 results/
 datasets/
 checkpoints/
+
+*.swp
+*.swo
+*.swn
diff --git a/PyTorch/SpeechRecognition/Jasper/Dockerfile b/PyTorch/SpeechRecognition/Jasper/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3  
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
 FROM ${FROM_IMAGE_NAME}
 
 

diff --git a/PyTorch/SpeechRecognition/Jasper/README.md b/PyTorch/SpeechRecognition/Jasper/README.md
diff --git a/PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server b/PyTorch/SpeechRecognition/Jasper/external/tensorrt-inference-server
diff --git a/PyTorch/SpeechRecognition/Jasper/helpers.py b/PyTorch/SpeechRecognition/Jasper/helpers.py
@@ -19,11 +19,6 @@
 from metrics import word_error_rate
 
 
-
-
-
-AmpOptimizations = ["O0", "O1", "O2", "O3"]
-
 def print_once(msg):
     if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
         print(msg)

diff --git a/PyTorch/SpeechRecognition/Jasper/inference.py b/PyTorch/SpeechRecognition/Jasper/inference.py
@@ -19,7 +19,7 @@
 import math
 import toml
 from dataset import AudioToTextDataLayer
-from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, AmpOptimizations, print_dict, model_multi_gpu, __ctc_decoder_predictions_tensor
+from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, print_dict, model_multi_gpu, __ctc_decoder_predictions_tensor
 from model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder
 from parts.features import audio_from_file
 import torch
@@ -46,21 +46,21 @@ def parse_args():
     parser.add_argument("--ckpt", default=None, type=str, required=True, help='path to model checkpoint')
     parser.add_argument("--max_duration", default=None, type=float, help='maximum duration of sequences. if None uses attribute from model configuration file')
     parser.add_argument("--pad_to", default=None, type=int, help="default is pad to value as specified in model configurations. if -1 pad to maximum duration. If > 0 pad batch to next multiple of value")
-    parser.add_argument("--fp16", action='store_true', help='use half precision')
-    parser.add_argument("--pyt_fp16", action='store_true', help='use half precision')
+    parser.add_argument("--amp", "--fp16", action='store_true', help='use half precision')
     parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark")
     parser.add_argument("--save_prediction", type=str, default=None, help="if specified saves predictions in text form at this location")
     parser.add_argument("--logits_save_to", default=None, type=str, help="if specified will save logits to path")
     parser.add_argument("--seed", default=42, type=int, help='seed')
-    parser.add_argument("--masked_fill", type="bool", help="Overrides the masked_fill option for the Encoder")
     parser.add_argument("--output_dir", default="results/", type=str, help="Output directory to store exported models. Only used if --export_model is used")
     parser.add_argument("--export_model", action='store_true', help="Exports the audio_featurizer, encoder and decoder using torch.jit to the output_dir")
     parser.add_argument("--wav", type=str, help='absolute path to .wav file (16KHz)')
+    parser.add_argument("--cpu", action="store_true", help="Run inference on CPU")
+    parser.add_argument("--ema", action="store_true", help="If available, load EMA model weights")
     return parser.parse_args()
 
-def calc_wer(data_layer, audio_processor, 
-             encoderdecoder, greedy_decoder, 
-             labels, args):
+def calc_wer(data_layer, audio_processor,
+             encoderdecoder, greedy_decoder,
+             labels, args, device):
 
     encoderdecoder = encoderdecoder.module if hasattr(encoderdecoder, 'module') else encoderdecoder
     with torch.no_grad():
@@ -74,16 +74,14 @@ def calc_wer(data_layer, audio_processor,
         # Evaluation mini-batch for loop
         for it, data in enumerate(tqdm(data_layer.data_iterator)):
 
-            tensors = []
-            for d in data:
-                tensors.append(d.cuda())
+            tensors = [t.to(device) for t in data]
 
             t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
-    
-            t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e) 
+
+            t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e)
             t_log_probs_e, _ = encoderdecoder.infer(t_processed_signal)
             t_predictions_e = greedy_decoder(t_log_probs_e)
-    
+
             values_dict = dict(
                 predictions=[t_predictions_e],
                 transcript=[t_transcript_e],
@@ -92,7 +90,7 @@ def calc_wer(data_layer, audio_processor,
             )
             # values_dict will contain results from all workers
             process_evaluation_batch(values_dict, _global_var_dict, labels=labels)
-    
+
             if args.steps is not None and it + 1 >= args.steps:
                 break
 
@@ -102,18 +100,13 @@ def calc_wer(data_layer, audio_processor,
         return wer, _global_var_dict
 
 
-def jit_export(
-         audio, audio_len,
-         audio_processor,
-         encoderdecoder,
-         greedy_decoder,
-         args):
+def jit_export(audio, audio_len, audio_processor, encoderdecoder, greedy_decoder, args):
 
                 print("##############")
 
-                module_name = "{}_{}".format(os.path.basename(args.model_toml), "fp16" if args.fp16 else "fp32")
+                module_name = "{}_{}".format(os.path.basename(args.model_toml), "fp16" if args.amp else "fp32")
 
-                if args.masked_fill is not None and args.masked_fill == False:
+                if args.use_conv_mask:
                     module_name = module_name + "_noMaskConv"
 
                 # Export just the featurizer
@@ -137,12 +130,18 @@ def jit_export(
 
                 return traced_module_feat, traced_module_acoustic, traced_module_decode
 
-def run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels):
-            features = audio_processor(audio, audio_len)
-            torch.cuda.synchronize()
+def run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels, device):
+            features, lens = audio_processor(audio, audio_len)
+            if not device.type == 'cpu':
+                torch.cuda.synchronize()
             t0 = time.perf_counter()
-            t_log_probs_e = encoderdecoder(features[0])
-            torch.cuda.synchronize()
+            # TorchScripted model does not support (features, lengths)
+            if isinstance(encoderdecoder, torch.jit.TracedModule):
+                t_log_probs_e = encoderdecoder(features)
+            else:
+                t_log_probs_e, _ = encoderdecoder.infer((features, lens))
+            if not device.type == 'cpu':
+                torch.cuda.synchronize()
             t1 = time.perf_counter()
             t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
             hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
@@ -157,6 +156,7 @@ def eval(
          greedy_decoder,
          labels,
          multi_gpu,
+         device,
          args):
     """performs inference / evaluation
     Args:
@@ -169,21 +169,19 @@ def eval(
         args: script input arguments
     """
     logits_save_to=args.logits_save_to
-    
+
     with torch.no_grad():
         if args.wav:
             audio, audio_len = audio_from_file(args.wav)
-            run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels)
+            run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels, device)
             if args.export_model:
-                jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(audio, audio_len, audio_processor,
-                                                                                         encoderdecoder,
-                                                                                         greedy_decoder,args)
-            run_once(jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder, audio, audio_len, labels)
+                jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(audio, audio_len, audio_processor, encoderdecoder,greedy_decoder,args)
+                run_once(jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder, audio, audio_len, labels, device)
             return
-        wer, _global_var_dict = calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args)
+        wer, _global_var_dict = calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args, device)
         if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
             print("==========>>>>>>Evaluation WER: {0}\n".format(wer))
-      
+
             if args.save_prediction is not None:
                 with open(args.save_prediction, 'w') as fp:
                     fp.write('\n'.join(_global_var_dict['predictions']))
@@ -203,26 +201,29 @@ def eval(
             #     print("===>>>Diff      : {0} %".format((wer_after - wer_before) * 100.0 / wer_before))
             #     print("")
 
-                
+
 def main(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    torch.backends.cudnn.benchmark = args.cudnn_benchmark
-    print("CUDNN BENCHMARK ", args.cudnn_benchmark)
-    assert(torch.cuda.is_available())
 
-    if args.local_rank is not None:
-        torch.cuda.set_device(args.local_rank)
-        torch.distributed.init_process_group(backend='nccl', init_method='env://')
     multi_gpu = args.local_rank is not None
-    if multi_gpu:
-        print("DISTRIBUTED with ", torch.distributed.get_world_size())
 
-    if args.fp16:
-        optim_level = 3
+    if args.cpu:
+        assert(not multi_gpu)
+        device = torch.device('cpu')
     else:
-        optim_level = 0
+        assert(torch.cuda.is_available())
+        device = torch.device('cuda')
+        torch.backends.cudnn.benchmark = args.cudnn_benchmark
+        print("CUDNN BENCHMARK ", args.cudnn_benchmark)
+
+        if multi_gpu:
+            print("DISTRIBUTED with ", torch.distributed.get_world_size())
+            torch.cuda.set_device(args.local_rank)
+            torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    optim_level = 3 if args.amp else 0
 
     jasper_model_definition = toml.load(args.model_toml)
     dataset_vocab = jasper_model_definition['labels']['labels']
@@ -231,32 +232,32 @@ def main(args):
     val_manifest = args.val_manifest
     featurizer_config = jasper_model_definition['input_eval']
     featurizer_config["optimization_level"] = optim_level
-    featurizer_config["fp16"] = args.fp16
-    args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
+    featurizer_config["fp16"] = args.amp
 
-    if args.masked_fill is not None:
-        print("{} masked_fill".format("Enabling" if args.masked_fill else "Disabling"))
-        jasper_model_definition["encoder"]["conv_mask"] = args.masked_fill
+    args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
+    if args.use_conv_mask and args.export_model:
+        print('WARNING: Masked convs currently not supported for TorchScript. Disabling.')
+        jasper_model_definition['encoder']['convmask'] = False
 
     if args.max_duration is not None:
         featurizer_config['max_duration'] = args.max_duration
     if args.pad_to is not None:
-        featurizer_config['pad_to'] = args.pad_to 
+        featurizer_config['pad_to'] = args.pad_to
 
     if featurizer_config['pad_to'] == "max":
         featurizer_config['pad_to'] = -1
-        
+
     print('=== model_config ===')
     print_dict(jasper_model_definition)
     print()
     print('=== feature_config ===')
     print_dict(featurizer_config)
     print()
     data_layer = None
-    
+
     if args.wav is None:
         data_layer = AudioToTextDataLayer(
-            dataset_dir=args.dataset_dir, 
+            dataset_dir=args.dataset_dir,
             featurizer_config=featurizer_config,
             manifest_filepath=val_manifest,
             labels=dataset_vocab,
@@ -274,10 +275,16 @@ def main(args):
             exit(0)
         else:
             checkpoint = torch.load(args.ckpt, map_location="cpu")
+            if args.ema and 'ema_state_dict' in checkpoint:
+                print('Loading EMA state dict')
+                sd = 'ema_state_dict'
+            else:
+                sd = 'state_dict'
+
             for k in audio_preprocessor.state_dict().keys():
-                checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k)
-            audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False)
-            encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)
+                checkpoint[sd][k] = checkpoint[sd].pop("audio_preprocessor." + k)
+            audio_preprocessor.load_state_dict(checkpoint[sd], strict=False)
+            encoderdecoder.load_state_dict(checkpoint[sd], strict=False)
 
     greedy_decoder = GreedyCTCDecoder()
 
@@ -298,24 +305,27 @@ def main(args):
             print('-----------------')
 
     print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize)
-    audio_preprocessor.cuda()
-    encoderdecoder.cuda()
-    if args.fp16:
-        encoderdecoder = amp.initialize( models=encoderdecoder,
-                                         opt_level=AmpOptimizations[optim_level])
+
+    audio_preprocessor.to(device)
+    encoderdecoder.to(device)
+
+    if args.amp:
+        encoderdecoder = amp.initialize(models=encoderdecoder,
+                                        opt_level='O'+str(optim_level))
 
     encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu)
     audio_preprocessor.eval()
     encoderdecoder.eval()
     greedy_decoder.eval()
-    
+
     eval(
         data_layer=data_layer,
         audio_processor=audio_preprocessor,
         encoderdecoder=encoderdecoder,
         greedy_decoder=greedy_decoder,
         labels=ctc_vocab,
         args=args,
+        device=device,
         multi_gpu=multi_gpu)
 
 if __name__=="__main__":