Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions PyTorch/SpeechRecognition/Jasper/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@ __pycache__
results/
datasets/
checkpoints/

*.swp
*.swo
*.swn
2 changes: 1 addition & 1 deletion PyTorch/SpeechRecognition/Jasper/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
FROM ${FROM_IMAGE_NAME}


Expand Down
531 changes: 277 additions & 254 deletions PyTorch/SpeechRecognition/Jasper/README.md

Large diffs are not rendered by default.

5 changes: 0 additions & 5 deletions PyTorch/SpeechRecognition/Jasper/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@
from metrics import word_error_rate





AmpOptimizations = ["O0", "O1", "O2", "O3"]

def print_once(msg):
if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
print(msg)
Expand Down
140 changes: 75 additions & 65 deletions PyTorch/SpeechRecognition/Jasper/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import math
import toml
from dataset import AudioToTextDataLayer
from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, AmpOptimizations, print_dict, model_multi_gpu, __ctc_decoder_predictions_tensor
from helpers import process_evaluation_batch, process_evaluation_epoch, add_ctc_labels, print_dict, model_multi_gpu, __ctc_decoder_predictions_tensor
from model import AudioPreprocessing, GreedyCTCDecoder, JasperEncoderDecoder
from parts.features import audio_from_file
import torch
Expand All @@ -46,21 +46,21 @@ def parse_args():
parser.add_argument("--ckpt", default=None, type=str, required=True, help='path to model checkpoint')
parser.add_argument("--max_duration", default=None, type=float, help='maximum duration of sequences. if None uses attribute from model configuration file')
parser.add_argument("--pad_to", default=None, type=int, help="default is pad to value as specified in model configurations. if -1 pad to maximum duration. If > 0 pad batch to next multiple of value")
parser.add_argument("--fp16", action='store_true', help='use half precision')
parser.add_argument("--pyt_fp16", action='store_true', help='use half precision')
parser.add_argument("--amp", "--fp16", action='store_true', help='use half precision')
parser.add_argument("--cudnn_benchmark", action='store_true', help="enable cudnn benchmark")
parser.add_argument("--save_prediction", type=str, default=None, help="if specified saves predictions in text form at this location")
parser.add_argument("--logits_save_to", default=None, type=str, help="if specified will save logits to path")
parser.add_argument("--seed", default=42, type=int, help='seed')
parser.add_argument("--masked_fill", type="bool", help="Overrides the masked_fill option for the Encoder")
parser.add_argument("--output_dir", default="results/", type=str, help="Output directory to store exported models. Only used if --export_model is used")
parser.add_argument("--export_model", action='store_true', help="Exports the audio_featurizer, encoder and decoder using torch.jit to the output_dir")
parser.add_argument("--wav", type=str, help='absolute path to .wav file (16KHz)')
parser.add_argument("--cpu", action="store_true", help="Run inference on CPU")
parser.add_argument("--ema", action="store_true", help="If available, load EMA model weights")
return parser.parse_args()

def calc_wer(data_layer, audio_processor,
encoderdecoder, greedy_decoder,
labels, args):
def calc_wer(data_layer, audio_processor,
encoderdecoder, greedy_decoder,
labels, args, device):

encoderdecoder = encoderdecoder.module if hasattr(encoderdecoder, 'module') else encoderdecoder
with torch.no_grad():
Expand All @@ -74,16 +74,14 @@ def calc_wer(data_layer, audio_processor,
# Evaluation mini-batch for loop
for it, data in enumerate(tqdm(data_layer.data_iterator)):

tensors = []
for d in data:
tensors.append(d.cuda())
tensors = [t.to(device) for t in data]

t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors
t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e)

t_processed_signal = audio_processor(t_audio_signal_e, t_a_sig_length_e)
t_log_probs_e, _ = encoderdecoder.infer(t_processed_signal)
t_predictions_e = greedy_decoder(t_log_probs_e)

values_dict = dict(
predictions=[t_predictions_e],
transcript=[t_transcript_e],
Expand All @@ -92,7 +90,7 @@ def calc_wer(data_layer, audio_processor,
)
# values_dict will contain results from all workers
process_evaluation_batch(values_dict, _global_var_dict, labels=labels)

if args.steps is not None and it + 1 >= args.steps:
break

Expand All @@ -102,18 +100,13 @@ def calc_wer(data_layer, audio_processor,
return wer, _global_var_dict


def jit_export(
audio, audio_len,
audio_processor,
encoderdecoder,
greedy_decoder,
args):
def jit_export(audio, audio_len, audio_processor, encoderdecoder, greedy_decoder, args):

print("##############")

module_name = "{}_{}".format(os.path.basename(args.model_toml), "fp16" if args.fp16 else "fp32")
module_name = "{}_{}".format(os.path.basename(args.model_toml), "fp16" if args.amp else "fp32")

if args.masked_fill is not None and args.masked_fill == False:
if args.use_conv_mask:
module_name = module_name + "_noMaskConv"

# Export just the featurizer
Expand All @@ -137,12 +130,18 @@ def jit_export(

return traced_module_feat, traced_module_acoustic, traced_module_decode

def run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels):
features = audio_processor(audio, audio_len)
torch.cuda.synchronize()
def run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels, device):
features, lens = audio_processor(audio, audio_len)
if not device.type == 'cpu':
torch.cuda.synchronize()
t0 = time.perf_counter()
t_log_probs_e = encoderdecoder(features[0])
torch.cuda.synchronize()
# TorchScripted model does not support (features, lengths)
if isinstance(encoderdecoder, torch.jit.TracedModule):
t_log_probs_e = encoderdecoder(features)
else:
t_log_probs_e, _ = encoderdecoder.infer((features, lens))
if not device.type == 'cpu':
torch.cuda.synchronize()
t1 = time.perf_counter()
t_predictions_e = greedy_decoder(log_probs=t_log_probs_e)
hypotheses = __ctc_decoder_predictions_tensor(t_predictions_e, labels=labels)
Expand All @@ -157,6 +156,7 @@ def eval(
greedy_decoder,
labels,
multi_gpu,
device,
args):
"""performs inference / evaluation
Args:
Expand All @@ -169,21 +169,19 @@ def eval(
args: script input arguments
"""
logits_save_to=args.logits_save_to

with torch.no_grad():
if args.wav:
audio, audio_len = audio_from_file(args.wav)
run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels)
run_once(audio_processor, encoderdecoder, greedy_decoder, audio, audio_len, labels, device)
if args.export_model:
jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(audio, audio_len, audio_processor,
encoderdecoder,
greedy_decoder,args)
run_once(jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder, audio, audio_len, labels)
jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder = jit_export(audio, audio_len, audio_processor, encoderdecoder,greedy_decoder,args)
run_once(jit_audio_processor, jit_encoderdecoder, jit_greedy_decoder, audio, audio_len, labels, device)
return
wer, _global_var_dict = calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args)
wer, _global_var_dict = calc_wer(data_layer, audio_processor, encoderdecoder, greedy_decoder, labels, args, device)
if (not multi_gpu or (multi_gpu and torch.distributed.get_rank() == 0)):
print("==========>>>>>>Evaluation WER: {0}\n".format(wer))

if args.save_prediction is not None:
with open(args.save_prediction, 'w') as fp:
fp.write('\n'.join(_global_var_dict['predictions']))
Expand All @@ -203,26 +201,29 @@ def eval(
# print("===>>>Diff : {0} %".format((wer_after - wer_before) * 100.0 / wer_before))
# print("")


def main(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.benchmark = args.cudnn_benchmark
print("CUDNN BENCHMARK ", args.cudnn_benchmark)
assert(torch.cuda.is_available())

if args.local_rank is not None:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
multi_gpu = args.local_rank is not None
if multi_gpu:
print("DISTRIBUTED with ", torch.distributed.get_world_size())

if args.fp16:
optim_level = 3
if args.cpu:
assert(not multi_gpu)
device = torch.device('cpu')
else:
optim_level = 0
assert(torch.cuda.is_available())
device = torch.device('cuda')
torch.backends.cudnn.benchmark = args.cudnn_benchmark
print("CUDNN BENCHMARK ", args.cudnn_benchmark)

if multi_gpu:
print("DISTRIBUTED with ", torch.distributed.get_world_size())
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')

optim_level = 3 if args.amp else 0

jasper_model_definition = toml.load(args.model_toml)
dataset_vocab = jasper_model_definition['labels']['labels']
Expand All @@ -231,32 +232,32 @@ def main(args):
val_manifest = args.val_manifest
featurizer_config = jasper_model_definition['input_eval']
featurizer_config["optimization_level"] = optim_level
featurizer_config["fp16"] = args.fp16
args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
featurizer_config["fp16"] = args.amp

if args.masked_fill is not None:
print("{} masked_fill".format("Enabling" if args.masked_fill else "Disabling"))
jasper_model_definition["encoder"]["conv_mask"] = args.masked_fill
args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)
if args.use_conv_mask and args.export_model:
print('WARNING: Masked convs currently not supported for TorchScript. Disabling.')
jasper_model_definition['encoder']['convmask'] = False

if args.max_duration is not None:
featurizer_config['max_duration'] = args.max_duration
if args.pad_to is not None:
featurizer_config['pad_to'] = args.pad_to
featurizer_config['pad_to'] = args.pad_to

if featurizer_config['pad_to'] == "max":
featurizer_config['pad_to'] = -1

print('=== model_config ===')
print_dict(jasper_model_definition)
print()
print('=== feature_config ===')
print_dict(featurizer_config)
print()
data_layer = None

if args.wav is None:
data_layer = AudioToTextDataLayer(
dataset_dir=args.dataset_dir,
dataset_dir=args.dataset_dir,
featurizer_config=featurizer_config,
manifest_filepath=val_manifest,
labels=dataset_vocab,
Expand All @@ -274,10 +275,16 @@ def main(args):
exit(0)
else:
checkpoint = torch.load(args.ckpt, map_location="cpu")
if args.ema and 'ema_state_dict' in checkpoint:
print('Loading EMA state dict')
sd = 'ema_state_dict'
else:
sd = 'state_dict'

for k in audio_preprocessor.state_dict().keys():
checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k)
audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False)
encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)
checkpoint[sd][k] = checkpoint[sd].pop("audio_preprocessor." + k)
audio_preprocessor.load_state_dict(checkpoint[sd], strict=False)
encoderdecoder.load_state_dict(checkpoint[sd], strict=False)

greedy_decoder = GreedyCTCDecoder()

Expand All @@ -298,24 +305,27 @@ def main(args):
print('-----------------')

print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize)
audio_preprocessor.cuda()
encoderdecoder.cuda()
if args.fp16:
encoderdecoder = amp.initialize( models=encoderdecoder,
opt_level=AmpOptimizations[optim_level])

audio_preprocessor.to(device)
encoderdecoder.to(device)

if args.amp:
encoderdecoder = amp.initialize(models=encoderdecoder,
opt_level='O'+str(optim_level))

encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu)
audio_preprocessor.eval()
encoderdecoder.eval()
greedy_decoder.eval()

eval(
data_layer=data_layer,
audio_processor=audio_preprocessor,
encoderdecoder=encoderdecoder,
greedy_decoder=greedy_decoder,
labels=ctc_vocab,
args=args,
device=device,
multi_gpu=multi_gpu)

if __name__=="__main__":
Expand Down
Loading