Skip to content

Commit 49acab4

Browse files
committed
torch.hub entrypoints for tacotron2, waveglow and ncf
1 parent a9d0554 commit 49acab4

7 files changed

Lines changed: 234 additions & 24 deletions

File tree

PyTorch/Recommendation/NCF/neumf.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@
3232
import torch
3333
import torch.nn as nn
3434

35+
import sys
36+
from os.path import abspath, join, dirname
37+
# enabling modules discovery from global entrypoint
38+
sys.path.append(abspath(dirname(__file__)+'/'))
39+
3540
from logger.logger import LOGGER
3641
from logger import tags
3742

PyTorch/SpeechSynthesis/Tacotron2/inference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def main():
190190
if args.tacotron2:
191191
tacotron2_t0 = time.time()
192192
with torch.no_grad():
193-
_, mel, _, _ = tacotron2.inference(sequence)
193+
_, mel, _, _ = tacotron2.infer(sequence)
194194
tacotron2_t1 = time.time()
195195
tacotron2_infer_perf = sequence.size(1)/(tacotron2_t1-tacotron2_t0)
196196
LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf)

PyTorch/SpeechSynthesis/Tacotron2/inference_perf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def main():
141141

142142
t0 = time.time()
143143
with torch.no_grad():
144-
_, mels, _, _ = model.inference(text_padded)
144+
_, mels, _, _ = model.infer(text_padded)
145145
t1 = time.time()
146146
inference_time= t1 - t0
147147
num_items = text_padded.size(0)*text_padded.size(1)

PyTorch/SpeechSynthesis/Tacotron2/models.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,22 +27,22 @@
2727

2828
from tacotron2.model import Tacotron2
2929
from waveglow.model import WaveGlow
30-
from tacotron2.arg_parser import parse_tacotron2_args
31-
from waveglow.arg_parser import parse_waveglow_args
3230
import torch
3331

3432

3533
def parse_model_args(model_name, parser, add_help=False):
3634
if model_name == 'Tacotron2':
35+
from tacotron2.arg_parser import parse_tacotron2_args
3736
return parse_tacotron2_args(parser, add_help)
3837
if model_name == 'WaveGlow':
38+
from waveglow.arg_parser import parse_waveglow_args
3939
return parse_waveglow_args(parser, add_help)
4040
else:
4141
raise NotImplementedError(model_name)
4242

4343

4444
def batchnorm_to_float(module):
45-
"""Converts LSTMCells to FP32"""
45+
"""Converts batch norm to FP32"""
4646
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
4747
module.float()
4848
for child in module.children():
@@ -51,7 +51,7 @@ def batchnorm_to_float(module):
5151

5252

5353
def lstmcell_to_float(module):
54-
"""Converts batch norm modules to FP32"""
54+
"""Converts LSTMCells modules to FP32"""
5555
if isinstance(module, torch.nn.LSTMCell):
5656
module.float()
5757
for child in module.children():

PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@
3030
from torch.autograd import Variable
3131
from torch import nn
3232
from torch.nn import functional as F
33+
import sys
34+
from os.path import abspath, dirname
35+
# enabling modules discovery from global entrypoint
36+
sys.path.append(abspath(dirname(__file__)+'/../'))
3337
from common.layers import ConvNorm, LinearNorm
3438
from common.utils import to_gpu, get_mask_from_lengths
3539

@@ -375,7 +379,7 @@ def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
375379

376380
return mel_outputs, gate_outputs, alignments
377381

378-
def decode(self, decoder_input, is_infer=False):
382+
def decode(self, decoder_input):
379383
""" Decoder step using stored states, attention and memory
380384
PARAMS
381385
------
@@ -390,12 +394,8 @@ def decode(self, decoder_input, is_infer=False):
390394
cell_input = torch.cat((decoder_input, self.attention_context), -1)
391395
attention_hidden_dtype = self.attention_hidden.dtype
392396

393-
if is_infer:
394-
self.attention_hidden, self.attention_cell = self.attention_rnn(
395-
cell_input, (self.attention_hidden, self.attention_cell))
396-
else:
397-
self.attention_hidden, self.attention_cell = self.attention_rnn(
398-
cell_input.float(), (self.attention_hidden.float(), self.attention_cell.float()))
397+
self.attention_hidden, self.attention_cell = self.attention_rnn(
398+
cell_input.float(), (self.attention_hidden.float(), self.attention_cell.float()))
399399

400400
self.attention_hidden = F.dropout(
401401
self.attention_hidden, self.p_attention_dropout, self.training)
@@ -418,13 +418,9 @@ def decode(self, decoder_input, is_infer=False):
418418
(self.attention_hidden, self.attention_context), -1)
419419
decoder_hidden_dtype = self.decoder_hidden.dtype
420420

421-
if is_infer:
422-
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
423-
decoder_input, (self.decoder_hidden, self.decoder_cell))
424-
else:
425-
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
426-
decoder_input.float(), (self.decoder_hidden.float(), self.decoder_cell.float()))
427-
421+
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
422+
decoder_input.float(), (self.decoder_hidden.float(), self.decoder_cell.float()))
423+
428424
self.decoder_hidden = F.dropout(
429425
self.decoder_hidden, self.p_decoder_dropout, self.training)
430426

@@ -539,7 +535,7 @@ def inference(self, memory):
539535
mel_outputs, gate_outputs, alignments = [], [], []
540536
while True:
541537
decoder_input = self.prenet(decoder_input)
542-
mel_output, gate_output, alignment = self.decode(decoder_input, is_infer=True)
538+
mel_output, gate_output, alignment = self.decode(decoder_input)
543539

544540
mel_outputs += [mel_output.squeeze(1)]
545541
gate_outputs += [gate_output]
@@ -647,7 +643,7 @@ def forward(self, inputs):
647643
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
648644
output_lengths)
649645

650-
def inference(self, inputs):
646+
def infer(self, inputs):
651647
inputs = self.parse_input(inputs)
652648
embedded_inputs = self.embedding(inputs).transpose(1, 2)
653649
encoder_outputs = self.encoder.inference(embedded_inputs)

PyTorch/SpeechSynthesis/Tacotron2/train.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def save_sample(model_name, model, waveglow_path, tacotron2_path, phrase_path, f
240240
'WaveGlow', checkpoint['config'], to_fp16=False, to_cuda=False)
241241
waveglow.eval()
242242
model.eval()
243-
mel = model.inference(phrase.cuda())[0].cpu()
243+
mel = model.infer(phrase.cuda())[0].cpu()
244244
model.train()
245245
if fp16:
246246
mel = mel.float()
@@ -254,7 +254,7 @@ def save_sample(model_name, model, waveglow_path, tacotron2_path, phrase_path, f
254254
tacotron2 = models.get_model(
255255
'Tacotron2', checkpoint['config'], to_fp16=False, to_cuda=False)
256256
tacotron2.eval()
257-
mel = tacotron2.inference(phrase)[0].cuda()
257+
mel = tacotron2.infer(phrase)[0].cuda()
258258
model.eval()
259259
if fp16:
260260
mel = mel.half()

hubconf.py

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
import urllib.request
2+
import torch
3+
4+
# from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
5+
def checkpoint_from_distributed(state_dict):
6+
"""
7+
Checks whether checkpoint was generated by DistributedDataParallel. DDP
8+
wraps model in additional "module.", it needs to be unwrapped for single
9+
GPU inference.
10+
:param state_dict: model's state dict
11+
"""
12+
ret = False
13+
for key, _ in state_dict.items():
14+
if key.find('module.') != -1:
15+
ret = True
16+
break
17+
return ret
18+
19+
20+
# from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py
21+
def unwrap_distributed(state_dict):
22+
"""
23+
Unwraps model from DistributedDataParallel.
24+
DDP wraps model in additional "module.", it needs to be removed for single
25+
GPU inference.
26+
:param state_dict: model's state dict
27+
"""
28+
new_state_dict = {}
29+
for key, value in state_dict.items():
30+
new_key = key.replace('module.1.', '')
31+
new_key = new_key.replace('module.', '')
32+
new_state_dict[new_key] = value
33+
return new_state_dict
34+
35+
36+
dependencies = ['torch']
37+
38+
39+
def nvidia_ncf(pretrained=True, **kwargs):
40+
"""Constructs an NCF model.
41+
For detailed information on model input and output, training recipies, inference and performance
42+
visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
43+
44+
Args:
45+
pretrained (bool, True): If True, returns a model pretrained on ml-20m dataset.
46+
model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
47+
nb_users (int): number of users
48+
nb_items (int): number of items
49+
mf_dim (int, 64): dimension of latent space in matrix factorization
50+
mlp_layer_sizes (list, [256,256,128,64]): sizes of layers of multi-layer-perceptron
51+
dropout (float, 0.5): dropout
52+
"""
53+
54+
from PyTorch.Recommendation.NCF import neumf as ncf
55+
56+
fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
57+
58+
config = {'nb_users': None, 'nb_items': None, 'mf_dim': 64, 'mf_reg': 0.,
59+
'mlp_layer_sizes': [256, 256, 128, 64], 'mlp_layer_regs':[0, 0, 0, 0], 'dropout': 0.5}
60+
61+
if pretrained:
62+
if fp16:
63+
checkpoint = 'https://developer.nvidia.com/joc-ncf-fp16-pyt-20190225'
64+
else:
65+
checkpoint = 'https://developer.nvidia.com/joc-ncf-fp32-pyt-20190225'
66+
ckpt_file = "ncf_ckpt.pt"
67+
urllib.request.urlretrieve(checkpoint, ckpt_file)
68+
ckpt = torch.load(ckpt_file)
69+
70+
if checkpoint_from_distributed(ckpt):
71+
ckpt = unwrap_distributed(ckpt)
72+
73+
config['nb_users'] = ckpt['mf_user_embed.weight'].shape[0]
74+
config['nb_items'] = ckpt['mf_item_embed.weight'].shape[0]
75+
config['mf_dim'] = ckpt['mf_item_embed.weight'].shape[1]
76+
mlp_shapes = [ckpt[k].shape for k in ckpt.keys() if 'mlp' in k and 'weight' in k and 'embed' not in k]
77+
config['mlp_layer_sizes'] = [mlp_shapes[0][1], mlp_shapes[1][1], mlp_shapes[2][1], mlp_shapes[2][0]]
78+
config['mlp_layer_regs'] = [0] * len(config['mlp_layer_sizes'])
79+
80+
else:
81+
if 'nb_users' not in kwargs:
82+
raise ValueError("Missing 'nb_users' argument.")
83+
if 'nb_items' not in kwargs:
84+
raise ValueError("Missing 'nb_items' argument.")
85+
for k,v in kwargs.items():
86+
if k in config.keys():
87+
config[k] = v
88+
config['mlp_layer_regs'] = [0] * len(config['mlp_layer_sizes'])
89+
90+
m = ncf.NeuMF(**config)
91+
92+
if fp16:
93+
m.half()
94+
95+
if pretrained:
96+
m.load_state_dict(ckpt)
97+
98+
return m
99+
100+
101+
def nvidia_tacotron2(pretrained=True, **kwargs):
102+
"""Constructs a Tacotron 2 model (nn.module with additional infer(input) method).
103+
For detailed information on model input and output, training recipies, inference and performance
104+
visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
105+
106+
Args (type[, default value]):
107+
pretrained (bool, True): If True, returns a model pretrained on LJ Speech dataset.
108+
model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
109+
n_symbols (int, 148): Number of symbols used in a sequence passed to the prenet, see
110+
https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/text/symbols.py
111+
p_attention_dropout (float, 0.1): dropout probability on attention LSTM (1st LSTM layer in decoder)
112+
p_decoder_dropout (float, 0.1): dropout probability on decoder LSTM (2nd LSTM layer in decoder)
113+
max_decoder_steps (int, 1000): maximum number of generated mel spectrograms during inference
114+
"""
115+
116+
from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2
117+
from PyTorch.SpeechSynthesis.Tacotron2.models import lstmcell_to_float, batchnorm_to_float
118+
119+
fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
120+
121+
if pretrained:
122+
if fp16:
123+
checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp16-pyt-20190306'
124+
else:
125+
checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp32-pyt-20190306'
126+
ckpt_file = "tacotron2_ckpt.pt"
127+
urllib.request.urlretrieve(checkpoint, ckpt_file)
128+
ckpt = torch.load(ckpt_file)
129+
state_dict = ckpt['state_dict']
130+
if checkpoint_from_distributed(state_dict):
131+
state_dict = unwrap_distributed(state_dict)
132+
config = ckpt['config']
133+
else:
134+
config = {'mask_padding': False, 'n_mel_channels': 80, 'n_symbols': 148,
135+
'symbols_embedding_dim': 512, 'encoder_kernel_size': 5,
136+
'encoder_n_convolutions': 3, 'encoder_embedding_dim': 512,
137+
'attention_rnn_dim': 1024, 'attention_dim': 128,
138+
'attention_location_n_filters': 32,
139+
'attention_location_kernel_size': 31, 'n_frames_per_step': 1,
140+
'decoder_rnn_dim': 1024, 'prenet_dim': 256,
141+
'max_decoder_steps': 1000, 'gate_threshold': 0.5,
142+
'p_attention_dropout': 0.1, 'p_decoder_dropout': 0.1,
143+
'postnet_embedding_dim': 512, 'postnet_kernel_size': 5,
144+
'postnet_n_convolutions': 5, 'decoder_no_early_stopping': False}
145+
for k,v in kwargs.items():
146+
if k in config.keys():
147+
config[k] = v
148+
149+
m = tacotron2.Tacotron2(**config)
150+
151+
if fp16:
152+
m = batchnorm_to_float(m.half())
153+
m = lstmcell_to_float(m)
154+
155+
if pretrained:
156+
m.load_state_dict(state_dict)
157+
158+
return m
159+
160+
161+
def nvidia_waveglow(pretrained=True, **kwargs):
162+
"""Constructs a WaveGlow model (nn.module with additional infer(input) method).
163+
For detailed information on model input and output, training recipies, inference and performance
164+
visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com
165+
166+
Args:
167+
pretrained (bool): If True, returns a model pretrained on LJ Speech dataset.
168+
model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16')
169+
"""
170+
171+
from PyTorch.SpeechSynthesis.Tacotron2.waveglow import model as waveglow
172+
from PyTorch.SpeechSynthesis.Tacotron2.models import batchnorm_to_float
173+
174+
fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16"
175+
176+
if pretrained:
177+
if fp16:
178+
checkpoint = 'https://developer.nvidia.com/joc-waveglow-fp16-pyt-20190306'
179+
else:
180+
checkpoint = 'https://developer.nvidia.com/joc-waveglow-fp32-pyt-20190306'
181+
ckpt_file = "waveglow_ckpt.pt"
182+
urllib.request.urlretrieve(checkpoint, ckpt_file)
183+
ckpt = torch.load(ckpt_file)
184+
state_dict = ckpt['state_dict']
185+
if checkpoint_from_distributed(state_dict):
186+
state_dict = unwrap_distributed(state_dict)
187+
config = ckpt['config']
188+
else:
189+
config = {'n_mel_channels': 80, 'n_flows': 12, 'n_group': 8,
190+
'n_early_every': 4, 'n_early_size': 2,
191+
'WN_config': {'n_layers': 8, 'kernel_size': 3,
192+
'n_channels': 512}}
193+
for k,v in kwargs.items():
194+
if k in config.keys():
195+
config[k] = v
196+
elif k in config['WN_config'].keys():
197+
config['WN_config'][k] = v
198+
199+
m = waveglow.WaveGlow(**config)
200+
201+
if fp16:
202+
m = batchnorm_to_float(m.half())
203+
for mat in m.convinv:
204+
mat.float()
205+
206+
if pretrained:
207+
m.load_state_dict(state_dict)
208+
209+
return m

0 commit comments

Comments
 (0)