|
| 1 | +import urllib.request |
| 2 | +import torch |
| 3 | + |
| 4 | +# from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py |
| 5 | +def checkpoint_from_distributed(state_dict): |
| 6 | + """ |
| 7 | + Checks whether checkpoint was generated by DistributedDataParallel. DDP |
| 8 | + wraps model in additional "module.", it needs to be unwrapped for single |
| 9 | + GPU inference. |
| 10 | + :param state_dict: model's state dict |
| 11 | + """ |
| 12 | + ret = False |
| 13 | + for key, _ in state_dict.items(): |
| 14 | + if key.find('module.') != -1: |
| 15 | + ret = True |
| 16 | + break |
| 17 | + return ret |
| 18 | + |
| 19 | + |
| 20 | +# from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/inference.py |
| 21 | +def unwrap_distributed(state_dict): |
| 22 | + """ |
| 23 | + Unwraps model from DistributedDataParallel. |
| 24 | + DDP wraps model in additional "module.", it needs to be removed for single |
| 25 | + GPU inference. |
| 26 | + :param state_dict: model's state dict |
| 27 | + """ |
| 28 | + new_state_dict = {} |
| 29 | + for key, value in state_dict.items(): |
| 30 | + new_key = key.replace('module.1.', '') |
| 31 | + new_key = new_key.replace('module.', '') |
| 32 | + new_state_dict[new_key] = value |
| 33 | + return new_state_dict |
| 34 | + |
| 35 | + |
| 36 | +dependencies = ['torch'] |
| 37 | + |
| 38 | + |
| 39 | +def nvidia_ncf(pretrained=True, **kwargs): |
| 40 | + """Constructs an NCF model. |
| 41 | + For detailed information on model input and output, training recipies, inference and performance |
| 42 | + visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com |
| 43 | +
|
| 44 | + Args: |
| 45 | + pretrained (bool, True): If True, returns a model pretrained on ml-20m dataset. |
| 46 | + model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16') |
| 47 | + nb_users (int): number of users |
| 48 | + nb_items (int): number of items |
| 49 | + mf_dim (int, 64): dimension of latent space in matrix factorization |
| 50 | + mlp_layer_sizes (list, [256,256,128,64]): sizes of layers of multi-layer-perceptron |
| 51 | + dropout (float, 0.5): dropout |
| 52 | + """ |
| 53 | + |
| 54 | + from PyTorch.Recommendation.NCF import neumf as ncf |
| 55 | + |
| 56 | + fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16" |
| 57 | + |
| 58 | + config = {'nb_users': None, 'nb_items': None, 'mf_dim': 64, 'mf_reg': 0., |
| 59 | + 'mlp_layer_sizes': [256, 256, 128, 64], 'mlp_layer_regs':[0, 0, 0, 0], 'dropout': 0.5} |
| 60 | + |
| 61 | + if pretrained: |
| 62 | + if fp16: |
| 63 | + checkpoint = 'https://developer.nvidia.com/joc-ncf-fp16-pyt-20190225' |
| 64 | + else: |
| 65 | + checkpoint = 'https://developer.nvidia.com/joc-ncf-fp32-pyt-20190225' |
| 66 | + ckpt_file = "ncf_ckpt.pt" |
| 67 | + urllib.request.urlretrieve(checkpoint, ckpt_file) |
| 68 | + ckpt = torch.load(ckpt_file) |
| 69 | + |
| 70 | + if checkpoint_from_distributed(ckpt): |
| 71 | + ckpt = unwrap_distributed(ckpt) |
| 72 | + |
| 73 | + config['nb_users'] = ckpt['mf_user_embed.weight'].shape[0] |
| 74 | + config['nb_items'] = ckpt['mf_item_embed.weight'].shape[0] |
| 75 | + config['mf_dim'] = ckpt['mf_item_embed.weight'].shape[1] |
| 76 | + mlp_shapes = [ckpt[k].shape for k in ckpt.keys() if 'mlp' in k and 'weight' in k and 'embed' not in k] |
| 77 | + config['mlp_layer_sizes'] = [mlp_shapes[0][1], mlp_shapes[1][1], mlp_shapes[2][1], mlp_shapes[2][0]] |
| 78 | + config['mlp_layer_regs'] = [0] * len(config['mlp_layer_sizes']) |
| 79 | + |
| 80 | + else: |
| 81 | + if 'nb_users' not in kwargs: |
| 82 | + raise ValueError("Missing 'nb_users' argument.") |
| 83 | + if 'nb_items' not in kwargs: |
| 84 | + raise ValueError("Missing 'nb_items' argument.") |
| 85 | + for k,v in kwargs.items(): |
| 86 | + if k in config.keys(): |
| 87 | + config[k] = v |
| 88 | + config['mlp_layer_regs'] = [0] * len(config['mlp_layer_sizes']) |
| 89 | + |
| 90 | + m = ncf.NeuMF(**config) |
| 91 | + |
| 92 | + if fp16: |
| 93 | + m.half() |
| 94 | + |
| 95 | + if pretrained: |
| 96 | + m.load_state_dict(ckpt) |
| 97 | + |
| 98 | + return m |
| 99 | + |
| 100 | + |
| 101 | +def nvidia_tacotron2(pretrained=True, **kwargs): |
| 102 | + """Constructs a Tacotron 2 model (nn.module with additional infer(input) method). |
| 103 | + For detailed information on model input and output, training recipies, inference and performance |
| 104 | + visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com |
| 105 | +
|
| 106 | + Args (type[, default value]): |
| 107 | + pretrained (bool, True): If True, returns a model pretrained on LJ Speech dataset. |
| 108 | + model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16') |
| 109 | + n_symbols (int, 148): Number of symbols used in a sequence passed to the prenet, see |
| 110 | + https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/text/symbols.py |
| 111 | + p_attention_dropout (float, 0.1): dropout probability on attention LSTM (1st LSTM layer in decoder) |
| 112 | + p_decoder_dropout (float, 0.1): dropout probability on decoder LSTM (2nd LSTM layer in decoder) |
| 113 | + max_decoder_steps (int, 1000): maximum number of generated mel spectrograms during inference |
| 114 | + """ |
| 115 | + |
| 116 | + from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2 |
| 117 | + from PyTorch.SpeechSynthesis.Tacotron2.models import lstmcell_to_float, batchnorm_to_float |
| 118 | + |
| 119 | + fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16" |
| 120 | + |
| 121 | + if pretrained: |
| 122 | + if fp16: |
| 123 | + checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp16-pyt-20190306' |
| 124 | + else: |
| 125 | + checkpoint = 'https://developer.nvidia.com/joc-tacotron2-fp32-pyt-20190306' |
| 126 | + ckpt_file = "tacotron2_ckpt.pt" |
| 127 | + urllib.request.urlretrieve(checkpoint, ckpt_file) |
| 128 | + ckpt = torch.load(ckpt_file) |
| 129 | + state_dict = ckpt['state_dict'] |
| 130 | + if checkpoint_from_distributed(state_dict): |
| 131 | + state_dict = unwrap_distributed(state_dict) |
| 132 | + config = ckpt['config'] |
| 133 | + else: |
| 134 | + config = {'mask_padding': False, 'n_mel_channels': 80, 'n_symbols': 148, |
| 135 | + 'symbols_embedding_dim': 512, 'encoder_kernel_size': 5, |
| 136 | + 'encoder_n_convolutions': 3, 'encoder_embedding_dim': 512, |
| 137 | + 'attention_rnn_dim': 1024, 'attention_dim': 128, |
| 138 | + 'attention_location_n_filters': 32, |
| 139 | + 'attention_location_kernel_size': 31, 'n_frames_per_step': 1, |
| 140 | + 'decoder_rnn_dim': 1024, 'prenet_dim': 256, |
| 141 | + 'max_decoder_steps': 1000, 'gate_threshold': 0.5, |
| 142 | + 'p_attention_dropout': 0.1, 'p_decoder_dropout': 0.1, |
| 143 | + 'postnet_embedding_dim': 512, 'postnet_kernel_size': 5, |
| 144 | + 'postnet_n_convolutions': 5, 'decoder_no_early_stopping': False} |
| 145 | + for k,v in kwargs.items(): |
| 146 | + if k in config.keys(): |
| 147 | + config[k] = v |
| 148 | + |
| 149 | + m = tacotron2.Tacotron2(**config) |
| 150 | + |
| 151 | + if fp16: |
| 152 | + m = batchnorm_to_float(m.half()) |
| 153 | + m = lstmcell_to_float(m) |
| 154 | + |
| 155 | + if pretrained: |
| 156 | + m.load_state_dict(state_dict) |
| 157 | + |
| 158 | + return m |
| 159 | + |
| 160 | + |
| 161 | +def nvidia_waveglow(pretrained=True, **kwargs): |
| 162 | + """Constructs a WaveGlow model (nn.module with additional infer(input) method). |
| 163 | + For detailed information on model input and output, training recipies, inference and performance |
| 164 | + visit: github.com/NVIDIA/DeepLearningExamples and/or ngc.nvidia.com |
| 165 | +
|
| 166 | + Args: |
| 167 | + pretrained (bool): If True, returns a model pretrained on LJ Speech dataset. |
| 168 | + model_math (str, 'fp32'): returns a model in given precision ('fp32' or 'fp16') |
| 169 | + """ |
| 170 | + |
| 171 | + from PyTorch.SpeechSynthesis.Tacotron2.waveglow import model as waveglow |
| 172 | + from PyTorch.SpeechSynthesis.Tacotron2.models import batchnorm_to_float |
| 173 | + |
| 174 | + fp16 = "model_math" in kwargs and kwargs["model_math"] == "fp16" |
| 175 | + |
| 176 | + if pretrained: |
| 177 | + if fp16: |
| 178 | + checkpoint = 'https://developer.nvidia.com/joc-waveglow-fp16-pyt-20190306' |
| 179 | + else: |
| 180 | + checkpoint = 'https://developer.nvidia.com/joc-waveglow-fp32-pyt-20190306' |
| 181 | + ckpt_file = "waveglow_ckpt.pt" |
| 182 | + urllib.request.urlretrieve(checkpoint, ckpt_file) |
| 183 | + ckpt = torch.load(ckpt_file) |
| 184 | + state_dict = ckpt['state_dict'] |
| 185 | + if checkpoint_from_distributed(state_dict): |
| 186 | + state_dict = unwrap_distributed(state_dict) |
| 187 | + config = ckpt['config'] |
| 188 | + else: |
| 189 | + config = {'n_mel_channels': 80, 'n_flows': 12, 'n_group': 8, |
| 190 | + 'n_early_every': 4, 'n_early_size': 2, |
| 191 | + 'WN_config': {'n_layers': 8, 'kernel_size': 3, |
| 192 | + 'n_channels': 512}} |
| 193 | + for k,v in kwargs.items(): |
| 194 | + if k in config.keys(): |
| 195 | + config[k] = v |
| 196 | + elif k in config['WN_config'].keys(): |
| 197 | + config['WN_config'][k] = v |
| 198 | + |
| 199 | + m = waveglow.WaveGlow(**config) |
| 200 | + |
| 201 | + if fp16: |
| 202 | + m = batchnorm_to_float(m.half()) |
| 203 | + for mat in m.convinv: |
| 204 | + mat.float() |
| 205 | + |
| 206 | + if pretrained: |
| 207 | + m.load_state_dict(state_dict) |
| 208 | + |
| 209 | + return m |
0 commit comments