feifeibear
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/Dockerfile‎
Lines changed: 3 additions & 1 deletion b/‎PyTorch/SpeechSynthesis/Tacotron2/Dockerfile‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/README.md‎
Lines changed: 254 additions & 118 deletions b/‎PyTorch/SpeechSynthesis/Tacotron2/README.md‎
Lines changed: 254 additions & 118 deletions
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/common/stft.py‎
Lines changed: 6 additions & 5 deletions b/‎PyTorch/SpeechSynthesis/Tacotron2/common/stft.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/common/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎PyTorch/SpeechSynthesis/Tacotron2/common/utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_onnx.py‎
Lines changed: 1 addition & 1 deletion b/‎PyTorch/SpeechSynthesis/Tacotron2/exports/export_tacotron2_onnx.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py‎
Lines changed: 12 additions & 85 deletions b/‎PyTorch/SpeechSynthesis/Tacotron2/exports/export_waveglow_onnx.py‎
Lines changed: 12 additions & 85 deletions
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/img/Taco2WG_train_loss.png‎
52.1 KB b/‎PyTorch/SpeechSynthesis/Tacotron2/img/Taco2WG_train_loss.png‎
52.1 KB
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/img/tacotron2_a100_amp_loss.png‎
19 KB b/‎PyTorch/SpeechSynthesis/Tacotron2/img/tacotron2_a100_amp_loss.png‎
19 KB
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/img/tacotron2_a100_tf32_loss.png‎
19.4 KB b/‎PyTorch/SpeechSynthesis/Tacotron2/img/tacotron2_a100_tf32_loss.png‎
19.4 KB
diff --git a/‎PyTorch/SpeechSynthesis/Tacotron2/img/waveglow_a100_amp_loss.png‎
20.5 KB b/‎PyTorch/SpeechSynthesis/Tacotron2/img/waveglow_a100_amp_loss.png‎
20.5 KB
@@ -1,6 +1,8 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.03-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
 FROM ${FROM_IMAGE_NAME}
 
 ADD . /workspace/tacotron2
 WORKDIR /workspace/tacotron2
 RUN pip install --no-cache-dir -r requirements.txt
+
+ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4
@@ -108,11 +108,12 @@ def inverse(self, magnitude, phase):
         recombine_magnitude_phase = torch.cat(
             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
 
-        inverse_transform = F.conv_transpose1d(
-            recombine_magnitude_phase,
-            Variable(self.inverse_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0)
+        inverse_transform = F.conv_transpose2d(
+            recombine_magnitude_phase.unsqueeze(-1),
+            Variable(self.inverse_basis.unsqueeze(-1), requires_grad=False),
+            stride=(self.hop_length,1),
+            padding=(0,0))
+        inverse_transform = inverse_transform.squeeze(-1)
 
         if self.window is not None:
             window_sum = window_sumsquare(
 
@@ -63,4 +63,4 @@ def to_gpu(x):
 
     if torch.cuda.is_available():
         x = x.cuda(non_blocking=True)
-    return torch.autograd.Variable(x)
+    return x
@@ -297,7 +297,7 @@ def main():
     args, _ = parser.parse_known_args()
 
     tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
-                                     amp_run=args.fp16, cpu_run=False)
+                                     fp16_run=args.fp16, cpu_run=False)
 
     opset_version = 10
 
 
@@ -49,72 +49,6 @@ def parse_args(parser):
     return parser
 
 
-def convert_convinv_1d_to_2d(convinv):
-    """
-    Takes an invertible 1x1 1-d convolution and returns a 2-d convolution that does
-    the inverse
-    """
-    conv2d = torch.nn.Conv2d(convinv.W_inverse.size(1),
-                             convinv.W_inverse.size(0),
-                             1, bias=False)
-    conv2d.weight.data[:,:,:,0] = convinv.W_inverse.data
-    return conv2d
-
-
-def convert_conv_1d_to_2d(conv1d):
-    conv2d = torch.nn.Conv2d(conv1d.weight.size(1),
-                             conv1d.weight.size(0),
-                             (conv1d.weight.size(2), 1),
-                             stride=(conv1d.stride[0], 1),
-                             dilation=(conv1d.dilation[0], 1),
-                             padding=(conv1d.padding[0], 0))
-    conv2d.weight.data[:,:,:,0] = conv1d.weight.data
-    conv2d.bias.data = conv1d.bias.data
-    return conv2d
-
-
-def convert_WN_1d_to_2d_(WN):
-    """
-    Modifies the WaveNet like affine coupling layer in-place to use 2-d convolutions
-    """
-    WN.start = convert_conv_1d_to_2d(WN.start)
-    WN.end = convert_conv_1d_to_2d(WN.end)
-
-    for i in range(len(WN.in_layers)):
-        WN.in_layers[i] = convert_conv_1d_to_2d(WN.in_layers[i])
-
-    for i in range(len(WN.res_skip_layers)):
-        WN.res_skip_layers[i] = convert_conv_1d_to_2d(WN.res_skip_layers[i])
-
-    for i in range(len(WN.res_skip_layers)):
-        WN.cond_layers[i] = convert_conv_1d_to_2d(WN.cond_layers[i])
-
-def convert_1d_to_2d_(glow):
-    """
-    Caffe2 and TensorRT don't seem to support 1-d convolutions or properly
-    convert ONNX exports with 1d convolutions to 2d convolutions yet, so we
-    do the conversion to 2-d convolutions before ONNX export
-    """
-    # Convert upsample to 2d
-    upsample = torch.nn.ConvTranspose2d(glow.upsample.weight.size(0),
-                                        glow.upsample.weight.size(1),
-                                        (glow.upsample.weight.size(2), 1),
-                                        stride=(glow.upsample.stride[0], 1))
-    upsample.weight.data[:,:,:,0] = glow.upsample.weight.data
-    upsample.bias.data = glow.upsample.bias.data
-    glow.upsample = upsample.cuda()
-
-    # Convert WN to 2d
-    for WN in glow.WN:
-        convert_WN_1d_to_2d_(WN)
-
-    # Convert invertible conv to 2d
-    for i in range(len(glow.convinv)):
-        glow.convinv[i] = convert_convinv_1d_to_2d(glow.convinv[i])
-
-    glow.cuda()
-
-
 def infer_onnx(self, spect, z, sigma=0.9):
 
     spect = self.upsample(spect)
@@ -126,37 +60,33 @@ def infer_onnx(self, spect, z, sigma=0.9):
     mel_dim = 80
     batch_size = spect.size(0)
 
-    spect = torch.squeeze(spect, 3)
     spect = spect.view((batch_size, mel_dim, length_spect_group, self.n_group))
     spect = spect.permute(0, 2, 1, 3)
     spect = spect.contiguous()
     spect = spect.view((batch_size, length_spect_group, self.n_group*mel_dim))
     spect = spect.permute(0, 2, 1)
-    spect = torch.unsqueeze(spect, 3)
     spect = spect.contiguous()
 
-    audio = z[:, :self.n_remaining_channels, :, :]
-    z = z[:, self.n_remaining_channels:self.n_group, :, :]
+    audio = z[:, :self.n_remaining_channels, :]
+    z = z[:, self.n_remaining_channels:self.n_group, :]
     audio = sigma*audio
 
     for k in reversed(range(self.n_flows)):
         n_half = int(audio.size(1) / 2)
-        audio_0 = audio[:, :n_half, :, :]
-        audio_1 = audio[:, n_half:(n_half+n_half), :, :]
+        audio_0 = audio[:, :n_half, :]
+        audio_1 = audio[:, n_half:(n_half+n_half), :]
 
         output = self.WN[k]((audio_0, spect))
-        s = output[:, n_half:(n_half+n_half), :, :]
-        b = output[:, :n_half, :, :]
+        s = output[:, n_half:(n_half+n_half), :]
+        b = output[:, :n_half, :]
         audio_1 = (audio_1 - b) / torch.exp(s)
         audio = torch.cat([audio_0, audio_1], 1)
-
-        audio = self.convinv[k](audio)
+        audio = self.convinv[k].infer(audio)
 
         if k % self.n_early_every == 0 and k > 0:
-            audio = torch.cat((z[:, :self.n_early_size, :, :], audio), 1)
-            z = z[:, self.n_early_size:self.n_group, :, :]
+            audio = torch.cat((z[:, :self.n_early_size, :], audio), 1)
+            z = z[:, self.n_early_size:self.n_group, :]
 
-    audio = torch.squeeze(audio, 3)
     audio = audio.permute(0,2,1).contiguous().view(batch_size, (length_spect_group * self.n_group))
 
     return audio
@@ -165,15 +95,15 @@ def infer_onnx(self, spect, z, sigma=0.9):
 def export_onnx(parser, args):
 
     waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
-                                    amp_run=args.fp16, cpu_run=False,
+                                    fp16_run=args.fp16, cpu_run=False,
                                     forward_is_infer=False)
 
     # 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
     mel = torch.randn(1, 80, 620).cuda()
     stride = 256 # value from waveglow upsample
     n_group = 8
     z_size2 = (mel.size(2)*stride)//n_group
-    z = torch.randn(1, n_group, z_size2, 1).cuda()
+    z = torch.randn(1, n_group, z_size2).cuda()
 
     if args.fp16:
         mel = mel.half()
@@ -183,16 +113,13 @@ def export_onnx(parser, args):
         waveglow.infer(mel, sigma=args.sigma_infer)
 
         # export to ONNX
-        convert_1d_to_2d_(waveglow)
         if args.fp16:
             waveglow = waveglow.half()
 
         fType = types.MethodType
         waveglow.forward = fType(infer_onnx, waveglow)
 
-        mel = mel.unsqueeze(3)
-
-        opset_version = 10
+        opset_version = 12
 
         torch.onnx.export(waveglow, (mel, z), args.output+"/"+"waveglow.onnx",
                           opset_version=opset_version,