add global init for multi band melgan to avoid large output in the begin

d3d9f835 · 小湉湉 · 9125d71a · d3d9f835 · 9125d71a · d3d9f835
4 changed file
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@@ -35,7 +35,7 @@ generator_params:
    stacks: 4                     # Number of stacks in a single residual stack module.
    use_weight_norm: True         # Whether to use weight normalization.
    use_causal_conv: False        # Whether to use causal convolution.
-    use_final_nonlinear_activation: False # If True, spectral_convergence_loss and sub_spectral_convergence_loss will be too large (eg.30)
+    use_final_nonlinear_activation: True
 ###########################################################
@@ -129,7 +129,7 @@ discriminator_scheduler_params:
 ###########################################################
 discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
 train_max_steps: 1000000                # Number of training steps.
-save_interval_steps: 50000              # Interval steps to save checkpoint.
+save_interval_steps: 5000              # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
 ###########################################################

--- a/examples/csmsc/voc3/conf/use_tanh.yaml
+++ b/examples/csmsc/voc3/conf/use_tanh.yaml
-# This is the hyperparameter configuration file for MelGAN.
-# Please make sure this is adjusted for the CSMSC dataset. If you want to
-# apply to the other dataset, you might need to carefully change some parameters.
-# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
-# This configuration is based on full-band MelGAN but the hop size and sampling
-# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
-# is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
-###########################################################
-#                FEATURE EXTRACTION SETTING               #
-###########################################################
-fs: 24000                # Sampling rate.
-n_fft: 2048              # FFT size. (in samples)
-n_shift: 300             # Hop size. (in samples)
-win_length: 1200         # Window length. (in samples)
-                         # If set to null, it will be the same as fft_size.
-window: "hann"           # Window function.
-n_mels: 80               # Number of mel basis.
-fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
-fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-generator_params:
-    in_channels: 80               # Number of input channels.
-    out_channels: 4               # Number of output channels.
-    kernel_size: 7                # Kernel size of initial and final conv layers.
-    channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
-    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
-    stacks: 4                     # Number of stacks in a single residual stack module.
-    use_weight_norm: True         # Whether to use weight normalization.
-    use_causal_conv: False        # Whether to use causal convolution.
-    use_final_nonlinear_activation: True # If True, spectral_convergence_loss and sub_spectral_convergence_loss will be too large (eg.30)
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-discriminator_params:
-    in_channels: 1                    # Number of input channels.
-    out_channels: 1                   # Number of output channels.
-    scales: 3                         # Number of multi-scales.
-    downsample_pooling: "AvgPool1D"   # Pooling type for the input downsampling.
-    downsample_pooling_params:        # Parameters of the above pooling function.
-        kernel_size: 4
-        stride: 2
-        padding: 1
-        exclusive: True
-    kernel_sizes: [5, 3]              # List of kernel size.
-    channels: 16                      # Number of channels of the initial conv layer.
-    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
-    downsample_scales: [4, 4, 4]      # List of downsampling scales.
-    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
-    nonlinear_activation_params:      # Parameters of nonlinear activation function.
-        negative_slope: 0.2
-    use_weight_norm: True             # Whether to use weight norm.
-###########################################################
-#                   STFT LOSS SETTING                     #
-###########################################################
-use_stft_loss: true
-stft_loss_params:
-    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
-    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
-    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
-    window: "hann"                # Window function for STFT-based loss
-use_subband_stft_loss: true
-subband_stft_loss_params:
-    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
-    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-    window: "hann"              # Window function for STFT-based loss
-###########################################################
-#               ADVERSARIAL LOSS SETTING                  #
-###########################################################
-use_feat_match_loss: false # Whether to use feature matching loss.
-lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 64             # Batch size.
-batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
-num_workers: 2             # Number of workers in DataLoader.
-###########################################################
-#             OPTIMIZER & SCHEDULER SETTING               #
-###########################################################
-generator_optimizer_params:
-    epsilon: 1.0e-7                     # Generator's epsilon.
-    weight_decay: 0.0                   # Generator's weight decay coefficient.
-generator_grad_norm: -1                 # Generator's gradient norm.
-generator_scheduler_params:
-    learning_rate: 1.0e-3               # Generator's learning rate.
-    gamma: 0.5                          # Generator's scheduler gamma.
-    milestones:                         # At each milestone, lr will be multiplied by gamma.
-        - 100000
-        - 200000
-        - 300000
-        - 400000
-        - 500000
-        - 600000
-discriminator_optimizer_params:
-    epsilon: 1.0e-7                          # Discriminator's epsilon.
-    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
-discriminator_grad_norm: -1                 # Discriminator's gradient norm.
-discriminator_scheduler_params:
-    learning_rate: 1.0e-3                   # Discriminator's learning rate.
-    gamma: 0.5                              # Discriminator's scheduler gamma.
-    milestones:                             # At each milestone, lr will be multiplied by gamma.
-        - 100000
-        - 200000
-        - 300000
-        - 400000
-        - 500000
-        - 600000
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
-train_max_steps: 1000000                # Number of training steps.
-save_interval_steps: 50000              # Interval steps to save checkpoint.
-eval_interval_steps: 1000               # Interval steps to evaluate the network.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_snapshots: 10                 # max number of snapshots to keep while training
-seed: 42                          # random seed for paddle, random, and np.random
\ No newline at end of file
--- a/parakeet/models/melgan/melgan.py
+++ b/parakeet/models/melgan/melgan.py
@@ -22,6 +22,7 @@ from paddle import nn
 from parakeet.modules.causal_conv import CausalConv1D
 from parakeet.modules.causal_conv import CausalConv1DTranspose
+from parakeet.modules.nets_utils import initialize
 from parakeet.modules.pqmf import PQMF
 from parakeet.modules.residual_stack import ResidualStack
@@ -45,7 +46,8 @@ class MelGANGenerator(nn.Layer):
            pad_params: Dict[str, Any]={"mode": "reflect"},
            use_final_nonlinear_activation: bool=True,
            use_weight_norm: bool=True,
-            use_causal_conv: bool=False, ):
+            use_causal_conv: bool=False,
+            init_type: str="xavier_uniform", ):
        """Initialize MelGANGenerator module.
        Parameters
        ----------
@@ -91,7 +93,10 @@ class MelGANGenerator(nn.Layer):
        if not use_causal_conv:
            assert (kernel_size - 1
                    ) % 2 == 0, "Not support even number kernel size."
-        # add initial layer
+        # initialize parameters
+        initialize(self, init_type)
        layers = []
        if not use_causal_conv:
            layers += [
@@ -178,6 +183,7 @@ class MelGANGenerator(nn.Layer):
        # define the model as a single function        
        self.melgan = nn.Sequential(*layers)
+        nn.initializer.set_global_initializer(None)
        # apply weight norm
        if use_weight_norm:
@@ -322,6 +328,7 @@ class MelGANDiscriminator(nn.Layer):
        assert len(kernel_sizes) == 2
        assert kernel_sizes[0] % 2 == 1
        assert kernel_sizes[1] % 2 == 1
        # add first layer
        self.layers.append(
            nn.Sequential(
@@ -417,7 +424,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
            pad: str="Pad1D",
            pad_params: Dict[str, Any]={"mode": "reflect"},
-            use_weight_norm: bool=True, ):
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
        """Initilize MelGAN multi-scale discriminator module.
        Parameters
        ----------
@@ -454,6 +462,9 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
            Whether to use causal convolution.
        """
        super().__init__()
+        # initialize parameters
+        initialize(self, init_type)
        self.discriminators = nn.LayerList()
        # add discriminators
@@ -474,6 +485,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
        self.pooling = getattr(nn, downsample_pooling)(
            **downsample_pooling_params)
+        nn.initializer.set_global_initializer(None)
        # apply weight norm
        if use_weight_norm:
            self.apply_weight_norm()

--- a/parakeet/modules/residual_stack.py
+++ b/parakeet/modules/residual_stack.py
@@ -106,7 +106,4 @@ class ResidualStack(nn.Layer):
        Tensor
            Output tensor (B, chennels, T).
        """
-        stack_output = self.stack(c)
+        return self.stack(c) + self.skip_layer(c)
-        skip_layer_output = self.skip_layer(c)
-        out = stack_output + skip_layer_output
-        return out