diff --git a/examples/FastSpeech/config/fastspeech.yaml b/examples/FastSpeech/config/fastspeech.yaml index 54467296c537b2b4f49fb7ec9516e5ca2138ed43..05df0aa531fecdb572a2bf4c1332c29b063f4521 100644 --- a/examples/FastSpeech/config/fastspeech.yaml +++ b/examples/FastSpeech/config/fastspeech.yaml @@ -1,14 +1,14 @@ audio: - num_mels: 80 - n_fft: 2048 - sr: 22050 - preemphasis: 0.97 - hop_length: 256 - win_length: 1024 - power: 1.2 - min_level_db: -100 - ref_level_db: 20 - outputs_per_step: 1 + num_mels: 80 #the number of mel bands when calculating mel spectrograms. + n_fft: 2048 #the number of fft components. + sr: 22050 #the sampling rate of audio data file. + preemphasis: 0.97 #the preemphasis coefficient. + hop_length: 256 #the number of samples to advance between frames. + win_length: 1024 #the length (width) of the window function. + power: 1.2 #the power to raise before griffin-lim. + min_level_db: -100 #the minimum level db. + ref_level_db: 20 #the reference level db. + outputs_per_step: 1 #the outputs per step. encoder_n_layer: 6 encoder_head: 2 @@ -35,12 +35,12 @@ epochs: 10000 lr: 0.001 save_step: 500 use_gpu: True -use_data_parallel: False +use_data_parallel: True data_path: ../../dataset/LJSpeech-1.1 transtts_path: ../TransformerTTS/checkpoint/ -transformer_step: 200000 +transformer_step: 160000 save_path: ./checkpoint log_dir: ./log #checkpoint_path: ./checkpoint -#ransformer_step: 97000 \ No newline at end of file +#transformer_step: 97000 diff --git a/examples/FastSpeech/train.py b/examples/FastSpeech/train.py index 45102e13a4832169da782390451d30c5195009a1..4f797056dca952d1fb9fba02f4054d2ee03b823e 100644 --- a/examples/FastSpeech/train.py +++ b/examples/FastSpeech/train.py @@ -51,7 +51,6 @@ def main(cfg): with fluid.unique_name.guard(): transformerTTS = TransformerTTS(cfg) model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer")) - transformerTTS.set_dict(model_dict) transformerTTS.eval() @@ -126,4 +125,4 @@ if __name__ =='__main__': parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse') add_config_options_to_parser(parser) cfg = parser.parse_args('-c config/fastspeech.yaml'.split()) - main(cfg) \ No newline at end of file + main(cfg) diff --git a/examples/TransformerTTS/config/train_transformer.yaml b/examples/TransformerTTS/config/train_transformer.yaml index fb94a416c7eb097f63ba734668a75cec0ba9337e..3065f4840e22ef5419b827aed0ef104b02430954 100644 --- a/examples/TransformerTTS/config/train_transformer.yaml +++ b/examples/TransformerTTS/config/train_transformer.yaml @@ -23,7 +23,7 @@ lr: 0.001 save_step: 1000 image_step: 2000 use_gpu: True -use_data_parallel: True +use_data_parallel: False stop_token: False data_path: ../../dataset/LJSpeech-1.1 diff --git a/parakeet/models/fastspeech/LengthRegulator.py b/parakeet/models/fastspeech/LengthRegulator.py index 2446f61480650bc4e52ddc0915f8c39cfb443ef9..d90eaa5015e771cb698728adbe933968ba7522ef 100644 --- a/parakeet/models/fastspeech/LengthRegulator.py +++ b/parakeet/models/fastspeech/LengthRegulator.py @@ -83,21 +83,21 @@ class DurationPredictor(dg.Layer): self.dropout = dropout k = math.sqrt(1 / self.input_size) - self.conv1 = Conv1D(in_channels = self.input_size, - out_channels = self.out_channels, + self.conv1 = Conv1D(num_channels = self.input_size, + num_filters = self.out_channels, filter_size = self.filter_size, padding=1, param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - data_format='NTC') + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) + #data_format='NTC') k = math.sqrt(1 / self.out_channels) - self.conv2 = Conv1D(in_channels = self.out_channels, - out_channels = self.out_channels, + self.conv2 = Conv1D(num_channels = self.out_channels, + num_filters = self.out_channels, filter_size = self.filter_size, padding=1, param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - data_format='NTC') + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) + #data_format='NTC') self.layer_norm1 = dg.LayerNorm(self.out_channels) self.layer_norm2 = dg.LayerNorm(self.out_channels) @@ -118,10 +118,17 @@ class DurationPredictor(dg.Layer): out (Variable), Shape(B, T, C), the output of duration predictor. """ # encoder_output.shape(N, T, C) - out = layers.dropout(layers.relu(self.layer_norm1(self.conv1(encoder_output))), self.dropout) - out = layers.dropout(layers.relu(self.layer_norm2(self.conv2(out))), self.dropout) + out = layers.transpose(encoder_output, [0,2,1]) + out = self.conv1(out) + out = layers.transpose(out, [0,2,1]) + out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout) + out = layers.transpose(out, [0,2,1]) + out = self.conv2(out) + out = layers.transpose(out, [0,2,1]) + out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout) out = layers.relu(self.linear(out)) out = layers.squeeze(out, axes=[-1]) + return out diff --git a/parakeet/models/transformerTTS/CBHG.py b/parakeet/models/transformerTTS/CBHG.py index 7ee90e3ea485884068e8033af2f2f7eb1fe31fda..94b907f9d2599b274d5471be5b7665c340ea5601 100644 --- a/parakeet/models/transformerTTS/CBHG.py +++ b/parakeet/models/transformerTTS/CBHG.py @@ -24,22 +24,20 @@ class CBHG(dg.Layer): self.projection_size = projection_size self.conv_list = [] k = math.sqrt(1 / projection_size) - self.conv_list.append(Conv1D(in_channels = projection_size, - out_channels = hidden_size, + self.conv_list.append(Conv1D(num_channels = projection_size, + num_filters = hidden_size, filter_size = 1, padding = int(np.floor(1/2)), param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - data_format = "NCT")) + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))) k = math.sqrt(1 / hidden_size) for i in range(2,K+1): - self.conv_list.append(Conv1D(in_channels = hidden_size, - out_channels = hidden_size, + self.conv_list.append(Conv1D(num_channels = hidden_size, + num_filters = hidden_size, filter_size = i, padding = int(np.floor(i/2)), param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - data_format = "NCT")) + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))) for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) @@ -55,22 +53,20 @@ class CBHG(dg.Layer): conv_outdim = hidden_size * K k = math.sqrt(1 / conv_outdim) - self.conv_projection_1 = Conv1D(in_channels = conv_outdim, - out_channels = hidden_size, + self.conv_projection_1 = Conv1D(num_channels = conv_outdim, + num_filters = hidden_size, filter_size = 3, padding = int(np.floor(3/2)), param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - data_format = "NCT") + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) k = math.sqrt(1 / hidden_size) - self.conv_projection_2 = Conv1D(in_channels = hidden_size, - out_channels = projection_size, + self.conv_projection_2 = Conv1D(num_channels = hidden_size, + num_filters = projection_size, filter_size = 3, padding = int(np.floor(3/2)), param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), - bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - data_format = "NCT") + bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))) self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW') diff --git a/parakeet/models/transformerTTS/encoderprenet.py b/parakeet/models/transformerTTS/encoderprenet.py index 85f89dadc3310a95cd824c4c371681e7bffbcfc2..b27f2fed666cc4b132c5c1b0200f877257a43880 100644 --- a/parakeet/models/transformerTTS/encoderprenet.py +++ b/parakeet/models/transformerTTS/encoderprenet.py @@ -17,24 +17,22 @@ class EncoderPrenet(dg.Layer): padding_idx = None) self.conv_list = [] k = math.sqrt(1 / embedding_size) - self.conv_list.append(Conv1D(in_channels = embedding_size, - out_channels = num_hidden, + self.conv_list.append(Conv1D(num_channels = embedding_size, + num_filters = num_hidden, filter_size = 5, padding = int(np.floor(5/2)), param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn, - data_format = "NCT")) + use_cudnn = use_cudnn)) k = math.sqrt(1 / num_hidden) for _ in range(2): - self.conv_list.append(Conv1D(in_channels = num_hidden, - out_channels = num_hidden, + self.conv_list.append(Conv1D(num_channels = num_hidden, + num_filters = num_hidden, filter_size = 5, padding = int(np.floor(5/2)), param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn, - data_format = "NCT")) + use_cudnn = use_cudnn)) for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) diff --git a/parakeet/models/transformerTTS/post_convnet.py b/parakeet/models/transformerTTS/post_convnet.py index 7ed905b957a27c54e5449c15a1b76700fc716dec..3e393ee151ca3dacc62fcf0bcb34fa798efc2c23 100644 --- a/parakeet/models/transformerTTS/post_convnet.py +++ b/parakeet/models/transformerTTS/post_convnet.py @@ -22,34 +22,31 @@ class PostConvNet(dg.Layer): self.batchnorm_last = batchnorm_last self.conv_list = [] k = math.sqrt(1 / (n_mels * outputs_per_step)) - self.conv_list.append(Conv1D(in_channels = n_mels * outputs_per_step, - out_channels = num_hidden, + self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step, + num_filters = num_hidden, filter_size = filter_size, padding = padding, param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn, - data_format = "NCT")) + use_cudnn = use_cudnn)) k = math.sqrt(1 / num_hidden) for _ in range(1, num_conv-1): - self.conv_list.append(Conv1D(in_channels = num_hidden, - out_channels = num_hidden, + self.conv_list.append(Conv1D(num_channels = num_hidden, + num_filters = num_hidden, filter_size = filter_size, padding = padding, param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn, - data_format = "NCT") ) + use_cudnn = use_cudnn)) - self.conv_list.append(Conv1D(in_channels = num_hidden, - out_channels = n_mels * outputs_per_step, + self.conv_list.append(Conv1D(num_channels = num_hidden, + num_filters = n_mels * outputs_per_step, filter_size = filter_size, padding = padding, param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn, - data_format = "NCT")) + use_cudnn = use_cudnn)) for i, layer in enumerate(self.conv_list): self.add_sublayer("conv_list_{}".format(i), layer) diff --git a/parakeet/models/transformerTTS/vocoder.py b/parakeet/models/transformerTTS/vocoder.py index 3973dae117ae301c3cebdb0efa74f31a251cbf61..f8e66e1ce93a2e08d056d6a031f0cc01e4272d64 100644 --- a/parakeet/models/transformerTTS/vocoder.py +++ b/parakeet/models/transformerTTS/vocoder.py @@ -10,15 +10,13 @@ class Vocoder(dg.Layer): """ def __init__(self, config): super(Vocoder, self).__init__() - self.pre_proj = Conv1D(in_channels = config.audio.num_mels, - out_channels = config.hidden_size, - filter_size=1, - data_format = "NCT") + self.pre_proj = Conv1D(num_channels = config.audio.num_mels, + num_filters = config.hidden_size, + filter_size=1) self.cbhg = CBHG(config.hidden_size, config.batch_size) - self.post_proj = Conv1D(in_channels = config.hidden_size, - out_channels = (config.audio.n_fft // 2) + 1, - filter_size=1, - data_format = "NCT") + self.post_proj = Conv1D(num_channels = config.hidden_size, + num_filters = (config.audio.n_fft // 2) + 1, + filter_size=1) def forward(self, mel): mel = layers.transpose(mel, [0,2,1]) diff --git a/parakeet/modules/ffn.py b/parakeet/modules/ffn.py index 7b06dcb2353d8f72015e03e404f717c2ef16c357..dc413bfe00eb8a0a4c6b07af2a3ec82854d408b1 100644 --- a/parakeet/modules/ffn.py +++ b/parakeet/modules/ffn.py @@ -14,23 +14,21 @@ class PositionwiseFeedForward(dg.Layer): self.dropout = dropout k = math.sqrt(1 / d_in) - self.w_1 = Conv1D(in_channels = d_in, - out_channels = num_hidden, + self.w_1 = Conv1D(num_channels = d_in, + num_filters = num_hidden, filter_size = filter_size, padding=padding, param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn, - data_format = "NTC") + use_cudnn = use_cudnn) k = math.sqrt(1 / num_hidden) - self.w_2 = Conv1D(in_channels = num_hidden, - out_channels = d_in, + self.w_2 = Conv1D(num_channels = num_hidden, + num_filters = d_in, filter_size = filter_size, padding=padding, param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()), bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)), - use_cudnn = use_cudnn, - data_format = "NTC") + use_cudnn = use_cudnn) self.layer_norm = dg.LayerNorm(d_in) def forward(self, input): @@ -42,12 +40,14 @@ class PositionwiseFeedForward(dg.Layer): Returns: output (Variable), Shape(B, T, C), the result after FFN. """ + x = layers.transpose(input, [0,2,1]) #FFN Networt - x = self.w_2(layers.relu(self.w_1(input))) + x = self.w_2(layers.relu(self.w_1(x))) # dropout x = layers.dropout(x, self.dropout) + x = layers.transpose(x, [0,2,1]) # residual connection x = x + input