diff --git a/PaddleSpeech/DeepVoice3/README_cn.md b/PaddleSpeech/DeepVoice3/README_cn.md index 0828c9835747c795aa6398d7f4ad5dcea34aa674..a726a074e782289d2212808d118c637699d79ddf 100644 --- a/PaddleSpeech/DeepVoice3/README_cn.md +++ b/PaddleSpeech/DeepVoice3/README_cn.md @@ -10,7 +10,7 @@ Paddle 实现的 Deepvoice3,一个基于卷积神经网络的语音合成 (Tex ### 安装 paddlepaddle 框架 -为了更快的训练速度和更好的支持,我们推荐使用最新的开发版 paddle。用户可以最新编译的开发版 whl 包,也可以选择从源码编译 Paddle。 +为了更快的训练速度和更好的支持,我们推荐使用最新的 Paddle 开发版。用户也可以最新编译的开发版 whl 包,也可以选择从源码编译 Paddle。 1. 下载最新编译的开发版 whl 包。可以从 [**多版本 wheel 包列表-dev**](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev) 页面中选择合适的版本。 diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/conv.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/conv.py index 3e43232df3542c9cd12bd8228c0557a362fa4baf..0805135ff8a55163d6a5ea840d46cfa09b1139c2 100644 --- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/conv.py +++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/conv.py @@ -31,7 +31,7 @@ class Conv1D(dg.Layer): def __init__(self, name_scope, - in_cahnnels, + in_channels, num_filters, filter_size=3, dilation=1, @@ -49,7 +49,7 @@ class Conv1D(dg.Layer): else: padding = (dilation * (filter_size - 1)) // 2 - self.in_channels = in_cahnnels + self.in_channels = in_channels self.num_filters = num_filters self.filter_size = filter_size self.dilation = dilation diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/data.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/data.py index bb9d62a31bfe44297fc53644d2d5cc4ea77ebc09..d6dc55db2a6574fa5327f6aa07141d19947ad9ad 100644 --- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/data.py +++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/data.py @@ -294,7 +294,6 @@ def create_batch(batch): text_positions = np.array( [_pad(np.arange(1, len(x[0]) + 1), max_input_len) for x in batch], dtype=np.int64) - text_positions = np.expand_dims(text_positions, axis=-1) max_decoder_target_len = max_target_len // r // downsample_step @@ -304,7 +303,6 @@ def create_batch(batch): np.expand_dims( np.arange( s, e, dtype=np.int64), axis=0), (len(batch), 1)) - frame_positions = np.expand_dims(frame_positions, axis=-1) # done flags done = np.array([ diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/deepvoice3.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/deepvoice3.py index 4dda07c1fe53df7a040b435e1ac0c7840ad5b840..ca6dbbb6105d4e127187632d36d6cad0a1216e4e 100644 --- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/deepvoice3.py +++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/deepvoice3.py @@ -591,10 +591,10 @@ class Decoder(dg.Layer): of text inputs for each example. inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth mel-spectrogram, which is used as decoder inputs when training. - text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. + text_positions (Variable): Shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. - frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: + frame_positions (Variable): Shape(B, T_dec // r), dtype: int64. Positions indices for each decoder time steps. speaker_embed: shape(batch_size, speaker_dim), speaker embedding, only used for multispeaker model. @@ -717,7 +717,7 @@ class Decoder(dg.Layer): values (Variable): shape(B, C_emb, 1, T_enc), the value representation from an encoder, where C_emb means text embedding size. - text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. + text_positions (Variable): Shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. @@ -789,7 +789,7 @@ class Decoder(dg.Layer): while True: frame_pos = fluid.layers.fill_constant( - shape=[B, 1, 1], value=t + 1, dtype="int64") + shape=[B, 1], value=t + 1, dtype="int64") w = self.query_position_rate if self.n_speakers > 1: w = w * fluid.layers.reshape( @@ -1222,7 +1222,7 @@ class DeepVoiceTTS(dg.Layer): Encode text sequence and decode with ground truth mel spectrogram. Args: - text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe + text_sequences (Variable): Shape(B, T_enc), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. valid_lengths (Variable): shape(batch_size,), dtype: int64, valid lengths for each example in text_sequences. @@ -1231,10 +1231,10 @@ class DeepVoiceTTS(dg.Layer): speaker_indices (Variable, optional): Shape(Batch_size), dtype: int64. Speaker index for each example. This arg is not None only when the model is a multispeaker model. - text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. + text_positions (Variable): Shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. - frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: + frame_positions (Variable): Shape(B, T_dec // r), dtype: int64. Positions indices for each decoder time steps. Returns: @@ -1295,12 +1295,12 @@ class DeepVoiceTTS(dg.Layer): Encode text sequence and decode without ground truth mel spectrogram. Args: - text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe + text_sequences (Variable): Shape(B, T_enc), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. - text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. + text_positions (Variable): Shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. - speaker_indices (Variable, optional): Shape(Batch_size, 1), + speaker_indices (Variable, optional): Shape(Batch_size), dtype: int64. Speaker index for each example. This arg is not None only when the model is a multispeaker model. @@ -1423,7 +1423,7 @@ class ConvS2S(dg.Layer): Encode text sequence and decode with ground truth mel spectrogram. Args: - text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe + text_sequences (Variable): Shape(B, T_enc), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. valid_lengths (Variable): shape(batch_size,), dtype: int64, valid lengths for each example in text_sequences. @@ -1432,10 +1432,10 @@ class ConvS2S(dg.Layer): speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim), dtype: float32. Speaker embeddings. This arg is not None only when the model is a multispeaker model. - text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. + text_positions (Variable): Shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. - frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: + frame_positions (Variable): Shape(B, T_dec // r), dtype: int64. Positions indices for each decoder time steps. Returns: @@ -1466,9 +1466,9 @@ class ConvS2S(dg.Layer): Encode text sequence and decode without ground truth mel spectrogram. Args: - text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe + text_sequences (Variable): Shape(B, T_enc), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. - text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. + text_positions (Variable): Shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim), diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/dry_run.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/dry_run.py index 9c9518ce8aa73ad12ec225dfd4fcdeb5f5d5995b..c947f5bd2315b73e57f72c4396465f8c61412e35 100644 --- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/dry_run.py +++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/dry_run.py @@ -48,7 +48,7 @@ def dry_run(model): mel_dim = hparams.num_mels x = np.random.randint( - low=0, high=n_vocab, size=(batch_size, enc_length, 1), dtype="int64") + low=0, high=n_vocab, size=(batch_size, enc_length), dtype="int64") input_lengths = np.arange( enc_length - batch_size + 1, enc_length + 1, dtype="int64") mel = np.random.randn(batch_size, mel_dim, 1, mel_length).astype("float32") @@ -60,18 +60,16 @@ def dry_run(model): 0, enc_length, dtype="int64"), (batch_size, 1)) text_mask = text_positions > np.expand_dims(input_lengths, 1) text_positions[text_mask] = 0 - text_positions = np.expand_dims(text_positions, axis=-1) frame_positions = np.tile( np.arange( 1, decoder_length + 1, dtype="int64"), (batch_size, 1)) - frame_positions = np.expand_dims(frame_positions, axis=-1) done = np.zeros(shape=(batch_size, 1, 1, decoder_length), dtype="float32") target_lengths = np.array([snd_sample_length] * batch_size).astype("int64") speaker_ids = np.random.randint( - low=0, high=n_speakers, size=(batch_size, 1), + low=0, high=n_speakers, size=(batch_size), dtype="int64") if n_speakers > 1 else None ismultispeaker = speaker_ids is not None diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/modules.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/modules.py index b2921fdd089cf1909fda9dd307db3f18dedab9a6..d651c5edc380574773e3852919c45162590f73ad 100644 --- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/modules.py +++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/modules.py @@ -366,14 +366,14 @@ class PositionEmbedding(dg.Layer): self._dtype = dtype def set_weight(self, array): - assert self.embed._w.shape == list(array.shape), "shape does not match" - self.embed._w.value().get_tensor().set( - array, fluid.framework._current_expected_place()) + assert self.embed.weight.shape == list( + array.shape), "shape does not match" + self.embed.weight.set_value(array) def forward(self, indices, speaker_position_rate=None): """ Args: - indices (Variable): Shape (B, T, 1), dtype: int64, position + indices (Variable): Shape (B, T), dtype: int64, position indices, where B means the batch size, T means the time steps. speaker_position_rate (Variable | float, optional), position rate. It can be a float point number or a Variable with @@ -391,7 +391,7 @@ class PositionEmbedding(dg.Layer): weight = compute_position_embedding(rad) out = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( - type="lookup_table", + type="lookup_table_v2", inputs={"Ids": indices, "W": weight}, outputs={"Out": out}, @@ -417,7 +417,7 @@ class PositionEmbedding(dg.Layer): weight = compute_position_embedding(scaled_rad) out = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( - type="lookup_table", + type="lookup_table_v2", inputs={"Ids": indices, "W": weight}, outputs={"Out": out}, @@ -441,7 +441,7 @@ class PositionEmbedding(dg.Layer): self._dtype) sequence = indices[i] self._helper.append_op( - type="lookup_table", + type="lookup_table_v2", inputs={"Ids": sequence, "W": weight}, outputs={"Out": out}, diff --git a/PaddleSpeech/DeepVoice3/eval_model.py b/PaddleSpeech/DeepVoice3/eval_model.py index 3d800f3398a7bdc78b976fa5e05748fb7d3a79d5..196b8d0c194b80e71eb5b944e6e812678dbfc4e0 100644 --- a/PaddleSpeech/DeepVoice3/eval_model.py +++ b/PaddleSpeech/DeepVoice3/eval_model.py @@ -67,9 +67,9 @@ def tts(model, text, p=0., speaker_id=None): model.eval() sequence = np.array(_frontend.text_to_sequence(text, p=p)).astype("int64") - sequence = np.reshape(sequence, (1, -1, 1)) + sequence = np.reshape(sequence, (1, -1)) text_positions = np.arange(1, sequence.shape[1] + 1, dtype="int64") - text_positions = np.reshape(text_positions, (1, -1, 1)) + text_positions = np.reshape(text_positions, (1, -1)) sequence = dg.to_variable(sequence) text_positions = dg.to_variable(text_positions) @@ -191,8 +191,8 @@ def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker): # Mel writer.add_image( - "(Eval) Predicted mel spectrogram text{}_{}".format( - idx, speaker_str), + "Eval_Predicted_mel_spectrogram_text{}_{}".format(idx, + speaker_str), prepare_spec_image(mel), global_step, dataformats='HWC') @@ -205,8 +205,8 @@ def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker): try: writer.add_audio( - "(Eval) Predicted audio signal {}_{}".format(idx, - speaker_str), + "Eval_Predicted_audio_signal_{}_{}".format(idx, + speaker_str), signal, global_step, sample_rate=hparams.sample_rate) @@ -273,7 +273,7 @@ def save_states(global_step, mel_output = mel_outputs[idx].numpy().squeeze().T mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image( - "Predicted mel spectrogram", + "Predicted_mel_spectrogram", mel_output, global_step, dataformats="HWC") @@ -282,7 +282,7 @@ def save_states(global_step, linear_output = linear_outputs[idx].numpy().squeeze().T spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image( - "Predicted linear spectrogram", + "Predicted_linear_spectrogram", spectrogram, global_step, dataformats="HWC") @@ -293,7 +293,7 @@ def save_states(global_step, "step{:09d}_predicted.wav".format(global_step)) try: writer.add_audio( - "Predicted audio signal", + "Predicted_audio_signal", signal, global_step, sample_rate=hparams.sample_rate) @@ -306,7 +306,7 @@ def save_states(global_step, mel_output = mel[idx].numpy().squeeze().T mel_output = prepare_spec_image(audio._denormalize(mel_output)) writer.add_image( - "Target mel spectrogram", + "Target_mel_spectrogram", mel_output, global_step, dataformats="HWC") @@ -315,7 +315,7 @@ def save_states(global_step, linear_output = y[idx].numpy().squeeze().T spectrogram = prepare_spec_image(audio._denormalize(linear_output)) writer.add_image( - "Target linear spectrogram", + "Target_linear_spectrogram", spectrogram, global_step, dataformats="HWC")