diff --git a/demo/README.md b/demo/README.md index 8891c968853b5cfe6b67e0e8a9b50036bbacbd3d..1ae13ac94032a39d213e818b7b61b18a5584acc2 100644 --- a/demo/README.md +++ b/demo/README.md @@ -1,2 +1,2 @@ ### PaddleHub Office Website:https://www.paddlepaddle.org.cn/hub -### PaddleHub Module Searching:https://www.paddlepaddle.org.cn/hublist +### PaddleHub Module Searching:https://www.paddlepaddle.org.cn/hublist diff --git a/modules/audio/voice_cloning/lstm_tacotron2/audio_processor.py b/modules/audio/voice_cloning/lstm_tacotron2/audio_processor.py index 7b28a5330e41db4e8ab5661d8df80e94a76d3e87..a06d86ae3dfc15dca2e661b7ec180da2529c044b 100644 --- a/modules/audio/voice_cloning/lstm_tacotron2/audio_processor.py +++ b/modules/audio/voice_cloning/lstm_tacotron2/audio_processor.py @@ -194,17 +194,14 @@ class SpeakerVerificationPreprocessor(object): return wav def melspectrogram(self, wav): - mel = librosa.feature.melspectrogram(wav, - sr=self.sampling_rate, - n_fft=self.n_fft, - hop_length=self.hop_length, - n_mels=self.n_mels) + mel = librosa.feature.melspectrogram( + wav, sr=self.sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels) mel = mel.astype(np.float32).T return mel def extract_mel_partials(self, wav): - wav_slices, mel_slices = compute_partial_slices(len(wav), self.partial_n_frames, self.hop_length, - self.min_pad_coverage, self.partial_overlap_ratio) + wav_slices, mel_slices = compute_partial_slices( + len(wav), self.partial_n_frames, self.hop_length, self.min_pad_coverage, self.partial_overlap_ratio) # pad audio if needed max_wave_length = wav_slices[-1].stop diff --git a/modules/audio/voice_cloning/lstm_tacotron2/module.py b/modules/audio/voice_cloning/lstm_tacotron2/module.py index 88bc8c4fc0ef6735a9b1c354a635237953b53397..f4fb2960d3df075c33f8d270f1dba5eae712bb91 100644 --- a/modules/audio/voice_cloning/lstm_tacotron2/module.py +++ b/modules/audio/voice_cloning/lstm_tacotron2/module.py @@ -58,56 +58,53 @@ class VoiceCloner(nn.Layer): 'waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams') # Speaker encoder - self.speaker_processor = SpeakerVerificationPreprocessor(sampling_rate=16000, - audio_norm_target_dBFS=-30, - vad_window_length=30, - vad_moving_average_width=8, - vad_max_silence_length=6, - mel_window_length=25, - mel_window_step=10, - n_mels=40, - partial_n_frames=160, - min_pad_coverage=0.75, - partial_overlap_ratio=0.5) + self.speaker_processor = SpeakerVerificationPreprocessor( + sampling_rate=16000, + audio_norm_target_dBFS=-30, + vad_window_length=30, + vad_moving_average_width=8, + vad_max_silence_length=6, + mel_window_length=25, + mel_window_step=10, + n_mels=40, + partial_n_frames=160, + min_pad_coverage=0.75, + partial_overlap_ratio=0.5) self.speaker_encoder = LSTMSpeakerEncoder(n_mels=40, num_layers=3, hidden_size=256, output_size=256) self.speaker_encoder.set_state_dict(paddle.load(speaker_encoder_ckpt)) self.speaker_encoder.eval() # Voice synthesizer - self.synthesizer = Tacotron2(vocab_size=68, - n_tones=10, - d_mels=80, - d_encoder=512, - encoder_conv_layers=3, - encoder_kernel_size=5, - d_prenet=256, - d_attention_rnn=1024, - d_decoder_rnn=1024, - attention_filters=32, - attention_kernel_size=31, - d_attention=128, - d_postnet=512, - postnet_kernel_size=5, - postnet_conv_layers=5, - reduction_factor=1, - p_encoder_dropout=0.5, - p_prenet_dropout=0.5, - p_attention_dropout=0.1, - p_decoder_dropout=0.1, - p_postnet_dropout=0.5, - d_global_condition=256, - use_stop_token=False) + self.synthesizer = Tacotron2( + vocab_size=68, + n_tones=10, + d_mels=80, + d_encoder=512, + encoder_conv_layers=3, + encoder_kernel_size=5, + d_prenet=256, + d_attention_rnn=1024, + d_decoder_rnn=1024, + attention_filters=32, + attention_kernel_size=31, + d_attention=128, + d_postnet=512, + postnet_kernel_size=5, + postnet_conv_layers=5, + reduction_factor=1, + p_encoder_dropout=0.5, + p_prenet_dropout=0.5, + p_attention_dropout=0.1, + p_decoder_dropout=0.1, + p_postnet_dropout=0.5, + d_global_condition=256, + use_stop_token=False) self.synthesizer.set_state_dict(paddle.load(synthesizer_ckpt)) self.synthesizer.eval() # Vocoder - self.vocoder = ConditionalWaveFlow(upsample_factors=[16, 16], - n_flows=8, - n_layers=8, - n_group=16, - channels=128, - n_mels=80, - kernel_size=[3, 3]) + self.vocoder = ConditionalWaveFlow( + upsample_factors=[16, 16], n_flows=8, n_layers=8, n_group=16, channels=128, n_mels=80, kernel_size=[3, 3]) self.vocoder.set_state_dict(paddle.load(vocoder_ckpt)) self.vocoder.eval() diff --git a/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py b/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py index 1bec2489e2be99c6d0eaff4538ee0d4205bdd160..715121d030dfe3860d7b037b1f0d8f81a5e3942f 100644 --- a/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py +++ b/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py @@ -237,14 +237,16 @@ def process_aishell3(dataset_root, output_dir): if __name__ == "__main__": parser = argparse.ArgumentParser( description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle).") - parser.add_argument("--input", - type=str, - default="~/datasets/aishell3/train", - help="path of the training dataset,(contains a label_train-set.txt).") - parser.add_argument("--output", - type=str, - help="the directory to save the processed transcription." - "If not provided, it would be the same as the input.") + parser.add_argument( + "--input", + type=str, + default="~/datasets/aishell3/train", + help="path of the training dataset,(contains a label_train-set.txt).") + parser.add_argument( + "--output", + type=str, + help="the directory to save the processed transcription." + "If not provided, it would be the same as the input.") args = parser.parse_args() if args.output is None: args.output = args.input