提交 5b5a1ea2 编写于 作者: W wuzewu

Fix docs format issue.

上级 7a28aaad
### PaddleHub Office Website:https://www.paddlepaddle.org.cn/hub ### PaddleHub Office Website:https://www.paddlepaddle.org.cn/hub
### PaddleHub Module Searching:https://www.paddlepaddle.org.cn/hublist ### PaddleHub Module Searching:https://www.paddlepaddle.org.cn/hublist
...@@ -194,17 +194,14 @@ class SpeakerVerificationPreprocessor(object): ...@@ -194,17 +194,14 @@ class SpeakerVerificationPreprocessor(object):
return wav return wav
def melspectrogram(self, wav): def melspectrogram(self, wav):
mel = librosa.feature.melspectrogram(wav, mel = librosa.feature.melspectrogram(
sr=self.sampling_rate, wav, sr=self.sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels)
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels)
mel = mel.astype(np.float32).T mel = mel.astype(np.float32).T
return mel return mel
def extract_mel_partials(self, wav): def extract_mel_partials(self, wav):
wav_slices, mel_slices = compute_partial_slices(len(wav), self.partial_n_frames, self.hop_length, wav_slices, mel_slices = compute_partial_slices(
self.min_pad_coverage, self.partial_overlap_ratio) len(wav), self.partial_n_frames, self.hop_length, self.min_pad_coverage, self.partial_overlap_ratio)
# pad audio if needed # pad audio if needed
max_wave_length = wav_slices[-1].stop max_wave_length = wav_slices[-1].stop
......
...@@ -58,56 +58,53 @@ class VoiceCloner(nn.Layer): ...@@ -58,56 +58,53 @@ class VoiceCloner(nn.Layer):
'waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams') 'waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams')
# Speaker encoder # Speaker encoder
self.speaker_processor = SpeakerVerificationPreprocessor(sampling_rate=16000, self.speaker_processor = SpeakerVerificationPreprocessor(
audio_norm_target_dBFS=-30, sampling_rate=16000,
vad_window_length=30, audio_norm_target_dBFS=-30,
vad_moving_average_width=8, vad_window_length=30,
vad_max_silence_length=6, vad_moving_average_width=8,
mel_window_length=25, vad_max_silence_length=6,
mel_window_step=10, mel_window_length=25,
n_mels=40, mel_window_step=10,
partial_n_frames=160, n_mels=40,
min_pad_coverage=0.75, partial_n_frames=160,
partial_overlap_ratio=0.5) min_pad_coverage=0.75,
partial_overlap_ratio=0.5)
self.speaker_encoder = LSTMSpeakerEncoder(n_mels=40, num_layers=3, hidden_size=256, output_size=256) self.speaker_encoder = LSTMSpeakerEncoder(n_mels=40, num_layers=3, hidden_size=256, output_size=256)
self.speaker_encoder.set_state_dict(paddle.load(speaker_encoder_ckpt)) self.speaker_encoder.set_state_dict(paddle.load(speaker_encoder_ckpt))
self.speaker_encoder.eval() self.speaker_encoder.eval()
# Voice synthesizer # Voice synthesizer
self.synthesizer = Tacotron2(vocab_size=68, self.synthesizer = Tacotron2(
n_tones=10, vocab_size=68,
d_mels=80, n_tones=10,
d_encoder=512, d_mels=80,
encoder_conv_layers=3, d_encoder=512,
encoder_kernel_size=5, encoder_conv_layers=3,
d_prenet=256, encoder_kernel_size=5,
d_attention_rnn=1024, d_prenet=256,
d_decoder_rnn=1024, d_attention_rnn=1024,
attention_filters=32, d_decoder_rnn=1024,
attention_kernel_size=31, attention_filters=32,
d_attention=128, attention_kernel_size=31,
d_postnet=512, d_attention=128,
postnet_kernel_size=5, d_postnet=512,
postnet_conv_layers=5, postnet_kernel_size=5,
reduction_factor=1, postnet_conv_layers=5,
p_encoder_dropout=0.5, reduction_factor=1,
p_prenet_dropout=0.5, p_encoder_dropout=0.5,
p_attention_dropout=0.1, p_prenet_dropout=0.5,
p_decoder_dropout=0.1, p_attention_dropout=0.1,
p_postnet_dropout=0.5, p_decoder_dropout=0.1,
d_global_condition=256, p_postnet_dropout=0.5,
use_stop_token=False) d_global_condition=256,
use_stop_token=False)
self.synthesizer.set_state_dict(paddle.load(synthesizer_ckpt)) self.synthesizer.set_state_dict(paddle.load(synthesizer_ckpt))
self.synthesizer.eval() self.synthesizer.eval()
# Vocoder # Vocoder
self.vocoder = ConditionalWaveFlow(upsample_factors=[16, 16], self.vocoder = ConditionalWaveFlow(
n_flows=8, upsample_factors=[16, 16], n_flows=8, n_layers=8, n_group=16, channels=128, n_mels=80, kernel_size=[3, 3])
n_layers=8,
n_group=16,
channels=128,
n_mels=80,
kernel_size=[3, 3])
self.vocoder.set_state_dict(paddle.load(vocoder_ckpt)) self.vocoder.set_state_dict(paddle.load(vocoder_ckpt))
self.vocoder.eval() self.vocoder.eval()
......
...@@ -237,14 +237,16 @@ def process_aishell3(dataset_root, output_dir): ...@@ -237,14 +237,16 @@ def process_aishell3(dataset_root, output_dir):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle).") description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle).")
parser.add_argument("--input", parser.add_argument(
type=str, "--input",
default="~/datasets/aishell3/train", type=str,
help="path of the training dataset,(contains a label_train-set.txt).") default="~/datasets/aishell3/train",
parser.add_argument("--output", help="path of the training dataset,(contains a label_train-set.txt).")
type=str, parser.add_argument(
help="the directory to save the processed transcription." "--output",
"If not provided, it would be the same as the input.") type=str,
help="the directory to save the processed transcription."
"If not provided, it would be the same as the input.")
args = parser.parse_args() args = parser.parse_args()
if args.output is None: if args.output is None:
args.output = args.input args.output = args.input
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册