Fix docs format issue.

5b5a1ea2 · wuzewu · 7a28aaad · 5b5a1ea2 · 5b5a1ea2 · 5b5a1ea2
4 changed file
--- a/demo/README.md
+++ b/demo/README.md
--- a/modules/audio/voice_cloning/lstm_tacotron2/audio_processor.py
+++ b/modules/audio/voice_cloning/lstm_tacotron2/audio_processor.py
@@ -194,17 +194,14 @@ class SpeakerVerificationPreprocessor(object):
        return wav
    def melspectrogram(self, wav):
-        mel = librosa.feature.melspectrogram(wav,
+        mel = librosa.feature.melspectrogram(
-                                             sr=self.sampling_rate,
+            wav, sr=self.sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels)
-                                             n_fft=self.n_fft,
-                                             hop_length=self.hop_length,
-                                             n_mels=self.n_mels)
        mel = mel.astype(np.float32).T
        return mel
    def extract_mel_partials(self, wav):
-        wav_slices, mel_slices = compute_partial_slices(len(wav), self.partial_n_frames, self.hop_length,
+        wav_slices, mel_slices = compute_partial_slices(
-                                                        self.min_pad_coverage, self.partial_overlap_ratio)
+            len(wav), self.partial_n_frames, self.hop_length, self.min_pad_coverage, self.partial_overlap_ratio)
        # pad audio if needed
        max_wave_length = wav_slices[-1].stop

--- a/modules/audio/voice_cloning/lstm_tacotron2/module.py
+++ b/modules/audio/voice_cloning/lstm_tacotron2/module.py
@@ -58,7 +58,8 @@ class VoiceCloner(nn.Layer):
                                    'waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams')
        # Speaker encoder
-        self.speaker_processor = SpeakerVerificationPreprocessor(sampling_rate=16000,
+        self.speaker_processor = SpeakerVerificationPreprocessor(
+            sampling_rate=16000,
            audio_norm_target_dBFS=-30,
            vad_window_length=30,
            vad_moving_average_width=8,
@@ -74,7 +75,8 @@ class VoiceCloner(nn.Layer):
        self.speaker_encoder.eval()
        # Voice synthesizer
-        self.synthesizer = Tacotron2(vocab_size=68,
+        self.synthesizer = Tacotron2(
+            vocab_size=68,
            n_tones=10,
            d_mels=80,
            d_encoder=512,
@@ -101,13 +103,8 @@ class VoiceCloner(nn.Layer):
        self.synthesizer.eval()
        # Vocoder
-        self.vocoder = ConditionalWaveFlow(upsample_factors=[16, 16],
+        self.vocoder = ConditionalWaveFlow(
-                                           n_flows=8,
+            upsample_factors=[16, 16], n_flows=8, n_layers=8, n_group=16, channels=128, n_mels=80, kernel_size=[3, 3])
-                                           n_layers=8,
-                                           n_group=16,
-                                           channels=128,
-                                           n_mels=80,
-                                           kernel_size=[3, 3])
        self.vocoder.set_state_dict(paddle.load(vocoder_ckpt))
        self.vocoder.eval()

--- a/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py
+++ b/modules/audio/voice_cloning/lstm_tacotron2/preprocess_transcription.py
@@ -237,11 +237,13 @@ def process_aishell3(dataset_root, output_dir):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle).")
-    parser.add_argument("--input",
+    parser.add_argument(
+        "--input",
        type=str,
        default="~/datasets/aishell3/train",
        help="path of the training dataset,(contains a label_train-set.txt).")
-    parser.add_argument("--output",
+    parser.add_argument(
+        "--output",
        type=str,
        help="the directory to save the processed transcription."
        "If not provided, it would be the same as the input.")