Interactive generation done

2e8ef14b · CorentinJ · d025a84d · 2e8ef14b · 2e8ef14b
隐藏空白更改
内联并排

Showing with 83 addition and 7 deletion

demo_cli.py demo_cli.py +81 -7

vocoder/audio.py vocoder/audio.py +2 -0

未找到文件。
--- a/demo_cli.py
+++ b/demo_cli.py
@@ -5,17 +5,14 @@ from encoder import inference as encoder
 from vocoder import inference as vocoder
 from pathlib import Path
 import numpy as np
+import librosa
 import argparse
 import torch
 import sys


-
 if __name__ == '__main__':
    ## Info & args
-    print("This is a UI-less example of interface to SV2TTS. The purpose of this script is to "
-          "show how you can interface this project easily with your own. See the source code for "
-          "an explanation of what is happening.\n")
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
@@ -40,6 +37,7 @@ if __name__ == '__main__':
        
    
    ## Print some environment information (for debugging purposes)
+    print("Running a test of your configuration...\n")
    if not torch.cuda.is_available():
        print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
              "for deep learning, ensure that the drivers are properly installed, and that your "
@@ -106,7 +104,83 @@ if __name__ == '__main__':
    # recommended in general.
    vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
    
-    print("All test passed! You can now synthesize speech.")
+    print("All test passed! You can now synthesize speech.\n\n")
+    
+    
+    ## Interactive speech generation
+    print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
+          "show how you can interface this project easily with your own. See the source code for "
+          "an explanation of what is happening.\n")
    
-    # TODO: interactive speech generation
-    
\ No newline at end of file
+    print("Interactive generation loop")
+    num_generated = 0
+    while True:
+        try:
+            # Get the reference audio filepath
+            message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
+                      "wav, m4a, flac, ...):\n"
+            in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
+            
+            
+            ## Computing the embedding
+            # First, we load the wav using the function that the speaker encoder provides. This is 
+            # important: there is preprocessing that must be applied.
+            
+            # The following two methods are equivalent:
+            # - Directly load from the filepath:
+            preprocessed_wav = encoder.preprocess_wav(in_fpath)
+            # - If the wav is already loaded:
+            original_wav, sampling_rate = librosa.load(in_fpath)
+            preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
+            print("Loaded file succesfully")
+            
+            # Then we derive the embedding. There are many functions and parameters that the 
+            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
+            # only use this function (with its default parameters):
+            embed = encoder.embed_utterance(preprocessed_wav)
+            print("Created the embedding")
+            
+            
+            ## Generating the spectrogram
+            text = input("Write a sentence (+-20 words) to be synthesized:\n")
+            
+            # The synthesizer works in batch, so you need to put your data in a list or numpy array
+            texts = [text]
+            embeds = [embed]
+            # If you know what the attention layer alignments are, you can retrieve them here by
+            # passing return_alignments=True
+            specs = synthesizer.synthesize_spectrograms(texts, embeds)
+            spec = specs[0]
+            print("Created the mel spectrogram")
+            
+            
+            ## Generating the waveform
+            print("Synthesizing the waveform:")
+            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
+            # spectrogram, the more time-efficient the vocoder.
+            generated_wav = vocoder.infer_waveform(spec)
+            
+            
+            ## Post-generation
+            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
+            # pad it.
+            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
+            
+            # Play the audio (non-blocking)
+            if not args.no_sound:
+                sd.stop()
+                sd.play(generated_wav, synthesizer.sample_rate)
+                
+            # Save it on the disk
+            fpath = "demo_output_%02d.wav" % num_generated
+            print(generated_wav.dtype)
+            librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
+                                     synthesizer.sample_rate)
+            num_generated += 1
+            print("\nSaved output as %s\n\n" % fpath)
+            
+            
+        except Exception as e:
+            print("Caught exception: %s" % repr(e))
+            print("Restarting\n")
+        
\ No newline at end of file
--- a/vocoder/audio.py
+++ b/vocoder/audio.py
@@ -8,11 +8,13 @@ from scipy.signal import lfilter
 def label_2_float(x, bits) :
    return 2 * x / (2**bits - 1.) - 1.

+
 def float_2_label(x, bits) :
    assert abs(x).max() <= 1.0
    x = (x + 1.) * (2**bits - 1) / 2
    return x.clip(0, 2**bits - 1)

+
 def load_wav(path) :
    return librosa.load(path, sr=hp.sample_rate)[0]