提交 2e8ef14b 编写于 作者: C CorentinJ

Interactive generation done

上级 d025a84d
......@@ -5,17 +5,14 @@ from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import librosa
import argparse
import torch
import sys
if __name__ == '__main__':
## Info & args
print("This is a UI-less example of interface to SV2TTS. The purpose of this script is to "
"show how you can interface this project easily with your own. See the source code for "
"an explanation of what is happening.\n")
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
......@@ -40,6 +37,7 @@ if __name__ == '__main__':
## Print some environment information (for debugging purposes)
print("Running a test of your configuration...\n")
if not torch.cuda.is_available():
print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
"for deep learning, ensure that the drivers are properly installed, and that your "
......@@ -106,7 +104,83 @@ if __name__ == '__main__':
# recommended in general.
vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
print("All test passed! You can now synthesize speech.")
print("All test passed! You can now synthesize speech.\n\n")
## Interactive speech generation
print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
"show how you can interface this project easily with your own. See the source code for "
"an explanation of what is happening.\n")
# TODO: interactive speech generation
\ No newline at end of file
print("Interactive generation loop")
num_generated = 0
while True:
try:
# Get the reference audio filepath
message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
"wav, m4a, flac, ...):\n"
in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
## Computing the embedding
# First, we load the wav using the function that the speaker encoder provides. This is
# important: there is preprocessing that must be applied.
# The following two methods are equivalent:
# - Directly load from the filepath:
preprocessed_wav = encoder.preprocess_wav(in_fpath)
# - If the wav is already loaded:
original_wav, sampling_rate = librosa.load(in_fpath)
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
print("Loaded file succesfully")
# Then we derive the embedding. There are many functions and parameters that the
# speaker encoder interfaces. These are mostly for in-depth research. You will typically
# only use this function (with its default parameters):
embed = encoder.embed_utterance(preprocessed_wav)
print("Created the embedding")
## Generating the spectrogram
text = input("Write a sentence (+-20 words) to be synthesized:\n")
# The synthesizer works in batch, so you need to put your data in a list or numpy array
texts = [text]
embeds = [embed]
# If you know what the attention layer alignments are, you can retrieve them here by
# passing return_alignments=True
specs = synthesizer.synthesize_spectrograms(texts, embeds)
spec = specs[0]
print("Created the mel spectrogram")
## Generating the waveform
print("Synthesizing the waveform:")
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
# spectrogram, the more time-efficient the vocoder.
generated_wav = vocoder.infer_waveform(spec)
## Post-generation
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
# pad it.
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
# Play the audio (non-blocking)
if not args.no_sound:
sd.stop()
sd.play(generated_wav, synthesizer.sample_rate)
# Save it on the disk
fpath = "demo_output_%02d.wav" % num_generated
print(generated_wav.dtype)
librosa.output.write_wav(fpath, generated_wav.astype(np.float32),
synthesizer.sample_rate)
num_generated += 1
print("\nSaved output as %s\n\n" % fpath)
except Exception as e:
print("Caught exception: %s" % repr(e))
print("Restarting\n")
\ No newline at end of file
......@@ -8,11 +8,13 @@ from scipy.signal import lfilter
def label_2_float(x, bits) :
return 2 * x / (2**bits - 1.) - 1.
def float_2_label(x, bits) :
assert abs(x).max() <= 1.0
x = (x + 1.) * (2**bits - 1) / 2
return x.clip(0, 2**bits - 1)
def load_wav(path) :
return librosa.load(path, sr=hp.sample_rate)[0]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册