提交 8229b1d7 编写于 作者: C Corentin Jemine

OO style inference for the synthesizer, experimental support for low memory GPUs

上级 b9a072e3
......@@ -18,6 +18,8 @@ SV2TTS is a three-stage deep learning framework that allows to create a numerica
|[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 (synthesizer) | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2)
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
## News
**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM.
## Quick start
......
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from synthesizer import inference as synthesizer
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
......@@ -28,6 +28,9 @@ if __name__ == '__main__':
parser.add_argument("-v", "--voc_model_fpath", type=Path,
default="vocoder/saved_models/pretrained/pretrained.pt",
help="Path to a saved vocoder")
parser.add_argument("--low_mem", action="store_true", help=\
"If True, the memory used by the synthesizer will be freed after each use. Adds large "
"overhead but allows to save some GPU memory for lower-end GPUs.")
parser.add_argument("--no_sound", action="store_true", help=\
"If True, audio won't be played.")
args = parser.parse_args()
......@@ -56,12 +59,10 @@ if __name__ == '__main__':
## Load the models one by one.
print("Loading the encoder, the synthesizer and the vocoder. This should take a few seconds. "
"The synthesizer will output a lot of stuff. Tensorflow is like that.")
print("Preparing the encoder, the synthesizer and the vocoder...")
encoder.load_model(args.enc_model_fpath)
synthesizer.load_model(args.syn_model_dir.joinpath("taco_pretrained"))
synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
vocoder.load_model(args.voc_model_fpath)
print("\nAll models succesfully loaded!\n")
## Run a test
......@@ -87,7 +88,7 @@ if __name__ == '__main__':
# illustrate that
embeds = [embed, np.zeros(speaker_embedding_size)]
texts = ["test 1", "test 2"]
print("\tTesting the synthesizer...")
print("\tTesting the synthesizer... (loading the model will output a lot of text)")
mels = synthesizer.synthesize_spectrograms(texts, embeds)
# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
......
......@@ -22,6 +22,9 @@ if __name__ == '__main__':
help="Directory containing saved synthesizer models")
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
help="Directory containing saved vocoder models")
parser.add_argument("--low_mem", action="store_true", help=\
"If True, the memory used by the synthesizer will be freed after each use. Adds large "
"overhead but allows to save some GPU memory for lower-end GPUs.")
args = parser.parse_args()
# Launch the toolbox
......
from synthesizer.tacotron2 import Tacotron2
from synthesizer.hparams import hparams
from synthesizer.synthesizer import Synthesizer
from multiprocess.pool import Pool # You're free to use either one
#from multiprocessing import Pool #
from synthesizer import audio
from pathlib import Path
from typing import Union, List
import tensorflow as tf
import numpy as np
import numba.cuda
import librosa
_model = None # type: Synthesizer
sample_rate = hparams.sample_rate
# TODO: allow for custom hparams throughout this module?
def load_model(checkpoints_dir: Path):
global _model
class Synthesizer:
sample_rate = hparams.sample_rate
hparams = hparams
tf.reset_default_graph()
_model = Synthesizer()
checkpoint_fpath = tf.train.get_checkpoint_state(checkpoints_dir).model_checkpoint_path
_model.load(checkpoint_fpath, hparams)
def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False):
"""
Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
needed or until load() is called.
:param checkpoints_dir: path to the directory containing the checkpoint file as well as the
weight files (.data, .index and .meta files)
:param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
:param low_mem: if True, the model will be loaded in a separate process and its resources
will be released after each usage. Adds a large overhead, only recommended if your GPU
memory is low (<= 2gb)
"""
self.verbose = verbose
self._low_mem = low_mem
# Prepare the model
self._model = None # type: Tacotron2
checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
if checkpoint_state is None:
raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
if verbose:
model_name = checkpoints_dir.parent.name.replace("logs-", "")
step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
def is_loaded(self):
"""
Whether the model is loaded in GPU memory.
"""
return self._model is not None
model_name = checkpoints_dir.parent.name.replace("logs-", "")
step = int(checkpoint_fpath[checkpoint_fpath.rfind('-') + 1:])
print("Loaded synthesizer \"%s\" trained to step %d" % (model_name, step))
def is_loaded():
return _model is not None
def load(self):
"""
Effectively loads the model to GPU memory given the weights file that was passed in the
constructor.
"""
if self._low_mem:
raise Exception("Cannot load the synthesizer permanently in low mem mode")
tf.reset_default_graph()
self._model = Tacotron2(self.checkpoint_fpath, hparams)
def synthesize_spectrograms(self, texts: List[str],
embeddings: Union[np.ndarray, List[np.ndarray]],
return_alignments=False):
"""
Synthesizes mel spectrograms from texts and speaker embeddings.
def synthesize_spectrograms(texts: List[str], embeddings: np.ndarray, return_alignments=False):
"""
Synthesizes mel spectrograms from texts and speaker embeddings.
:param texts: a list of N text prompts to be synthesized
:param embeddings: a numpy array of (N, 256) speaker embeddings
:param return_alignments: if True, a matrix representing the alignments between the characters
and each decoder output step will be returned for each spectrogram
:return: a list of N melspectrograms as numpy arrays of shape (80, M), and possibly the
alignments.
"""
if not is_loaded():
raise Exception("Load a model first")
specs, alignments = _model.my_synthesize(embeddings, texts)
:param texts: a list of N text prompts to be synthesized
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
:param return_alignments: if True, a matrix representing the alignments between the
characters
and each decoder output step will be returned for each spectrogram
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
sequence length of spectrogram i, and possibly the alignments.
"""
if not self._low_mem:
# Usual inference mode: load the model on the first request and keep it loaded.
if not self.is_loaded():
self.load()
specs, alignments = self._model.my_synthesize(embeddings, texts)
else:
# Low memory inference mode: load the model upon every request. The model has to be
# loaded in a separate process to be able to release GPU memory (a simple workaround
# to tensorflow's intricacies)
specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms,
[(self.checkpoint_fpath, embeddings, texts)])[0]
if return_alignments:
return (specs, alignments) if return_alignments else specs
@staticmethod
def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
# Load the model and forward the inputs
tf.reset_default_graph()
model = Tacotron2(checkpoint_fpath, hparams)
specs, alignments = model.my_synthesize(embeddings, texts)
# Detach the outputs (not doing so will cause the process to hang)
specs, alignments = [spec.copy() for spec in specs], alignments.copy()
# Close cuda for this process
model.session.close()
numba.cuda.select_device(0)
numba.cuda.close()
return specs, alignments
else:
return specs
def load_preprocess_wav(fpath):
wav = librosa.load(fpath, hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
return wav
@staticmethod
def load_preprocess_wav(fpath):
"""
Loads and preprocesses an audio file under the same conditions the audio files were used to
train the synthesizer.
"""
wav = librosa.load(fpath, hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
return wav
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav = load_preprocess_wav(fpath_or_wav)
else:
wav = fpath_or_wav
@staticmethod
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
"""
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
were fed to the synthesizer when training.
"""
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
else:
wav = fpath_or_wav
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
return mel_spectrogram
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
return mel_spectrogram
def griffin_lim(mel):
return audio.inv_mel_spectrogram(mel, hparams)
@staticmethod
def griffin_lim(mel):
"""
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
with the same parameters present in hparams.py.
"""
return audio.inv_mel_spectrogram(mel, hparams)
\ No newline at end of file
from synthesizer.synthesizer import Synthesizer
from synthesizer.tacotron2 import Tacotron2
from synthesizer.hparams import hparams_debug_string
from synthesizer.infolog import log
import tensorflow as tf
......@@ -18,8 +18,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
log(hparams_debug_string())
synth = Synthesizer()
synth.load(checkpoint_path, hparams)
synth = Tacotron2(checkpoint_path, hparams)
#Set inputs batch wise
sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i
......@@ -44,10 +43,9 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
print(hparams_debug_string())
# Load the model in memory
synth = Synthesizer()
weights_dir = os.path.join(model_dir, "taco_pretrained")
checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
synth.load(checkpoint_fpath, hparams, gta=True)
synth = Tacotron2(checkpoint_fpath, hparams, gta=True)
# Load the metadata
with open(metadata_filename, encoding="utf-8") as f:
......
from synthesizer.utils.text import text_to_sequence
from synthesizer import audio
from synthesizer.infolog import log
from synthesizer.models import create_model
from synthesizer.utils import plot
from synthesizer import audio
import tensorflow as tf
import numpy as np
import wave
import os
class Synthesizer:
def load(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
class Tacotron2:
def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
log("Constructing model: %s" % model_name)
#Force the batch size to be known in order to use attention masking in batch synthesis
inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
......@@ -65,7 +64,7 @@ class Synthesizer:
def my_synthesize(self, speaker_embeds, texts):
"""
Lighter synthesis function that directly returns the mel spectrogram.
Lighter synthesis function that directly returns the mel spectrograms.
"""
# Prepare the input
......
from toolbox.ui import UI
from encoder import inference as encoder
from synthesizer import inference as synthesizer
from synthesizer.inference import Synthesizer
from vocoder import inference as vocoder
from pathlib import Path
from time import perf_counter as timer
......@@ -28,12 +28,15 @@ recognized_datasets = [
]
class Toolbox:
def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir):
def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, low_mem):
sys.excepthook = self.excepthook
self.datasets_root = datasets_root
self.low_mem = low_mem
self.utterances = set()
self.current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
self.synthesizer = None # type: Synthesizer
# Initialize the events and the interface
self.ui = UI()
self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir)
......@@ -58,7 +61,9 @@ class Toolbox:
# Model selection
self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)
self.ui.synthesizer_box.currentIndexChanged.connect(self.init_synthesizer)
def func():
self.synthesizer = None
self.ui.synthesizer_box.currentIndexChanged.connect(func)
self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)
# Utterance selection
......@@ -66,7 +71,7 @@ class Toolbox:
self.ui.browser_browse_button.clicked.connect(func)
func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
self.ui.utterance_history.currentIndexChanged.connect(func)
func = lambda: self.ui.play(self.ui.selected_utterance.wav, synthesizer.sample_rate)
func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
self.ui.play_button.clicked.connect(func)
self.ui.stop_button.clicked.connect(self.ui.stop)
self.ui.record_button.clicked.connect(self.record)
......@@ -102,7 +107,7 @@ class Toolbox:
# Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
# playback, so as to have a fair comparison with the generated audio
wav = synthesizer.load_preprocess_wav(fpath)
wav = Synthesizer.load_preprocess_wav(fpath)
self.ui.log("Loaded %s" % name)
self.add_real_utterance(wav, name, speaker_name)
......@@ -119,7 +124,7 @@ class Toolbox:
def add_real_utterance(self, wav, name, speaker_name):
# Compute the mel spectrogram
spec = synthesizer.make_spectrogram(wav)
spec = Synthesizer.make_spectrogram(wav)
self.ui.draw_spec(spec, "current")
# Compute the embedding
......@@ -142,21 +147,28 @@ class Toolbox:
self.ui.draw_umap_projections(self.utterances)
def synthesize(self):
# Synthesize the spectrogram
if not synthesizer.is_loaded():
self.init_synthesizer()
self.ui.log("Generating the mel spectrogram...")
self.ui.set_loading(1)
# Synthesize the spectrogram
if self.synthesizer is None:
model_dir = self.ui.current_synthesizer_model_dir
checkpoints_dir = model_dir.joinpath("taco_pretrained")
self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem)
if not self.synthesizer.is_loaded():
self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath)
texts = self.ui.text_prompt.toPlainText().split("\n")
embed = self.ui.selected_utterance.embed
embeds = np.stack([embed] * len(texts))
specs = synthesizer.synthesize_spectrograms(texts, embeds)
specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
breaks = [spec.shape[1] for spec in specs]
spec = np.concatenate(specs, axis=1)
self.ui.draw_spec(spec, "generated")
self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
self.ui.set_loading(0)
def vocode(self):
speaker_name, spec, breaks, _ = self.current_generated
assert spec is not None
......@@ -165,7 +177,7 @@ class Toolbox:
if not vocoder.is_loaded():
self.init_vocoder()
def vocoder_progress(i, seq_len, b_size, gen_rate):
real_time_factor = (gen_rate / synthesizer.hparams.sample_rate) * 1000
real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
% (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
self.ui.log(line, "overwrite")
......@@ -175,20 +187,20 @@ class Toolbox:
wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
else:
self.ui.log("Waveform generation with Griffin-Lim... ")
wav = synthesizer.griffin_lim(spec)
wav = Synthesizer.griffin_lim(spec)
self.ui.set_loading(0)
self.ui.log(" Done!", "append")
# Add breaks
b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size)
b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * synthesizer.hparams.sample_rate))] * len(breaks)
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
# Play it
wav = wav / np.abs(wav).max() * 0.97
self.ui.play(wav, synthesizer.sample_rate)
self.ui.play(wav, Synthesizer.sample_rate)
# Compute the embedding
# TODO: this is problematic with different sampling rates, gotta fix it
......@@ -206,7 +218,6 @@ class Toolbox:
self.ui.draw_embed(embed, name, "generated")
self.ui.draw_umap_projections(self.utterances)
def init_encoder(self):
model_fpath = self.ui.current_encoder_fpath
......@@ -216,18 +227,7 @@ class Toolbox:
encoder.load_model(model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0)
def init_synthesizer(self):
model_dir = self.ui.current_synthesizer_model_dir
checkpoints_dir = model_dir.joinpath("taco_pretrained")
self.ui.log("Loading the synthesizer %s... " % checkpoints_dir)
self.ui.set_loading(1)
start = timer()
synthesizer.load_model(checkpoints_dir)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0)
def init_vocoder(self):
model_fpath = self.ui.current_vocoder_fpath
# Case of Griffin-lim
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册