OO style inference for the synthesizer, experimental support for low memory GPUs

8229b1d7 · Corentin Jemine · b9a072e3 · 8229b1d7 · 8229b1d7 · 8229b1d7
8 changed file
--- a/README.md
+++ b/README.md
@@ -18,6 +18,8 @@ SV2TTS is a three-stage deep learning framework that allows to create a numerica
 |[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 (synthesizer) | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2)
 |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |

+## News
+**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM.


 ## Quick start

--- a/demo_cli.py
+++ b/demo_cli.py
 from encoder.params_model import model_embedding_size as speaker_embedding_size
 from utils.argutils import print_args
-from synthesizer import inference as synthesizer
+from synthesizer.inference import Synthesizer
 from encoder import inference as encoder
 from vocoder import inference as vocoder
 from pathlib import Path
@@ -28,6 +28,9 @@ if __name__ == '__main__':
    parser.add_argument("-v", "--voc_model_fpath", type=Path, 
                        default="vocoder/saved_models/pretrained/pretrained.pt",
                        help="Path to a saved vocoder")
+    parser.add_argument("--low_mem", action="store_true", help=\
+        "If True, the memory used by the synthesizer will be freed after each use. Adds large "
+        "overhead but allows to save some GPU memory for lower-end GPUs.")
    parser.add_argument("--no_sound", action="store_true", help=\
        "If True, audio won't be played.")
    args = parser.parse_args()
@@ -56,12 +59,10 @@ if __name__ == '__main__':
    
    
    ## Load the models one by one.
-    print("Loading the encoder, the synthesizer and the vocoder. This should take a few seconds. "
-          "The synthesizer will output a lot of stuff. Tensorflow is like that.")
+    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(args.enc_model_fpath)
-    synthesizer.load_model(args.syn_model_dir.joinpath("taco_pretrained"))
+    synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
    vocoder.load_model(args.voc_model_fpath)
-    print("\nAll models succesfully loaded!\n")
    
    
    ## Run a test
@@ -87,7 +88,7 @@ if __name__ == '__main__':
    # illustrate that
    embeds = [embed, np.zeros(speaker_embedding_size)]
    texts = ["test 1", "test 2"]
-    print("\tTesting the synthesizer...")
+    print("\tTesting the synthesizer... (loading the model will output a lot of text)")
    mels = synthesizer.synthesize_spectrograms(texts, embeds)
    
    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 

--- a/demo_toolbox.py
+++ b/demo_toolbox.py
@@ -22,6 +22,9 @@ if __name__ == '__main__':
                        help="Directory containing saved synthesizer models")
    parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models", 
                        help="Directory containing saved vocoder models")
+    parser.add_argument("--low_mem", action="store_true", help=\
+        "If True, the memory used by the synthesizer will be freed after each use. Adds large "
+        "overhead but allows to save some GPU memory for lower-end GPUs.")
    args = parser.parse_args()

    # Launch the toolbox

--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ Unidecode
 inflect
 PyQt5
 multiprocess
+numba
--- a/synthesizer/inference.py
+++ b/synthesizer/inference.py
+from synthesizer.tacotron2 import Tacotron2
 from synthesizer.hparams import hparams
-from synthesizer.synthesizer import Synthesizer
+from multiprocess.pool import Pool  # You're free to use either one
+#from multiprocessing import Pool   # 
 from synthesizer import audio
 from pathlib import Path
 from typing import Union, List
 import tensorflow as tf
 import numpy as np
+import numba.cuda
 import librosa

-_model = None   # type: Synthesizer
-sample_rate = hparams.sample_rate

-# TODO: allow for custom hparams throughout this module?
-
-def load_model(checkpoints_dir: Path):
-    global _model
+class Synthesizer:
+    sample_rate = hparams.sample_rate
+    hparams = hparams
    
-    tf.reset_default_graph()
-    _model = Synthesizer()
-    checkpoint_fpath = tf.train.get_checkpoint_state(checkpoints_dir).model_checkpoint_path
-    _model.load(checkpoint_fpath, hparams)
+    def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False):
+        """
+        Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
+        needed or until load() is called.
+        
+        :param checkpoints_dir: path to the directory containing the checkpoint file as well as the
+        weight files (.data, .index and .meta files)
+        :param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
+        :param low_mem: if True, the model will be loaded in a separate process and its resources 
+        will be released after each usage. Adds a large overhead, only recommended if your GPU 
+        memory is low (<= 2gb)
+        """
+        self.verbose = verbose
+        self._low_mem = low_mem
+        
+        # Prepare the model
+        self._model = None  # type: Tacotron2
+        checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
+        if checkpoint_state is None:
+            raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
+        self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
+        if verbose:
+            model_name = checkpoints_dir.parent.name.replace("logs-", "")
+            step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
+            print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
+     
+    def is_loaded(self):
+        """
+        Whether the model is loaded in GPU memory.
+        """
+        return self._model is not None
    
-    model_name = checkpoints_dir.parent.name.replace("logs-", "")
-    step = int(checkpoint_fpath[checkpoint_fpath.rfind('-') + 1:])
-    print("Loaded synthesizer \"%s\" trained to step %d" % (model_name, step))
-
-def is_loaded():
-    return _model is not None
+    def load(self):
+        """
+        Effectively loads the model to GPU memory given the weights file that was passed in the
+        constructor.
+        """
+        if self._low_mem:
+            raise Exception("Cannot load the synthesizer permanently in low mem mode")
+        tf.reset_default_graph()
+        self._model = Tacotron2(self.checkpoint_fpath, hparams)
+            
+    def synthesize_spectrograms(self, texts: List[str],
+                                embeddings: Union[np.ndarray, List[np.ndarray]],
+                                return_alignments=False):
+        """
+        Synthesizes mel spectrograms from texts and speaker embeddings.

-def synthesize_spectrograms(texts: List[str], embeddings: np.ndarray, return_alignments=False):
-    """
-    Synthesizes mel spectrograms from texts and speaker embeddings.
-    
-    :param texts: a list of N text prompts to be synthesized
-    :param embeddings: a numpy array of (N, 256) speaker embeddings
-    :param return_alignments: if True, a matrix representing the alignments between the characters
-    and each decoder output step will be returned for each spectrogram
-    :return: a list of N melspectrograms as numpy arrays of shape (80, M), and possibly the 
-    alignments.
-    """
-    if not is_loaded():
-        raise Exception("Load a model first")
-    
-    specs, alignments = _model.my_synthesize(embeddings, texts)
+        :param texts: a list of N text prompts to be synthesized
+        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
+        :param return_alignments: if True, a matrix representing the alignments between the 
+        characters
+        and each decoder output step will be returned for each spectrogram
+        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
+        sequence length of spectrogram i, and possibly the alignments.
+        """
+        if not self._low_mem:
+            # Usual inference mode: load the model on the first request and keep it loaded.
+            if not self.is_loaded():
+                self.load()
+            specs, alignments = self._model.my_synthesize(embeddings, texts)
+        else:
+            # Low memory inference mode: load the model upon every request. The model has to be 
+            # loaded in a separate process to be able to release GPU memory (a simple workaround 
+            # to tensorflow's intricacies)
+            specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, 
+                                                [(self.checkpoint_fpath, embeddings, texts)])[0]
    
-    if return_alignments:
+        return (specs, alignments) if return_alignments else specs
+
+    @staticmethod
+    def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
+        # Load the model and forward the inputs
+        tf.reset_default_graph()
+        model = Tacotron2(checkpoint_fpath, hparams)
+        specs, alignments = model.my_synthesize(embeddings, texts)
+        
+        # Detach the outputs (not doing so will cause the process to hang)
+        specs, alignments = [spec.copy() for spec in specs], alignments.copy()
+        
+        # Close cuda for this process
+        model.session.close()
+        numba.cuda.select_device(0)
+        numba.cuda.close()
+        
        return specs, alignments
-    else:
-        return specs

-def load_preprocess_wav(fpath):
-    wav = librosa.load(fpath, hparams.sample_rate)[0]
-    if hparams.rescale:
-        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-    return wav
+    @staticmethod
+    def load_preprocess_wav(fpath):
+        """
+        Loads and preprocesses an audio file under the same conditions the audio files were used to
+        train the synthesizer. 
+        """
+        wav = librosa.load(fpath, hparams.sample_rate)[0]
+        if hparams.rescale:
+            wav = wav / np.abs(wav).max() * hparams.rescaling_max
+        return wav

-def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
-    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
-        wav = load_preprocess_wav(fpath_or_wav)
-    else: 
-        wav = fpath_or_wav
+    @staticmethod
+    def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
+        """
+        Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 
+        were fed to the synthesizer when training.
+        """
+        if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+            wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
+        else:
+            wav = fpath_or_wav
+        
+        mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
+        return mel_spectrogram
    
-    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
-    return mel_spectrogram
-
-def griffin_lim(mel):
-    return audio.inv_mel_spectrogram(mel, hparams)
-
-
+    @staticmethod
+    def griffin_lim(mel):
+        """
+        Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
+        with the same parameters present in hparams.py.
+        """
+        return audio.inv_mel_spectrogram(mel, hparams)
+    
\ No newline at end of file
--- a/synthesizer/synthesize.py
+++ b/synthesizer/synthesize.py
-from synthesizer.synthesizer import Synthesizer
+from synthesizer.tacotron2 import Tacotron2
 from synthesizer.hparams import hparams_debug_string
 from synthesizer.infolog import log
 import tensorflow as tf
@@ -18,8 +18,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
    os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
    
    log(hparams_debug_string())
-    synth = Synthesizer()
-    synth.load(checkpoint_path, hparams)
+    synth = Tacotron2(checkpoint_path, hparams)
    
    #Set inputs batch wise
    sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i 
@@ -44,10 +43,9 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
    print(hparams_debug_string())
    
    # Load the model in memory
-    synth = Synthesizer()
    weights_dir = os.path.join(model_dir, "taco_pretrained")
    checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
-    synth.load(checkpoint_fpath, hparams, gta=True)
+    synth = Tacotron2(checkpoint_fpath, hparams, gta=True)
    
    # Load the metadata
    with open(metadata_filename, encoding="utf-8") as f:

--- a/synthesizer/synthesizer.py
+++ b/synthesizer/synthesizer.py
 from synthesizer.utils.text import text_to_sequence
-from synthesizer import audio
 from synthesizer.infolog import log
 from synthesizer.models import create_model
 from synthesizer.utils import plot
+from synthesizer import audio
 import tensorflow as tf
 import numpy as np
-import wave
 import os


-class Synthesizer:
-    def load(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
+class Tacotron2:
+    def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
        log("Constructing model: %s" % model_name)
        #Force the batch size to be known in order to use attention masking in batch synthesis
        inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
@@ -65,7 +64,7 @@ class Synthesizer:
    
    def my_synthesize(self, speaker_embeds, texts):
        """
-        Lighter synthesis function that directly returns the mel spectrogram.
+        Lighter synthesis function that directly returns the mel spectrograms.
        """
        
        # Prepare the input

--- a/toolbox/__init__.py
+++ b/toolbox/__init__.py
 from toolbox.ui import UI
 from encoder import inference as encoder
-from synthesizer import inference as synthesizer
+from synthesizer.inference import Synthesizer
 from vocoder import inference as vocoder
 from pathlib import Path
 from time import perf_counter as timer
@@ -28,12 +28,15 @@ recognized_datasets = [
 ]

 class Toolbox:
-    def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir):
+    def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, low_mem):
        sys.excepthook = self.excepthook
        self.datasets_root = datasets_root
+        self.low_mem = low_mem
        self.utterances = set()
        self.current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
        
+        self.synthesizer = None # type: Synthesizer
+        
        # Initialize the events and the interface
        self.ui = UI()
        self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir)
@@ -58,7 +61,9 @@ class Toolbox:
        
        # Model selection
        self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)
-        self.ui.synthesizer_box.currentIndexChanged.connect(self.init_synthesizer)
+        def func(): 
+            self.synthesizer = None
+        self.ui.synthesizer_box.currentIndexChanged.connect(func)
        self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)
        
        # Utterance selection
@@ -66,7 +71,7 @@ class Toolbox:
        self.ui.browser_browse_button.clicked.connect(func)
        func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
        self.ui.utterance_history.currentIndexChanged.connect(func)
-        func = lambda: self.ui.play(self.ui.selected_utterance.wav, synthesizer.sample_rate)
+        func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
        self.ui.play_button.clicked.connect(func)
        self.ui.stop_button.clicked.connect(self.ui.stop)
        self.ui.record_button.clicked.connect(self.record)
@@ -102,7 +107,7 @@ class Toolbox:
        
        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
-        wav = synthesizer.load_preprocess_wav(fpath)
+        wav = Synthesizer.load_preprocess_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)
@@ -119,7 +124,7 @@ class Toolbox:
        
    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
-        spec = synthesizer.make_spectrogram(wav)
+        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
@@ -142,21 +147,28 @@ class Toolbox:
        self.ui.draw_umap_projections(self.utterances)
        
    def synthesize(self):
-        # Synthesize the spectrogram
-        if not synthesizer.is_loaded():
-            self.init_synthesizer()
        self.ui.log("Generating the mel spectrogram...")
+        self.ui.set_loading(1)
+        
+        # Synthesize the spectrogram
+        if self.synthesizer is None:
+            model_dir = self.ui.current_synthesizer_model_dir
+            checkpoints_dir = model_dir.joinpath("taco_pretrained")
+            self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem)
+        if not self.synthesizer.is_loaded():
+            self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath)
        
        texts = self.ui.text_prompt.toPlainText().split("\n")
        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
-        specs = synthesizer.synthesize_spectrograms(texts, embeds)
+        specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        
        self.ui.draw_spec(spec, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
-        
+        self.ui.set_loading(0)
+
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None
@@ -165,7 +177,7 @@ class Toolbox:
        if not vocoder.is_loaded():
            self.init_vocoder()
        def vocoder_progress(i, seq_len, b_size, gen_rate):
-            real_time_factor = (gen_rate / synthesizer.hparams.sample_rate) * 1000
+            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
@@ -175,20 +187,20 @@ class Toolbox:
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
-            wav = synthesizer.griffin_lim(spec)
+            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")
        
        # Add breaks
-        b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size)
+        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
-        breaks = [np.zeros(int(0.15 * synthesizer.hparams.sample_rate))] * len(breaks)
+        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
-        self.ui.play(wav, synthesizer.sample_rate)
+        self.ui.play(wav, Synthesizer.sample_rate)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
@@ -206,7 +218,6 @@ class Toolbox:
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
        
-        
    def init_encoder(self):
        model_fpath = self.ui.current_encoder_fpath
        
@@ -216,18 +227,7 @@ class Toolbox:
        encoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
-        
-    def init_synthesizer(self):
-        model_dir = self.ui.current_synthesizer_model_dir
-        checkpoints_dir = model_dir.joinpath("taco_pretrained")
-
-        self.ui.log("Loading the synthesizer %s... " % checkpoints_dir)
-        self.ui.set_loading(1)
-        start = timer()
-        synthesizer.load_model(checkpoints_dir)
-        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
-        self.ui.set_loading(0)
-    
+           
    def init_vocoder(self):
        model_fpath = self.ui.current_vocoder_fpath
        # Case of Griffin-lim