Fixed imports for tacotron

4585864f · Corentin Jemine · 9a2f1ddd · 4585864f · 4585864f · 4585864f
21 changed file
--- a/SV2TTS/demo_embeddings.py
+++ b/SV2TTS/demo_embeddings.py
@@ -2,6 +2,7 @@ from time import perf_counter
 from encoder import inference
 from encoder.params_data import sampling_rate
 from pathlib import Path
+import numpy as np
 import torch

 if __name__ == '__main__':
@@ -28,4 +29,5 @@ if __name__ == '__main__':
    torch.cuda.synchronize()
    print("Processed %.2fs long utterance in %.2fs" % (duration, perf_counter() - start))
    
+    np.set_printoptions(precision=2, suppress=True)
    print(embed)
--- a/SV2TTS/demo_sv2tts.py
+++ b/SV2TTS/demo_sv2tts.py
-from datasets.audio import inv_mel_spectrogram
-from tacotron import synthesizer
+from synthesizer.datasets.audio import inv_mel_spectrogram
 from synthesizer.hparams import hparams
+from synthesizer import synthesizer
 from vlibs import fileio
 import sounddevice as sd
 import tensorflow as tf
 import numpy as np
-import sys
-sys.path.append('../wave-rnn')
-from vocoder import inference as vocoder
 import os
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-sys.path.append('../encoder')
+from vocoder import inference as vocoder
 from encoder import inference as encoder

-encoder.load_model('../encoder/saved_models/all.pt', 'cuda')
+encoder.load_model('SV2TTS/encoder/saved_models/all.pt')
 vocoder.load_model('../wave-rnn/checkpoints/mu_law.pt')
    


--- a/SV2TTS/encoder/audio.py
+++ b/SV2TTS/encoder/audio.py
@@ -13,9 +13,9 @@ int16_max = (2 ** 15) - 1
 def load(fpath):
    return librosa.load(fpath, sr=sampling_rate)[0]

-def wave_to_mel_filterbank(wave):
+def wav_to_mel_filterbank(wav):
    frames = librosa.feature.melspectrogram(
-        wave, 
+        wav, 
        sampling_rate,
        n_fft=int(sampling_rate * mel_window_length / 1000),
        hop_length=int(sampling_rate * mel_window_step / 1000),
@@ -23,13 +23,13 @@ def wave_to_mel_filterbank(wave):
    )
    return frames.astype(np.float32).transpose()

-def trim_long_silences(wave):
+def trim_long_silences(wav):
    """
    Ensures that segments without voice in the waveform remain no longer than a 
    threshold determined by the VAD parameters in params.py.
    
-    :param wave: the raw waveform as a numpy array of floats 
-    :return: the same waveform with silences trimmed away (length <= original wave length)
+    :param wav: the raw waveform as a numpy array of floats 
+    :return: the same waveform with silences trimmed away (length <= original wav length)
    """
    # import matplotlib.pyplot as plt
    
@@ -37,17 +37,17 @@ def trim_long_silences(wave):
    samples_per_window = (vad_window_length * sampling_rate) // 1000
    
    # Trim the end of the audio to have a multiple of the window size
-    wave = wave[:len(wave) - (len(wave) % samples_per_window)]
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
    # plt.subplot(611)
-    # plt.plot(wave)
+    # plt.plot(wav)
    
    # Convert the float waveform to 16-bit mono PCM
-    pcm_wave = struct.pack("%dh" % len(wave), *(np.round(wave * int16_max)).astype(np.int16))
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
    
    # Perform voice activation detection
    voice_flags = []
    vad = webrtcvad.Vad(mode=3)
-    for window_start in range(0, len(wave), samples_per_window):
+    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                                         sample_rate=sampling_rate))
@@ -74,38 +74,38 @@ def trim_long_silences(wave):
    # Trim away the long silences in the audio
    audio_mask = np.repeat(audio_mask, samples_per_window)
    # plt.subplot(615)
-    # plt.plot(wave)
+    # plt.plot(wav)
    # plt.plot(audio_mask * 10000)
    
-    wave = wave[audio_mask == True]
+    wav = wav[audio_mask == True]
    # plt.subplot(616)
-    # plt.plot(wave)
-    # play_wave(wave)
+    # plt.plot(wav)
+    # play_wave(wav)
    # plt.show()
    
-    return wave
+    return wav
  
-def normalize_volume(wave, target_dBFS, increase_only=False, decrease_only=False):
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
    if increase_only and decrease_only:
        raise ValueError("Both increase only and decrease only are set")
-    rms = np.sqrt(np.mean((wave * int16_max) ** 2))
+    rms = np.sqrt(np.mean((wav * int16_max) ** 2))
    wave_dBFS = 20 * np.log10(rms / int16_max)
    dBFS_change = target_dBFS - wave_dBFS
    if dBFS_change < 0 and increase_only or dBFS_change > 0 and decrease_only:
-        return wave
-    return wave * (10 ** (dBFS_change / 20))
+        return wav
+    return wav * (10 ** (dBFS_change / 20))

-def preprocess_wave(wave):
+def preprocess_wave(wav):
    """ 
    This is the standard routine that should be used on every audio file before being used in 
    this project.
    """
-    wave = normalize_volume(wave, audio_norm_target_dBFS, increase_only=True)
-    wave = trim_long_silences(wave)
-    return wave
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    return wav

-def plot_wave(wave):
-    plt.plot(wave)
+def plot_wave(wav):
+    plt.plot(wav)
    plt.show()
    
 def plot_mel_filterbank(frames):
@@ -121,16 +121,16 @@ def plot_mel_filterbank(frames):
    plt.tight_layout()
    plt.show()
    
-def play_wave(wave, blocking=False):
+def play_wave(wav, blocking=False):
    sounddevice.stop()
-    sounddevice.play(wave, sampling_rate, blocking=blocking)
+    sounddevice.play(wav, sampling_rate, blocking=blocking)
    
 def rec_wave(duration, blocking=True, verbose=True):
    if verbose:
        print('Recording %d seconds of audio' % duration)
-    wave = sounddevice.rec(duration * sampling_rate, sampling_rate, 1)
+    wav = sounddevice.rec(duration * sampling_rate, sampling_rate, 1)
    if blocking:
        sounddevice.wait()
        if verbose:
            print('Done recording!')
-    return wave.squeeze()
+    return wav.squeeze()
--- a/SV2TTS/encoder/inference.py
+++ b/SV2TTS/encoder/inference.py
@@ -116,7 +116,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
    """
    # Process the entire utterance if not using partials
    if not using_partials:
-        frames = audio.wave_to_mel_filterbank(wav)
+        frames = audio.wav_to_mel_filterbank(wav)
        embed = embed_frames_batch(frames[None, ...])[0]
        if return_partials:
            return embed, None, None
@@ -129,7 +129,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
        wav = np.pad(wav, (0, max_wave_length - len(wav)), 'constant')
    
    # Split the utterance into partials
-    frames = audio.wave_to_mel_filterbank(wav)
+    frames = audio.wav_to_mel_filterbank(wav)
    frames_batch = np.array([frames[s] for s in mel_slices])
    partial_embeds = embed_frames_batch(frames_batch)
    

--- a/SV2TTS/encoder/preprocess.py
+++ b/SV2TTS/encoder/preprocess.py
@@ -98,7 +98,7 @@ def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir,
                continue
            
            # Create the mel spectrogram, discard those that are too short
-            frames = audio.wave_to_mel_filterbank(wav)
+            frames = audio.wav_to_mel_filterbank(wav)
            if len(frames) < partials_n_frames:
                continue
            

--- a/SV2TTS/synthesizer/datasets/preprocessor.py
+++ b/SV2TTS/synthesizer/datasets/preprocessor.py
-import os
+from synthesizer.datasets import audio
+from encoder import inference
 from vlibs import fileio
 import numpy as np
-from datasets import audio
-import sys
-sys.path.append('../encoder')
-encoder_model_fpath = '../encoder/saved_models/all.pt'
-from encoder import inference
+import os

+encoder_model_fpath = 'SV2TTS/encoder/saved_models/all.pt'

 def build_from_path(hparams, input_dirs, mel_dir, embed_dir, wav_dir):
    """

--- a/SV2TTS/synthesizer/feeder.py
+++ b/SV2TTS/synthesizer/feeder.py
-import os
+from sklearn.model_selection import train_test_split
+from synthesizer.utils.text import text_to_sequence
+from synthesizer.infolog import log
+import tensorflow as tf
+import numpy as np
 import threading
 import time
-
-import numpy as np
-import tensorflow as tf
-from synthesizer.infolog import log
-from sklearn.model_selection import train_test_split
-from tacotron.utils.text import text_to_sequence
+import os

 _batches_per_group = 64


--- a/SV2TTS/synthesizer/hparams.py
+++ b/SV2TTS/synthesizer/hparams.py
-import numpy as np
 import tensorflow as tf
+import numpy as np

 # Default hyperparameters
 hparams = tf.contrib.training.HParams(

--- a/SV2TTS/synthesizer/models/architecture_wrappers.py
+++ b/SV2TTS/synthesizer/models/architecture_wrappers.py
@@ -3,7 +3,7 @@ All notations and variable names were used in concordance with originial tensorf
 """
 import collections
 import tensorflow as tf
-from tacotron.models.attention import _compute_attention
+from synthesizer.models.attention import _compute_attention
 from tensorflow.contrib.rnn import RNNCell
 from tensorflow.python.framework import ops, tensor_shape
 from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops

--- a/SV2TTS/synthesizer/models/custom_decoder.py
+++ b/SV2TTS/synthesizer/models/custom_decoder.py
 from __future__ import absolute_import, division, print_function
-
 import collections
-
 import tensorflow as tf
-from tacotron.models.helpers import TacoTestHelper, TacoTrainingHelper
+from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.python.framework import ops, tensor_shape

--- a/SV2TTS/synthesizer/models/tacotron.py
+++ b/SV2TTS/synthesizer/models/tacotron.py
 import tensorflow as tf
-from tacotron.utils.symbols import symbols
+from synthesizer.utils.symbols import symbols
 from synthesizer.infolog import log
-from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
-from tacotron.models.modules import *
+from synthesizer.models.helpers import TacoTrainingHelper, TacoTestHelper
+from synthesizer.models.modules import *
 from tensorflow.contrib.seq2seq import dynamic_decode
-from tacotron.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
-from tacotron.models.custom_decoder import CustomDecoder
-from tacotron.models.attention import LocationSensitiveAttention
+from synthesizer.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
+from synthesizer.models.custom_decoder import CustomDecoder
+from synthesizer.models.attention import LocationSensitiveAttention

 import numpy as np


--- a/SV2TTS/synthesizer/synthesize.py
+++ b/SV2TTS/synthesizer/synthesize.py
-import os
-import time
-from time import sleep
-
-import tensorflow as tf
+from synthesizer.synthesizer import Synthesizer
 from synthesizer.hparams import hparams_debug_string
 from synthesizer.infolog import log
-from tacotron.synthesizer import Synthesizer
+import tensorflow as tf
+from time import sleep
 from tqdm import tqdm
+import time
+import os


 def generate_fast(model, text):

--- a/SV2TTS/synthesizer/synthesizer.py
+++ b/SV2TTS/synthesizer/synthesizer.py
-import os
-import wave
-
+from synthesizer.utils.text import text_to_sequence
+from synthesizer.datasets import audio
+from synthesizer.infolog import log
+from synthesizer.models import create_model
+from synthesizer.utils import plot
+import tensorflow as tf
 import numpy as np
 import pyaudio
-import tensorflow as tf
-from datasets import audio
-from synthesizer.infolog import log
-from tacotron.models import create_model
-from tacotron.utils import plot
-from tacotron.utils.text import text_to_sequence
+import wave
+import os


 class Synthesizer:

--- a/SV2TTS/synthesizer/train.py
+++ b/SV2TTS/synthesizer/train.py
-import os
-import time
-import traceback
-from datetime import datetime
-
-from synthesizer import infolog
-import numpy as np
-import tensorflow as tf
-from datasets import audio
+from synthesizer.utils.symbols import symbols
+from synthesizer.utils.text import sequence_to_text
+from synthesizer.datasets import audio
 from synthesizer.hparams import hparams_debug_string
-from tacotron.feeder import Feeder
-from tacotron.models import create_model
-from tacotron.utils import ValueWindow, plot
-from tacotron.utils.text import sequence_to_text
-from tacotron.utils.symbols import symbols
+from synthesizer.feeder import Feeder
+from synthesizer.models import create_model
+from synthesizer.utils import ValueWindow, plot
+from synthesizer import infolog
+from datetime import datetime
 from tqdm import tqdm
+import tensorflow as tf
+import numpy as np
+import traceback
+import time
+import os

 log = infolog.log


--- a/SV2TTS/synthesizer/utils/cleaners.py
+++ b/SV2TTS/synthesizer/utils/cleaners.py
@@ -11,9 +11,7 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
 '''

 import re
-
 from unidecode import unidecode
-
 from .numbers import normalize_numbers

 # Regular expression matching whitespace:

--- a/SV2TTS/synthesizer/utils/numbers.py
+++ b/SV2TTS/synthesizer/utils/numbers.py
 import re
-
 import inflect

 _inflect = inflect.engine()

--- a/SV2TTS/synthesizer/utils/plot.py
+++ b/SV2TTS/synthesizer/utils/plot.py
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
-
 import numpy as np



--- a/SV2TTS/synthesizer/utils/text.py
+++ b/SV2TTS/synthesizer/utils/text.py
-import re
-
-from . import cleaners
 from .symbols import symbols
+from . import cleaners
+import re

 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}

--- a/SV2TTS/synthesizer_preprocess.py
+++ b/SV2TTS/synthesizer_preprocess.py
-import argparse
-import os
-from multiprocessing import cpu_count
-from datasets import preprocessor
+from synthesizer.datasets import preprocessor
 from synthesizer.hparams import hparams
 from vlibs import fileio
+import argparse
+import os


 def preprocess(args, input_folders, out_dir, hparams):
@@ -53,7 +52,6 @@ def main():
    parser.add_argument('--hparams', default='',
                        help='Hyperparameter overrides as a comma-separated list of name=value pairs')
    parser.add_argument('--output', default='Synthesizer')
-    parser.add_argument('--n_jobs', type=int, default=cpu_count())
    
    # Name of the LibriTTS sets to use, separated by spaces 
    # (e.g. "--sets train-other-500 train-clean-360). Defaults to using all the clean training sets 

--- a/SV2TTS/synthesizer_train.py
+++ b/SV2TTS/synthesizer_train.py
@@ -5,8 +5,8 @@ from synthesizer import infolog
 import tensorflow as tf
 from synthesizer.hparams import hparams
 from synthesizer.infolog import log
-from tacotron.synthesize import tacotron_synthesize
-from tacotron.train import tacotron_train
+from synthesizer.synthesize import tacotron_synthesize
+from synthesizer.train import tacotron_train

 log = infolog.log


--- a/notes/vocoder.txt
+++ b/notes/vocoder.txt
@@ -16,11 +16,8 @@ TODO:
    - Proper versions in requirements.txt (do something different for inference and training?)
 - Clean up the rest of the code (' to ")
 - Move root of repo to SV2TTS (the rest isn't necessary)
- Put on github (INCLUDE VLIBS)
+- Put on github (RECHECK ALL REQUIREMENTS (+VLIBS))
 - Make demo website
 - Write something

 Some noisy speakers: 40, 
-
- moved files
- removed old saved models for encoder
\ No newline at end of file