Backup of the encoder (1M steps)

62a887f7 · Corentin Jemine · 9692bea9 · 62a887f7 · 62a887f7 · 62a887f7
6 changed file
--- a/sv2tts/demo_sv2tts.py
+++ b/sv2tts/demo_sv2tts.py
-from synthesizer.datasets.audio import inv_mel_spectrogram
+from synthesizer.audio import inv_mel_spectrogram
 from synthesizer.hparams import hparams
 from synthesizer import synthesizer
 import sounddevice as sd

--- a/sv2tts/encoder/audio.py
+++ b/sv2tts/encoder/audio.py
@@ -3,11 +3,11 @@ import matplotlib.pyplot as plt
 import librosa.display
 import librosa
 import numpy as np
-import sounddevice
 import webrtcvad
 import struct
 from encoder.params_data import *

+
 int16_max = (2 ** 15) - 1

 def load(fpath):
@@ -121,16 +121,16 @@ def plot_mel_filterbank(frames):
    plt.tight_layout()
    plt.show()
    
-def play_wave(wav, blocking=False):
-    sounddevice.stop()
-    sounddevice.play(wav, sampling_rate, blocking=blocking)
-    
-def rec_wave(duration, blocking=True, verbose=True):
-    if verbose:
-        print("Recording %d seconds of audio" % duration)
-    wav = sounddevice.rec(duration * sampling_rate, sampling_rate, 1)
-    if blocking:
-        sounddevice.wait()
-        if verbose:
-            print("Done recording!")
-    return wav.squeeze()
+# def play_wave(wav, blocking=False):
+#     sounddevice.stop()
+#     sounddevice.play(wav, sampling_rate, blocking=blocking)
+#     
+# def rec_wave(duration, blocking=True, verbose=True):
+#     if verbose:
+#         print("Recording %d seconds of audio" % duration)
+#     wav = sounddevice.rec(duration * sampling_rate, sampling_rate, 1)
+#     if blocking:
+#         sounddevice.wait()
+#         if verbose:
+#             print("Done recording!")
+#     return wav.squeeze()
--- a/sv2tts/encoder/inference.py
+++ b/sv2tts/encoder/inference.py
@@ -9,6 +9,7 @@ from matplotlib import cm
 _model = None # type: SpeakerEncoder
 _device = None # type: torch.device

+
 def load_model(weights_fpath, device=None):
    """
    Loads the model in memory. If this function is not explicitely called, it will be run on the 

--- a/sv2tts/encoder/saved_models/pretrained.pt
+++ b/sv2tts/encoder/saved_models/pretrained.pt
--- a/sv2tts/synthesizer/datasets/audio.py
+++ b/sv2tts/synthesizer/datasets/audio.py
--- a/sv2tts/synthesizer/datasets/preprocessor.py
+++ b/sv2tts/synthesizer/datasets/preprocessor.py
@@ -2,7 +2,7 @@ from synthesizer.datasets import audio
 from multiprocessing.pool import Pool 
 from functools import partial
 from itertools import chain
-from encoder import inference as speaker_encoder
+# from encoder import inference as speaker_encoder
 from pathlib import Path
 from tqdm import tqdm
 import numpy as np
@@ -32,7 +32,7 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, wav_out_dir: Path
    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
    func = partial(preprocess_speaker, mel_out_dir=mel_out_dir, wav_out_dir=wav_out_dir,
                   skip_existing=skip_existing, hparams=hparams)
-    job = Pool().imap(func, speaker_dirs)
+    job = Pool(1).imap(func, speaker_dirs)
    for speaker_metadata in tqdm(job, "LibriSpeech", len(speaker_dirs), unit="speakers"):
        for metadatum in speaker_metadata:
            metadata_file.write("|".join(str(x) for x in metadatum) + "\n")