提交 4585864f 编写于 作者: C Corentin Jemine

Fixed imports for tacotron

上级 9a2f1ddd
......@@ -2,6 +2,7 @@ from time import perf_counter
from encoder import inference
from encoder.params_data import sampling_rate
from pathlib import Path
import numpy as np
import torch
if __name__ == '__main__':
......@@ -28,4 +29,5 @@ if __name__ == '__main__':
torch.cuda.synchronize()
print("Processed %.2fs long utterance in %.2fs" % (duration, perf_counter() - start))
np.set_printoptions(precision=2, suppress=True)
print(embed)
from datasets.audio import inv_mel_spectrogram
from tacotron import synthesizer
from synthesizer.datasets.audio import inv_mel_spectrogram
from synthesizer.hparams import hparams
from synthesizer import synthesizer
from vlibs import fileio
import sounddevice as sd
import tensorflow as tf
import numpy as np
import sys
sys.path.append('../wave-rnn')
from vocoder import inference as vocoder
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
sys.path.append('../encoder')
from vocoder import inference as vocoder
from encoder import inference as encoder
encoder.load_model('../encoder/saved_models/all.pt', 'cuda')
encoder.load_model('SV2TTS/encoder/saved_models/all.pt')
vocoder.load_model('../wave-rnn/checkpoints/mu_law.pt')
......
......@@ -13,9 +13,9 @@ int16_max = (2 ** 15) - 1
def load(fpath):
return librosa.load(fpath, sr=sampling_rate)[0]
def wave_to_mel_filterbank(wave):
def wav_to_mel_filterbank(wav):
frames = librosa.feature.melspectrogram(
wave,
wav,
sampling_rate,
n_fft=int(sampling_rate * mel_window_length / 1000),
hop_length=int(sampling_rate * mel_window_step / 1000),
......@@ -23,13 +23,13 @@ def wave_to_mel_filterbank(wave):
)
return frames.astype(np.float32).transpose()
def trim_long_silences(wave):
def trim_long_silences(wav):
"""
Ensures that segments without voice in the waveform remain no longer than a
threshold determined by the VAD parameters in params.py.
:param wave: the raw waveform as a numpy array of floats
:return: the same waveform with silences trimmed away (length <= original wave length)
:param wav: the raw waveform as a numpy array of floats
:return: the same waveform with silences trimmed away (length <= original wav length)
"""
# import matplotlib.pyplot as plt
......@@ -37,17 +37,17 @@ def trim_long_silences(wave):
samples_per_window = (vad_window_length * sampling_rate) // 1000
# Trim the end of the audio to have a multiple of the window size
wave = wave[:len(wave) - (len(wave) % samples_per_window)]
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
# plt.subplot(611)
# plt.plot(wave)
# plt.plot(wav)
# Convert the float waveform to 16-bit mono PCM
pcm_wave = struct.pack("%dh" % len(wave), *(np.round(wave * int16_max)).astype(np.int16))
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
# Perform voice activation detection
voice_flags = []
vad = webrtcvad.Vad(mode=3)
for window_start in range(0, len(wave), samples_per_window):
for window_start in range(0, len(wav), samples_per_window):
window_end = window_start + samples_per_window
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
......@@ -74,38 +74,38 @@ def trim_long_silences(wave):
# Trim away the long silences in the audio
audio_mask = np.repeat(audio_mask, samples_per_window)
# plt.subplot(615)
# plt.plot(wave)
# plt.plot(wav)
# plt.plot(audio_mask * 10000)
wave = wave[audio_mask == True]
wav = wav[audio_mask == True]
# plt.subplot(616)
# plt.plot(wave)
# play_wave(wave)
# plt.plot(wav)
# play_wave(wav)
# plt.show()
return wave
return wav
def normalize_volume(wave, target_dBFS, increase_only=False, decrease_only=False):
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
if increase_only and decrease_only:
raise ValueError("Both increase only and decrease only are set")
rms = np.sqrt(np.mean((wave * int16_max) ** 2))
rms = np.sqrt(np.mean((wav * int16_max) ** 2))
wave_dBFS = 20 * np.log10(rms / int16_max)
dBFS_change = target_dBFS - wave_dBFS
if dBFS_change < 0 and increase_only or dBFS_change > 0 and decrease_only:
return wave
return wave * (10 ** (dBFS_change / 20))
return wav
return wav * (10 ** (dBFS_change / 20))
def preprocess_wave(wave):
def preprocess_wave(wav):
"""
This is the standard routine that should be used on every audio file before being used in
this project.
"""
wave = normalize_volume(wave, audio_norm_target_dBFS, increase_only=True)
wave = trim_long_silences(wave)
return wave
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
wav = trim_long_silences(wav)
return wav
def plot_wave(wave):
plt.plot(wave)
def plot_wave(wav):
plt.plot(wav)
plt.show()
def plot_mel_filterbank(frames):
......@@ -121,16 +121,16 @@ def plot_mel_filterbank(frames):
plt.tight_layout()
plt.show()
def play_wave(wave, blocking=False):
def play_wave(wav, blocking=False):
sounddevice.stop()
sounddevice.play(wave, sampling_rate, blocking=blocking)
sounddevice.play(wav, sampling_rate, blocking=blocking)
def rec_wave(duration, blocking=True, verbose=True):
if verbose:
print('Recording %d seconds of audio' % duration)
wave = sounddevice.rec(duration * sampling_rate, sampling_rate, 1)
wav = sounddevice.rec(duration * sampling_rate, sampling_rate, 1)
if blocking:
sounddevice.wait()
if verbose:
print('Done recording!')
return wave.squeeze()
return wav.squeeze()
......@@ -116,7 +116,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
"""
# Process the entire utterance if not using partials
if not using_partials:
frames = audio.wave_to_mel_filterbank(wav)
frames = audio.wav_to_mel_filterbank(wav)
embed = embed_frames_batch(frames[None, ...])[0]
if return_partials:
return embed, None, None
......@@ -129,7 +129,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
wav = np.pad(wav, (0, max_wave_length - len(wav)), 'constant')
# Split the utterance into partials
frames = audio.wave_to_mel_filterbank(wav)
frames = audio.wav_to_mel_filterbank(wav)
frames_batch = np.array([frames[s] for s in mel_slices])
partial_embeds = embed_frames_batch(frames_batch)
......
......@@ -98,7 +98,7 @@ def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir,
continue
# Create the mel spectrogram, discard those that are too short
frames = audio.wave_to_mel_filterbank(wav)
frames = audio.wav_to_mel_filterbank(wav)
if len(frames) < partials_n_frames:
continue
......
import os
from synthesizer.datasets import audio
from encoder import inference
from vlibs import fileio
import numpy as np
from datasets import audio
import sys
sys.path.append('../encoder')
encoder_model_fpath = '../encoder/saved_models/all.pt'
from encoder import inference
import os
encoder_model_fpath = 'SV2TTS/encoder/saved_models/all.pt'
def build_from_path(hparams, input_dirs, mel_dir, embed_dir, wav_dir):
"""
......
import os
from sklearn.model_selection import train_test_split
from synthesizer.utils.text import text_to_sequence
from synthesizer.infolog import log
import tensorflow as tf
import numpy as np
import threading
import time
import numpy as np
import tensorflow as tf
from synthesizer.infolog import log
from sklearn.model_selection import train_test_split
from tacotron.utils.text import text_to_sequence
import os
_batches_per_group = 64
......
import numpy as np
import tensorflow as tf
import numpy as np
# Default hyperparameters
hparams = tf.contrib.training.HParams(
......
......@@ -3,7 +3,7 @@ All notations and variable names were used in concordance with originial tensorf
"""
import collections
import tensorflow as tf
from tacotron.models.attention import _compute_attention
from synthesizer.models.attention import _compute_attention
from tensorflow.contrib.rnn import RNNCell
from tensorflow.python.framework import ops, tensor_shape
from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
......
from __future__ import absolute_import, division, print_function
import collections
import tensorflow as tf
from tacotron.models.helpers import TacoTestHelper, TacoTrainingHelper
from synthesizer.models.helpers import TacoTestHelper, TacoTrainingHelper
from tensorflow.contrib.seq2seq.python.ops import decoder
from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
from tensorflow.python.framework import ops, tensor_shape
......
import tensorflow as tf
from tacotron.utils.symbols import symbols
from synthesizer.utils.symbols import symbols
from synthesizer.infolog import log
from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
from tacotron.models.modules import *
from synthesizer.models.helpers import TacoTrainingHelper, TacoTestHelper
from synthesizer.models.modules import *
from tensorflow.contrib.seq2seq import dynamic_decode
from tacotron.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
from tacotron.models.custom_decoder import CustomDecoder
from tacotron.models.attention import LocationSensitiveAttention
from synthesizer.models.architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
from synthesizer.models.custom_decoder import CustomDecoder
from synthesizer.models.attention import LocationSensitiveAttention
import numpy as np
......
import os
import time
from time import sleep
import tensorflow as tf
from synthesizer.synthesizer import Synthesizer
from synthesizer.hparams import hparams_debug_string
from synthesizer.infolog import log
from tacotron.synthesizer import Synthesizer
import tensorflow as tf
from time import sleep
from tqdm import tqdm
import time
import os
def generate_fast(model, text):
......
import os
import wave
from synthesizer.utils.text import text_to_sequence
from synthesizer.datasets import audio
from synthesizer.infolog import log
from synthesizer.models import create_model
from synthesizer.utils import plot
import tensorflow as tf
import numpy as np
import pyaudio
import tensorflow as tf
from datasets import audio
from synthesizer.infolog import log
from tacotron.models import create_model
from tacotron.utils import plot
from tacotron.utils.text import text_to_sequence
import wave
import os
class Synthesizer:
......
import os
import time
import traceback
from datetime import datetime
from synthesizer import infolog
import numpy as np
import tensorflow as tf
from datasets import audio
from synthesizer.utils.symbols import symbols
from synthesizer.utils.text import sequence_to_text
from synthesizer.datasets import audio
from synthesizer.hparams import hparams_debug_string
from tacotron.feeder import Feeder
from tacotron.models import create_model
from tacotron.utils import ValueWindow, plot
from tacotron.utils.text import sequence_to_text
from tacotron.utils.symbols import symbols
from synthesizer.feeder import Feeder
from synthesizer.models import create_model
from synthesizer.utils import ValueWindow, plot
from synthesizer import infolog
from datetime import datetime
from tqdm import tqdm
import tensorflow as tf
import numpy as np
import traceback
import time
import os
log = infolog.log
......
......@@ -11,9 +11,7 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
'''
import re
from unidecode import unidecode
from .numbers import normalize_numbers
# Regular expression matching whitespace:
......
import re
import inflect
_inflect = inflect.engine()
......
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
......
import re
from . import cleaners
from .symbols import symbols
from . import cleaners
import re
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
......
import argparse
import os
from multiprocessing import cpu_count
from datasets import preprocessor
from synthesizer.datasets import preprocessor
from synthesizer.hparams import hparams
from vlibs import fileio
import argparse
import os
def preprocess(args, input_folders, out_dir, hparams):
......@@ -53,7 +52,6 @@ def main():
parser.add_argument('--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--output', default='Synthesizer')
parser.add_argument('--n_jobs', type=int, default=cpu_count())
# Name of the LibriTTS sets to use, separated by spaces
# (e.g. "--sets train-other-500 train-clean-360). Defaults to using all the clean training sets
......
......@@ -5,8 +5,8 @@ from synthesizer import infolog
import tensorflow as tf
from synthesizer.hparams import hparams
from synthesizer.infolog import log
from tacotron.synthesize import tacotron_synthesize
from tacotron.train import tacotron_train
from synthesizer.synthesize import tacotron_synthesize
from synthesizer.train import tacotron_train
log = infolog.log
......
......@@ -16,11 +16,8 @@ TODO:
- Proper versions in requirements.txt (do something different for inference and training?)
- Clean up the rest of the code (' to ")
- Move root of repo to SV2TTS (the rest isn't necessary)
- Put on github (INCLUDE VLIBS)
- Put on github (RECHECK ALL REQUIREMENTS (+VLIBS))
- Make demo website
- Write something
Some noisy speakers: 40,
- moved files
- removed old saved models for encoder
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册