提交 b7008db2 编写于 作者: C Corentin Jemine

Inference demo for Tacotron

上级 0e8571db
from datasets.audio import inv_mel_spectrogram
from tacotron import synthesizer
from hparams import hparams
from vlibs import fileio
import sounddevice as sd
import tensorflow as tf
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
def get_speaker_embed(speaker_id):
embed_root = r"E:\Datasets\Synthesizer\embed"
embeds = [np.load(f) for f in fileio.get_files(embed_root, "embed-%d-" % speaker_id)]
speaker_embed = np.mean(embeds, axis=0)
speaker_embed /= np.linalg.norm(speaker_embed, 2)
return speaker_embed[None, ...]
if __name__ == '__main__':
checkpoint_dir = os.path.join('logs-conditioned', 'taco_pretrained')
checkpoint_fpath = tf.train.get_checkpoint_state(checkpoint_dir).model_checkpoint_path
synth = synthesizer.Synthesizer()
synth.load(checkpoint_fpath, hparams)
while True:
speaker_id = int(input("Speaker ID: "))
speaker_embed = get_speaker_embed(speaker_id)
text = input("Text: ")
mel = synth.my_synthesize(speaker_embed, text)
wav = inv_mel_spectrogram(mel.T, hparams)
sd.play(wav, 16000)
sd.wait()
Synthesizing new speech with a new voice!
I hope my thesis will work out nicely.
Can you pass me the butter?
Automatic multispeaker voice cloning.
Can you pass me the butter?
There was no world in which Icarus felt not the need to strike his blade.
This sentence should be the last one.
\ No newline at end of file
......@@ -14,7 +14,6 @@ from tqdm import tqdm
def generate_fast(model, text):
model.synthesize(text, None, None, None, None)
def run_live(args, checkpoint_path, hparams):
#Log to Terminal without keeping any records in files
log(hparams_debug_string())
......@@ -114,6 +113,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
log('synthesized mel spectrograms at {}'.format(synth_dir))
return os.path.join(synth_dir, 'map.txt')
def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
output_dir = 'tacotron_' + args.output_dir
......
import os
import wave
from datetime import datetime
import numpy as np
import pyaudio
import sounddevice as sd
import tensorflow as tf
from datasets import audio
from infolog import log
from librosa import effects
from tacotron.models import create_model
from tacotron.utils import plot
from tacotron.utils.text import text_to_sequence
......@@ -19,7 +16,7 @@ class Synthesizer:
log('Constructing model: %s' % model_name)
#Force the batch size to be known in order to use attention masking in batch synthesis
inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
input_lengths = tf.placeholder(tf.int32, (None,), name='input_lengths')
speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
name='speaker_embeddings')
targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets')
......@@ -68,6 +65,45 @@ class Synthesizer:
saver = tf.train.Saver()
saver.restore(self.session, checkpoint_path)
def my_synthesize(self, speaker_embed, text, raise_exception=False):
"""
Lighter synthesis function that directly returns the mel spectrogram.
:param speaker_embed:
:param text: the text to synthesize
:param raise_exception:
:return:
"""
# Prepare the input
cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')]
seqs = [np.asarray(text_to_sequence(text, cleaner_names))]
input_lengths = [len(seq) for seq in seqs]
input_seqs, max_seq_len = self._prepare_inputs(seqs)
split_infos = [[max_seq_len, 0, 0, 0]]
feed_dict = {
self.inputs: input_seqs,
self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
self.split_infos: np.asarray(split_infos, dtype=np.int32),
self.speaker_embeddings: speaker_embed
}
# Forward it
mels, alignments, stop_tokens = self.session.run(
[self.mel_outputs, self.alignments, self.stop_token_prediction],
feed_dict=feed_dict)
mel, alignment, stop_token = mels[0][0], alignments[0][0], stop_tokens[0][0]
# Trim the output
try:
target_length = np.round(stop_token).index(1)
mel = mel[:target_length, :]
except:
if raise_exception:
raise Exception("Tacotron could not generate a stop token.")
return mel
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
hparams = self._hparams
......@@ -116,8 +152,7 @@ class Synthesizer:
assert len(np_targets) == len(texts)
feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
embed_fpath = r"E:\Datasets\Synthesizer\embed\embed-85-121551-0036.npy"
feed_dict[self.speaker_embeddings] = np.load(embed_fpath)[None, ...]
feed_dict[self.speaker_embeddings] = np.zeros((len(texts), 256))
if self.gta or not hparams.predict_linear:
mels, alignments, stop_tokens = self.session.run(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册