Inference demo for Tacotron

b7008db2 · Corentin Jemine · 0e8571db · 0e8571db · b7008db2 · b7008db2
5 changed file
--- a/tacotron2/__init__.py
+++ b/tacotron2/__init__.py
--- a/tacotron2/inference_demo.py
+++ b/tacotron2/inference_demo.py
+from datasets.audio import inv_mel_spectrogram
+from tacotron import synthesizer
+from hparams import hparams
+from vlibs import fileio
+import sounddevice as sd
+import tensorflow as tf
+import numpy as np
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+def get_speaker_embed(speaker_id):
+    embed_root = r"E:\Datasets\Synthesizer\embed"
+    embeds = [np.load(f) for f in fileio.get_files(embed_root, "embed-%d-" % speaker_id)]
+    speaker_embed = np.mean(embeds, axis=0)
+    speaker_embed /= np.linalg.norm(speaker_embed, 2)
+    return speaker_embed[None, ...]
+
+if __name__ == '__main__':
+    checkpoint_dir = os.path.join('logs-conditioned', 'taco_pretrained')
+    checkpoint_fpath = tf.train.get_checkpoint_state(checkpoint_dir).model_checkpoint_path
+
+    synth = synthesizer.Synthesizer()
+    synth.load(checkpoint_fpath, hparams)
+    
+    while True:
+        speaker_id = int(input("Speaker ID: "))
+        speaker_embed = get_speaker_embed(speaker_id)
+        text = input("Text: ")
+        mel = synth.my_synthesize(speaker_embed, text)
+        wav = inv_mel_spectrogram(mel.T, hparams)
+        sd.play(wav, 16000)
+        sd.wait()
--- a/tacotron2/sentences.txt
+++ b/tacotron2/sentences.txt
 Synthesizing new speech with a new voice!
-I hope my thesis will work out nicely.
-Can you pass me the butter?
 Automatic multispeaker voice cloning.
+Can you pass me the butter?
+There was no world in which Icarus felt not the need to strike his blade.
 This sentence should be the last one.
\ No newline at end of file
--- a/tacotron2/tacotron/synthesize.py
+++ b/tacotron2/tacotron/synthesize.py
@@ -14,7 +14,6 @@ from tqdm import tqdm
 def generate_fast(model, text):
 	model.synthesize(text, None, None, None, None)

-
 def run_live(args, checkpoint_path, hparams):
 	#Log to Terminal without keeping any records in files
 	log(hparams_debug_string())
@@ -114,6 +113,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
 	log('synthesized mel spectrograms at {}'.format(synth_dir))
 	return os.path.join(synth_dir, 'map.txt')

+
 def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
 	output_dir = 'tacotron_' + args.output_dir


--- a/tacotron2/tacotron/synthesizer.py
+++ b/tacotron2/tacotron/synthesizer.py
 import os
 import wave
-from datetime import datetime

 import numpy as np
 import pyaudio
-import sounddevice as sd
 import tensorflow as tf
 from datasets import audio
 from infolog import log
-from librosa import effects
 from tacotron.models import create_model
 from tacotron.utils import plot
 from tacotron.utils.text import text_to_sequence
@@ -19,7 +16,7 @@ class Synthesizer:
 		log('Constructing model: %s' % model_name)
 		#Force the batch size to be known in order to use attention masking in batch synthesis
 		inputs = tf.placeholder(tf.int32, (None, None), name='inputs')
-		input_lengths = tf.placeholder(tf.int32, (None), name='input_lengths')
+		input_lengths = tf.placeholder(tf.int32, (None,), name='input_lengths')
 		speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size),
 					   name='speaker_embeddings')
 		targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name='mel_targets')
@@ -68,6 +65,45 @@ class Synthesizer:
 		saver = tf.train.Saver()
 		saver.restore(self.session, checkpoint_path)

+	def my_synthesize(self, speaker_embed, text, raise_exception=False):
+		"""
+		Lighter synthesis function that directly returns the mel spectrogram.
+		
+		:param speaker_embed: 
+		:param text: the text to synthesize 
+		:param raise_exception: 
+		:return: 
+		"""
+		
+		# Prepare the input
+		cleaner_names = [x.strip() for x in self._hparams.cleaners.split(',')]
+		seqs = [np.asarray(text_to_sequence(text, cleaner_names))]
+		input_lengths = [len(seq) for seq in seqs]
+		input_seqs, max_seq_len = self._prepare_inputs(seqs)
+		split_infos = [[max_seq_len, 0, 0, 0]]
+		feed_dict = {
+			self.inputs: input_seqs,
+			self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
+			self.split_infos: np.asarray(split_infos, dtype=np.int32),
+			self.speaker_embeddings: speaker_embed
+		}
+		
+		# Forward it
+		mels, alignments, stop_tokens = self.session.run(
+			[self.mel_outputs, self.alignments, self.stop_token_prediction], 
+			feed_dict=feed_dict)
+		mel, alignment, stop_token = mels[0][0], alignments[0][0], stop_tokens[0][0]
+		
+		# Trim the output
+		try:
+			target_length = np.round(stop_token).index(1)
+			mel = mel[:target_length, :]
+		except:
+			if raise_exception:
+				raise Exception("Tacotron could not generate a stop token.")
+			
+		return mel
+

 	def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
 		hparams = self._hparams
@@ -116,8 +152,7 @@ class Synthesizer:
 			assert len(np_targets) == len(texts)

 		feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
-		embed_fpath = r"E:\Datasets\Synthesizer\embed\embed-85-121551-0036.npy"
-		feed_dict[self.speaker_embeddings] = np.load(embed_fpath)[None, ...]
+		feed_dict[self.speaker_embeddings] = np.zeros((len(texts), 256))

 		if self.gta or not hparams.predict_linear:
 			mels, alignments, stop_tokens = self.session.run(