First prototype

0e8571db · Corentin Jemine · 8968ffc1 · 0e8571db · 0e8571db · 0e8571db
8 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -20,5 +20,6 @@ tacotron2/debug
 tacotron2/papers
 tacotron2/wav_out
 tacotron2/_old/
+tacotron2/logs-conditioned
 WaveGlow
 torch-tacotron2
\ No newline at end of file
--- a/documents/images/mos_all.png
+++ b/documents/images/mos_all.png
--- a/tacotron2/griffin_lim_synthesis_tool.py
+++ b/tacotron2/griffin_lim_synthesis_tool.py
@@ -25,7 +25,7 @@ for i in range(1, len(fnames)):
    wav = inv_mel_spectrogram(mel_spectro.T, hparams) 
    sounddevice.wait()
    print(fnames[i])
-    sounddevice.play(wav, 22050)
+    sounddevice.play(wav, 16000)
 sounddevice.wait()
 quit()


--- a/tacotron2/sentences.txt
+++ b/tacotron2/sentences.txt
-Scientists at the CERN laboratory say they have discovered a new particle.
-There's a way to measure the acute emotional intelligence that has never gone out of style.
-President Trump met with other leaders at the Group of 20 conference.
-The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.
-
-Generative adversarial network or variational auto-encoder.
-Basilar membrane and otolaryngology are not auto-correlations.
-He has read the whole thing.
-He reads books.
-He thought it was time to present the present.
-Thisss isrealy awhsome.
-Punctuation sensitivity, is working.
-Punctuation sensitivity is working.
-Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
-She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.
-Tajima Airport serves Toyooka.
-
-Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization. This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that the adopted architecture is able to perform this task with wild success.
-Thank you so much for your support!
\ No newline at end of file
+Synthesizing new speech with a new voice!
+I hope my thesis will work out nicely.
+Can you pass me the butter?
+Automatic multispeaker voice cloning.
+This sentence should be the last one.
\ No newline at end of file
--- a/tacotron2/sentences_old.txt
+++ b/tacotron2/sentences_old.txt
+Scientists at the CERN laboratory say they have discovered a new particle.
+There's a way to measure the acute emotional intelligence that has never gone out of style.
+President Trump met with other leaders at the Group of 20 conference.
+The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.
+Generative adversarial network or variational auto-encoder.
+Basilar membrane and otolaryngology are not auto-correlations.
+He has read the whole thing.
+He reads books.
+He thought it was time to present the present.
+Thisss isrealy awhsome.
+Punctuation sensitivity, is working.
+Punctuation sensitivity is working.
+Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
+She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.
+Tajima Airport serves Toyooka.
+Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization. This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that the adopted architecture is able to perform this task with wild success.
+Thank you so much for your support!
\ No newline at end of file
--- a/tacotron2/tacotron/synthesize.py
+++ b/tacotron2/tacotron/synthesize.py
@@ -124,11 +124,13 @@ def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
 		raise RuntimeError('Failed to load checkpoint at {}'.format(checkpoint))

 	if hparams.tacotron_synthesis_batch_size < hparams.tacotron_num_gpus:
-		raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} (num_gpus)! Please verify your synthesis batch size choice.'.format(
+		raise ValueError('Defined synthesis batch size {} is smaller than minimum required {} '
+						 '(num_gpus)! Please verify your synthesis batch size choice.'.format(
 			hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))

 	if hparams.tacotron_synthesis_batch_size % hparams.tacotron_num_gpus != 0:
-		raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! Please verify your synthesis batch size choice!'.format(
+		raise ValueError('Defined synthesis batch size {} is not a multiple of {} (num_gpus)! '
+						 'Please verify your synthesis batch size choice!'.format(
 			hparams.tacotron_synthesis_batch_size, hparams.tacotron_num_gpus))

 	if args.mode == 'eval':

--- a/tacotron2/tacotron/synthesizer.py
+++ b/tacotron2/tacotron/synthesizer.py
@@ -52,6 +52,7 @@ class Synthesizer:

 		self.inputs = inputs
 		self.input_lengths = input_lengths
+		self.speaker_embeddings = speaker_embeddings
 		self.targets = targets
 		self.split_infos = split_infos

@@ -115,9 +116,13 @@ class Synthesizer:
 			assert len(np_targets) == len(texts)

 		feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
+		embed_fpath = r"E:\Datasets\Synthesizer\embed\embed-85-121551-0036.npy"
+		feed_dict[self.speaker_embeddings] = np.load(embed_fpath)[None, ...]

 		if self.gta or not hparams.predict_linear:
-			mels, alignments, stop_tokens = self.session.run([self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)
+			mels, alignments, stop_tokens = self.session.run(
+				[self.mel_outputs, self.alignments, self.stop_token_prediction], 
+				feed_dict=feed_dict)
 			#Linearize outputs (1D arrays)
 			mels = [mel for gpu_mels in mels for mel in gpu_mels]
 			alignments = [align for gpu_aligns in alignments for align in gpu_aligns]
@@ -133,7 +138,10 @@ class Synthesizer:
 			assert len(mels) == len(texts)

 		else:
-			linears, mels, alignments, stop_tokens = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)
+			linears, mels, alignments, stop_tokens = self.session.run(
+				[self.linear_outputs, self.mel_outputs, self.alignments, 
+				 self.stop_token_prediction], 
+				feed_dict=feed_dict)
 			#Linearize outputs (1D arrays)
 			linears = [linear for gpu_linear in linears for linear in gpu_linear]
 			mels = [mel for gpu_mels in mels for mel in gpu_mels]
@@ -215,8 +223,6 @@ class Synthesizer:
 					plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
 						title='{}'.format(texts[i]), split_title=True, auto_aspect=True)

-
-
 		return saved_mels_paths, speaker_ids

 	def _round_up(self, x, multiple):

--- a/tacotron2/train.py
+++ b/tacotron2/train.py
@@ -115,11 +115,11 @@ def main():
 							 'in Tacotron synthesis mode')
    parser.add_argument('--restore', type=bool, default=True,
                        help='Set this to False to do a fresh training')
-    parser.add_argument('--summary_interval', type=int, default=500,
+    parser.add_argument('--summary_interval', type=int, default=2500,
                        help='Steps between running summary ops')
    parser.add_argument('--embedding_interval', type=int, default=10000,
                        help='Steps between updating embeddings projection visualization')
-    parser.add_argument('--checkpoint_interval', type=int, default=2000, # Was 5000
+    parser.add_argument('--checkpoint_interval', type=int, default=2500, # Was 5000
                        help='Steps between writing checkpoints')
    parser.add_argument('--eval_interval', type=int, default=10000,
                        help='Steps between eval on test data')