Setup the training configuration for ASR

4cd60d96 · Corentin Jemine · 6ac1ff26 · 4cd60d96 · 4cd60d96 · 4cd60d96
8 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -17,4 +17,5 @@ encoder/saved_models/*_backups
 tacotron2/logs-*
 waveglow
 torch-tacotron2
-wave-rnn/checkpoints
\ No newline at end of file
+wave-rnn/checkpoints
+wave-rnn/model_outputs
\ No newline at end of file
--- a/tacotron2/hparams.py
+++ b/tacotron2/hparams.py
@@ -103,7 +103,7 @@ hparams = tf.contrib.training.HParams(
    clip_mels_length=True,
    # For cases of OOM (Not really recommended, only use if facing unsolvable OOM errors, 
 	# also consider clipping your samples to smaller chunks)
-    max_mel_frames=1200,
+    max_mel_frames=900,
    # Only relevant when clip_mels_length = True, please only use after trying output_per_steps=3
 	#  and still getting OOM errors.
    
@@ -252,7 +252,7 @@ hparams = tf.contrib.training.HParams(
    # major slowdowns! Only use when critical!)
    
    # train/test split ratios, mini-batches sizes
-    tacotron_batch_size=19,  # number of training samples on each training steps (was 32)
+    tacotron_batch_size=25,  # number of training samples on each training steps (was 32)
    # Tacotron Batch synthesis supports ~16x the training batch size (no gradients during 
    # testing). 
    # Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times

--- a/tacotron2/preprocess.py
+++ b/tacotron2/preprocess.py
@@ -54,11 +54,11 @@ def main():
 	parser.add_argument('--base_dir', default='')
 	parser.add_argument('--hparams', default='',
 		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
-	parser.add_argument('--output', default='Synthesizer3')
+	parser.add_argument('--output', default='Synthesizer')
 	parser.add_argument('--n_jobs', type=int, default=cpu_count())
 	
 	# Name of the LibriSpeech sets to use, separated by spaces 
-	# (e.g. "--sets train-other-500 train-clean-360). Defaults to using all the training sets 
+	# (e.g. "--sets train-other-500 train-clean-360). Defaults to using all the clean training sets 
 	# present in the LibriSpeech directory.
 	parser.add_argument('--sets', type=str, nargs='+', default=None)
 	

--- a/tacotron2/synthesize.py
+++ b/tacotron2/synthesize.py
@@ -8,17 +8,16 @@ import tensorflow as tf
 from hparams import hparams
 from infolog import log
 from tacotron.synthesize import tacotron_synthesize
-from wavenet_vocoder.synthesize import wavenet_synthesize


 def prepare_run(args):
    modified_hp = hparams.parse(args.hparams)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
    
-    run_name = args.name or args.tacotron_name or args.model
+    run_name = args.name
    taco_checkpoint = os.path.join('logs-' + run_name, 'taco_' + args.checkpoint)
    
-    run_name = args.name or args.wavenet_name or args.model
+    run_name = args.name
    wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint)
    return taco_checkpoint, wave_checkpoint, modified_hp

@@ -33,7 +32,7 @@ def get_sentences(args):


 def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences):
-    log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model))
+    log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name))
    log('Synthesizing mel-spectrograms from text..')
    wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
    # Delete Tacotron model from graph
@@ -42,7 +41,7 @@ def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences):
    # synthesizing
    sleep(0.5)
    log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
-    wavenet_synthesize(args, hparams, wave_checkpoint)
+    raise NotImplemented()
    log('Tacotron-2 TTS synthesis complete!')


@@ -80,43 +79,17 @@ def main():
 							 'ids')
    args = parser.parse_args()
    
-    accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2']
-    
-    if args.model not in accepted_models:
-        raise ValueError(
-            'please enter a valid model to synthesize with: {}'.format(accepted_models))
    
    if args.mode not in accepted_modes:
        raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode))
    
-    if args.mode == 'live' and args.model == 'Wavenet':
-        raise RuntimeError(
-            'Wavenet vocoder cannot be tested live due to its slow generation. Live only works '
-			'with Tacotron!')
-    
    if args.GTA not in ('True', 'False'):
        raise ValueError('GTA option must be either True or False')
    
-    if args.model == 'Tacotron-2':
-        if args.mode == 'live':
-            warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!')
-        if args.mode == 'synthesis':
-            raise ValueError(
-                'I don\'t recommend running WaveNet on entire dataset.. The world might end '
-				'before the synthesis :) (only eval allowed)')
-    
    taco_checkpoint, wave_checkpoint, hparams = prepare_run(args)
    sentences = get_sentences(args)
    
-    if args.model == 'Tacotron':
-        _ = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
-    elif args.model == 'WaveNet':
-        wavenet_synthesize(args, hparams, wave_checkpoint)
-    elif args.model == 'Tacotron-2':
-        synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences)
-    else:
-        raise ValueError('Model provided {} unknown! {}'.format(args.model, accepted_models))
-
+    _ = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)

 if __name__ == '__main__':
    main()
--- a/tacotron2/tacotron/synthesize.py
+++ b/tacotron2/tacotron/synthesize.py
@@ -42,9 +42,6 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
 	eval_dir = os.path.join(output_dir, 'eval')
 	log_dir = os.path.join(output_dir, 'logs-eval')

-	if args.model == 'Tacotron-2':
-		assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir)
-
 	#Create output path if it doesn't exist
 	os.makedirs(eval_dir, exist_ok=True)
 	os.makedirs(log_dir, exist_ok=True)

--- a/tacotron2/tacotron/train.py
+++ b/tacotron2/tacotron/train.py
@@ -88,10 +88,7 @@ def time_string():

 def model_train_mode(args, feeder, hparams, global_step):
    with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
-        model_name = None
-        if args.model == 'Tacotron-2':
-            model_name = 'Tacotron'
-        model = create_model(model_name or args.model, hparams)
+        model = create_model('Tacotron', hparams)
        if hparams.predict_linear:
            model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, 
                             feeder.mel_targets, feeder.token_targets, 
@@ -111,10 +108,7 @@ def model_train_mode(args, feeder, hparams, global_step):

 def model_test_mode(args, feeder, hparams, global_step):
    with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
-        model_name = None
-        if args.model == 'Tacotron-2':
-            model_name = 'Tacotron'
-        model = create_model(model_name or args.model, hparams)
+        model = create_model('Tacotron', hparams)
        if hparams.predict_linear:
            model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, 
                             feeder.speaker_embeddings, feeder.eval_mel_targets, 
@@ -161,7 +155,7 @@ def train(log_dir, args, hparams):
    
    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
-    log('Using model: {}'.format(args.model))
+    log('Using model: Tacotron')
    log(hparams_debug_string())
    
    # Start by setting a seed for repeatability
@@ -323,7 +317,7 @@ def train(log_dir, args, hparams):
                    
                    plot.plot_alignment(align, os.path.join(eval_plot_dir,
                                                            'step-{}-eval-align.png'.format(step)),
-                                        title='{}, {}, step={}, loss={:.5f}'.format(args.model,
+                                        title='{}, {}, step={}, loss={:.5f}'.format('Tacotron',
                                                                                    time_string(),
                                                                                    step,
                                                                                    eval_loss),
@@ -332,7 +326,7 @@ def train(log_dir, args, hparams):
                                                              'step-{'
 															  '}-eval-mel-spectrogram.png'.format(
                                                                  step)),
-                                          title='{}, {}, step={}, loss={:.5f}'.format(args.model,
+                                          title='{}, {}, step={}, loss={:.5f}'.format('Tacotron',
                                                                                      time_string(),
                                                                                      step,
                                                                                      eval_loss),
@@ -344,7 +338,7 @@ def train(log_dir, args, hparams):
                                                                  'step-{}-eval-linear-spectrogram.png'.format(
                                                                      step)),
                                              title='{}, {}, step={}, loss={:.5f}'.format(
-                                                  args.model, time_string(), step, eval_loss),
+                                                  'Tacotron', time_string(), step, eval_loss),
                                              target_spectrogram=lin_t,
                                              max_len=t_len, auto_aspect=True)
                    
@@ -387,7 +381,7 @@ def train(log_dir, args, hparams):
                                                                              'step-{}-linear-spectrogram.png'.format(
                                                                                  step)),
                                              title='{}, {}, step={}, loss={:.5f}'.format(
-                                                  args.model, time_string(), step, loss),
+                                                  'Tacotron', time_string(), step, loss),
                                              target_spectrogram=linear_target,
                                              max_len=target_length, auto_aspect=True)
                    
@@ -414,7 +408,7 @@ def train(log_dir, args, hparams):
                    # save alignment plot to disk (control purposes)
                    plot.plot_alignment(alignment,
                                        os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
-                                        title='{}, {}, step={}, loss={:.5f}'.format(args.model,
+                                        title='{}, {}, step={}, loss={:.5f}'.format('Tacotron',
                                                                                    time_string(),
                                                                                    step, loss),
                                        max_len=target_length // hparams.outputs_per_step)
@@ -422,7 +416,7 @@ def train(log_dir, args, hparams):
                    plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir,
                                                                       'step-{}-mel-spectrogram.png'.format(
                                                                           step)),
-                                          title='{}, {}, step={}, loss={:.5f}'.format(args.model,
+                                          title='{}, {}, step={}, loss={:.5f}'.format('Tacotron',
                                                                                      time_string(),
                                                                                      step, loss),
                                          target_spectrogram=target,

--- a/tacotron2/temp.py
+++ b/tacotron2/temp.py
+from vlibs import fileio
+import numpy as np
+
+root = r'E:\Datasets\Synthesizer'
+
+lines = fileio.read_all_lines(fileio.join(root, "train.txt"))
+out = []
+pruned = 0
+intact = 0
+for line in lines:
+    line = line.rstrip()
+    audio_fname, mel_fname, embed_fname, *_ = line.split('|')
+    mel = np.load(fileio.join(root, "mels", mel_fname))
+    if len(mel) > 900:
+        fileio.remove(fileio.join(root, "audio", audio_fname))
+        fileio.remove(fileio.join(root, "mels", mel_fname))
+        fileio.remove(fileio.join(root, "embed", embed_fname))
+        pruned += 1
+    else:
+        intact += 1
+        out.append(line)
+        if intact %100 == 0:
+            print("%d / %d" % (intact, pruned))
+out.append('')        
+fileio.write_all_lines(fileio.join(root, "train2.txt"), out)
+print("%d / %d" % (intact, pruned))
--- a/tacotron2/train.py
+++ b/tacotron2/train.py
@@ -34,7 +34,7 @@ def read_seq(file):
 def prepare_run(args):
    modified_hp = hparams.parse(args.hparams)
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_log_level)
-    run_name = args.name or args.model
+    run_name = args.name
    log_dir = os.path.join(args.base_dir, 'logs-{}'.format(run_name))
    os.makedirs(log_dir, exist_ok=True)
    infolog.init(os.path.join(log_dir, 'Terminal_train_log'), run_name, args.slack_url)
@@ -109,7 +109,7 @@ def main():
                        help='Steps between writing checkpoints')
    parser.add_argument('--eval_interval', type=int, default=10000,
                        help='Steps between eval on test data')
-    parser.add_argument('--tacotron_train_steps', type=int, default=200000, # Was 100000
+    parser.add_argument('--tacotron_train_steps', type=int, default=500000, # Was 100000
                        help='total number of tacotron training steps')
    parser.add_argument('--tf_log_level', type=int, default=1, help='Tensorflow C++ log level.')
    parser.add_argument('--slack_url', default=None,