diff --git a/examples/fastspeech/alignments/get_alignments.py b/examples/fastspeech/alignments/get_alignments.py index 3e0eec9da93e6dca4b070df818f1aff0e7cab2de..d31bafca4a454049386afdd43500df7ec6069e11 100644 --- a/examples/fastspeech/alignments/get_alignments.py +++ b/examples/fastspeech/alignments/get_alignments.py @@ -115,15 +115,10 @@ def alignments(args): mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0]) mel_lens = mel_input.shape[1] - dec_slf_mask = get_triu_tensor(mel_input, - mel_input).astype(np.float32) - dec_slf_mask = np.expand_dims(dec_slf_mask, axis=0) - dec_slf_mask = fluid.layers.cast( - dg.to_variable(dec_slf_mask != 0), np.float32) * (-2**32 + 1) pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( - text, mel_input, pos_text, pos_mel, dec_slf_mask) + text, mel_input, pos_text, pos_mel) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) diff --git a/examples/fastspeech/configs/ljspeech.yaml b/examples/fastspeech/configs/ljspeech.yaml index 88b335ab0f67bcd0c3f423f5c00e19c9b5889201..96b0d54bbf6cfdc664597820065f0ec74da2db0e 100644 --- a/examples/fastspeech/configs/ljspeech.yaml +++ b/examples/fastspeech/configs/ljspeech.yaml @@ -29,5 +29,5 @@ train: grad_clip_thresh: 0.1 #the threshold of grad clip. checkpoint_interval: 1000 - max_epochs: 10000 + max_iteration: 500000 diff --git a/examples/fastspeech/train.py b/examples/fastspeech/train.py index 4ad8dca1ab1b6ee3aef6816d3a49f96d6ca8fe17..e575d0fbfa3803c29a11c2a4bc41c21bd5ee385b 100644 --- a/examples/fastspeech/train.py +++ b/examples/fastspeech/train.py @@ -62,7 +62,8 @@ def main(args): cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 - place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() + place = fluid.CUDAPlace(dg.parallel.Env() + .dev_id) if args.use_gpu else fluid.CPUPlace() fluid.enable_dygraph(place) if not os.path.exists(args.output): @@ -88,7 +89,8 @@ def main(args): cfg['train']['batch_size'], nranks, local_rank, - shuffle=True).reader() + shuffle=True).reader + iterator = iter(tqdm(reader)) # Load parameters. global_step = io.load_parameters( @@ -103,52 +105,53 @@ def main(args): strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) - for epoch in range(cfg['train']['max_epochs']): - pbar = tqdm(reader) - - for i, data in enumerate(pbar): - pbar.set_description('Processing at epoch %d' % epoch) - (character, mel, pos_text, pos_mel, alignment) = data - - global_step += 1 - - #Forward - result = model( - character, pos_text, mel_pos=pos_mel, length_target=alignment) - mel_output, mel_output_postnet, duration_predictor_output, _, _ = result - mel_loss = layers.mse_loss(mel_output, mel) - mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) - duration_loss = layers.mean( - layers.abs( - layers.elementwise_sub(duration_predictor_output, - alignment))) - total_loss = mel_loss + mel_postnet_loss + duration_loss - - if local_rank == 0: - writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) - writer.add_scalar('post_mel_loss', - mel_postnet_loss.numpy(), global_step) - writer.add_scalar('duration_loss', - duration_loss.numpy(), global_step) - writer.add_scalar('learning_rate', - optimizer._learning_rate.step().numpy(), - global_step) - - if parallel: - total_loss = model.scale_loss(total_loss) - total_loss.backward() - model.apply_collective_grads() - else: - total_loss.backward() - optimizer.minimize(total_loss) - model.clear_gradients() - - # save checkpoint - if local_rank == 0 and global_step % cfg['train'][ - 'checkpoint_interval'] == 0: - io.save_parameters( - os.path.join(args.output, 'checkpoints'), global_step, - model, optimizer) + while global_step <= cfg['train']['max_iteration']: + try: + batch = next(iterator) + except StopIteration as e: + iterator = iter(tqdm(reader)) + batch = next(iterator) + + (character, mel, pos_text, pos_mel, alignment) = batch + + global_step += 1 + + #Forward + result = model( + character, pos_text, mel_pos=pos_mel, length_target=alignment) + mel_output, mel_output_postnet, duration_predictor_output, _, _ = result + mel_loss = layers.mse_loss(mel_output, mel) + mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) + duration_loss = layers.mean( + layers.abs( + layers.elementwise_sub(duration_predictor_output, alignment))) + total_loss = mel_loss + mel_postnet_loss + duration_loss + + if local_rank == 0: + writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) + writer.add_scalar('post_mel_loss', + mel_postnet_loss.numpy(), global_step) + writer.add_scalar('duration_loss', + duration_loss.numpy(), global_step) + writer.add_scalar('learning_rate', + optimizer._learning_rate.step().numpy(), + global_step) + + if parallel: + total_loss = model.scale_loss(total_loss) + total_loss.backward() + model.apply_collective_grads() + else: + total_loss.backward() + optimizer.minimize(total_loss) + model.clear_gradients() + + # save checkpoint + if local_rank == 0 and global_step % cfg['train'][ + 'checkpoint_interval'] == 0: + io.save_parameters( + os.path.join(args.output, 'checkpoints'), global_step, model, + optimizer) if local_rank == 0: writer.close() diff --git a/examples/transformer_tts/README.md b/examples/transformer_tts/README.md index 1ff74a9bb2a9d5be3fd76ed8db293fc7e8921784..0be870c7e39a2ef2ab81a948472adef9fdc43a83 100644 --- a/examples/transformer_tts/README.md +++ b/examples/transformer_tts/README.md @@ -53,7 +53,7 @@ During synthesis, results are saved in `${output}/samples` and tensorboard log i TransformerTTS model can be trained by running ``train_transformer.py``. ```bash -python train_trasformer.py \ +python train_transformer.py \ --use_gpu=1 \ --data=${DATAPATH} \ --output='./experiment' \ diff --git a/examples/transformer_tts/configs/ljspeech.yaml b/examples/transformer_tts/configs/ljspeech.yaml index 33300ab29964629b92ac14ae242ef0f41cd8648a..f5aabf93b1ce85d94fd6fe554ad4a6e271913c76 100644 --- a/examples/transformer_tts/configs/ljspeech.yaml +++ b/examples/transformer_tts/configs/ljspeech.yaml @@ -31,7 +31,7 @@ train: checkpoint_interval: 1000 image_interval: 2000 - max_epochs: 10000 + max_iteration: 500000 diff --git a/examples/transformer_tts/train_transformer.py b/examples/transformer_tts/train_transformer.py index 8cd766b7af263ac116478846b7dfff09476b6821..646176f6f50f3e87b7809017431b46af4b136dbd 100644 --- a/examples/transformer_tts/train_transformer.py +++ b/examples/transformer_tts/train_transformer.py @@ -102,105 +102,110 @@ def main(args): cfg['train']['batch_size'], nranks, local_rank, - shuffle=True).reader() + shuffle=True).reader - for epoch in range(cfg['train']['max_epochs']): - pbar = tqdm(reader) - for i, data in enumerate(pbar): - pbar.set_description('Processing at epoch %d' % epoch) - character, mel, mel_input, pos_text, pos_mel = data + iterator = iter(tqdm(reader)) - global_step += 1 + global_step += 1 - mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( - character, mel_input, pos_text, pos_mel) + while global_step <= cfg['train']['max_iteration']: + try: + batch = next(iterator) + except StopIteration as e: + iterator = iter(tqdm(reader)) + batch = next(iterator) - mel_loss = layers.mean( - layers.abs(layers.elementwise_sub(mel_pred, mel))) - post_mel_loss = layers.mean( - layers.abs(layers.elementwise_sub(postnet_pred, mel))) - loss = mel_loss + post_mel_loss + character, mel, mel_input, pos_text, pos_mel = batch - # Note: When used stop token loss the learning did not work. - if cfg['network']['stop_token']: - label = (pos_mel == 0).astype(np.float32) - stop_loss = cross_entropy(stop_preds, label) - loss = loss + stop_loss - - if local_rank == 0: - writer.add_scalars('training_loss', { - 'mel_loss': mel_loss.numpy(), - 'post_mel_loss': post_mel_loss.numpy() - }, global_step) + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( + character, mel_input, pos_text, pos_mel) + + mel_loss = layers.mean( + layers.abs(layers.elementwise_sub(mel_pred, mel))) + post_mel_loss = layers.mean( + layers.abs(layers.elementwise_sub(postnet_pred, mel))) + loss = mel_loss + post_mel_loss - if cfg['network']['stop_token']: - writer.add_scalar('stop_loss', - stop_loss.numpy(), global_step) - - if parallel: - writer.add_scalars('alphas', { - 'encoder_alpha': model._layers.encoder.alpha.numpy(), - 'decoder_alpha': model._layers.decoder.alpha.numpy(), - }, global_step) - else: - writer.add_scalars('alphas', { - 'encoder_alpha': model.encoder.alpha.numpy(), - 'decoder_alpha': model.decoder.alpha.numpy(), - }, global_step) - - writer.add_scalar('learning_rate', - optimizer._learning_rate.step().numpy(), - global_step) - - if global_step % cfg['train']['image_interval'] == 1: - for i, prob in enumerate(attn_probs): - for j in range(cfg['network']['decoder_num_head']): - x = np.uint8( - cm.viridis(prob.numpy()[j * cfg['train'][ - 'batch_size'] // 2]) * 255) - writer.add_image( - 'Attention_%d_0' % global_step, - x, - i * 4 + j, - dataformats="HWC") - - for i, prob in enumerate(attn_enc): - for j in range(cfg['network']['encoder_num_head']): - x = np.uint8( - cm.viridis(prob.numpy()[j * cfg['train'][ - 'batch_size'] // 2]) * 255) - writer.add_image( - 'Attention_enc_%d_0' % global_step, - x, - i * 4 + j, - dataformats="HWC") - - for i, prob in enumerate(attn_dec): - for j in range(cfg['network']['decoder_num_head']): - x = np.uint8( - cm.viridis(prob.numpy()[j * cfg['train'][ - 'batch_size'] // 2]) * 255) - writer.add_image( - 'Attention_dec_%d_0' % global_step, - x, - i * 4 + j, - dataformats="HWC") + # Note: When used stop token loss the learning did not work. + if cfg['network']['stop_token']: + label = (pos_mel == 0).astype(np.float32) + stop_loss = cross_entropy(stop_preds, label) + loss = loss + stop_loss + + if local_rank == 0: + writer.add_scalars('training_loss', { + 'mel_loss': mel_loss.numpy(), + 'post_mel_loss': post_mel_loss.numpy() + }, global_step) + + if cfg['network']['stop_token']: + writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: - loss = model.scale_loss(loss) - loss.backward() - model.apply_collective_grads() + writer.add_scalars('alphas', { + 'encoder_alpha': model._layers.encoder.alpha.numpy(), + 'decoder_alpha': model._layers.decoder.alpha.numpy(), + }, global_step) else: - loss.backward() - optimizer.minimize(loss) - model.clear_gradients() - - # save checkpoint - if local_rank == 0 and global_step % cfg['train'][ - 'checkpoint_interval'] == 0: - io.save_parameters( - os.path.join(args.output, 'checkpoints'), global_step, - model, optimizer) + writer.add_scalars('alphas', { + 'encoder_alpha': model.encoder.alpha.numpy(), + 'decoder_alpha': model.decoder.alpha.numpy(), + }, global_step) + + writer.add_scalar('learning_rate', + optimizer._learning_rate.step().numpy(), + global_step) + + if global_step % cfg['train']['image_interval'] == 1: + for i, prob in enumerate(attn_probs): + for j in range(cfg['network']['decoder_num_head']): + x = np.uint8( + cm.viridis(prob.numpy()[j * cfg['train'][ + 'batch_size'] // nranks]) * 255) + writer.add_image( + 'Attention_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") + + for i, prob in enumerate(attn_enc): + for j in range(cfg['network']['encoder_num_head']): + x = np.uint8( + cm.viridis(prob.numpy()[j * cfg['train'][ + 'batch_size'] // nranks]) * 255) + writer.add_image( + 'Attention_enc_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") + + for i, prob in enumerate(attn_dec): + for j in range(cfg['network']['decoder_num_head']): + x = np.uint8( + cm.viridis(prob.numpy()[j * cfg['train'][ + 'batch_size'] // nranks]) * 255) + writer.add_image( + 'Attention_dec_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") + + if parallel: + loss = model.scale_loss(loss) + loss.backward() + model.apply_collective_grads() + else: + loss.backward() + optimizer.minimize(loss) + model.clear_gradients() + + # save checkpoint + if local_rank == 0 and global_step % cfg['train'][ + 'checkpoint_interval'] == 0: + io.save_parameters( + os.path.join(args.output, 'checkpoints'), global_step, model, + optimizer) + global_step += 1 if local_rank == 0: writer.close() diff --git a/parakeet/models/fastspeech/length_regulator.py b/parakeet/models/fastspeech/length_regulator.py index 478d780e4182cfcd24bd6c4ffc2388c06ea21427..ecf03279cd1ed40799d8bf39a795680a77c98efc 100644 --- a/parakeet/models/fastspeech/length_regulator.py +++ b/parakeet/models/fastspeech/length_regulator.py @@ -94,7 +94,8 @@ class LengthRegulator(dg.Layer): else: duration_predictor_output = layers.round(duration_predictor_output) output = self.LR(x, duration_predictor_output, alpha) - mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1)) + mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1)).astype( + np.int64) mel_pos = layers.unsqueeze(mel_pos, [0]) return output, mel_pos