diff --git a/examples/fastspeech/parse.py b/examples/fastspeech/parse.py index 52068d3434e9385dae65746b4e2b7231f1fe8bae..6143d411c378675c5077c38828c2a199ad1f73be 100644 --- a/examples/fastspeech/parse.py +++ b/examples/fastspeech/parse.py @@ -18,7 +18,7 @@ def add_config_options_to_parser(parser): parser.add_argument( '--config_path', type=str, - default='config/fastspeech.yaml', + default='configs/fastspeech.yaml', help="the yaml config file path.") parser.add_argument( '--batch_size', type=int, default=32, help="batch size for training.") @@ -87,7 +87,7 @@ def add_config_options_to_parser(parser): parser.add_argument( '--transtts_path', type=str, - default='./log', + default='../transformer_tts/checkpoint', help="the directory to load pretrain transformerTTS model.") parser.add_argument( '--transformer_step', diff --git a/examples/fastspeech/train.sh b/examples/fastspeech/train.sh index 2301ab35b21a00db1fac5a7643baa039d383630b..0d5f0a34a3efea5479777703787a5345a17d5cc2 100644 --- a/examples/fastspeech/train.sh +++ b/examples/fastspeech/train.sh @@ -10,7 +10,7 @@ python -u train.py \ --use_data_parallel=0 \ --data_path='../../dataset/LJSpeech-1.1' \ --transtts_path='../transformer_tts/checkpoint' \ ---transformer_step=160000 \ +--transformer_step=120000 \ --save_path='./checkpoint' \ --log_dir='./log' \ --config_path='configs/fastspeech.yaml' \ diff --git a/examples/transformer_tts/README.md b/examples/transformer_tts/README.md index ce9cd5b179ab9cc90941f1aa7c47e9a38fc52f22..14ccc5d838d28dd1fe093f02ec5ed6659b6b4ade 100644 --- a/examples/transformer_tts/README.md +++ b/examples/transformer_tts/README.md @@ -55,6 +55,8 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr If you wish to resume from an existing model, please set ``--checkpoint_path`` and ``--transformer_step``. +**Note: In order to ensure the training effect, we recommend using multi-GPU training to enlarge the batch size, and at least 16 samples in single batch per GPU.** + For more help on arguments: ``python train_transformer.py --help``. diff --git a/examples/transformer_tts/data.py b/examples/transformer_tts/data.py index f8e85452d375c69e217271c193a43c69b4abdf4b..6ad9a9729a7c419452d27478df395319acf8363d 100644 --- a/examples/transformer_tts/data.py +++ b/examples/transformer_tts/data.py @@ -23,7 +23,7 @@ from parakeet import audio from parakeet.data.sampler import * from parakeet.data.datacargo import DataCargo from parakeet.data.batch import TextIDBatcher, SpecBatcher -from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset +from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset, SliceDataset from parakeet.models.transformer_tts.utils import * @@ -44,7 +44,7 @@ class LJSpeechLoader: dataset = CacheDataset(dataset) sampler = DistributedSampler( - len(metadata), nranks, rank, shuffle=shuffle) + len(dataset), nranks, rank, shuffle=shuffle) assert args.batch_size % nranks == 0 each_bs = args.batch_size // nranks @@ -64,7 +64,6 @@ class LJSpeechLoader: shuffle=shuffle, batch_fn=batch_examples, drop_last=True) - self.reader = fluid.io.DataLoader.from_generator( capacity=32, iterable=True, @@ -199,12 +198,13 @@ def batch_examples(batch): SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels) mel_inputs = np.transpose( SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels) - enc_slf_mask = get_attn_key_pad_mask(pos_texts, texts).astype(np.float32) + + enc_slf_mask = get_attn_key_pad_mask(pos_texts).astype(np.float32) enc_query_mask = get_non_pad_mask(pos_texts).astype(np.float32) dec_slf_mask = get_dec_attn_key_pad_mask(pos_mels, mel_inputs).astype(np.float32) - enc_dec_mask = get_attn_key_pad_mask(enc_query_mask[:, :, 0], - mel_inputs).astype(np.float32) + enc_dec_mask = get_attn_key_pad_mask(enc_query_mask[:, :, 0]).astype( + np.float32) dec_query_slf_mask = get_non_pad_mask(pos_mels).astype(np.float32) dec_query_mask = get_non_pad_mask(pos_mels).astype(np.float32) diff --git a/examples/transformer_tts/parse.py b/examples/transformer_tts/parse.py index e7f124adda687bb47dae9a2665f915d3b9cfbbcc..f4d2387d53b1c8c3d0d75b89662fa8ac253c804b 100644 --- a/examples/transformer_tts/parse.py +++ b/examples/transformer_tts/parse.py @@ -18,7 +18,7 @@ def add_config_options_to_parser(parser): parser.add_argument( '--config_path', type=str, - default='config/train_transformer.yaml', + default='configs/train_transformer.yaml', help="the yaml config file path.") parser.add_argument( '--batch_size', type=int, default=32, help="batch size for training.") diff --git a/examples/transformer_tts/train_transformer.py b/examples/transformer_tts/train_transformer.py index b63fafc3818f3a3bae489b0b39b5432821792376..c539afe946e449278106cb372dd14c5f7d990ad6 100644 --- a/examples/transformer_tts/train_transformer.py +++ b/examples/transformer_tts/train_transformer.py @@ -14,7 +14,6 @@ import os from tqdm import tqdm from tensorboardX import SummaryWriter -#from pathlib import Path from collections import OrderedDict import argparse from parse import add_config_options_to_parser @@ -69,9 +68,6 @@ def main(args): cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), parameter_list=model.parameters()) - reader = LJSpeechLoader( - cfg, args, nranks, local_rank, shuffle=True).reader() - if args.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint( str(args.transformer_step), @@ -85,6 +81,9 @@ def main(args): strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) + reader = LJSpeechLoader( + cfg, args, nranks, local_rank, shuffle=True).reader() + for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): @@ -148,7 +147,8 @@ def main(args): for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8( - cm.viridis(prob.numpy()[j * 16]) * 255) + cm.viridis(prob.numpy()[j * args.batch_size + // 2]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, @@ -158,7 +158,8 @@ def main(args): for i, prob in enumerate(attn_enc): for j in range(4): x = np.uint8( - cm.viridis(prob.numpy()[j * 16]) * 255) + cm.viridis(prob.numpy()[j * args.batch_size + // 2]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, @@ -168,7 +169,8 @@ def main(args): for i, prob in enumerate(attn_dec): for j in range(4): x = np.uint8( - cm.viridis(prob.numpy()[j * 16]) * 255) + cm.viridis(prob.numpy()[j * args.batch_size + // 2]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, diff --git a/parakeet/models/transformer_tts/utils.py b/parakeet/models/transformer_tts/utils.py index 4b525272ecaf1f1e5e55b4cfc05f55ff0a37ac3c..30c42dfef67b608c696cf8a9f78c0e40e9bdf723 100644 --- a/parakeet/models/transformer_tts/utils.py +++ b/parakeet/models/transformer_tts/utils.py @@ -56,15 +56,13 @@ def get_non_pad_mask(seq): return mask -def get_attn_key_pad_mask(seq_k, seq_q): +def get_attn_key_pad_mask(seq_k): ''' For masking out the padding part of key sequence. ''' - # Expand to fit the shape of key query attention matrix. - len_q = seq_q.shape[1] padding_mask = (seq_k != 0).astype(np.float32) padding_mask = np.expand_dims(padding_mask, axis=1) - padding_mask = padding_mask.repeat([len_q], axis=1) - padding_mask = (padding_mask == 0).astype(np.float32) * (-2**32 + 1) + padding_mask = ( + padding_mask == 0).astype(np.float32) * -1e30 #* (-2**32 + 1) return padding_mask @@ -72,12 +70,12 @@ def get_dec_attn_key_pad_mask(seq_k, seq_q): ''' For masking out the padding part of key sequence. ''' # Expand to fit the shape of key query attention matrix. - len_q = seq_q.shape[1] padding_mask = (seq_k == 0).astype(np.float32) padding_mask = np.expand_dims(padding_mask, axis=1) triu_tensor = get_triu_tensor(seq_q, seq_q) - padding_mask = padding_mask.repeat([len_q], axis=1) + triu_tensor - padding_mask = (padding_mask != 0).astype(np.float32) * (-2**32 + 1) + padding_mask = padding_mask + triu_tensor + padding_mask = ( + padding_mask != 0).astype(np.float32) * -1e30 #* (-2**32 + 1) return padding_mask @@ -85,12 +83,7 @@ def get_triu_tensor(seq_k, seq_q): ''' For make a triu tensor ''' len_k = seq_k.shape[1] len_q = seq_q.shape[1] - batch_size = seq_k.shape[0] triu_tensor = np.triu(np.ones([len_k, len_q]), 1) - triu_tensor = np.repeat( - np.expand_dims( - triu_tensor, axis=0), batch_size, axis=0) - return triu_tensor diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py index 4c350b2ca0cebdd44864996e06342368d504e427..2d4792e19827470b048fa0678381621e8e0431f8 100644 --- a/parakeet/modules/multihead_attention.py +++ b/parakeet/modules/multihead_attention.py @@ -89,7 +89,7 @@ class ScaledDotProductAttention(dg.Layer): # Mask key to ignore padding if mask is not None: attention = attention + mask - attention = layers.softmax(attention) + attention = layers.softmax(attention, use_cudnn=True) attention = layers.dropout( attention, dropout, dropout_implementation='upscale_in_train')