diff --git a/examples/clarinet/utils.py b/examples/clarinet/utils.py index a0ec74615ece50a2d2f5ae9a934c96f1334e30f8..8e2f39bc25be8179072e60b43aa8a91380b8db71 100644 --- a/examples/clarinet/utils.py +++ b/examples/clarinet/utils.py @@ -67,13 +67,13 @@ def save_checkpoint(model, optim, checkpoint_dir, global_step): def load_model(model, path): model_dict, _ = dg.load_dygraph(path) - model.state_dict(model_dict) + model.set_dict(model_dict) print("loaded model from {}.pdparams".format(path)) def load_checkpoint(model, optim, path): model_dict, optim_dict = dg.load_dygraph(path) - model.state_dict(model_dict) + model.set_dict(model_dict) print("loaded model from {}.pdparams".format(path)) if optim_dict: optim.set_dict(optim_dict) diff --git a/examples/deepvoice3/utils.py b/examples/deepvoice3/utils.py index 756d008cd20c9e0f9175456559a6bf6b218866db..1996d7d03a84e033dce8a4f0bcfb2c945074cf87 100644 --- a/examples/deepvoice3/utils.py +++ b/examples/deepvoice3/utils.py @@ -69,7 +69,6 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx=None, embedding_weight_std=embedding_std, convolutions=encoder_convolutions, - max_positions=max_positions, dropout=dropout) if freeze_embedding: freeze(enc.embed) @@ -91,7 +90,6 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim, mel_dim, r=r, max_positions=max_positions, - padding_idx=padding_idx, preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, diff --git a/examples/fastspeech/config/fastspeech.yaml b/examples/fastspeech/configs/fastspeech.yaml similarity index 100% rename from examples/fastspeech/config/fastspeech.yaml rename to examples/fastspeech/configs/fastspeech.yaml diff --git a/examples/fastspeech/config/synthesis.yaml b/examples/fastspeech/configs/synthesis.yaml similarity index 88% rename from examples/fastspeech/config/synthesis.yaml rename to examples/fastspeech/configs/synthesis.yaml index 9a43dfff4e5aef6fadf2279c3406267292d7216c..ab9dbb48e9756b0d5b8ed4a00edf608ce1e7531a 100644 --- a/examples/fastspeech/config/synthesis.yaml +++ b/examples/fastspeech/configs/synthesis.yaml @@ -3,8 +3,8 @@ audio: n_fft: 2048 sr: 22050 preemphasis: 0.97 - hop_length: 275 - win_length: 1102 + hop_length: 256 + win_length: 1024 power: 1.2 min_level_db: -100 ref_level_db: 20 diff --git a/examples/fastspeech/parse.py b/examples/fastspeech/parse.py index 690f4b2e44ce646505e8c9c8031e2000faeba9d1..52068d3434e9385dae65746b4e2b7231f1fe8bae 100644 --- a/examples/fastspeech/parse.py +++ b/examples/fastspeech/parse.py @@ -52,6 +52,12 @@ def add_config_options_to_parser(parser): type=int, default=0, help="use data parallel or not during training.") + parser.add_argument( + '--alpha', + type=float, + default=1.0, + help="The hyperparameter to determine the length of the expanded sequence \ + mel, thereby controlling the voice speed.") parser.add_argument( '--data_path', diff --git a/examples/fastspeech/synthesis.py b/examples/fastspeech/synthesis.py index 802d4e4b9fa2e7f5ad4967e2acb62b527496784d..774a67fa032c656f4758097556405ade3fbcea2b 100644 --- a/examples/fastspeech/synthesis.py +++ b/examples/fastspeech/synthesis.py @@ -24,6 +24,7 @@ import paddle.fluid.dygraph as dg from parakeet.g2p.en import text_to_sequence from parakeet import audio from parakeet.models.fastspeech.fastspeech import FastSpeech +from parakeet.models.transformer_tts.utils import * def load_checkpoint(step, model_path): @@ -59,12 +60,26 @@ def synthesis(text_input, args): model.eval() text = np.asarray(text_to_sequence(text_input)) - text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) + text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) - pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) + pos_text = np.expand_dims(pos_text, axis=0) + enc_non_pad_mask = get_non_pad_mask(pos_text).astype(np.float32) + enc_slf_attn_mask = get_attn_key_pad_mask(pos_text, + text).astype(np.float32) + + text = dg.to_variable(text) + pos_text = dg.to_variable(pos_text) + enc_non_pad_mask = dg.to_variable(enc_non_pad_mask) + enc_slf_attn_mask = dg.to_variable(enc_slf_attn_mask) mel_output, mel_output_postnet = model( - text, pos_text, alpha=args.alpha) + text, + pos_text, + alpha=args.alpha, + enc_non_pad_mask=enc_non_pad_mask, + enc_slf_attn_mask=enc_slf_attn_mask, + dec_non_pad_mask=None, + dec_slf_attn_mask=None) _ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], diff --git a/examples/fastspeech/train.py b/examples/fastspeech/train.py index f1b59a2ea2428e89a9c56b235cb648e5a761e8ab..7565ac950baa890c54741cbe770517d2f50113f8 100644 --- a/examples/fastspeech/train.py +++ b/examples/fastspeech/train.py @@ -21,6 +21,7 @@ from parse import add_config_options_to_parser from pprint import pprint from ruamel import yaml from tqdm import tqdm +from matplotlib import cm from collections import OrderedDict from tensorboardX import SummaryWriter import paddle.fluid.dygraph as dg @@ -66,12 +67,12 @@ def main(args): with dg.guard(place): with fluid.unique_name.guard(): - transformerTTS = TransformerTTS(cfg) + transformer_tts = TransformerTTS(cfg) model_dict, _ = load_checkpoint( str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) - transformerTTS.set_dict(model_dict) - transformerTTS.eval() + transformer_tts.set_dict(model_dict) + transformer_tts.eval() model = FastSpeech(cfg) model.train() @@ -100,13 +101,33 @@ def main(args): for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) - character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data + (character, mel, mel_input, pos_text, pos_mel, text_length, + mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask, + enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data - _, _, attn_probs, _, _, _ = transformerTTS( - character, mel_input, pos_text, pos_mel) - alignment = dg.to_variable( - get_alignment(attn_probs, mel_lens, cfg[ - 'transformer_head'])).astype(np.float32) + _, _, attn_probs, _, _, _ = transformer_tts( + character, + mel_input, + pos_text, + pos_mel, + dec_slf_mask=dec_slf_mask, + enc_slf_mask=enc_slf_mask, + enc_query_mask=enc_query_mask, + enc_dec_mask=enc_dec_mask, + dec_query_slf_mask=dec_query_slf_mask, + dec_query_mask=dec_query_mask) + alignment, max_attn = get_alignment(attn_probs, mel_lens, + cfg['transformer_head']) + alignment = dg.to_variable(alignment).astype(np.float32) + + if local_rank == 0 and global_step % 5 == 1: + x = np.uint8( + cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255) + writer.add_image( + 'Attention_%d_0' % global_step, + x, + 0, + dataformats="HWC") global_step += 1 @@ -115,7 +136,11 @@ def main(args): character, pos_text, mel_pos=pos_mel, - length_target=alignment) + length_target=alignment, + enc_non_pad_mask=enc_query_mask, + enc_slf_attn_mask=enc_slf_mask, + dec_non_pad_mask=dec_query_slf_mask, + dec_slf_attn_mask=dec_slf_mask) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) diff --git a/examples/fastspeech/train.sh b/examples/fastspeech/train.sh index d293c0cd59b897b97143d8f0478c01877a2960a0..11e78c4e8449c7c24ac5b51394e0a37e20428319 100644 --- a/examples/fastspeech/train.sh +++ b/examples/fastspeech/train.sh @@ -1,6 +1,6 @@ # train model # if you wish to resume from an exists model, uncomment --checkpoint_path and --fastspeech_step -CUDA_VISIBLE_DEVICES=0\ +export CUDA_VISIBLE_DEVICES=0 python -u train.py \ --batch_size=32 \ --epochs=10000 \ diff --git a/examples/transformer_tts/config/synthesis.yaml b/examples/transformer_tts/configs/synthesis.yaml similarity index 72% rename from examples/transformer_tts/config/synthesis.yaml rename to examples/transformer_tts/configs/synthesis.yaml index 217dd8511667827497575a69f503b2ed7d08d273..c23b029354a2d69c1dda2f50953eddb74c2c4c67 100644 --- a/examples/transformer_tts/config/synthesis.yaml +++ b/examples/transformer_tts/configs/synthesis.yaml @@ -8,4 +8,7 @@ audio: power: 1.2 min_level_db: -100 ref_level_db: 20 - outputs_per_step: 1 \ No newline at end of file + outputs_per_step: 1 + +hidden_size: 256 +embedding_size: 512 \ No newline at end of file diff --git a/examples/transformer_tts/config/train_transformer.yaml b/examples/transformer_tts/configs/train_transformer.yaml similarity index 100% rename from examples/transformer_tts/config/train_transformer.yaml rename to examples/transformer_tts/configs/train_transformer.yaml diff --git a/examples/transformer_tts/config/train_vocoder.yaml b/examples/transformer_tts/configs/train_vocoder.yaml similarity index 100% rename from examples/transformer_tts/config/train_vocoder.yaml rename to examples/transformer_tts/configs/train_vocoder.yaml diff --git a/examples/transformer_tts/data.py b/examples/transformer_tts/data.py index 99c6739329de9be22c1778b30a8d7353a7f0370c..f8e85452d375c69e217271c193a43c69b4abdf4b 100644 --- a/examples/transformer_tts/data.py +++ b/examples/transformer_tts/data.py @@ -23,7 +23,8 @@ from parakeet import audio from parakeet.data.sampler import * from parakeet.data.datacargo import DataCargo from parakeet.data.batch import TextIDBatcher, SpecBatcher -from parakeet.data.dataset import DatasetMixin, TransformDataset +from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset +from parakeet.models.transformer_tts.utils import * class LJSpeechLoader: @@ -40,6 +41,8 @@ class LJSpeechLoader: metadata = LJSpeechMetaData(LJSPEECH_ROOT) transformer = LJSpeech(config) dataset = TransformDataset(metadata, transformer) + dataset = CacheDataset(dataset) + sampler = DistributedSampler( len(metadata), nranks, rank, shuffle=shuffle) @@ -196,8 +199,18 @@ def batch_examples(batch): SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels) mel_inputs = np.transpose( SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels) + enc_slf_mask = get_attn_key_pad_mask(pos_texts, texts).astype(np.float32) + enc_query_mask = get_non_pad_mask(pos_texts).astype(np.float32) + dec_slf_mask = get_dec_attn_key_pad_mask(pos_mels, + mel_inputs).astype(np.float32) + enc_dec_mask = get_attn_key_pad_mask(enc_query_mask[:, :, 0], + mel_inputs).astype(np.float32) + dec_query_slf_mask = get_non_pad_mask(pos_mels).astype(np.float32) + dec_query_mask = get_non_pad_mask(pos_mels).astype(np.float32) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), - np.array(mel_lens)) + np.array(mel_lens), enc_slf_mask, enc_query_mask, dec_slf_mask, + enc_dec_mask, dec_query_slf_mask, dec_query_mask) def batch_examples_vocoder(batch): diff --git a/examples/transformer_tts/synthesis.py b/examples/transformer_tts/synthesis.py index de833626980b25f759dbc383afd74754b7a08ea9..2896634feaa95a2e619da15aa675644564d99f45 100644 --- a/examples/transformer_tts/synthesis.py +++ b/examples/transformer_tts/synthesis.py @@ -16,6 +16,7 @@ from scipy.io.wavfile import write from parakeet.g2p.en import text_to_sequence import numpy as np from tqdm import tqdm +from matplotlib import cm from tensorboardX import SummaryWriter from ruamel import yaml import paddle.fluid as fluid @@ -25,6 +26,7 @@ import argparse from parse import add_config_options_to_parser from pprint import pprint from collections import OrderedDict +from parakeet.models.transformer_tts.utils import * from parakeet import audio from parakeet.models.transformer_tts.vocoder import Vocoder from parakeet.models.transformer_tts.transformer_tts import TransformerTTS @@ -78,14 +80,18 @@ def synthesis(text_input, args): pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pbar = tqdm(range(args.max_len)) - for i in pbar: + dec_slf_mask = get_triu_tensor( + mel_input.numpy(), mel_input.numpy()).astype(np.float32) + dec_slf_mask = fluid.layers.cast( + dg.to_variable(dec_slf_mask == 0), np.float32) pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( - text, mel_input, pos_text, pos_mel) + text, mel_input, pos_text, pos_mel, dec_slf_mask) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) + mag_pred = model_vocoder(postnet_pred) _ljspeech_processor = audio.AudioProcessor( @@ -111,6 +117,33 @@ def synthesis(text_input, args): wav = _ljspeech_processor.inv_spectrogram( fluid.layers.transpose( fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) + global_step = 0 + for i, prob in enumerate(attn_probs): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) + writer.add_image( + 'Attention_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") + + for i, prob in enumerate(attn_enc): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) + writer.add_image( + 'Attention_enc_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") + + for i, prob in enumerate(attn_dec): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) + writer.add_image( + 'Attention_dec_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) if not os.path.exists(args.sample_path): os.mkdir(args.sample_path) @@ -124,4 +157,6 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description="Synthesis model") add_config_options_to_parser(parser) args = parser.parse_args() - synthesis("Transformer model is so fast!", args) + synthesis( + "They emphasized the necessity that the information now being furnished be handled with judgment and care.", + args) diff --git a/examples/transformer_tts/synthesis.sh b/examples/transformer_tts/synthesis.sh index 8cb137ac25e94f876c9ebd0a08708a259f0406b6..42b704da2477a47ba7bb8042e620c32197bc7000 100644 --- a/examples/transformer_tts/synthesis.sh +++ b/examples/transformer_tts/synthesis.sh @@ -2,10 +2,10 @@ # train model CUDA_VISIBLE_DEVICES=0 \ python -u synthesis.py \ ---max_len=50 \ +--max_len=600 \ --transformer_step=160000 \ ---vocoder_step=70000 \ ---use_gpu=1 +--vocoder_step=90000 \ +--use_gpu=1 \ --checkpoint_path='./checkpoint' \ --log_dir='./log' \ --sample_path='./sample' \ diff --git a/examples/transformer_tts/train_transformer.py b/examples/transformer_tts/train_transformer.py index f3dd0231b052d1f837eb4fbd9e7b3b4efda70f79..b63fafc3818f3a3bae489b0b39b5432821792376 100644 --- a/examples/transformer_tts/train_transformer.py +++ b/examples/transformer_tts/train_transformer.py @@ -14,7 +14,7 @@ import os from tqdm import tqdm from tensorboardX import SummaryWriter -from pathlib import Path +#from pathlib import Path from collections import OrderedDict import argparse from parse import add_config_options_to_parser @@ -89,21 +89,31 @@ def main(args): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) - character, mel, mel_input, pos_text, pos_mel, text_length, _ = data + character, mel, mel_input, pos_text, pos_mel, text_length, _, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask = data global_step += 1 - mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( - character, mel_input, pos_text, pos_mel) - label = (pos_mel == 0).astype(np.float32) + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( + character, + mel_input, + pos_text, + pos_mel, + dec_slf_mask=dec_slf_mask, + enc_slf_mask=enc_slf_mask, + enc_query_mask=enc_query_mask, + enc_dec_mask=enc_dec_mask, + dec_query_slf_mask=dec_query_slf_mask, + dec_query_mask=dec_query_mask) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss + # Note: When used stop token loss the learning did not work. if args.stop_token: + label = (pos_mel == 0).astype(np.float32) stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss diff --git a/examples/transformer_tts/train_transformer.sh b/examples/transformer_tts/train_transformer.sh index cdb24cfb39fa149980e12701b4e7304d509cbc40..346d3512fefab0e80238684f454f55e35b5b3b7c 100644 --- a/examples/transformer_tts/train_transformer.sh +++ b/examples/transformer_tts/train_transformer.sh @@ -1,7 +1,7 @@ # train model # if you wish to resume from an exists model, uncomment --checkpoint_path and --transformer_step -CUDA_VISIBLE_DEVICES=0 \ +export CUDA_VISIBLE_DEVICES=2 python -u train_transformer.py \ --batch_size=32 \ --epochs=10000 \ diff --git a/examples/waveflow/README.md b/examples/waveflow/README.md index e21039a7c3553f83e90ba1bc66734a37e0cd4698..d36f0f30f13dc42e30e2feed71e432b1f12c000f 100644 --- a/examples/waveflow/README.md +++ b/examples/waveflow/README.md @@ -4,7 +4,7 @@ PaddlePaddle dynamic graph implementation of [WaveFlow: A Compact Flow-based Mod - WaveFlow can synthesize 22.05 kHz high-fidelity speech around 40x faster than real-time on a Nvidia V100 GPU without engineered inference kernels, which is faster than [WaveGlow] (https://github.com/NVIDIA/waveglow) and serveral orders of magnitude faster than WaveNet. - WaveFlow is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M) and comparable to WaveNet (4.6M). -- WaveFlow is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in Parallel WaveNet and ClariNet, which simplifies the training pipeline and reduces the cost of development. +- WaveFlow is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in Parallel WaveNet and ClariNet, which simplifies the training pipeline and reduces the cost of development. ## Project Structure ```text @@ -99,7 +99,7 @@ python -u synthesis.py \ --sigma=1.0 ``` -In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset. +In this example, `--output` specifies where to save the synthesized audios and `--sample` (<16) specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset. ### Benchmarking diff --git a/examples/waveflow/utils.py b/examples/waveflow/utils.py index da9b4ba90b2099c21204be56b407d0748d95762f..b89907378c46552b1289f2386d0b09962d1db0a9 100644 --- a/examples/waveflow/utils.py +++ b/examples/waveflow/utils.py @@ -109,6 +109,16 @@ def add_yaml_config(config): def load_latest_checkpoint(checkpoint_dir, rank=0): + """Get the iteration number corresponding to the latest saved checkpoint + + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + rank (int, optional): the rank of the process in multi-process setting. + Defaults to 0. + + Returns: + int: the latest iteration number. + """ checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") # Create checkpoint index file if not exist. if (not os.path.isfile(checkpoint_path)) and rank == 0: @@ -129,6 +139,15 @@ def load_latest_checkpoint(checkpoint_dir, rank=0): def save_latest_checkpoint(checkpoint_dir, iteration): + """Save the iteration number of the latest model to be checkpointed. + + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + iteration (int): the latest iteration number. + + Returns: + None + """ checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") # Update the latest checkpoint index. with open(checkpoint_path, "w") as handle: @@ -142,6 +161,24 @@ def load_parameters(checkpoint_dir, iteration=None, file_path=None, dtype="float32"): + """Load a specific model checkpoint from disk. + + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + rank (int): the rank of the process in multi-process setting. + model (obj): model to load parameters. + optimizer (obj, optional): optimizer to load states if needed. + Defaults to None. + iteration (int, optional): if specified, load the specific checkpoint, + if not specified, load the latest one. Defaults to None. + file_path (str, optional): if specified, load the checkpoint + stored in the file_path. Defaults to None. + dtype (str, optional): precision of the model parameters. + Defaults to float32. + + Returns: + None + """ if file_path is None: if iteration is None: iteration = load_latest_checkpoint(checkpoint_dir, rank) @@ -165,6 +202,18 @@ def load_parameters(checkpoint_dir, def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None): + """Checkpoint the latest trained model parameters. + + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + iteration (int): the latest iteration number. + model (obj): model to be checkpointed. + optimizer (obj, optional): optimizer to be checkpointed. + Defaults to None. + + Returns: + None + """ file_path = "{}/step-{}".format(checkpoint_dir, iteration) model_dict = model.state_dict() dg.save_dygraph(model_dict, file_path) diff --git a/parakeet/data/batch.py b/parakeet/data/batch.py index 22c24e4b1af362464922e68c75c4b24a39df523e..6a7f35d4ce027ff6dd6fccc098807f84ab0c5246 100644 --- a/parakeet/data/batch.py +++ b/parakeet/data/batch.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -functions to make batch for arrays which satisfy some conditions. +Utility functions to create batch for arrays which satisfy some conditions. +Batch functions for text sequences, audio and spectrograms are provided. """ import numpy as np class TextIDBatcher(object): - """A wrapper class for a function to build a functor, which holds the configs to pass to the function.""" + """A wrapper class for `batch_text_id`.""" def __init__(self, pad_id=0, dtype=np.int64): self.pad_id = pad_id @@ -30,9 +31,15 @@ class TextIDBatcher(object): def batch_text_id(minibatch, pad_id=0, dtype=np.int64): - """ - minibatch: List[Example] - Example: ndarray, shape(T,), dtype: int64 + """Pad sequences to text_ids to the largest length and batch them. + + Args: + minibatch (List[np.ndarray]): list of rank-1 arrays, shape(T,), dtype: np.int64, text_ids. + pad_id (int, optional): the id which correspond to the special pad token. Defaults to 0. + dtype (np.dtype, optional): the data dtype of the output. Defaults to np.int64. + + Returns: + np.ndarray: rank-2 array of text_ids, shape(B, T), B stands for batch_size, T stands for length. The output batch. """ peek_example = minibatch[0] assert len(peek_example.shape) == 1, "text example is an 1D tensor" @@ -53,6 +60,8 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64): class WavBatcher(object): + """A wrapper class for `batch_wav`.""" + def __init__(self, pad_value=0., dtype=np.float32): self.pad_value = pad_value self.dtype = dtype @@ -63,19 +72,25 @@ class WavBatcher(object): def batch_wav(minibatch, pad_value=0., dtype=np.float32): + """pad audios to the largest length and batch them. + + Args: + minibatch (List[np.ndarray]): list of rank-1 float arrays(mono-channel audio, shape(T,)) or list of rank-2 float arrays(multi-channel audio, shape(C, T), C stands for numer of channels, T stands for length), dtype: float. + pad_value (float, optional): the pad value. Defaults to 0.. + dtype (np.dtype, optional): the data type of the output. Defaults to np.float32. + + Returns: + np.ndarray: the output batch. It is a rank-2 float array of shape(B, T) if the minibatch is a list of mono-channel audios, or a rank-3 float array of shape(B, C, T) if the minibatch is a list of multi-channel audios. """ - minibatch: List[Example] - Example: ndarray, shape(C, T) for multi-channel wav, shape(T,) for mono-channel wav, dtype: float32 - """ - # detect data format, maybe better to specify it in __init__ + peek_example = minibatch[0] if len(peek_example.shape) == 1: mono_channel = True elif len(peek_example.shape) == 2: mono_channel = False - lengths = [example.shape[-1] for example in minibatch - ] # assume (channel, n_samples) or (n_samples, ) + # assume (channel, n_samples) or (n_samples, ) + lengths = [example.shape[-1] for example in minibatch] max_len = np.max(lengths) batch = [] @@ -90,12 +105,14 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32): batch.append( np.pad(example, [(0, 0), (0, pad_len)], mode='constant', - constant_values=pad_value)) # what about PCM, no + constant_values=pad_value)) return np.array(batch, dtype=dtype) class SpecBatcher(object): + """A wrapper class for `batch_spec`""" + def __init__(self, pad_value=0., dtype=np.float32): self.pad_value = pad_value self.dtype = dtype @@ -106,9 +123,15 @@ class SpecBatcher(object): def batch_spec(minibatch, pad_value=0., dtype=np.float32): - """ - minibatch: List[Example] - Example: ndarray, shape(C, F, T) for multi-channel spectrogram, shape(F, T) for mono-channel spectrogram, dtype: float32 + """Pad spectra to the largest length and batch them. + + Args: + minibatch (List[np.ndarray]): list of rank-2 arrays of shape(F, T) for mono-channel spectrograms, or list of rank-3 arrays of shape(C, F, T) for multi-channel spectrograms(F stands for frequency bands.), dtype: float. + pad_value (float, optional): the pad value. Defaults to 0.. + dtype (np.dtype, optional): data type of the output. Defaults to np.float32. + + Returns: + np.ndarray: a rank-3 array of shape(B, F, T) when the minibatch is a list of mono-channel spectrograms, or a rank-4 array of shape(B, C, F, T) when the minibatch is a list of multi-channel spectorgrams. """ # assume (F, T) or (C, F, T) peek_example = minibatch[0] @@ -117,8 +140,8 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32): elif len(peek_example.shape) == 3: mono_channel = False - lengths = [example.shape[-1] for example in minibatch - ] # assume (channel, F, n_frame) or (F, n_frame) + # assume (channel, F, n_frame) or (F, n_frame) + lengths = [example.shape[-1] for example in minibatch] max_len = np.max(lengths) batch = [] @@ -133,6 +156,6 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32): batch.append( np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', - constant_values=pad_value)) # what about PCM, no + constant_values=pad_value)) return np.array(batch, dtype=dtype) diff --git a/parakeet/data/datacargo.py b/parakeet/data/datacargo.py index c8b4547a068103e0eb52a0bdce5f3343f9645938..a88829ce7d033cb6634339ff027f2ee14c59f0e5 100644 --- a/parakeet/data/datacargo.py +++ b/parakeet/data/datacargo.py @@ -25,6 +25,17 @@ class DataCargo(object): shuffle=False, batch_sampler=None, drop_last=False): + """An Iterable object of batches. It requires a dataset, a batch function and a sampler. The sampler yields the example ids, then the corresponding examples in the dataset are collected and transformed into a batch with the batch function. + + Args: + dataset (Dataset): the dataset used to build a data cargo. + batch_fn (callable, optional): a callable that takes a list of examples of `dataset` and return a batch, it can be None if the dataset has a `_batch_examples` method which satisfy the requirement. Defaults to None. + batch_size (int, optional): number of examples in a batch. Defaults to 1. + sampler (Sampler, optional): an iterable of example ids(intergers), the example ids are used to pick examples. Defaults to None. + shuffle (bool, optional): when sampler is not provided, shuffle = True creates a RandomSampler and shuffle=False creates a SequentialSampler internally. Defaults to False. + batch_sampler (BatchSampler, optional): an iterable of lists of example ids(intergers), the list is used to pick examples, `batch_sampler` option is mutually exclusive with `batch_size`, `shuffle`, `sampler`, and `drop_last`. Defaults to None. + drop_last (bool, optional): whether to drop the last minibatch. Defaults to False. + """ self.dataset = dataset self.batch_fn = batch_fn or self.dataset._batch_examples @@ -59,11 +70,12 @@ class DataCargo(object): return DataIterator(self) def __call__(self): + # protocol for paddle's DataLoader return DataIterator(self) @property def _auto_collation(self): - # we will auto batching + # use auto batching return self.batch_sampler is not None @property @@ -79,6 +91,11 @@ class DataCargo(object): class DataIterator(object): def __init__(self, loader): + """Iterator object of DataCargo. + + Args: + loader (DataCargo): the data cargo to iterate. + """ self.loader = loader self._dataset = loader.dataset @@ -90,11 +107,9 @@ class DataIterator(object): return self def __next__(self): - - index = self._next_index( - ) # may raise StopIteration, TODO(chenfeiyu): use dynamic batch size - minibatch = [self._dataset[i] for i in index - ] # we can abstract it, too to use dynamic batch size + # TODO(chenfeiyu): use dynamic batch size + index = self._next_index() + minibatch = [self._dataset[i] for i in index] minibatch = self._batch_fn(minibatch) # list[Example] -> Batch return minibatch diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py index 6ab4ebb32c1b915ee5cc520c7a841b90aeac3515..87ef39325ae2c3d83d0708795be2d80f730ca1bd 100644 --- a/parakeet/data/dataset.py +++ b/parakeet/data/dataset.py @@ -14,12 +14,27 @@ import six import numpy as np +from tqdm import tqdm class DatasetMixin(object): - """standard indexing interface for dataset.""" + """Standard indexing interface for dataset. Inherit this class to + get the indexing interface. Since it is a mixin class which does + not have an `__init__` class, the subclass not need to call + `super().__init__()`. + """ def __getitem__(self, index): + """Standard indexing interface for dataset. + + Args: + index (slice, list[int], np.array or int): the index. if can be int, slice, list of integers, or ndarray of integers. It calls `get_example` to pick an example. + + Returns: + Example, or List[Example]: If `index` is an interger, it returns an + example. If `index` is a slice, a list of intergers or an array of intergers, + it returns a list of examples. + """ if isinstance(index, slice): start, stop, step = index.indices(len(self)) return [ @@ -32,6 +47,12 @@ class DatasetMixin(object): return self.get_example(index) def get_example(self, i): + """Get an example from the dataset. Custom datasets should have + this method implemented. + + Args: + i (int): example index. + """ raise NotImplementedError def __len__(self): @@ -43,9 +64,13 @@ class DatasetMixin(object): class TransformDataset(DatasetMixin): - """Transform a dataset to another with a transform.""" - def __init__(self, dataset, transform): + """Dataset which is transformed from another with a transform. + + Args: + dataset (DatasetMixin): the base dataset. + transform (callable): the transform which takes an example of the base dataset as parameter and return a new example. + """ self._dataset = dataset self._transform = transform @@ -53,14 +78,17 @@ class TransformDataset(DatasetMixin): return len(self._dataset) def get_example(self, i): - # CAUTION: only int is supported? - # CAUTION: dataset support support __getitem__ and __len__ in_data = self._dataset[i] return self._transform(in_data) class CacheDataset(DatasetMixin): def __init__(self, dataset): + """A lazy cache of the base dataset. + + Args: + dataset (DatasetMixin): the base dataset to cache. + """ self._dataset = dataset self._cache = dict() @@ -75,6 +103,11 @@ class CacheDataset(DatasetMixin): class TupleDataset(object): def __init__(self, *datasets): + """A compound dataset made from several datasets of the same length. An example of the `TupleDataset` is a tuple of examples from the constituent datasets. + + Args: + datasets: tuple[DatasetMixin], the constituent datasets. + """ if not datasets: raise ValueError("no datasets are given") length = len(datasets[0]) @@ -105,6 +138,11 @@ class TupleDataset(object): class DictDataset(object): def __init__(self, **datasets): + """A compound dataset made from several datasets of the same length. An example of the `DictDataset` is a dict of examples from the constituent datasets. + + Args: + datasets: Dict[DatasetMixin], the constituent datasets. + """ if not datasets: raise ValueError("no datasets are given") length = None @@ -134,6 +172,14 @@ class DictDataset(object): class SliceDataset(DatasetMixin): def __init__(self, dataset, start, finish, order=None): + """A Dataset which is a slice of the base dataset. + + Args: + dataset (DatasetMixin): the base dataset. + start (int): the start of the slice. + finish (int): the end of the slice, not inclusive. + order (List[int], optional): the order, it is a permutation of the valid example ids of the base dataset. If `order` is provided, the slice is taken in `order`. Defaults to None. + """ if start < 0 or finish > len(dataset): raise ValueError("subset overruns the dataset.") self._dataset = dataset @@ -168,6 +214,12 @@ class SliceDataset(DatasetMixin): class SubsetDataset(DatasetMixin): def __init__(self, dataset, indices): + """A Dataset which is a subset of the base dataset. + + Args: + dataset (DatasetMixin): the base dataset. + indices (Iterable[int]): the indices of the examples to pick. + """ self._dataset = dataset if len(indices) > len(dataset): raise ValueError("subset's size larger that dataset's size!") @@ -184,6 +236,12 @@ class SubsetDataset(DatasetMixin): class FilterDataset(DatasetMixin): def __init__(self, dataset, filter_fn): + """A filtered dataset. + + Args: + dataset (DatasetMixin): the base dataset. + filter_fn (callable): a callable which takes an example of the base dataset and return a boolean. + """ self._dataset = dataset self._indices = [ i for i in range(len(dataset)) if filter_fn(dataset[i]) @@ -200,6 +258,11 @@ class FilterDataset(DatasetMixin): class ChainDataset(DatasetMixin): def __init__(self, *datasets): + """A concatenation of the several datasets which the same structure. + + Args: + datasets (Iterable[DatasetMixin]): datasets to concat. + """ self._datasets = datasets def __len__(self): diff --git a/parakeet/data/sampler.py b/parakeet/data/sampler.py index b4ef097706f2563fc47bd7c0d7b388778f563ce0..2b9d4908fe8509ba0c3d0f68c976b33ee5263bb6 100644 --- a/parakeet/data/sampler.py +++ b/parakeet/data/sampler.py @@ -14,7 +14,7 @@ """ At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__. -This suffices for a sampler. We implemente sampler as iterable of valid indices. By valid, we mean 0 <= index < N, where N is the length of the dataset. We then collect several indices within a batch and use it to collect examples from the dataset with __getitem__. Then collate this examples to form a batch. +This suffices for a sampler. We implemente sampler as iterable of valid indices. By valid, we mean 0 <= index < N, where N is the length of the dataset. We then collect several indices within a batch and use them to collect examples from the dataset with __getitem__. Then transform these examples into a batch. So the sampler is only responsible for generating valid indices. """ @@ -24,9 +24,6 @@ import random class Sampler(object): - def __init__(self, data_source): - pass - def __iter__(self): # return a iterator of indices # or a iterator of list[int], for BatchSampler @@ -35,6 +32,11 @@ class Sampler(object): class SequentialSampler(Sampler): def __init__(self, data_source): + """Sequential sampler, the simplest sampler that samples indices from 0 to N - 1, where N is the dataset is length. + + Args: + data_source (DatasetMixin): the dataset. This is used to get the dataset's length. + """ self.data_source = data_source def __iter__(self): @@ -46,6 +48,13 @@ class SequentialSampler(Sampler): class RandomSampler(Sampler): def __init__(self, data_source, replacement=False, num_samples=None): + """Random sampler. + + Args: + data_source (DatasetMixin): the dataset. This is used to get the dataset's length. + replacement (bool, optional): whether replacement is enabled in sampling. When `replacement` is True, `num_samples` must be provided. Defaults to False. + num_samples (int, optional): numbers of indices to draw. This option should only be provided when replacement is True. Defaults to None. + """ self.data_source = data_source self.replacement = replacement self._num_samples = num_samples @@ -66,7 +75,6 @@ class RandomSampler(Sampler): @property def num_samples(self): - # dataset size might change at runtime if self._num_samples is None: return len(self.data_source) return self._num_samples @@ -84,12 +92,16 @@ class RandomSampler(Sampler): class SubsetRandomSampler(Sampler): - r"""Samples elements randomly from a given list of indices, without replacement. + """Samples elements randomly from a given list of indices, without replacement. Arguments: indices (sequence): a sequence of indices """ def __init__(self, indices): + """ + Args: + indices (List[int]): indices to sample from. + """ self.indices = indices def __iter__(self): @@ -112,6 +124,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler): batch_size=4, batch_group_size=None, permutate=True): + """[summary] + + Args: + lengths (List[int]): The length of the examples of the dataset. This is the key to be considered as 'time length'. + batch_size (int, optional): batch size. Defaults to 4. + batch_group_size (int, optional): the size of a small batch. Random shuffling is applied within such patches. If `batch_group_size` is not provided, it is set to min(batch_size * 32, len(self.lengths)). Batch_group_size should be perfectly divided by batch_size. Defaults to None. + permutate (bool, optional): permutate batches. Defaults to True. + """ _lengths = np.array( lengths, dtype=np.int64) # maybe better implement length as a sort key @@ -157,13 +177,11 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler): class WeightedRandomSampler(Sampler): - r"""Samples elements from ``[0,..,len(weights)-1]`` with given probabilities (weights). + """Samples elements from ``[0,..,len(weights)-1]`` with given probabilities (weights). Args: - weights (sequence) : a sequence of weights, not necessary summing up to one - num_samples (int): number of samples to draw - replacement (bool): if ``True``, samples are drawn with replacement. - If not, they are drawn without replacement, which means that when a - sample index is drawn for a row, it cannot be drawn again for that row. + weights (List[float]): a sequence of weights, not necessary summing up to 1. + num_samples (int): number of samples to draw. + replacement (bool): whether samples are drawn with replacement. When replacement is False, num_samples should not be larger than len(weights). Example: >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True)) [0, 0, 0, 1, 0] @@ -179,6 +197,10 @@ class WeightedRandomSampler(Sampler): self.weights = np.array(weights, dtype=np.float64) self.num_samples = num_samples self.replacement = replacement + if replacement is False and num_samples > len(weights): + raise ValueError( + "when replacement is False, num_samples should not be" + "larger that length of weight.") def __iter__(self): return iter( @@ -194,6 +216,21 @@ class WeightedRandomSampler(Sampler): class DistributedSampler(Sampler): def __init__(self, dataset_size, num_trainers, rank, shuffle=True): + """Sampler used for data parallel training. Indices are divided into num_trainers parts. Each trainer gets a subset and iter that subset. If the dataset has 16 examples, and there are 4 trainers. + + Trainer 0 gets [0, 4, 8, 12]; + Trainer 1 gets [1, 5, 9, 13]; + Trainer 2 gets [2, 6, 10, 14]; + trainer 3 gets [3, 7, 11, 15]. + + It ensures that trainer get different parts of the dataset. If dataset's length cannot be perfectly devidef by num_trainers, some examples appended to the dataset, to ensures that every trainer gets the same amounts of examples. + + Args: + dataset_size (int): the length of the dataset. + num_trainers (int): number of trainers(training processes). + rank (int): local rank of the trainer. + shuffle (bool, optional): whether to shuffle the indices before iteration. Defaults to True. + """ self.dataset_size = dataset_size self.num_trainers = num_trainers self.rank = rank @@ -222,20 +259,20 @@ class DistributedSampler(Sampler): class BatchSampler(Sampler): - r"""Wraps another sampler to yield a mini-batch of indices. - Args: - sampler (Sampler): Base sampler. - batch_size (int): Size of mini-batch. - drop_last (bool): If ``True``, the sampler will drop the last batch if - its size would be less than ``batch_size`` - Example: - >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False)) - [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] - >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True)) - [[0, 1, 2], [3, 4, 5], [6, 7, 8]] - """ + """Wraps another sampler to yield a mini-batch of indices.""" def __init__(self, sampler, batch_size, drop_last): + """ + Args: + sampler (Sampler): Base sampler. + batch_size (int): Size of mini-batch. + drop_last (bool): If True, the sampler will drop the last batch if its size is less than batch_size. + Example: + >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False)) + [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] + >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True)) + [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + """ if not isinstance(sampler, Sampler): raise ValueError("sampler should be an instance of " "Sampler, but got sampler={}".format(sampler)) diff --git a/parakeet/datasets/ljspeech.py b/parakeet/datasets/ljspeech.py index 62209e9219a50855fbd94f7deab96537407c4d18..3ab8ac92edd06b5411033ea9407545aa6ee6dd75 100644 --- a/parakeet/datasets/ljspeech.py +++ b/parakeet/datasets/ljspeech.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pathlib import Path +import os import numpy as np import pandas as pd import librosa @@ -27,13 +27,11 @@ from ..data.batch import TextIDBatcher, SpecBatcher class LJSpeech(DatasetMixin): def __init__(self, root): super(LJSpeech, self).__init__() - assert isinstance(root, ( - str, Path)), "root should be a string or Path object" - self.root = root if isinstance(root, Path) else Path(root) + self.root = root self.metadata = self._prepare_metadata() def _prepare_metadata(self): - csv_path = self.root.joinpath("metadata.csv") + csv_path = os.path.join(self.root, "metadata.csv") metadata = pd.read_csv( csv_path, sep="|", @@ -51,7 +49,7 @@ class LJSpeech(DatasetMixin): """ fname, raw_text, normalized_text = metadatum - wav_path = self.root.joinpath("wavs", fname + ".wav") + wav_path = os.path.join(self.root, "wavs", fname + ".wav") # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize wav, sample_rate = librosa.load( diff --git a/parakeet/models/clarinet/net.py b/parakeet/models/clarinet/net.py index 35f0f03a668da42c7a27a6f6f31015382a9e817e..52adbce116d8f0beb09ea64e339a358cd981398d 100644 --- a/parakeet/models/clarinet/net.py +++ b/parakeet/models/clarinet/net.py @@ -37,28 +37,41 @@ class Clarinet(dg.Layer): stft, min_log_scale=-6.0, lmd=4.0): + """Clarinet model. + + Args: + encoder (UpsampleNet): an UpsampleNet to upsample mel spectrogram. + teacher (WaveNet): a WaveNet, the teacher. + student (ParallelWaveNet): a ParallelWaveNet model, the student. + stft (STFT): a STFT model to perform differentiable stft transform. + min_log_scale (float, optional): used only for computing loss, the minimal value of log standard deviation of the output distribution of both the teacher and the student . Defaults to -6.0. + lmd (float, optional): weight for stft loss. Defaults to 4.0. + """ super(Clarinet, self).__init__() - self.lmd = lmd self.encoder = encoder self.teacher = teacher self.student = student + self.stft = stft + self.lmd = lmd self.min_log_scale = min_log_scale - self.stft = stft def forward(self, audio, mel, audio_start, clip_kl=True): - """Compute loss for a distill model - - Arguments: - audio {Variable} -- shape(batch_size, time_steps), target waveform. - mel {Variable} -- shape(batch_size, condition_dim, time_steps // hop_length), original mel spectrogram, not upsampled yet. - audio_starts {Variable} -- shape(batch_size, ), the index of the start sample. - clip_kl (bool) -- whether to clip kl divergence if it is greater than 10.0. - + """Compute loss of Clarinet model. + + Args: + audio (Variable): shape(B, T_audio), dtype: float, ground truth waveform. + mel (Variable): shape(B, F, T_mel), dtype: float, condition(mel spectrogram here). + audio_start (Variable): shape(B, ), dtype: int64, audio starts positions. + clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True. + Returns: - Variable -- shape(1,), loss + Dict(str, Variable) + loss (Variable): shape(1, ), dtype: float, total loss. + kl (Variable): shape(1, ), dtype: float, kl divergence between the teacher's output distribution and student's output distribution. + regularization (Variable): shape(1, ), dtype: float, a regularization term of the KL divergence. + spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform. """ - batch_size, audio_length = audio.shape # audio clip's length z = F.gaussian_random(audio.shape) @@ -104,13 +117,13 @@ class Clarinet(dg.Layer): @dg.no_grad def synthesis(self, mel): - """Synthesize waveform conditioned on the mel spectrogram. - - Arguments: - mel {Variable} -- shape(batch_size, frequqncy_bands, frames) - + """Synthesize waveform using the encoder and the student network. + + Args: + mel (Variable): shape(B, F, T_mel), the condition(mel spectrogram here). + Returns: - Variable -- shape(batch_size, frames * upsample_factor) + Variable: shape(B, T_audio), the synthesized waveform. (T_audio = T_mel * upscale_factor, where upscale_factor is the `upscale_factor` of the encoder.) """ condition = self.encoder(mel) samples_shape = (condition.shape[0], condition.shape[-1]) @@ -121,6 +134,14 @@ class Clarinet(dg.Layer): class STFT(dg.Layer): def __init__(self, n_fft, hop_length, win_length, window="hanning"): + """A module for computing differentiable stft transform. See `librosa.stft` for more details. + + Args: + n_fft (int): number of samples in a frame. + hop_length (int): number of samples shifted between adjacent frames. + win_length (int): length of the window function. + window (str, optional): name of window function, see `scipy.signal.get_window` for more details. Defaults to "hanning". + """ super(STFT, self).__init__() self.hop_length = hop_length self.n_bin = 1 + n_fft // 2 @@ -146,6 +167,16 @@ class STFT(dg.Layer): self.weight = dg.to_variable(w) def forward(self, x): + """Compute the stft transform. + + Args: + x (Variable): shape(B, T), dtype: float, the input waveform. + + Returns: + (real, imag) + real (Variable): shape(B, C, 1, T), dtype: float, the real part of the spectrogram. (C = 1 + n_fft // 2) + imag (Variable): shape(B, C, 1, T), dtype: float, the image part of the spectrogram. (C = 1 + n_fft // 2) + """ # x(batch_size, time_steps) # pad it first with reflect mode pad_start = F.reverse(x[:, 1:1 + self.n_fft // 2], axis=1) @@ -159,11 +190,31 @@ class STFT(dg.Layer): return real, imag def power(self, x): + """Compute the power spectrogram. + + Args: + (real, imag) + real (Variable): shape(B, C, 1, T), dtype: float, the real part of the spectrogram. + imag (Variable): shape(B, C, 1, T), dtype: float, the image part of the spectrogram. + + Returns: + Variable: shape(B, C, 1, T), dtype: float, the power spectrogram. + """ real, imag = self(x) power = real**2 + imag**2 return power def magnitude(self, x): + """Compute the magnitude spectrogram. + + Args: + (real, imag) + real (Variable): shape(B, C, 1, T), dtype: float, the real part of the spectrogram. + imag (Variable): shape(B, C, 1, T), dtype: float, the image part of the spectrogram. + + Returns: + Variable: shape(B, C, 1, T), dtype: float, the magnitude spectrogram. It is the square root of the power spectrogram. + """ power = self.power(x) magnitude = F.sqrt(power) return magnitude diff --git a/parakeet/models/clarinet/parallel_wavenet.py b/parakeet/models/clarinet/parallel_wavenet.py index be30b7bd86c88e0002f24a10ac4e4ce2259ce5a2..ec297d555c40cef9b21d253ceddc13efb1370713 100644 --- a/parakeet/models/clarinet/parallel_wavenet.py +++ b/parakeet/models/clarinet/parallel_wavenet.py @@ -29,6 +29,15 @@ from parakeet.models.wavenet import WaveNet class ParallelWaveNet(dg.Layer): def __init__(self, n_loops, n_layers, residual_channels, condition_dim, filter_size): + """ParallelWaveNet, an inverse autoregressive flow model, it contains several flows(WaveNets). + + Args: + n_loops (List[int]): `n_loop` for each flow. + n_layers (List[int]): `n_layer` for each flow. + residual_channels (int): `residual_channels` for every flow. + condition_dim (int): `condition_dim` for every flow. + filter_size (int): `filter_size` for every flow. + """ super(ParallelWaveNet, self).__init__() self.flows = dg.LayerList() for n_loop, n_layer in zip(n_loops, n_layers): @@ -38,20 +47,18 @@ class ParallelWaveNet(dg.Layer): filter_size, "mog", -100.0)) def forward(self, z, condition=None): - """Inverse Autoregressive Flow. Several wavenets. - - Arguments: - z {Variable} -- shape(batch_size, time_steps), hidden variable, sampled from a standard normal distribution. - - Keyword Arguments: - condition {Variable} -- shape(batch_size, condition_dim, time_steps), condition, basically upsampled mel spectrogram. (default: {None}) - + """Transform a random noise sampled from a standard Gaussian distribution into sample from the target distribution. And output the mean and log standard deviation of the output distribution. + + Args: + z (Variable): shape(B, T), random noise sampled from a standard gaussian disribution. + condition (Variable, optional): shape(B, F, T), dtype: float, the upsampled condition. Defaults to None. + Returns: - Variable -- shape(batch_size, time_steps), transformed z. - Variable -- shape(batch_size, time_steps), output distribution's mu. - Variable -- shape(batch_size, time_steps), output distribution's log_std. + (z, out_mu, out_log_std) + z (Variable): shape(B, T), dtype: float, transformed noise, it is the synthesized waveform. + out_mu (Variable): shape(B, T), dtype: float, means of the output distributions. + out_log_std (Variable): shape(B, T), dtype: float, log standard deviations of the output distributions. """ - for i, flow in enumerate(self.flows): theta = flow(z, condition) # w, mu, log_std [0: T] w, mu, log_std = F.split(theta, 3, dim=-1) # (B, T, 1) for each diff --git a/parakeet/models/deepvoice3/attention.py b/parakeet/models/deepvoice3/attention.py index 33ffc11277e700745c5d7ba6aee71dc55c5757af..d9536180730a5ec6eaf08e44e87651d999ab1795 100644 --- a/parakeet/models/deepvoice3/attention.py +++ b/parakeet/models/deepvoice3/attention.py @@ -31,6 +31,16 @@ class Attention(dg.Layer): window_range=WindowRange(-1, 3), key_projection=True, value_projection=True): + """Attention Layer for Deep Voice 3. + + Args: + query_dim (int): the dimension of query vectors. (The size of a single vector of query.) + embed_dim (int): the dimension of keys and values. + dropout (float, optional): dropout probability of attention. Defaults to 0.0. + window_range (WindowRange, optional): range of attention, this is only used at inference. Defaults to WindowRange(-1, 3). + key_projection (bool, optional): whether the `Attention` Layer has a Linear Layer for the keys to pass through before computing attention. Defaults to True. + value_projection (bool, optional): whether the `Attention` Layer has a Linear Layer for the values to pass through before computing attention. Defaults to True. + """ super(Attention, self).__init__() std = np.sqrt(1 / query_dim) self.query_proj = Linear( @@ -54,29 +64,19 @@ class Attention(dg.Layer): def forward(self, query, encoder_out, mask=None, last_attended=None): """ - Compute pooled context representation and alignment scores. + Compute contextualized representation and alignment scores. Args: - query (Variable): shape(B, T_dec, C_q), the query tensor, - where C_q means the channel of query. - encoder_out (Tuple(Variable, Variable)): - keys (Variable): shape(B, T_enc, C_emb), the key - representation from an encoder, where C_emb means - text embedding size. - values (Variable): shape(B, T_enc, C_emb), the value - representation from an encoder, where C_emb means - text embedding size. - mask (Variable, optional): Shape(B, T_enc), mask generated with - valid text lengths. - last_attended (int, optional): The position that received most - attention at last timestep. This is only used at decoding. + query (Variable): shape(B, T_dec, C_q), dtype: float, the query tensor, where C_q means the query dim. + encoder_out (keys, values): + keys (Variable): shape(B, T_enc, C_emb), dtype: float, the key representation from an encoder, where C_emb means embed dim. + values (Variable): shape(B, T_enc, C_emb), dtype: float, the value representation from an encoder, where C_emb means embed dim. + mask (Variable, optional): shape(B, T_enc), dtype: float, mask generated with valid text lengths. Pad tokens corresponds to 1, and valid tokens correspond to 0. + last_attended (int, optional): The position that received the most attention at last time step. This is only used at inference. Outpus: - x (Variable): Shape(B, T_dec, C_q), the context representation - pooled from attention mechanism. - attn_scores (Variable): shape(B, T_dec, T_enc), the alignment - tensor, where T_dec means the number of decoder time steps and - T_enc means number the number of decoder time steps. + x (Variable): shape(B, T_dec, C_q), dtype: float, the contextualized representation from attention mechanism. + attn_scores (Variable): shape(B, T_dec, T_enc), dtype: float, the alignment tensor, where T_dec means the number of decoder time steps and T_enc means number the number of decoder time steps. """ keys, values = encoder_out residual = query @@ -85,7 +85,6 @@ class Attention(dg.Layer): if self.key_projection: keys = self.key_proj(keys) x = self.query_proj(query) - # TODO: check the code x = F.matmul(x, keys, transpose_y=True) @@ -97,7 +96,6 @@ class Attention(dg.Layer): # if last_attended is provided, focus only on a window range around it # to enforce monotonic attention. - # TODO: if last attended is a shape(B,) array if last_attended is not None: locality_mask = np.ones(shape=x.shape, dtype=np.float32) backward, ahead = self.window_range @@ -116,7 +114,7 @@ class Attention(dg.Layer): x, self.dropout, dropout_implementation="upscale_in_train") x = F.matmul(x, values) encoder_length = keys.shape[1] - # CAUTION: is it wrong? let it be now + x = F.scale(x, encoder_length * np.sqrt(1.0 / encoder_length)) x = self.out_proj(x) x = F.scale((x + residual), np.sqrt(0.5)) diff --git a/parakeet/models/deepvoice3/conv1dglu.py b/parakeet/models/deepvoice3/conv1dglu.py index 584c3d7a4c9d7e935d8877c99e880674473ee4b8..174e825282396d486272af096a73a571149fc708 100644 --- a/parakeet/models/deepvoice3/conv1dglu.py +++ b/parakeet/models/deepvoice3/conv1dglu.py @@ -24,10 +24,7 @@ from parakeet.modules.weight_norm import Conv1D, Conv1DCell, Conv2D, Linear class Conv1DGLU(dg.Layer): """ - A Convolution 1D block with GLU activation. It also applys dropout for the - input x. It fuses speaker embeddings through a FC activated by softsign. It - has residual connection from the input x, and scale the output by - np.sqrt(0.5). + A Convolution 1D block with GLU activation. It also applys dropout for the input x. It integrates speaker embeddings through a Linear activated by softsign. It has residual connection from the input x, and scale the output by np.sqrt(0.5). """ def __init__(self, @@ -41,8 +38,21 @@ class Conv1DGLU(dg.Layer): dropout=0.0, causal=False, residual=True): - super(Conv1DGLU, self).__init__() + """[summary] + Args: + n_speakers (int): number of speakers. + speaker_dim (int): speaker embedding's size. + in_channels (int): channels of the input. + num_filters (int): channels of the output. + filter_size (int, optional): filter size of the internal Conv1DCell. Defaults to 1. + dilation (int, optional): dilation of the internal Conv1DCell. Defaults to 1. + std_mul (float, optional): [description]. Defaults to 4.0. + dropout (float, optional): dropout probability. Defaults to 0.0. + causal (bool, optional): padding of the Conv1DCell. It shoudl be True if `add_input` method of `Conv1DCell` is ever used. Defaults to False. + residual (bool, optional): whether to use residual connection. If True, in_channels shoudl equals num_filters. Defaults to True. + """ + super(Conv1DGLU, self).__init__() # conv spec self.in_channels = in_channels self.n_speakers = n_speakers @@ -83,18 +93,12 @@ class Conv1DGLU(dg.Layer): def forward(self, x, speaker_embed=None): """ Args: - x (Variable): Shape(B, C_in, T), the input of Conv1DGLU - layer, where B means batch_size, C_in means the input channels - T means input time steps. - speaker_embed_bct1 (Variable): Shape(B, C_sp), expanded - speaker embed, where C_sp means speaker embedding size. Note - that when using residual connection, the Conv1DGLU does not - change the number of channels, so out channels equals input - channels. + x (Variable): shape(B, C_in, T), dtype: float, the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels T means input time steps. + speaker_embed (Variable): shape(B, C_sp), dtype: float, speaker embed, where C_sp means speaker embedding size. Returns: - x (Variable): Shape(B, C_out, T), the output of Conv1DGLU, where - C_out means the output channels of Conv1DGLU. + x (Variable): shape(B, C_out, T), the output of Conv1DGLU, where + C_out means the `num_filters`. """ residual = x x = F.dropout( @@ -114,22 +118,20 @@ class Conv1DGLU(dg.Layer): return x def start_sequence(self): + """Prepare the Conv1DGLU to generate a new sequence. This method should be called before starting calling `add_input` multiple times. + """ self.conv.start_sequence() def add_input(self, x_t, speaker_embed=None): """ + Takes a step of inputs and return a step of outputs. It works similarily with the `forward` method, but in a `step-in-step-out` fashion. + Args: - x (Variable): Shape(B, C_in), the input of Conv1DGLU - layer, where B means batch_size, C_in means the input channels. - speaker_embed_bct1 (Variable): Shape(B, C_sp), expanded - speaker embed, where C_sp means speaker embedding size. Note - that when using residual connection, the Conv1DGLU does not - change the number of channels, so out channels equals input - channels. + x_t (Variable): shape(B, C_in, T=1), dtype: float, the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels. + speaker_embed (Variable): Shape(B, C_sp), dtype: float, speaker embed, where C_sp means speaker embedding size. Returns: - x (Variable): Shape(B, C_out), the output of Conv1DGLU, where - C_out means the output channels of Conv1DGLU. + x (Variable): shape(B, C_out), the output of Conv1DGLU, where C_out means the `num_filter`. """ residual = x_t x_t = F.dropout( diff --git a/parakeet/models/deepvoice3/converter.py b/parakeet/models/deepvoice3/converter.py index 5181a5cfd56618704ebb07192b03416dba5b0d59..9fffc5e1713b6b17bcc9c3d8fdd67d0bf4438c8e 100644 --- a/parakeet/models/deepvoice3/converter.py +++ b/parakeet/models/deepvoice3/converter.py @@ -25,6 +25,17 @@ from parakeet.models.deepvoice3.encoder import ConvSpec def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): + """Return a list of Layers that upsamples the input by 4 times in time dimension. + + Args: + n_speakers (int): number of speakers of the Conv1DGLU layers used. + speaker_dim (int): speaker embedding size of the Conv1DGLU layers used. + target_channels (int): channels of the input and the output.(the list of layers does not change the number of channels.) + dropout (float): dropout probability. + + Returns: + List[Layer]: upsampling layers. + """ # upsampling convolitions upsampling_convolutions = [ Conv1DTranspose( @@ -41,42 +52,56 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout): 3, dilation=1, std_mul=1., - dropout=dropout), Conv1DGLU( - n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=3, - std_mul=4., - dropout=dropout), Conv1DTranspose( - target_channels, - target_channels, - 2, - stride=2, - param_attr=I.Normal(scale=np.sqrt( - 4. / (2 * target_channels)))), Conv1DGLU( - n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=1, - std_mul=1., - dropout=dropout), Conv1DGLU( - n_speakers, - speaker_dim, - target_channels, - target_channels, - 3, - dilation=3, - std_mul=4., - dropout=dropout) + dropout=dropout), + Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=3, + std_mul=4., + dropout=dropout), + Conv1DTranspose( + target_channels, + target_channels, + 2, + stride=2, + param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))), + Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=1, + std_mul=1., + dropout=dropout), + Conv1DGLU( + n_speakers, + speaker_dim, + target_channels, + target_channels, + 3, + dilation=3, + std_mul=4., + dropout=dropout), ] return upsampling_convolutions def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout): + """Return a list of Layers that upsamples the input by 2 times in time dimension. + + Args: + n_speakers (int): number of speakers of the Conv1DGLU layers used. + speaker_dim (int): speaker embedding size of the Conv1DGLU layers used. + target_channels (int): channels of the input and the output.(the list of layers does not change the number of channels.) + dropout (float): dropout probability. + + Returns: + List[Layer]: upsampling layers. + """ upsampling_convolutions = [ Conv1DTranspose( target_channels, @@ -106,6 +131,17 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout): def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout): + """Return a list of Layers that upsamples the input by 1 times in time dimension. + + Args: + n_speakers (int): number of speakers of the Conv1DGLU layers used. + speaker_dim (int): speaker embedding size of the Conv1DGLU layers used. + target_channels (int): channels of the input and the output.(the list of layers does not change the number of channels.) + dropout (float): dropout probability. + + Returns: + List[Layer]: upsampling layers. + """ upsampling_convolutions = [ Conv1DGLU( n_speakers, @@ -121,10 +157,7 @@ def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout): class Converter(dg.Layer): - """ - Vocoder that transforms mel spectrogram (or ecoder hidden states) - to waveform. - """ + """Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform.""" def __init__(self, n_speakers, @@ -134,6 +167,17 @@ class Converter(dg.Layer): convolutions=(ConvSpec(256, 5, 1), ) * 4, time_upsampling=1, dropout=0.0): + """[summary] + + Args: + n_speakers (int): number of speakers. + speaker_dim (int): speaker embedding size. + in_channels (int): channels of the input. + linear_dim (int): channels of the linear spectrogram. + convolutions (Iterable[ConvSpec], optional): specifications of the internal convolutional layers. ConvSpec is a namedtuple of (output_channels, filter_size, dilation) Defaults to (ConvSpec(256, 5, 1), )*4. + time_upsampling (int, optional): time upsampling factor of the converter, possible options are {1, 2, 4}. Note that this should equals the downsample factor of the mel spectrogram. Defaults to 1. + dropout (float, optional): dropout probability. Defaults to 0.0. + """ super(Converter, self).__init__() self.n_speakers = n_speakers @@ -215,23 +259,12 @@ class Converter(dg.Layer): Convert mel spectrogram or decoder hidden states to linear spectrogram. Args: - x (Variable): Shape(B, T_mel, C_in), converter inputs, where - C_in means the input channel for the converter. Note that it - can be either C_mel (channel of mel spectrogram) or C_dec // r. - When use mel_spectrogram as the input of converter, C_in = - C_mel; and when use decoder states as the input of converter, - C_in = C_dec // r. In this scenario, decoder hidden states are - treated as if they were r outputs per decoder step and are - unpacked before passing to the converter. - speaker_embed (Variable, optional): shape(B, C_sp), speaker - embedding, where C_sp means the speaker embedding size. + x (Variable): Shape(B, T_mel, C_in), dtype: float, converter inputs, where C_in means the input channel for the converter. Note that it can be either C_mel (channel of mel spectrogram) or C_dec // r. + When use mel_spectrogram as the input of converter, C_in = C_mel; and when use decoder states as the input of converter, C_in = C_dec // r. + speaker_embed (Variable, optional): shape(B, C_sp), dtype: float, speaker embedding, where C_sp means the speaker embedding size. Returns: - out (Variable): Shape(B, T_lin, C_lin), the output linear - spectrogram, where C_lin means the channel of linear - spectrogram and T_linear means the length(time steps) of linear - spectrogram. T_line = time_upsampling * T_mel, which depends - on the time_upsampling converter. + out (Variable): Shape(B, T_lin, C_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling of the converter. """ x = F.transpose(x, [0, 2, 1]) x = self.first_conv_proj(x) diff --git a/parakeet/models/deepvoice3/decoder.py b/parakeet/models/deepvoice3/decoder.py index 7b7f5812741c3f8b450558abf5ddd7145dddabd7..24a5a4078d1d2a8ad2e7b1186938576125d85892 100644 --- a/parakeet/models/deepvoice3/decoder.py +++ b/parakeet/models/deepvoice3/decoder.py @@ -36,15 +36,12 @@ def gen_mask(valid_lengths, max_len, dtype="float32"): [0, 0, 0, 0, 0, 0, 0]]. Args: - valid_lengths (Variable): Shape(B), dtype: int64. A 1D-Tensor containing - the valid lengths (timesteps) of each example, where B means - beatch_size. - max_len (int): The length (number of timesteps) of the mask. - dtype (str, optional): A string that specifies the data type of the - returned mask. + valid_lengths (Variable): shape(B, ), dtype: int64. A rank-1 Tensor containing the valid lengths (timesteps) of each example, where B means beatch_size. + max_len (int): The length (number of time steps) of the mask. + dtype (str, optional): A string that specifies the data type of the returned mask. Defaults to 'float32'. Returns: - mask (Variable): A mask computed from valid lengths. + mask (Variable): shape(B, max_len), dtype: float, a mask computed from valid lengths. """ mask = F.sequence_mask(valid_lengths, maxlen=max_len, dtype=dtype) mask = 1 - mask @@ -54,14 +51,13 @@ def gen_mask(valid_lengths, max_len, dtype="float32"): def fold_adjacent_frames(frames, r): """fold multiple adjacent frames. - Arguments: - frames {Variable} -- shape(batch_size, time_steps, channels), the spectrogram - r {int} -- frames per step. + Args: + frames (Variable): shape(B, T, C), the spectrogram. + r (int): frames per step. Returns: - Variable -- shape(batch_size, time_steps // r, r *channels), folded frames + Variable: shape(B, T // r, r * C), folded frames. """ - if r == 1: return frames batch_size, time_steps, channels = frames.shape @@ -75,16 +71,15 @@ def fold_adjacent_frames(frames, r): def unfold_adjacent_frames(folded_frames, r): - """fold multiple adjacent frames. + """unfold the folded frames. - Arguments: - folded_frames {Variable} -- shape(batch_size, time_steps // r, r * channels), the spectrogram - r {int} -- frames per step. + Args: + folded_frames (Variable): shape(B, T, C), the folded spectrogram. + r (int): frames per step. Returns: - Variable -- shape(batch_size, time_steps, channels), folded frames + Variable: shape(B, T * r, C // r), unfolded frames. """ - if r == 1: return folded_frames batch_size, time_steps, channels = folded_frames.shape @@ -93,26 +88,44 @@ def unfold_adjacent_frames(folded_frames, r): class Decoder(dg.Layer): - def __init__( - self, - n_speakers, - speaker_dim, - embed_dim, - mel_dim, - r=1, - max_positions=512, - padding_idx=None, # remove it! - preattention=(ConvSpec(128, 5, 1), ) * 4, - convolutions=(ConvSpec(128, 5, 1), ) * 4, - attention=True, - dropout=0.0, - use_memory_mask=False, - force_monotonic_attention=False, - query_position_rate=1.0, - key_position_rate=1.0, - window_range=WindowRange(-1, 3), - key_projection=True, - value_projection=True): + def __init__(self, + n_speakers, + speaker_dim, + embed_dim, + mel_dim, + r=1, + max_positions=512, + preattention=(ConvSpec(128, 5, 1), ) * 4, + convolutions=(ConvSpec(128, 5, 1), ) * 4, + attention=True, + dropout=0.0, + use_memory_mask=False, + force_monotonic_attention=False, + query_position_rate=1.0, + key_position_rate=1.0, + window_range=WindowRange(-1, 3), + key_projection=True, + value_projection=True): + """Decoder of the Deep Voice 3 model. + + Args: + n_speakers (int): number of speakers. + speaker_dim (int): speaker embedding size. + embed_dim (int): text embedding size. + mel_dim (int): channel of mel input.(mel bands) + r (int, optional): number of frames generated per decoder step. Defaults to 1. + max_positions (int, optional): max position for text and decoder steps. Defaults to 512. + convolutions (Iterable[ConvSpec], optional): specification of causal convolutional layers inside the decoder. ConvSpec is a namedtuple of output_channels, filter_size and dilation. Defaults to (ConvSpec(128, 5, 1), )*4. + attention (bool or List[bool], optional): whether to use attention, it should have the same length with `convolutions` if it is a list of bool, indicating whether to have an Attention layer coupled with the corresponding convolutional layer. If it is a bool, it is repeated len(convolutions) times internally. Defaults to True. + dropout (float, optional): dropout probability. Defaults to 0.0. + use_memory_mask (bool, optional): whether to use memory mask at the Attention layer. It should have the same length with `attention` if it is a list of bool, indicating whether to use memory mask at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False. + force_monotonic_attention (bool, optional): whether to use monotonic_attention at the Attention layer when inferencing. It should have the same length with `attention` if it is a list of bool, indicating whether to use monotonic_attention at the corresponding Attention layer. If it is a bool, it is repeated len(attention) times internally. Defaults to False. + query_position_rate (float, optional): position_rate of the PositionEmbedding for query. Defaults to 1.0. + key_position_rate (float, optional): position_rate of the PositionEmbedding for key. Defaults to 1.0. + window_range (WindowRange, optional): window range of monotonic attention. Defaults to WindowRange(-1, 3). + key_projection (bool, optional): `key_projection` of Attention layers. Defaults to True. + value_projection (bool, optional): `value_projection` of Attention layers Defaults to True. + """ super(Decoder, self).__init__() self.dropout = dropout @@ -125,10 +138,9 @@ class Decoder(dg.Layer): conv_channels = convolutions[0].out_channels # only when padding idx is 0 can we easilt handle it - self.embed_keys_positions = PositionEmbedding( - max_positions, embed_dim, padding_idx=0) - self.embed_query_positions = PositionEmbedding( - max_positions, conv_channels, padding_idx=0) + self.embed_keys_positions = PositionEmbedding(max_positions, embed_dim) + self.embed_query_positions = PositionEmbedding(max_positions, + conv_channels) if n_speakers > 1: std = np.sqrt((1 - dropout) / speaker_dim) @@ -248,41 +260,20 @@ class Decoder(dg.Layer): Compute decoder outputs with ground truth mel spectrogram. Args: - encoder_out (Tuple(Variable, Variable)): - keys (Variable): shape(B, T_enc, C_emb), the key - representation from an encoder, where C_emb means - text embedding size. - values (Variable): shape(B, T_enc, C_emb), the value - representation from an encoder, where C_emb means - text embedding size. - lengths (Variable): shape(batch_size,), dtype: int64, valid lengths - of text inputs for each example. - inputs (Variable): shape(B, T_mel, C_mel), ground truth - mel-spectrogram, which is used as decoder inputs when training. - text_positions (Variable): shape(B, T_enc), dtype: int64. - Positions indices for text inputs for the encoder, where - T_enc means the encoder timesteps. - frame_positions (Variable): shape(B, T_mel // r), dtype: - int64. Positions indices for each decoder time steps. - speaker_embed: shape(batch_size, speaker_dim), speaker embedding, - only used for multispeaker model. - + encoder_out (keys, values): + keys (Variable): shape(B, T_enc, C_emb), dtype: float, the key representation from an encoder, where C_emb means text embedding size. + values (Variable): shape(B, T_enc, C_emb), dtype: float, the value representation from an encoder, where C_emb means text embedding size. + lengths (Variable): shape(batch_size,), dtype: int64, valid lengths of text inputs for each example. + inputs (Variable): shape(B, T_mel, C_mel), ground truth mel-spectrogram, which is used as decoder inputs when training. + text_positions (Variable): shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. + frame_positions (Variable): shape(B, T_mel // r), dtype: int64. Positions indices for each decoder time steps. + speaker_embed (Variable, optionals): shape(batch_size, speaker_dim), speaker embedding, only used for multispeaker model. Returns: - outputs (Variable): Shape(B, T_mel // r, r * C_mel). Decoder - outputs, where C_mel means the channels of mel-spectrogram, r - means the outputs per decoder step, T_mel means the length(time - steps) of mel spectrogram. Note that, when r > 1, the decoder - outputs r frames of mel spectrogram per step. - alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment - tensor between the decoder and the encoder, where N means number - of Attention Layers, T_mel means the length of mel spectrogram, - r means the outputs per decoder step, T_enc means the encoder - time steps. - done (Variable): Shape(B, T_mel // r), probability that the - outputs should stop. - decoder_states (Variable): Shape(B, T_mel // r, C_dec), decoder - hidden states, where C_dec means the channels of decoder states. + outputs (Variable): shape(B, T_mel, C_mel), dtype: float, decoder outputs, where C_mel means the channels of mel-spectrogram, T_mel means the length(time steps) of mel spectrogram. + alignments (Variable): shape(N, B, T_mel // r, T_enc), dtype: float, the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. + done (Variable): shape(B, T_mel // r), dtype: float, probability that the last frame has been generated. + decoder_states (Variable): shape(B, T_mel, C_dec // r), ddtype: float, decoder hidden states, where C_dec means the channels of decoder states (the output channels of the last `convolutions`). Note that it should be perfectlt devided by `r`. """ if speaker_embed is not None: speaker_embed = F.dropout( @@ -366,6 +357,8 @@ class Decoder(dg.Layer): return r def start_sequence(self): + """Prepare the Decoder to decode. This method is called by `decode`. + """ for layer in self.prenet: if isinstance(layer, Conv1DGLU): layer.start_sequence() @@ -379,6 +372,25 @@ class Decoder(dg.Layer): text_positions, speaker_embed=None, test_inputs=None): + """Decode from the encoder's output and other conditions. + + Args: + encoder_out (keys, values): + keys (Variable): shape(B, T_enc, C_emb), dtype: float, the key representation from an encoder, where C_emb means text embedding size. + values (Variable): shape(B, T_enc, C_emb), dtype: float, the value representation from an encoder, where C_emb means text embedding size. + text_positions (Variable): shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. + speaker_embed (Variable, optional): shape(B, C_sp), speaker embedding, only used for multispeaker model. + test_inputs (Variable, optional): shape(B, T_test, C_mel). test input, it is only used for debugging. Defaults to None. + + Returns: + outputs (Variable): shape(B, T_mel, C_mel), dtype: float, decoder outputs, where C_mel means the channels of mel-spectrogram, T_mel means the length(time steps) of mel spectrogram. + alignments (Variable): shape(N, B, T_mel // r, T_enc), dtype: float, the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. + done (Variable): shape(B, T_mel // r), dtype: float, probability that the last frame has been generated. If the probability is larger than 0.5 at a step, the generation stops. + decoder_states (Variable): shape(B, T_mel, C_dec // r), ddtype: float, decoder hidden states, where C_dec means the channels of decoder states (the output channels of the last `convolutions`). Note that it should be perfectlt devided by `r`. + + Note: + Only single instance inference is supported now, so B = 1. + """ self.start_sequence() keys, values = encoder_out batch_size = keys.shape[0] diff --git a/parakeet/models/deepvoice3/encoder.py b/parakeet/models/deepvoice3/encoder.py index b3e8bfb7c0f8e05457db437a8c285d4911759761..d428f72ec824a5f1b2ede4d194a3dab0ab1736d0 100644 --- a/parakeet/models/deepvoice3/encoder.py +++ b/parakeet/models/deepvoice3/encoder.py @@ -34,10 +34,20 @@ class Encoder(dg.Layer): padding_idx=None, embedding_weight_std=0.1, convolutions=(ConvSpec(64, 5, 1), ) * 7, - max_positions=512, dropout=0.): - super(Encoder, self).__init__() + """[summary] + Args: + n_vocab (int): vocabulary size of the text embedding. + embed_dim (int): embedding size of the text embedding. + n_speakers (int): number of speakers. + speaker_dim (int): speaker embedding size. + padding_idx (int, optional): padding index of text embedding. Defaults to None. + embedding_weight_std (float, optional): standard deviation of the embedding weights when intialized. Defaults to 0.1. + convolutions (Iterable[ConvSpec], optional): specifications of the convolutional layers. ConvSpec is a namedtuple of output channels, filter_size and dilation. Defaults to (ConvSpec(64, 5, 1), )*7. + dropout (float, optional): dropout probability. Defaults to 0.. + """ + super(Encoder, self).__init__() self.embedding_weight_std = embedding_weight_std self.embed = dg.Embedding( (n_vocab, embed_dim), @@ -101,18 +111,12 @@ class Encoder(dg.Layer): Encode text sequence. Args: - x (Variable): Shape(B, T_enc), dtype: int64. Ihe input text - indices. T_enc means the timesteps of decoder input x. - speaker_embed (Variable, optional): Shape(batch_size, speaker_dim), - dtype: float32. Speaker embeddings. This arg is not None only - when the model is a multispeaker model. + x (Variable): shape(B, T_enc), dtype: int64. Ihe input text indices. T_enc means the timesteps of decoder input x. + speaker_embed (Variable, optional): shape(B, C_sp), dtype: float, speaker embeddings. This arg is not None only when the model is a multispeaker model. Returns: - keys (Variable), Shape(B, T_enc, C_emb), the encoded - representation for keys, where C_emb menas the text embedding - size. - values (Variable), Shape(B, T_enc, C_emb), the encoded - representation for values. + keys (Variable), Shape(B, T_enc, C_emb), dtype: float, the encoded epresentation for keys, where C_emb menas the text embedding size. + values (Variable), Shape(B, T_enc, C_emb), dtype: float, the encoded representation for values. """ x = self.embed(x) x = F.dropout( diff --git a/parakeet/models/deepvoice3/loss.py b/parakeet/models/deepvoice3/loss.py index be6f0bde6f4001cd87e5ba81bf2dd66ef8cdc92b..ace96da97cf3226dcf2e3cb4f1246d2d5123708c 100644 --- a/parakeet/models/deepvoice3/loss.py +++ b/parakeet/models/deepvoice3/loss.py @@ -23,12 +23,10 @@ import paddle.fluid.dygraph as dg def masked_mean(inputs, mask): """ Args: - inputs (Variable): Shape(B, T, C), the input, where B means - batch size, C means channels of input, T means timesteps of - the input. - mask (Variable): Shape(B, T), a mask. + inputs (Variable): shape(B, T, C), dtype: float, the input. + mask (Variable): shape(B, T), dtype: float, a mask. Returns: - loss (Variable): Shape(1, ), masked mean. + loss (Variable): shape(1, ), dtype: float, masked mean. """ channels = inputs.shape[-1] masked_inputs = F.elementwise_mul(inputs, mask, axis=0) @@ -38,6 +36,18 @@ def masked_mean(inputs, mask): @jit(nopython=True) def guided_attention(N, max_N, T, max_T, g): + """Generate an diagonal attention guide. + + Args: + N (int): valid length of encoder. + max_N (int): max length of encoder. + T (int): valid length of decoder. + max_T (int): max length of decoder. + g (float): sigma to adjust the degree of diagonal guide. + + Returns: + np.ndarray: shape(max_N, max_T), dtype: float, the diagonal guide. + """ W = np.zeros((max_N, max_T), dtype=np.float32) for n in range(N): for t in range(T): @@ -47,6 +57,17 @@ def guided_attention(N, max_N, T, max_T, g): def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len, g=0.2): + """Generate a diagonal attention guide for a batch. + + Args: + encoder_lengths (np.ndarray): shape(B, ), dtype: int64, encoder valid lengths. + decoder_lengths (np.ndarray): shape(B, ), dtype: int64, decoder valid lengths. + max_decoder_len (int): max length of decoder. + g (float, optional): sigma to adjust the degree of diagonal guide.. Defaults to 0.2. + + Returns: + np.ndarray: shape(B, max_T, max_N), dtype: float, the diagonal guide. (max_N: max encoder length, max_T: max decoder length.) + """ B = len(encoder_lengths) max_input_len = encoder_lengths.max() W = np.zeros((B, max_decoder_len, max_input_len), dtype=np.float32) @@ -65,6 +86,17 @@ class TTSLoss(object): guided_attention_sigma=0.2, downsample_factor=4, r=1): + """Compute loss for Deep Voice 3 model. + + Args: + masked_weight (float, optional): the weight of masked loss. Defaults to 0.0. + priority_bin ([type], optional): frequency bands for linear spectrogram loss to be prioritized. Defaults to None. + priority_weight (float, optional): weight for the prioritized frequency bands. Defaults to 0.0. + binary_divergence_weight (float, optional): weight for binary cross entropy (used for spectrogram loss). Defaults to 0.0. + guided_attention_sigma (float, optional): `sigma` for attention guide. Defaults to 0.2. + downsample_factor (int, optional): the downsample factor for mel spectrogram. Defaults to 4. + r (int, optional): frames per decoder step. Defaults to 1. + """ self.masked_weight = masked_weight self.priority_bin = priority_bin # only used for lin-spec loss self.priority_weight = priority_weight # only used for lin-spec loss @@ -76,6 +108,17 @@ class TTSLoss(object): self.downsample_factor = downsample_factor def l1_loss(self, prediction, target, mask, priority_bin=None): + """L1 loss for spectrogram. + + Args: + prediction (Variable): shape(B, T, C), dtype: float, predicted spectrogram. + target (Variable): shape(B, T, C), dtype: float, target spectrogram. + mask (Variable): shape(B, T), mask. + priority_bin (int, optional): frequency bands for linear spectrogram loss to be prioritized. Defaults to None. + + Returns: + Variable: shape(1,), dtype: float, l1 loss(with mask and possibly priority bin applied.) + """ abs_diff = F.abs(prediction - target) # basic mask-weighted l1 loss @@ -103,6 +146,16 @@ class TTSLoss(object): return loss def binary_divergence(self, prediction, target, mask): + """Binary cross entropy loss for spectrogram. All the values in the spectrogram are treated as logits in a logistic regression. + + Args: + prediction (Variable): shape(B, T, C), dtype: float, predicted spectrogram. + target (Variable): shape(B, T, C), dtype: float, target spectrogram. + mask (Variable): shape(B, T), mask. + + Returns: + Variable: shape(1,), dtype: float, binary cross entropy loss. + """ flattened_prediction = F.reshape(prediction, [-1, 1]) flattened_target = F.reshape(target, [-1, 1]) flattened_loss = F.log_loss( @@ -119,6 +172,15 @@ class TTSLoss(object): @staticmethod def done_loss(done_hat, done): + """Compute done loss + + Args: + done_hat (Variable): shape(B, T), dtype: float, predicted done probability(the probability that the final frame has been generated.) + done (Variable): shape(B, T), dtype: float, ground truth done probability(the probability that the final frame has been generated.) + + Returns: + Variable: shape(1, ), dtype: float, done loss. + """ flat_done_hat = F.reshape(done_hat, [-1, 1]) flat_done = F.reshape(done, [-1, 1]) loss = F.log_loss(flat_done_hat, flat_done, epsilon=1e-8) @@ -128,21 +190,15 @@ class TTSLoss(object): def attention_loss(self, predicted_attention, input_lengths, target_lengths): """ - Given valid encoder_lengths and decoder_lengths, compute a diagonal - guide, and compute loss from the predicted attention and the guide. + Given valid encoder_lengths and decoder_lengths, compute a diagonal guide, and compute loss from the predicted attention and the guide. Args: - predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the - alignment tensor, where B means batch size, T_dec means number - of time steps of the decoder, T_enc means the number of time - steps of the encoder, * means other possible dimensions. - input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths - (time steps) of encoder outputs. - target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64, - valid lengths (time steps) of decoder outputs. + predicted_attention (Variable): shape(*, B, T_dec, T_enc), dtype: float, the alignment tensor, where B means batch size, T_dec means number of time steps of the decoder, T_enc means the number of time steps of the encoder, * means other possible dimensions. + input_lengths (numpy.ndarray): shape(B,), dtype:int64, valid lengths (time steps) of encoder outputs. + target_lengths (numpy.ndarray): shape(batch_size,), dtype:int64, valid lengths (time steps) of decoder outputs. Returns: - loss (Variable): Shape(1, ) attention loss. + loss (Variable): shape(1, ), dtype: float, attention loss. """ n_attention, batch_size, max_target_len, max_input_len = ( predicted_attention.shape) @@ -167,6 +223,26 @@ class TTSLoss(object): compute_mel_loss=True, compute_done_loss=True, compute_attn_loss=True): + """Total loss + + Args: + mel_hyp (Variable): shape(B, T, C_mel), dtype, float, predicted mel spectrogram. + lin_hyp (Variable): shape(B, T, C_lin), dtype, float, predicted linear spectrogram. + done_hyp (Variable): shape(B, T), dtype, float, predicted done probability. + attn_hyp (Variable): shape(N, B, T_dec, T_enc), dtype: float, predicted attention. + mel_ref (Variable): shape(B, T, C_mel), dtype, float, ground truth mel spectrogram. + lin_ref (Variable): shape(B, T, C_lin), dtype, float, ground truth linear spectrogram. + done_ref (Variable): shape(B, T), dtype, float, ground truth done flag. + input_lengths (Variable): shape(B, ), dtype: int, encoder valid lengths. + n_frames (Variable): shape(B, ), dtype: int, decoder valid lengths. + compute_lin_loss (bool, optional): whether to compute linear loss. Defaults to True. + compute_mel_loss (bool, optional): whether to compute mel loss. Defaults to True. + compute_done_loss (bool, optional): whether to compute done loss. Defaults to True. + compute_attn_loss (bool, optional): whether to compute atention loss. Defaults to True. + + Returns: + Dict(str, Variable): details of loss. + """ total_loss = 0. # n_frames # mel_lengths # decoder_lengths diff --git a/parakeet/models/deepvoice3/model.py b/parakeet/models/deepvoice3/model.py index f2fb271c21aad346ae27c81683f5fa0c927d414a..a635d6eeff2ee596cc14be69a79113142a8a935e 100644 --- a/parakeet/models/deepvoice3/model.py +++ b/parakeet/models/deepvoice3/model.py @@ -22,6 +22,15 @@ import paddle.fluid.dygraph as dg class DeepVoice3(dg.Layer): def __init__(self, encoder, decoder, converter, speaker_embedding, use_decoder_states): + """Deep Voice 3 TTS model. + + Args: + encoder (Layer): the encoder. + decoder (Layer): the decoder. + converter (Layer): the converter. + speaker_embedding (Layer): the speaker embedding (for multispeaker cases). + use_decoder_states (bool): use decoder states instead of predicted mel spectrogram as the input of the converter. + """ super(DeepVoice3, self).__init__() if speaker_embedding is None: self.n_speakers = 1 @@ -34,6 +43,24 @@ class DeepVoice3(dg.Layer): def forward(self, text_sequences, text_positions, valid_lengths, speaker_indices, mel_inputs, frame_positions): + """Compute predicted value in a teacher forcing training manner. + + Args: + text_sequences (Variable): shape(B, T_enc), dtype: int64, text indices. + text_positions (Variable): shape(B, T_enc), dtype: int64, positions of text indices. + valid_lengths (Variable): shape(B, ), dtype: int64, valid lengths of utterances. + speaker_indices (Variable): shape(B, ), dtype: int64, speaker indices for utterances. + mel_inputs (Variable): shape(B, T_mel, C_mel), dytpe: int64, ground truth mel spectrogram. + frame_positions (Variable): shape(B, T_dec), dtype: int64, positions of decoder steps. + + Returns: + (mel_outputs, linear_outputs, alignments, done) + mel_outputs (Variable): shape(B, T_mel, C_mel), dtype: float, predicted mel spectrogram. + mel_outputs (Variable): shape(B, T_mel, C_mel), dtype: float, predicted mel spectrogram. + alignments (Variable): shape(N, B, T_dec, T_enc), dtype: float, predicted attention. + done (Variable): shape(B, T_dec), dtype: float, predicted done probability. + (T_mel: time steps of mel spectrogram, T_lin: time steps of linear spectrogra, T_dec, time steps of decoder, T_enc: time steps of encoder.) + """ if hasattr(self, "speaker_embedding"): speaker_embed = self.speaker_embedding(speaker_indices) else: @@ -49,6 +76,21 @@ class DeepVoice3(dg.Layer): return mel_outputs, linear_outputs, alignments, done def transduce(self, text_sequences, text_positions, speaker_indices=None): + """Generate output without teacher forcing. Only batch_size = 1 is supported. + + Args: + text_sequences (Variable): shape(B, T_enc), dtype: int64, text indices. + text_positions (Variable): shape(B, T_enc), dtype: int64, positions of text indices. + speaker_indices (Variable): shape(B, ), dtype: int64, speaker indices for utterances. + + Returns: + (mel_outputs, linear_outputs, alignments, done) + mel_outputs (Variable): shape(B, T_mel, C_mel), dtype: float, predicted mel spectrogram. + mel_outputs (Variable): shape(B, T_mel, C_mel), dtype: float, predicted mel spectrogram. + alignments (Variable): shape(B, T_dec, T_enc), dtype: float, predicted average attention of all attention layers. + done (Variable): shape(B, T_dec), dtype: float, predicted done probability. + (T_mel: time steps of mel spectrogram, T_lin: time steps of linear spectrogra, T_dec, time steps of decoder, T_enc: time steps of encoder.) + """ if hasattr(self, "speaker_embedding"): speaker_embed = self.speaker_embedding(speaker_indices) else: diff --git a/parakeet/models/deepvoice3/position_embedding.py b/parakeet/models/deepvoice3/position_embedding.py index 88ef5cbd1f5f285d9366a4de5031ab529b2247cd..c3f5c27b2bfcc3b1cd06534b08f2dc83e6894d14 100644 --- a/parakeet/models/deepvoice3/position_embedding.py +++ b/parakeet/models/deepvoice3/position_embedding.py @@ -19,14 +19,14 @@ import paddle.fluid.dygraph as dg def compute_position_embedding(radians, speaker_position_rate): - """compute sin/cos separately and scatter them to a zero. + """Compute sin/cos interleaved matrix from the radians. - Arguments: - radians {Variable} -- shape(n_vocab, embed_dim), the radians matrix. - speaker_position_rate {Variable} -- shape(batch_size, ), speaker positioning rate. + Arg: + radians (Variable): shape(n_vocab, embed_dim), dtype: float, the radians matrix. + speaker_position_rate (Variable): shape(B, ), speaker positioning rate. Returns: - Variable -- shape(batch_size, n_vocab, embed_dim), the sin, cos matrix. + Variable: shape(B, n_vocab, embed_dim), the sin, cos interleaved matrix. """ _, embed_dim = radians.shape batch_size = speaker_position_rate.shape[0] @@ -48,10 +48,20 @@ def position_encoding_init(n_position, d_pos_vec, position_rate=1.0, padding_idx=None): - """init the position encoding table""" + """Init the position encoding. + + Args: + n_position (int): max position, vocab size for position embedding. + d_pos_vec (int): position embedding size. + position_rate (float, optional): position rate (this should only be used when all the utterances are from one speaker.). Defaults to 1.0. + padding_idx (int, optional): padding index for the position embedding(it is set as 0 internally if not provided.). Defaults to None. + + Returns: + [type]: [description] + """ + # init the position encoding table # keep idx 0 for padding token position encoding zero vector # CAUTION: it is radians here, sin and cos are not applied - # CAUTION: difference here indices_range = np.expand_dims(np.arange(n_position), -1) embed_range = 2 * (np.arange(d_pos_vec) // 2) radians = position_rate \ @@ -63,31 +73,32 @@ def position_encoding_init(n_position, class PositionEmbedding(dg.Layer): - def __init__(self, - n_position, - d_pos_vec, - position_rate=1.0, - param_attr=None, - max_norm=None, - padding_idx=None): + def __init__(self, n_position, d_pos_vec, position_rate=1.0): + """Position Embedding for Deep Voice 3. + + Args: + n_position (int): max position, vocab size for position embedding. + d_pos_vec (int): position embedding size. + position_rate (float, optional): position rate (this should only be used when all the utterances are from one speaker.). Defaults to 1.0. + """ super(PositionEmbedding, self).__init__() self.weight = self.create_parameter((n_position, d_pos_vec)) self.weight.set_value( - position_encoding_init(n_position, d_pos_vec, position_rate, - padding_idx).astype("float32")) + position_encoding_init(n_position, d_pos_vec, position_rate) + .astype("float32")) def forward(self, indices, speaker_position_rate=None): """ Args: - indices (Variable): Shape (B, T), dtype: int64, position + indices (Variable): shape (B, T), dtype: int64, position indices, where B means the batch size, T means the time steps. speaker_position_rate (Variable | float, optional), position rate. It can be a float point number or a Variable with shape (1,), then this speaker_position_rate is used for every - example. It can also be a Variable with shape (B, 1), which - contains a speaker position rate for each speaker. + example. It can also be a Variable with shape (B, ), which + contains a speaker position rate for each utterance. Returns: - out (Variable): Shape(B, T, C_pos), position embedding, where C_pos + out (Variable): shape(B, T, C_pos), dtype: float, position embedding, where C_pos means position embedding size. """ batch_size, time_steps = indices.shape diff --git a/parakeet/models/fastspeech/decoder.py b/parakeet/models/fastspeech/decoder.py index 46eb391ba0ae76f29267e080d6457921a36aa1c1..8432fc5ba7f21ab9e3b3e7f18a5168fcb41f5d16 100644 --- a/parakeet/models/fastspeech/decoder.py +++ b/parakeet/models/fastspeech/decoder.py @@ -32,6 +32,7 @@ class Decoder(dg.Layer): super(Decoder, self).__init__() n_position = len_max_seq + 1 + self.n_head = n_head self.pos_inp = get_sinusoid_encoding_table( n_position, d_model, padding_idx=0) self.position_enc = dg.Embedding( @@ -55,7 +56,7 @@ class Decoder(dg.Layer): for i, layer in enumerate(self.layer_stack): self.add_sublayer('fft_{}'.format(i), layer) - def forward(self, enc_seq, enc_pos): + def forward(self, enc_seq, enc_pos, non_pad_mask, slf_attn_mask=None): """ Decoder layer of FastSpeech. @@ -69,10 +70,7 @@ class Decoder(dg.Layer): dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. """ dec_slf_attn_list = [] - - # -- Prepare masks - slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos) - non_pad_mask = get_non_pad_mask(enc_pos) + slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1]) # -- Forward dec_output = enc_seq + self.position_enc(enc_pos) diff --git a/parakeet/models/fastspeech/encoder.py b/parakeet/models/fastspeech/encoder.py index 15c8d60e6e016fc1955111f4702aa1f4e2c478e2..15d634eca1aa96a8c0af2b9eac40424cf9c23d7e 100644 --- a/parakeet/models/fastspeech/encoder.py +++ b/parakeet/models/fastspeech/encoder.py @@ -32,14 +32,17 @@ class Encoder(dg.Layer): dropout=0.1): super(Encoder, self).__init__() n_position = len_max_seq + 1 + self.n_head = n_head self.src_word_emb = dg.Embedding( - size=[n_src_vocab, d_model], padding_idx=0) + size=[n_src_vocab, d_model], + padding_idx=0, + param_attr=fluid.initializer.Normal( + loc=0.0, scale=1.0)) self.pos_inp = get_sinusoid_encoding_table( n_position, d_model, padding_idx=0) self.position_enc = dg.Embedding( size=[n_position, d_model], - padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( self.pos_inp), @@ -58,7 +61,7 @@ class Encoder(dg.Layer): for i, layer in enumerate(self.layer_stack): self.add_sublayer('fft_{}'.format(i), layer) - def forward(self, character, text_pos): + def forward(self, character, text_pos, non_pad_mask, slf_attn_mask=None): """ Encoder layer of FastSpeech. @@ -74,10 +77,7 @@ class Encoder(dg.Layer): enc_slf_attn_list (list), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list. """ enc_slf_attn_list = [] - # -- prepare masks - # shape character (N, T) - slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character) - non_pad_mask = get_non_pad_mask(character) + slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1]) # -- Forward enc_output = self.src_word_emb(character) + self.position_enc( @@ -90,4 +90,4 @@ class Encoder(dg.Layer): slf_attn_mask=slf_attn_mask) enc_slf_attn_list += [enc_slf_attn] - return enc_output, non_pad_mask, enc_slf_attn_list + return enc_output, enc_slf_attn_list diff --git a/parakeet/models/fastspeech/fastspeech.py b/parakeet/models/fastspeech/fastspeech.py index 91478af59d67db3565982e17e49cfadb6249e386..a37d5fac06dce8379738f95781c85abcdaa241a4 100644 --- a/parakeet/models/fastspeech/fastspeech.py +++ b/parakeet/models/fastspeech/fastspeech.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import math +import numpy as np import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.g2p.text.symbols import symbols +from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.post_convnet import PostConvNet from parakeet.models.fastspeech.length_regulator import LengthRegulator from parakeet.models.fastspeech.encoder import Encoder @@ -78,6 +80,10 @@ class FastSpeech(dg.Layer): def forward(self, character, text_pos, + enc_non_pad_mask, + dec_non_pad_mask, + enc_slf_attn_mask=None, + dec_slf_attn_mask=None, mel_pos=None, length_target=None, alpha=1.0): @@ -106,14 +112,20 @@ class FastSpeech(dg.Layer): dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. """ - encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder( - character, text_pos) + encoder_output, enc_slf_attn_list = self.encoder( + character, + text_pos, + enc_non_pad_mask, + slf_attn_mask=enc_slf_attn_mask) if fluid.framework._dygraph_tracer()._train_mode: length_regulator_output, duration_predictor_output = self.length_regulator( encoder_output, target=length_target, alpha=alpha) decoder_output, dec_slf_attn_list = self.decoder( - length_regulator_output, mel_pos) + length_regulator_output, + mel_pos, + dec_non_pad_mask, + slf_attn_mask=dec_slf_attn_mask) mel_output = self.mel_linear(decoder_output) mel_output_postnet = self.postnet(mel_output) + mel_output @@ -122,8 +134,18 @@ class FastSpeech(dg.Layer): else: length_regulator_output, decoder_pos = self.length_regulator( encoder_output, alpha=alpha) - decoder_output, _ = self.decoder(length_regulator_output, - decoder_pos) + slf_attn_mask = get_triu_tensor( + decoder_pos.numpy(), decoder_pos.numpy()).astype(np.float32) + slf_attn_mask = fluid.layers.cast( + dg.to_variable(slf_attn_mask == 0), np.float32) + slf_attn_mask = dg.to_variable(slf_attn_mask) + dec_non_pad_mask = fluid.layers.unsqueeze( + (decoder_pos != 0).astype(np.float32), [-1]) + decoder_output, _ = self.decoder( + length_regulator_output, + decoder_pos, + dec_non_pad_mask, + slf_attn_mask=slf_attn_mask) mel_output = self.mel_linear(decoder_output) mel_output_postnet = self.postnet(mel_output) + mel_output diff --git a/parakeet/models/fastspeech/fft_block.py b/parakeet/models/fastspeech/fft_block.py index f50f11a189d8194bf2bee5c9b0115d556753bbcb..0c0ed4fda024735691fc6c4ddf39ef29ffeb4f4a 100644 --- a/parakeet/models/fastspeech/fft_block.py +++ b/parakeet/models/fastspeech/fft_block.py @@ -46,7 +46,7 @@ class FFTBlock(dg.Layer): padding=padding, dropout=dropout) - def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): + def forward(self, enc_input, non_pad_mask, slf_attn_mask=None): """ Feed Forward Transformer block in FastSpeech. @@ -63,6 +63,7 @@ class FFTBlock(dg.Layer): """ output, slf_attn = self.slf_attn( enc_input, enc_input, enc_input, mask=slf_attn_mask) + output *= non_pad_mask output = self.pos_ffn(output) diff --git a/parakeet/models/fastspeech/length_regulator.py b/parakeet/models/fastspeech/length_regulator.py index 331597ab663de4ea5c66e2b2522d64bc87149a78..f6bc8037f032004f54bc4791cfce9b6611685f49 100644 --- a/parakeet/models/fastspeech/length_regulator.py +++ b/parakeet/models/fastspeech/length_regulator.py @@ -146,11 +146,17 @@ class DurationPredictor(dg.Layer): out = layers.transpose(encoder_output, [0, 2, 1]) out = self.conv1(out) out = layers.transpose(out, [0, 2, 1]) - out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout) + out = layers.dropout( + layers.relu(self.layer_norm1(out)), + self.dropout, + dropout_implementation='upscale_in_train') out = layers.transpose(out, [0, 2, 1]) out = self.conv2(out) out = layers.transpose(out, [0, 2, 1]) - out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout) + out = layers.dropout( + layers.relu(self.layer_norm2(out)), + self.dropout, + dropout_implementation='upscale_in_train') out = layers.relu(self.linear(out)) out = layers.squeeze(out, axes=[-1]) diff --git a/parakeet/models/fastspeech/utils.py b/parakeet/models/fastspeech/utils.py index 5e680f08b618b120e10ead6165c2557772f073e7..cfd6d47cb5005412ed3044a82bd032c811411f1e 100644 --- a/parakeet/models/fastspeech/utils.py +++ b/parakeet/models/fastspeech/utils.py @@ -18,7 +18,6 @@ def get_alignment(attn_probs, mel_lens, n_head): max_F = 0 assert attn_probs[0].shape[0] % n_head == 0 batch_size = int(attn_probs[0].shape[0] // n_head) - #max_attn = attn_probs[0].numpy()[0,batch_size] for i in range(len(attn_probs)): multi_attn = attn_probs[i].numpy() for j in range(n_head): @@ -28,7 +27,7 @@ def get_alignment(attn_probs, mel_lens, n_head): max_F = F max_attn = attn alignment = compute_duration(max_attn, mel_lens) - return alignment + return alignment, max_attn def score_F(attn): diff --git a/parakeet/models/transformer_tts/decoder.py b/parakeet/models/transformer_tts/decoder.py index 3d7adf15b878c3a3a75af80039a74e13d33e06f1..5b17a7a2f7674e5c76ba8776a8a6aa015f029d12 100644 --- a/parakeet/models/transformer_tts/decoder.py +++ b/parakeet/models/transformer_tts/decoder.py @@ -14,7 +14,7 @@ import math import paddle.fluid.dygraph as dg import paddle.fluid as fluid -from parakeet.modules.utils import * +from parakeet.models.transformer_tts.utils import * from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.models.transformer_tts.prenet import PreNet @@ -25,6 +25,7 @@ class Decoder(dg.Layer): def __init__(self, num_hidden, config, num_head=4): super(Decoder, self).__init__() self.num_hidden = num_hidden + self.num_head = num_head param = fluid.ParamAttr() self.alpha = self.create_parameter( shape=(1, ), @@ -98,30 +99,29 @@ class Decoder(dg.Layer): outputs_per_step=config['audio']['outputs_per_step'], use_cudnn=True) - def forward(self, key, value, query, c_mask, positional): + def forward(self, + key, + value, + query, + positional, + mask, + m_mask=None, + m_self_mask=None, + zero_mask=None): # get decoder mask with triangular matrix if fluid.framework._dygraph_tracer()._train_mode: - m_mask = get_non_pad_mask(positional) - mask = get_attn_key_pad_mask((positional == 0).astype(np.float32), - query) - triu_tensor = dg.to_variable( - get_triu_tensor(query.numpy(), query.numpy())).astype( - np.float32) - mask = mask + triu_tensor - mask = fluid.layers.cast(mask == 0, np.float32) - - # (batch_size, decoder_len, encoder_len) - zero_mask = get_attn_key_pad_mask( - layers.squeeze(c_mask, [-1]), query) + m_mask = layers.expand(m_mask, [self.num_head, 1, key.shape[1]]) + m_self_mask = layers.expand(m_self_mask, + [self.num_head, 1, query.shape[1]]) + mask = layers.expand(mask, [self.num_head, 1, 1]) + zero_mask = layers.expand(zero_mask, [self.num_head, 1, 1]) + else: - mask = get_triu_tensor(query.numpy(), - query.numpy()).astype(np.float32) - mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) - m_mask, zero_mask = None, None + m_mask, m_self_mask, zero_mask = None, None, None - # Decoder pre-network +# Decoder pre-network query = self.decoder_prenet(query) # Centered position @@ -132,7 +132,8 @@ class Decoder(dg.Layer): query = positional * self.alpha + query #positional dropout - query = fluid.layers.dropout(query, 0.1) + query = fluid.layers.dropout( + query, 0.1, dropout_implementation='upscale_in_train') # Attention decoder-decoder, encoder-decoder selfattn_list = list() @@ -141,12 +142,13 @@ class Decoder(dg.Layer): for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): query, attn_dec = selfattn( - query, query, query, mask=mask, query_mask=m_mask) + query, query, query, mask=mask, query_mask=m_self_mask) query, attn_dot = attn( key, value, query, mask=zero_mask, query_mask=m_mask) query = ffn(query) selfattn_list.append(attn_dec) attn_list.append(attn_dot) + # Mel linear projection mel_out = self.mel_linear(query) # Post Mel Network diff --git a/parakeet/models/transformer_tts/encoder.py b/parakeet/models/transformer_tts/encoder.py index 548ea8e4640f317b29de486b1d58f710d042d852..ef3821ff1667cf0029ac9c5f077b0ffe95a6c70d 100644 --- a/parakeet/models/transformer_tts/encoder.py +++ b/parakeet/models/transformer_tts/encoder.py @@ -23,6 +23,7 @@ class Encoder(dg.Layer): def __init__(self, embedding_size, num_hidden, num_head=4): super(Encoder, self).__init__() self.num_hidden = num_hidden + self.num_head = num_head param = fluid.ParamAttr(initializer=fluid.initializer.Constant( value=1.0)) self.alpha = self.create_parameter( @@ -31,7 +32,6 @@ class Encoder(dg.Layer): 1024, self.num_hidden, padding_idx=0) self.pos_emb = dg.Embedding( size=[1024, num_hidden], - padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( self.pos_inp), @@ -56,13 +56,15 @@ class Encoder(dg.Layer): for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) - def forward(self, x, positional): + def forward(self, x, positional, mask=None, query_mask=None): + if fluid.framework._dygraph_tracer()._train_mode: - query_mask = get_non_pad_mask(positional) - mask = get_attn_key_pad_mask(positional, x) + seq_len_key = x.shape[1] + query_mask = layers.expand(query_mask, + [self.num_head, 1, seq_len_key]) + mask = layers.expand(mask, [self.num_head, 1, 1]) else: query_mask, mask = None, None - # Encoder pre_network x = self.encoder_prenet(x) #(N,T,C) @@ -72,7 +74,7 @@ class Encoder(dg.Layer): x = positional * self.alpha + x #(N, T, C) # Positional dropout - x = layers.dropout(x, 0.1) + x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train') # Self attention encoder attentions = list() @@ -81,4 +83,4 @@ class Encoder(dg.Layer): x = ffn(x) attentions.append(attention) - return x, query_mask, attentions + return x, attentions diff --git a/parakeet/models/transformer_tts/encoderprenet.py b/parakeet/models/transformer_tts/encoderprenet.py index d7014240eb8066cee18a890f4f6d509d3d4a09f7..e953dab062c80d4e9218612981e65030a5fc0270 100644 --- a/parakeet/models/transformer_tts/encoderprenet.py +++ b/parakeet/models/transformer_tts/encoderprenet.py @@ -27,7 +27,10 @@ class EncoderPrenet(dg.Layer): self.num_hidden = num_hidden self.use_cudnn = use_cudnn self.embedding = dg.Embedding( - size=[len(symbols), embedding_size], padding_idx=None) + size=[len(symbols), embedding_size], + padding_idx=0, + param_attr=fluid.initializer.Normal( + loc=0.0, scale=1.0)) self.conv_list = [] k = math.sqrt(1 / embedding_size) self.conv_list.append( @@ -78,10 +81,14 @@ class EncoderPrenet(dg.Layer): low=-k, high=k))) def forward(self, x): + x = self.embedding(x) #(batch_size, seq_len, embending_size) x = layers.transpose(x, [0, 2, 1]) for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): - x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) + x = layers.dropout( + layers.relu(batch_norm(conv(x))), + 0.2, + dropout_implementation='upscale_in_train') x = layers.transpose(x, [0, 2, 1]) #(N,T,C) x = self.projection(x) diff --git a/parakeet/models/transformer_tts/post_convnet.py b/parakeet/models/transformer_tts/post_convnet.py index 8882e79687e0308633132737237f74b560920fd8..60e93824a501f3a9003ac6c89aaa7ce90ccf52da 100644 --- a/parakeet/models/transformer_tts/post_convnet.py +++ b/parakeet/models/transformer_tts/post_convnet.py @@ -108,11 +108,16 @@ class PostConvNet(dg.Layer): conv = self.conv_list[i] input = layers.dropout( - layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout) + layers.tanh(batch_norm(conv(input)[:, :, :len])), + self.dropout, + dropout_implementation='upscale_in_train') conv = self.conv_list[self.num_conv - 1] input = conv(input)[:, :, :len] if self.batchnorm_last: batch_norm = self.batch_norm_list[self.num_conv - 1] - input = layers.dropout(batch_norm(input), self.dropout) + input = layers.dropout( + batch_norm(input), + self.dropout, + dropout_implementation='upscale_in_train') output = layers.transpose(input, [0, 2, 1]) return output diff --git a/parakeet/models/transformer_tts/prenet.py b/parakeet/models/transformer_tts/prenet.py index 6039b6033dce5c861f4e7b94597807310f04c9a7..b47a9f8b58195ed67c85339f86a273867759648a 100644 --- a/parakeet/models/transformer_tts/prenet.py +++ b/parakeet/models/transformer_tts/prenet.py @@ -56,6 +56,12 @@ class PreNet(dg.Layer): Returns: x (Variable), Shape(B, T, C), the result after pernet. """ - x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate) - x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate) + x = layers.dropout( + layers.relu(self.linear1(x)), + self.dropout_rate, + dropout_implementation='upscale_in_train') + x = layers.dropout( + layers.relu(self.linear2(x)), + self.dropout_rate, + dropout_implementation='upscale_in_train') return x diff --git a/parakeet/models/transformer_tts/transformer_tts.py b/parakeet/models/transformer_tts/transformer_tts.py index 1205c6b939efe48bca523824ae3aa3ce25894cce..a7fffbd38b04f17bb2b5392d1f4cb83183be3d6d 100644 --- a/parakeet/models/transformer_tts/transformer_tts.py +++ b/parakeet/models/transformer_tts/transformer_tts.py @@ -24,11 +24,29 @@ class TransformerTTS(dg.Layer): self.decoder = Decoder(config['hidden_size'], config) self.config = config - def forward(self, characters, mel_input, pos_text, pos_mel): - - key, c_mask, attns_enc = self.encoder(characters, pos_text) + def forward(self, + characters, + mel_input, + pos_text, + pos_mel, + dec_slf_mask, + enc_slf_mask=None, + enc_query_mask=None, + enc_dec_mask=None, + dec_query_slf_mask=None, + dec_query_mask=None): + key, attns_enc = self.encoder( + characters, pos_text, mask=enc_slf_mask, query_mask=enc_query_mask) mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder( - key, key, mel_input, c_mask, pos_mel) + key, + key, + mel_input, + pos_mel, + mask=dec_slf_mask, + zero_mask=enc_dec_mask, + m_self_mask=dec_query_slf_mask, + m_mask=dec_query_mask) + return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec diff --git a/parakeet/models/transformer_tts/utils.py b/parakeet/models/transformer_tts/utils.py index 22127446b463a9fd1f2407f29ceca6f2639ac2cc..4b525272ecaf1f1e5e55b4cfc05f55ff0a37ac3c 100644 --- a/parakeet/models/transformer_tts/utils.py +++ b/parakeet/models/transformer_tts/utils.py @@ -51,7 +51,9 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): def get_non_pad_mask(seq): - return layers.unsqueeze((seq != 0).astype(np.float32), [-1]) + mask = (seq != 0).astype(np.float32) + mask = np.expand_dims(mask, axis=-1) + return mask def get_attn_key_pad_mask(seq_k, seq_q): @@ -60,8 +62,22 @@ def get_attn_key_pad_mask(seq_k, seq_q): # Expand to fit the shape of key query attention matrix. len_q = seq_q.shape[1] padding_mask = (seq_k != 0).astype(np.float32) - padding_mask = layers.expand( - layers.unsqueeze(padding_mask, [1]), [1, len_q, 1]) + padding_mask = np.expand_dims(padding_mask, axis=1) + padding_mask = padding_mask.repeat([len_q], axis=1) + padding_mask = (padding_mask == 0).astype(np.float32) * (-2**32 + 1) + return padding_mask + + +def get_dec_attn_key_pad_mask(seq_k, seq_q): + ''' For masking out the padding part of key sequence. ''' + + # Expand to fit the shape of key query attention matrix. + len_q = seq_q.shape[1] + padding_mask = (seq_k == 0).astype(np.float32) + padding_mask = np.expand_dims(padding_mask, axis=1) + triu_tensor = get_triu_tensor(seq_q, seq_q) + padding_mask = padding_mask.repeat([len_q], axis=1) + triu_tensor + padding_mask = (padding_mask != 0).astype(np.float32) * (-2**32 + 1) return padding_mask diff --git a/parakeet/models/waveflow/data.py b/parakeet/models/waveflow/data.py index 0c1e914271214de93de514098872183b94843375..33e2ee55b1ef8ca3ceb6cbcdaeb425ce64914891 100644 --- a/parakeet/models/waveflow/data.py +++ b/parakeet/models/waveflow/data.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import random import librosa @@ -32,7 +33,7 @@ class Dataset(ljspeech.LJSpeech): def _get_example(self, metadatum): fname, _, _ = metadatum - wav_path = self.root.joinpath("wavs", fname + ".wav") + wav_path = os.path.join(self.root, "wavs", fname + ".wav") loaded_sr, audio = read(wav_path) assert loaded_sr == self.config.sample_rate @@ -79,6 +80,7 @@ class Subset(DatasetMixin): # whole audio for valid set pass else: + # Randomly crop segment_length from audios in the training set. # audio shape: [len] if audio.shape[0] >= segment_length: max_audio_start = audio.shape[0] - segment_length diff --git a/parakeet/models/waveflow/waveflow.py b/parakeet/models/waveflow/waveflow.py index a8bd8afaae297d50b6b000d5e99daef8fd9fea6e..101bb66c0265b211f1041770133c7a7329a3dc3b 100644 --- a/parakeet/models/waveflow/waveflow.py +++ b/parakeet/models/waveflow/waveflow.py @@ -28,6 +28,25 @@ from .waveflow_modules import WaveFlowLoss, WaveFlowModule class WaveFlow(): + """Wrapper class of WaveFlow model that supports multiple APIs. + + This module provides APIs for model building, training, validation, + inference, benchmarking, and saving. + + Args: + config (obj): config info. + checkpoint_dir (str): path for checkpointing. + parallel (bool, optional): whether use multiple GPUs for training. + Defaults to False. + rank (int, optional): the rank of the process in a multi-process + scenario. Defaults to 0. + nranks (int, optional): the total number of processes. Defaults to 1. + tb_logger (obj, optional): logger to visualize metrics. + Defaults to None. + + Returns: + WaveFlow + """ def __init__(self, config, checkpoint_dir, @@ -44,6 +63,15 @@ class WaveFlow(): self.dtype = "float16" if config.use_fp16 else "float32" def build(self, training=True): + """Initialize the model. + + Args: + training (bool, optional): Whether the model is built for training or inference. + Defaults to True. + + Returns: + None + """ config = self.config dataset = LJSpeech(config, self.nranks, self.rank) self.trainloader = dataset.trainloader @@ -99,6 +127,14 @@ class WaveFlow(): self.waveflow = waveflow def train_step(self, iteration): + """Train the model for one step. + + Args: + iteration (int): current iteration number. + + Returns: + None + """ self.waveflow.train() start_time = time.time() @@ -135,6 +171,14 @@ class WaveFlow(): @dg.no_grad def valid_step(self, iteration): + """Run the model on the validation dataset. + + Args: + iteration (int): current iteration number. + + Returns: + None + """ self.waveflow.eval() tb = self.tb_logger @@ -167,6 +211,14 @@ class WaveFlow(): @dg.no_grad def infer(self, iteration): + """Run the model to synthesize audios. + + Args: + iteration (int): iteration number of the loaded checkpoint. + + Returns: + None + """ self.waveflow.eval() config = self.config @@ -179,10 +231,13 @@ class WaveFlow(): mels_list = [mels for _, mels in self.validloader()] if sample is not None: mels_list = [mels_list[sample]] + else: + sample = 0 - for sample, mel in enumerate(mels_list): - filename = "{}/valid_{}.wav".format(output, sample) - print("Synthesize sample {}, save as {}".format(sample, filename)) + for idx, mel in enumerate(mels_list): + abs_idx = sample + idx + filename = "{}/valid_{}.wav".format(output, abs_idx) + print("Synthesize sample {}, save as {}".format(abs_idx, filename)) start_time = time.time() audio = self.waveflow.synthesize(mel, sigma=self.config.sigma) @@ -200,6 +255,14 @@ class WaveFlow(): @dg.no_grad def benchmark(self): + """Run the model to benchmark synthesis speed. + + Args: + None + + Returns: + None + """ self.waveflow.eval() mels_list = [mels for _, mels in self.validloader()] @@ -220,6 +283,14 @@ class WaveFlow(): print("{} X real-time".format(audio_time / syn_time)) def save(self, iteration): + """Save model checkpoint. + + Args: + iteration (int): iteration number of the model to be saved. + + Returns: + None + """ utils.save_latest_parameters(self.checkpoint_dir, iteration, self.waveflow, self.optimizer) utils.save_latest_checkpoint(self.checkpoint_dir, iteration) diff --git a/parakeet/models/waveflow/waveflow_modules.py b/parakeet/models/waveflow/waveflow_modules.py index 46dfba782b97bb9657a9e3b334b1accb25afec1a..f480cd9b3a627bf7df07e228f64fd15e59feb904 100644 --- a/parakeet/models/waveflow/waveflow_modules.py +++ b/parakeet/models/waveflow/waveflow_modules.py @@ -293,6 +293,14 @@ class Flow(dg.Layer): class WaveFlowModule(dg.Layer): + """WaveFlow model implementation. + + Args: + config (obj): model configuration parameters. + + Returns: + WaveFlowModule + """ def __init__(self, config): super(WaveFlowModule, self).__init__() self.n_flows = config.n_flows @@ -321,6 +329,22 @@ class WaveFlowModule(dg.Layer): self.perms.append(perm) def forward(self, audio, mel): + """Training forward pass. + + Use a conditioner to upsample mel spectrograms into hidden states. + These hidden states along with the audio are passed to a stack of Flow + modules to obtain the final latent variable z and a list of log scaling + variables, which are then passed to the WaveFlowLoss module to calculate + the negative log likelihood. + + Args: + audio (obj): audio samples. + mel (obj): mel spectrograms. + + Returns: + z (obj): latent variable. + log_s_list(list): list of log scaling variables. + """ mel = self.conditioner(mel) assert mel.shape[2] >= audio.shape[1] # Prune out the tail of audio/mel so that time/n_group == 0. @@ -361,6 +385,20 @@ class WaveFlowModule(dg.Layer): return z, log_s_list def synthesize(self, mel, sigma=1.0): + """Use model to synthesize waveform. + + Use a conditioner to upsample mel spectrograms into hidden states. + These hidden states along with initial random gaussian latent variable + are passed to a stack of Flow modules to obtain the audio output. + + Args: + mel (obj): mel spectrograms. + sigma (float, optional): standard deviation of the guassian latent + variable. Defaults to 1.0. + + Returns: + audio (obj): synthesized audio. + """ if self.dtype == "float16": mel = fluid.layers.cast(mel, self.dtype) mel = self.conditioner.infer(mel) diff --git a/parakeet/models/wavenet/net.py b/parakeet/models/wavenet/net.py index 72b9ad5c5245f101706a67dcf2068c22cfb4e759..4817bd309ef48e83e38fa70495ff1bb830a87934 100644 --- a/parakeet/models/wavenet/net.py +++ b/parakeet/models/wavenet/net.py @@ -27,17 +27,16 @@ from parakeet.models.wavenet.wavenet import WaveNet def crop(x, audio_start, audio_length): - """Crop mel spectrogram. - + """Crop the upsampled condition to match audio_length. The upsampled condition has the same time steps as the whole audio does. But since audios are sliced to 0.5 seconds randomly while conditions are not, upsampled conditions should also be sliced to extaclt match the time steps of the audio slice. + Args: - x (Variable): shape(batch_size, channels, time_steps), the condition, upsampled mel spectrogram. - audio_start (int): starting point. - audio_length (int): length. - + x (Variable): shape(B, C, T), dtype: float, the upsample condition. + audio_start (Variable): shape(B, ), dtype: int64, the index the starting point. + audio_length (int): the length of the audio (number of samples it contaions). + Returns: - out: cropped condition. + Variable: shape(B, C, audio_length), cropped condition. """ - # crop audio slices = [] # for each example starts = audio_start.numpy() @@ -51,12 +50,15 @@ def crop(x, audio_start, audio_length): class UpsampleNet(dg.Layer): - """A upsampling net (bridge net) in clarinet to upsample spectrograms from frame level to sample level. - It consists of several(2) layers of transposed_conv2d. in time and frequency. - The time dim is dilated hop_length times. The frequency bands retains. - """ - def __init__(self, upscale_factors=[16, 16]): + """UpsamplingNet. + It consists of several layers of Conv2DTranspose. Each Conv2DTranspose layer upsamples the time dimension by its `stride` times. And each Conv2DTranspose's filter_size at frequency dimension is 3. + + Args: + upscale_factors (list[int], optional): time upsampling factors for each Conv2DTranspose Layer. The `UpsampleNet` contains len(upscale_factor) Conv2DTranspose Layers. Each upscale_factor is used as the `stride` for the corresponding Conv2DTranspose. Defaults to [16, 16]. + Note: + np.prod(upscale_factors) should equals the `hop_length` of the stft transformation used to extract spectrogram features from audios. For example, 16 * 16 = 256, then the spectram extracted using a stft transformation whose `hop_length` is 256. See `librosa.stft` for more details. + """ super(UpsampleNet, self).__init__() self.upscale_factors = list(upscale_factors) self.upsample_convs = dg.LayerList() @@ -74,13 +76,13 @@ class UpsampleNet(dg.Layer): return np.prod(self.upscale_factors) def forward(self, x): - """upsample local condition to match time steps of input signals. i.e. upsample mel spectrogram to match time steps for waveform, for each layer of a wavenet. - - Arguments: - x {Variable} -- shape(batch_size, frequency, time_steps), local condition - + """Compute the upsampled condition. + + Args: + x (Variable): shape(B, F, T), dtype: float, the condition (mel spectrogram here.) (F means the frequency bands). In the internal Conv2DTransposes, the frequency dimension is treated as `height` dimension instead of `in_channels`. + Returns: - Variable -- shape(batch_size, frequency, time_steps * np.prod(upscale_factors)), upsampled condition for each layer. + Variable: shape(B, F, T * upscale_factor), dtype: float, the upsampled condition. """ x = F.unsqueeze(x, axes=[1]) for sublayer in self.upsample_convs: @@ -91,27 +93,31 @@ class UpsampleNet(dg.Layer): # AutoRegressive Model class ConditionalWavenet(dg.Layer): - def __init__(self, encoder: UpsampleNet, decoder: WaveNet): + def __init__(self, encoder, decoder): + """Conditional Wavenet, which contains an UpsampleNet as the encoder and a WaveNet as the decoder. It is an autoregressive model. + + Args: + encoder (UpsampleNet): the UpsampleNet as the encoder. + decoder (WaveNet): the WaveNet as the decoder. + """ super(ConditionalWavenet, self).__init__() self.encoder = encoder self.decoder = decoder def forward(self, audio, mel, audio_start): - """forward - - Arguments: - audio {Variable} -- shape(batch_size, time_steps), waveform of 0.5 seconds - mel {Variable} -- shape(batch_size, frequency_bands, frames), mel spectrogram of the whole sentence - audio_start {Variable} -- shape(batch_size, ), audio start positions - + """Compute the output distribution given the mel spectrogram and the input(for teacher force training). + + Args: + audio (Variable): shape(B, T_audio), dtype: float, ground truth waveform, used for teacher force training. + mel ([Variable): shape(B, F, T_mel), dtype: float, mel spectrogram. Note that it is the spectrogram for the whole utterance. + audio_start (Variable): shape(B, ), dtype: int, audio slices' start positions for each utterance. + Returns: - Variable -- shape(batch_size, time_steps - 1, output_dim), output distribution parameters + Variable: shape(B, T_audio - 1, C_putput), parameters for the output distribution.(C_output is the `output_dim` of the decoder.) """ - audio_length = audio.shape[1] # audio clip's length condition = self.encoder(mel) - condition_slice = crop(condition, audio_start, - audio_length) # crop audio + condition_slice = crop(condition, audio_start, audio_length) # shifting 1 step audio = audio[:, :-1] @@ -121,43 +127,41 @@ class ConditionalWavenet(dg.Layer): return y def loss(self, y, t): - """compute loss - - Arguments: - y {Variable} -- shape(batch_size, time_steps - 1, output_dim), output distribution parameters - t {Variable} -- shape(batch_size, time_steps), target waveform - + """compute loss with respect to the output distribution and the targer audio. + + Args: + y (Variable): shape(B, T - 1, C_output), dtype: float, parameters of the output distribution. + t (Variable): shape(B, T), dtype: float, target waveform. + Returns: - Variable -- shape(1, ), reduced loss + Variable: shape(1, ), dtype: float, the loss. """ t = t[:, 1:] loss = self.decoder.loss(y, t) return loss def sample(self, y): - """sample from output distribution - - Arguments: - y {Variable} -- shape(batch_size, time_steps, output_dim), output distribution parameters - + """Sample from the output distribution. + + Args: + y (Variable): shape(B, T, C_output), dtype: float, parameters of the output distribution. + Returns: - Variable -- shape(batch_size, time_steps) samples + Variable: shape(B, T), dtype: float, sampled waveform from the output distribution. """ - samples = self.decoder.sample(y) return samples @dg.no_grad def synthesis(self, mel): - """synthesize waveform from mel spectrogram - - Arguments: - mel {Variable} -- shape(batch_size, frequency_bands, frames), mel-spectrogram - + """Synthesize waveform from mel spectrogram. + + Args: + mel (Variable): shape(B, F, T), condition(mel spectrogram here). + Returns: - Variable -- shape(batch_size, time_steps), synthesized waveform. + Variable: shape(B, T * upsacle_factor), synthesized waveform.(`upscale_factor` is the `upscale_factor` of the encoder `UpsampleNet`) """ - condition = self.encoder(mel) batch_size, _, time_steps = condition.shape samples = [] diff --git a/parakeet/models/wavenet/wavenet.py b/parakeet/models/wavenet/wavenet.py index 4c355f406d1cafc81e17048b9470339a465190f9..da369d1ed49523275a8eb29d6eb47d6b17eff3a8 100644 --- a/parakeet/models/wavenet/wavenet.py +++ b/parakeet/models/wavenet/wavenet.py @@ -27,11 +27,29 @@ from parakeet.modules.weight_norm import Linear, Conv1D, Conv1DCell, Conv2DTrans # for wavenet with softmax loss def quantize(values, n_bands): + """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in [0, n_bands). + + Args: + values (Variable): dtype: flaot32 or float64. the floating point value. + n_bands (int): the number of bands. The output integer Tensor's value is in the range [0, n_bans). + + Returns: + Variable: the quantized tensor, dtype: int64. + """ quantized = F.cast((values + 1.0) / 2.0 * n_bands, "int64") return quantized def dequantize(quantized, n_bands): + """Linearlly dequantize an integer Tensor into a float Tensor in the range [-1, 1). + + Args: + quantized (Variable): dtype: int64. The quantized value in the range [0, n_bands). + n_bands (int): number of bands. The input integer Tensor's value is in the range [0, n_bans). + + Returns: + Variable: the dequantized tensor, dtype: float32. + """ value = (F.cast(quantized, "float32") + 0.5) * (2.0 / n_bands) - 1.0 return value @@ -39,6 +57,14 @@ def dequantize(quantized, n_bands): class ResidualBlock(dg.Layer): def __init__(self, residual_channels, condition_dim, filter_size, dilation): + """A Residual block in wavenet. It does not have parametric residual or skip connection. It consists of a Conv1DCell and an Conv1D(filter_size = 1) to integrate the condition. + + Args: + residual_channels (int): the channels of the input, residual and skip. + condition_dim (int): the channels of the condition. + filter_size (int): filter size of the internal convolution cell. + dilation (int): dilation of the internal convolution cell. + """ super(ResidualBlock, self).__init__() dilated_channels = 2 * residual_channels # following clarinet's implementation, we do not have parametric residual @@ -64,17 +90,16 @@ class ResidualBlock(dg.Layer): self.condition_dim = condition_dim def forward(self, x, condition=None): - """Conv1D gated tanh Block - - Arguments: - x {Variable} -- shape(batch_size, residual_channels, time_steps), the input. - - Keyword Arguments: - condition {Variable} -- shape(batch_size, condition_dim, time_steps), upsampled local condition, it has the shape time steps as the input x. (default: {None}) - + """Conv1D gated-tanh Block. + + Args: + x (Variable): shape(B, C_res, T), the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) dtype: float. + condition (Variable, optional): shape(B, C_cond, T), the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels). Defaults to None. + Returns: - Variable -- shape(batch_size, residual_channels, time_steps), the output which is used as the input of the next layer. - Variable -- shape(batch_size, residual_channels, time_steps), the output which is stacked alongside with other layers' as the output of wavenet. + (residual, skip_connection) + residual (Variable): shape(B, C_res, T), the residual, which is used as the input to the next layer of ResidualBlock. + skip_connection (Variable): shape(B, C_res, T), the skip connection. This output is accumulated with that of other ResidualBlocks. """ time_steps = x.shape[-1] h = x @@ -98,20 +123,21 @@ class ResidualBlock(dg.Layer): return residual, skip_connection def start_sequence(self): + """Prepare the ResidualBlock to generate a new sequence. This method should be called before starting calling `add_input` multiple times. + """ self.conv.start_sequence() def add_input(self, x, condition=None): - """add a step input. - - Arguments: - x {Variable} -- shape(batch_size, in_channels, time_steps=1), step input - - Keyword Arguments: - condition {Variable} -- shape(batch_size, condition_dim, time_steps=1) (default: {None}) - + """Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion. + + Args: + x (Variable): shape(B, C_res, T=1), input for a step, dtype: float. + condition (Variable, optional): shape(B, C_cond, T=1). condition for a step, dtype: float. Defaults to None. + Returns: - Variable -- shape(batch_size, in_channels, time_steps=1), residual connection, which is the input for the next layer - Variable -- shape(batch_size, in_channels, time_steps=1), skip connection + (residual, skip_connection) + residual (Variable): shape(B, C_res, T=1), the residual for a step, which is used as the input to the next layer of ResidualBlock. + skip_connection (Variable): shape(B, C_res, T=1), the skip connection for a step. This output is accumulated with that of other ResidualBlocks. """ h = x @@ -135,6 +161,15 @@ class ResidualBlock(dg.Layer): class ResidualNet(dg.Layer): def __init__(self, n_loop, n_layer, residual_channels, condition_dim, filter_size): + """The residual network in wavenet. It consists of `n_layer` stacks, each of which consists of `n_loop` ResidualBlocks. + + Args: + n_loop (int): number of ResidualBlocks in a stack. + n_layer (int): number of stacks in the `ResidualNet`. + residual_channels (int): channels of each `ResidualBlock`'s input. + condition_dim (int): channels of the condition. + filter_size (int): filter size of the internal Conv1DCell of each `ResidualBlock`. + """ super(ResidualNet, self).__init__() # double the dilation at each layer in a loop(n_loop layers) dilations = [2**i for i in range(n_loop)] * n_layer @@ -145,19 +180,14 @@ class ResidualNet(dg.Layer): ]) def forward(self, x, condition=None): - """n_layer layers of n_loop Residual Blocks. - - Arguments: - x {Variable} -- shape(batch_size, residual_channels, time_steps), input of the residual net. - - Keyword Arguments: - condition {Variable} -- shape(batch_size, condition_dim, time_steps), upsampled conditions, which has the same time steps as the input. (default: {None}) - - Returns: - Variable -- shape(batch_size, skip_channels, time_steps), output of the residual net. """ + Args: + x (Variable): shape(B, C_res, T), dtype: float, the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) + condition (Variable, optional): shape(B, C_cond, T), dtype: float, the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels) Defaults to None. - #before_resnet = time.time() + Returns: + skip_connection (Variable): shape(B, C_res, T), dtype: float, the output. + """ for i, func in enumerate(self.residual_blocks): x, skip = func(x, condition) if i == 0: @@ -165,24 +195,23 @@ class ResidualNet(dg.Layer): else: skip_connections = F.scale(skip_connections + skip, np.sqrt(0.5)) - #print("resnet: ", time.time() - before_resnet) return skip_connections def start_sequence(self): + """Prepare the ResidualNet to generate a new sequence. This method should be called before starting calling `add_input` multiple times. + """ for block in self.residual_blocks: block.start_sequence() def add_input(self, x, condition=None): - """add step input and return step output. - - Arguments: - x {Variable} -- shape(batch_size, residual_channels, time_steps=1), step input. - - Keyword Arguments: - condition {Variable} -- shape(batch_size, condition_dim, time_steps=1), step condition (default: {None}) - + """Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion. + + Args: + x (Variable): shape(B, C_res, T=1), dtype: float, input for a step. + condition (Variable, optional): shape(B, C_cond, T=1), dtype: float, condition for a step. Defaults to None. + Returns: - Variable -- shape(batch_size, skip_channels, time_steps=1), step output, parameters of the output distribution. + skip_connection (Variable): shape(B, C_res, T=1), dtype: float, the output for a step. """ for i, func in enumerate(self.residual_blocks): @@ -198,6 +227,18 @@ class ResidualNet(dg.Layer): class WaveNet(dg.Layer): def __init__(self, n_loop, n_layer, residual_channels, output_dim, condition_dim, filter_size, loss_type, log_scale_min): + """Wavenet that transform upsampled mel spectrogram into waveform. + + Args: + n_loop (int): n_loop for the internal ResidualNet. + n_layer (int): n_loop for the internal ResidualNet. + residual_channels (int): the channel of the input. + output_dim (int): the channel of the output distribution. + condition_dim (int): the channel of the condition. + filter_size (int): the filter size of the internal ResidualNet. + loss_type (str): loss type of the wavenet. Possible values are 'softmax' and 'mog'. If `loss_type` is 'softmax', the output is the logits of the catrgotical(multinomial) distribution, `output_dim` means the number of classes of the categorical distribution. If `loss_type` is mog(mixture of gaussians), the output is the parameters of a mixture of gaussians, which consists of weight(in the form of logit) of each gaussian distribution and its mean and log standard deviaton. So when `loss_type` is 'mog', `output_dim` should be perfectly divided by 3. + log_scale_min (int): the minimum value of log standard deviation of the output gaussian distributions. Note that this value is only used for computing loss if `loss_type` is 'mog', values less than `log_scale_min` is clipped when computing loss. + """ super(WaveNet, self).__init__() if loss_type not in ["softmax", "mog"]: raise ValueError("loss_type {} is not supported".format(loss_type)) @@ -225,19 +266,16 @@ class WaveNet(dg.Layer): self.log_scale_min = log_scale_min def forward(self, x, condition=None): - """(Possibly) Conditonal Wavenet. - - Arguments: - x {Variable} -- shape(batch_size, time_steps), the input signal of wavenet. The waveform in 0.5 seconds. - - Keyword Arguments: - conditions {Variable} -- shape(batch_size, condition_dim, 1, time_steps), the upsampled local condition. (default: {None}) - + """compute the output distribution (represented by its parameters). + + Args: + x (Variable): shape(B, T), dtype: float, the input waveform. + condition (Variable, optional): shape(B, C_cond, T), dtype: float, the upsampled condition. Defaults to None. + Returns: - Variable -- shape(batch_size, time_steps, output_dim), output distributions at each time_steps. + Variable: shape(B, T, C_output), dtype: float, the parameter of the output distributions. """ - # CAUTION: rank-4 condition here # Causal Conv if self.loss_type == "softmax": x = F.clip(x, min=-1., max=0.99999) @@ -258,21 +296,20 @@ class WaveNet(dg.Layer): return y def start_sequence(self): + """Prepare the WaveNet to generate a new sequence. This method should be called before starting calling `add_input` multiple times. + """ self.resnet.start_sequence() def add_input(self, x, condition=None): - """add step input - - Arguments: - x {Variable} -- shape(batch_size, time_steps=1), step input. - - Keyword Arguments: - condition {Variable} -- shape(batch_size, condition_dim , 1, time_steps=1) (default: {None}) - + """compute the output distribution (represented by its parameters) for a step. It works similarily with the `forward` method but in a `step-in-step-out` fashion. + + Args: + x (Variable): shape(B, T=1), dtype: float, a step of the input waveform. + condition (Variable, optional): shape(B, C_cond, T=1), dtype: float, a step of the upsampled condition. Defaults to None. + Returns: - Variable -- ouput parameter for the distribution. + Variable: shape(B, T=1, C_output), dtype: float, the parameter of the output distributions. """ - # Causal Conv if self.loss_type == "softmax": x = quantize(x, self.output_dim) @@ -292,16 +329,15 @@ class WaveNet(dg.Layer): return y def compute_softmax_loss(self, y, t): - """compute loss, it is basically a language_model-like loss. - - Arguments: - y {Variable} -- shape(batch_size, time_steps - 1, output_dim), output distribution of multinomial distribution. - t {Variable} -- shape(batch_size, time_steps - 1), target waveform. - + """compute the loss where output distribution is a categorial distribution. + + Args: + y (Variable): shape(B, T, C_output), dtype: float, the logits of the output distribution. + t (Variable): shape(B, T), dtype: float, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation. + Returns: - Variable -- shape(1,), loss + Variable: shape(1, ), dtype: float, the loss. """ - # context size is not taken into account y = y[:, self.context_size:, :] t = t[:, self.context_size:] @@ -314,15 +350,14 @@ class WaveNet(dg.Layer): return reduced_loss def sample_from_softmax(self, y): - """sample from output distribution. - - Arguments: - y {Variable} -- shape(batch_size, time_steps - 1, output_dim), output distribution. - + """Sample from the output distribution where the output distribution is a categorical distriobution. + + Args: + y (Variable): shape(B, T, C_output), the logits of the output distribution + Returns: - Variable -- shape(batch_size, time_steps - 1), samples. + Variable: shape(B, T), waveform sampled from the output distribution. """ - # dequantize batch_size, time_steps, output_dim, = y.shape y = F.reshape(y, (batch_size * time_steps, output_dim)) @@ -333,17 +368,15 @@ class WaveNet(dg.Layer): return samples def compute_mog_loss(self, y, t): - """compute the loss with an mog output distribution. - WARNING: this is not a legal probability, but a density. so it might be greater than 1. - - Arguments: - y {Variable} -- shape(batch_size, time_steps, output_dim), output distribution's parameter. To represent a mixture of Gaussians. The output for each example at each time_step consists of 3 parts. The mean, the stddev, and a weight for that gaussian. - t {Variable} -- shape(batch_size, time_steps), target waveform. + """compute the loss where output distribution is a mixture of Gaussians. + + Args: + y (Variable): shape(B, T, C_output), dtype: float, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture. + t (Variable): shape(B, T), dtype: float, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation. Returns: - Variable -- loss, note that it is computed with the pdf of the MoG distribution. + Variable: shape(1, ), dtype: float, the loss. """ - n_mixture = self.output_dim // 3 # context size is not taken in to account @@ -373,15 +406,13 @@ class WaveNet(dg.Layer): return loss def sample_from_mog(self, y): - """sample from output distribution. - - Arguments: - y {Variable} -- shape(batch_size, time_steps - 1, output_dim), output distribution. - + """Sample from the output distribution where the output distribution is a mixture of Gaussians. + Args: + y (Variable): shape(B, T, C_output), dtype: float, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture. + Returns: - Variable -- shape(batch_size, time_steps - 1), samples. + Variable: shape(B, T), waveform sampled from the output distribution. """ - batch_size, time_steps, output_dim = y.shape n_mixture = output_dim // 3 @@ -405,31 +436,28 @@ class WaveNet(dg.Layer): return samples def sample(self, y): - """sample from output distribution. - - Arguments: - y {Variable} -- shape(batch_size, time_steps - 1, output_dim), output distribution. - + """Sample from the output distribution. + Args: + y (Variable): shape(B, T, C_output), dtype: float, the parameterd of the output distribution. + Returns: - Variable -- shape(batch_size, time_steps - 1), samples. + Variable: shape(B, T), waveform sampled from the output distribution. """ - if self.loss_type == "softmax": return self.sample_from_softmax(y) else: return self.sample_from_mog(y) def loss(self, y, t): - """compute loss. - - Arguments: - y {Variable} -- shape(batch_size, time_steps - 1, output_dim), output distribution of multinomial distribution. - t {Variable} -- shape(batch_size, time_steps - 1), target waveform. - + """compute the loss where output distribution is a mixture of Gaussians. + + Args: + y (Variable): shape(B, T, C_output), dtype: float, the parameterd of the output distribution. + t (Variable): shape(B, T), dtype: float, the target audio. Note that the target's corresponding time index is one step ahead of the output distribution. And output distribution whose input contains padding is neglected in loss computation. + Returns: - Variable -- shape(1,), loss + Variable: shape(1, ), dtype: float, the loss. """ - if self.loss_type == "softmax": return self.compute_softmax_loss(y, t) else: diff --git a/parakeet/modules/dynamic_gru.py b/parakeet/modules/dynamic_gru.py index 3a6602e3dddf2a18c69d0b9741ec2d6b3b5fe5e7..9e55688fedf3adb45dce67750ec9cd98ebf46cc0 100644 --- a/parakeet/modules/dynamic_gru.py +++ b/parakeet/modules/dynamic_gru.py @@ -53,11 +53,9 @@ class DynamicGRU(dg.Layer): if self.is_reverse: i = inputs.shape[1] - 1 - i input_ = inputs[:, i:i + 1, :] - input_ = layers.reshape( - input_, [-1, input_.shape[2]], inplace=False) + input_ = layers.reshape(input_, [-1, input_.shape[2]]) hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = layers.reshape( - hidden, [-1, 1, hidden.shape[1]], inplace=False) + hidden_ = layers.reshape(hidden, [-1, 1, hidden.shape[1]]) res.append(hidden_) if self.is_reverse: res = res[::-1] diff --git a/parakeet/modules/ffn.py b/parakeet/modules/ffn.py index 3fa8c16e9e97868d6df27f3b2fb3ff8b21d909be..fe39d3cec8a721191180cb31e919033f6dd935a8 100644 --- a/parakeet/modules/ffn.py +++ b/parakeet/modules/ffn.py @@ -71,7 +71,8 @@ class PositionwiseFeedForward(dg.Layer): x = self.w_2(layers.relu(self.w_1(x))) # dropout - x = layers.dropout(x, self.dropout) + x = layers.dropout( + x, self.dropout, dropout_implementation='upscale_in_train') x = layers.transpose(x, [0, 2, 1]) # residual connection diff --git a/parakeet/modules/modules.py b/parakeet/modules/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..72a8d2dfefb26ad67a269b32feb73cdf2d7ecba6 --- /dev/null +++ b/parakeet/modules/modules.py @@ -0,0 +1,610 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + +import numpy as np + +from . import conv +from . import weight_norm + + +def FC(name_scope, + in_features, + size, + num_flatten_dims=1, + relu=False, + dropout=0.0, + epsilon=1e-30, + act=None, + is_test=False, + dtype="float32"): + """ + A special Linear Layer, when it is used with dropout, the weight is + initialized as normal(0, std=np.sqrt((1-dropout) / in_features)) + """ + + # stds + if isinstance(in_features, int): + in_features = [in_features] + + stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features] + if relu: + stds = [std * np.sqrt(2.0) for std in stds] + + weight_inits = [ + fluid.initializer.NormalInitializer(scale=std) for std in stds + ] + bias_init = fluid.initializer.ConstantInitializer(0.0) + + # param attrs + weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits] + bias_attr = fluid.ParamAttr(initializer=bias_init) + + layer = weight_norm.FC(name_scope, + size, + num_flatten_dims=num_flatten_dims, + param_attr=weight_attrs, + bias_attr=bias_attr, + act=act, + dtype=dtype) + return layer + + +def Conv1D(name_scope, + in_channels, + num_filters, + filter_size=3, + dilation=1, + groups=None, + causal=False, + std_mul=1.0, + dropout=0.0, + use_cudnn=True, + act=None, + dtype="float32"): + """ + A special Conv1D Layer, when it is used with dropout, the weight is + initialized as + normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features))) + """ + # std + std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels)) + weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std) + bias_init = fluid.initializer.ConstantInitializer(0.0) + + # param attrs + weight_attr = fluid.ParamAttr(initializer=weight_init) + bias_attr = fluid.ParamAttr(initializer=bias_init) + + layer = conv.Conv1D( + name_scope, + in_channels, + num_filters, + filter_size, + dilation, + groups=groups, + causal=causal, + param_attr=weight_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + return layer + + +def Embedding(name_scope, + num_embeddings, + embed_dim, + is_sparse=False, + is_distributed=False, + padding_idx=None, + std=0.01, + dtype="float32"): + # param attrs + weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=std)) + layer = dg.Embedding( + name_scope, (num_embeddings, embed_dim), + padding_idx=padding_idx, + param_attr=weight_attr, + dtype=dtype) + return layer + + +class Conv1DGLU(dg.Layer): + """ + A Convolution 1D block with GLU activation. It also applys dropout for the + input x. It fuses speaker embeddings through a FC activated by softsign. It + has residual connection from the input x, and scale the output by + np.sqrt(0.5). + """ + + def __init__(self, + name_scope, + n_speakers, + speaker_dim, + in_channels, + num_filters, + filter_size, + dilation, + std_mul=4.0, + dropout=0.0, + causal=False, + residual=True, + dtype="float32"): + super(Conv1DGLU, self).__init__(name_scope, dtype=dtype) + + # conv spec + self.in_channels = in_channels + self.n_speakers = n_speakers + self.speaker_dim = speaker_dim + self.num_filters = num_filters + self.filter_size = filter_size + self.dilation = dilation + self.causal = causal + self.residual = residual + + # weight init and dropout + self.std_mul = std_mul + self.dropout = dropout + + if residual: + assert ( + in_channels == num_filters + ), "this block uses residual connection"\ + "the input_channes should equals num_filters" + + self.conv = Conv1D( + self.full_name(), + in_channels, + 2 * num_filters, + filter_size, + dilation, + causal=causal, + std_mul=std_mul, + dropout=dropout, + dtype=dtype) + + if n_speakers > 1: + assert (speaker_dim is not None + ), "speaker embed should not be null in multi-speaker case" + self.fc = Conv1D( + self.full_name(), + speaker_dim, + num_filters, + filter_size=1, + dilation=1, + causal=False, + act="softsign", + dtype=dtype) + + def forward(self, x, speaker_embed_bc1t=None): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU + layer, where B means batch_size, C_in means the input channels + T means input time steps. + speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded + speaker embed, where C_sp means speaker embedding size. Note + that when using residual connection, the Conv1DGLU does not + change the number of channels, so out channels equals input + channels. + + Returns: + x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where + C_out means the output channels of Conv1DGLU. + """ + + residual = x + x = fluid.layers.dropout(x, self.dropout) + x = self.conv(x) + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + if speaker_embed_bc1t is not None: + sp = self.fc(speaker_embed_bc1t) + content = content + sp + + # glu + x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) + + if self.residual: + x = fluid.layers.scale(x + residual, np.sqrt(0.5)) + return x + + def add_input(self, x, speaker_embed_bc11=None): + """ + Inputs: + x: shape(B, num_filters, 1, time_steps) + speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps) + + Outputs: + out: shape(B, num_filters, 1, time_steps), where time_steps = 1 + """ + + residual = x + + # add step input and produce step output + x = fluid.layers.dropout(x, self.dropout) + x = self.conv.add_input(x) + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + if speaker_embed_bc11 is not None: + sp = self.fc(speaker_embed_bc11) + content = content + sp + + x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) + + if self.residual: + x = fluid.layers.scale(x + residual, np.sqrt(0.5)) + return x + + +def Conv1DTranspose(name_scope, + in_channels, + num_filters, + filter_size, + padding=0, + stride=1, + dilation=1, + groups=None, + std_mul=1.0, + dropout=0.0, + use_cudnn=True, + act=None, + dtype="float32"): + std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size)) + weight_init = fluid.initializer.NormalInitializer(scale=std) + weight_attr = fluid.ParamAttr(initializer=weight_init) + bias_init = fluid.initializer.ConstantInitializer(0.0) + bias_attr = fluid.ParamAttr(initializer=bias_init) + layer = conv.Conv1DTranspose( + name_scope, + in_channels, + num_filters, + filter_size, + padding=padding, + stride=stride, + dilation=dilation, + groups=groups, + param_attr=weight_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + return layer + + +def compute_position_embedding(rad): + # rad is a transposed radius, shape(embed_dim, n_vocab) + embed_dim, n_vocab = rad.shape + + even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32")) + odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32")) + + even_rads = fluid.layers.gather(rad, even_dims) + odd_rads = fluid.layers.gather(rad, odd_dims) + + sines = fluid.layers.sin(even_rads) + cosines = fluid.layers.cos(odd_rads) + + temp = fluid.layers.scatter(rad, even_dims, sines) + out = fluid.layers.scatter(temp, odd_dims, cosines) + out = fluid.layers.transpose(out, perm=[1, 0]) + return out + + +def position_encoding_init(n_position, + d_pos_vec, + position_rate=1.0, + sinusoidal=True): + """ Init the sinusoid position encoding table """ + + # keep idx 0 for padding token position encoding zero vector + position_enc = np.array([[ + position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) + for i in range(d_pos_vec) + ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + + if sinusoidal: + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + + return position_enc + + +class PositionEmbedding(dg.Layer): + def __init__(self, + name_scope, + n_position, + d_pos_vec, + position_rate=1.0, + is_sparse=False, + is_distributed=False, + param_attr=None, + max_norm=None, + padding_idx=None, + dtype="float32"): + super(PositionEmbedding, self).__init__(name_scope, dtype=dtype) + self.embed = dg.Embedding( + self.full_name(), + size=(n_position, d_pos_vec), + is_sparse=is_sparse, + is_distributed=is_distributed, + padding_idx=None, + param_attr=param_attr, + dtype=dtype) + self.set_weight( + position_encoding_init( + n_position, + d_pos_vec, + position_rate=position_rate, + sinusoidal=False).astype(dtype)) + + self._is_sparse = is_sparse + self._is_distributed = is_distributed + self._remote_prefetch = self._is_sparse and (not self._is_distributed) + if self._remote_prefetch: + assert self._is_sparse is True and self._is_distributed is False + + self._padding_idx = (-1 if padding_idx is None else padding_idx if + padding_idx >= 0 else (n_position + padding_idx)) + self._position_rate = position_rate + self._max_norm = max_norm + self._dtype = dtype + + def set_weight(self, array): + assert self.embed._w.shape == list(array.shape), "shape does not match" + self.embed._w._ivar.value().get_tensor().set( + array, fluid.framework._current_expected_place()) + + def forward(self, indices, speaker_position_rate=None): + """ + Args: + indices (Variable): Shape (B, T, 1), dtype: int64, position + indices, where B means the batch size, T means the time steps. + speaker_position_rate (Variable | float, optional), position + rate. It can be a float point number or a Variable with + shape (1,), then this speaker_position_rate is used for every + example. It can also be a Variable with shape (B, 1), which + contains a speaker position rate for each speaker. + Returns: + out (Variable): Shape(B, C_pos), position embedding, where C_pos + means position embedding size. + """ + rad = fluid.layers.transpose(self.embed._w, perm=[1, 0]) + batch_size = indices.shape[0] + + if speaker_position_rate is None: + weight = compute_position_embedding(rad) + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="lookup_table", + inputs={"Ids": indices, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": + self._padding_idx, # special value for lookup table op + }) + return out + + elif (np.isscalar(speaker_position_rate) or + isinstance(speaker_position_rate, fluid.framework.Variable) and + speaker_position_rate.shape == [1, 1]): + # # make a weight + # scale the weight (the operand for sin & cos) + if np.isscalar(speaker_position_rate): + scaled_rad = fluid.layers.scale(rad, speaker_position_rate) + else: + scaled_rad = fluid.layers.elementwise_mul( + rad, speaker_position_rate[0]) + weight = compute_position_embedding(scaled_rad) + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="lookup_table", + inputs={"Ids": indices, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": + self._padding_idx, # special value for lookup table op + }) + return out + + elif np.prod(speaker_position_rate.shape) > 1: + assert speaker_position_rate.shape == [batch_size, 1] + outputs = [] + for i in range(batch_size): + rate = speaker_position_rate[i] # rate has shape [1] + scaled_rad = fluid.layers.elementwise_mul(rad, rate) + weight = compute_position_embedding(scaled_rad) + out = self._helper.create_variable_for_type_inference( + self._dtype) + sequence = indices[i] + self._helper.append_op( + type="lookup_table", + inputs={"Ids": sequence, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": -1, + }) + outputs.append(out) + out = fluid.layers.stack(outputs) + return out + else: + raise Exception("Then you can just use position rate at init") + + +class Conv1D_GU(dg.Layer): + def __init__(self, + name_scope, + conditioner_dim, + in_channels, + num_filters, + filter_size, + dilation, + causal=False, + residual=True, + dtype="float32"): + super(Conv1D_GU, self).__init__(name_scope, dtype=dtype) + + self.conditioner_dim = conditioner_dim + self.in_channels = in_channels + self.num_filters = num_filters + self.filter_size = filter_size + self.dilation = dilation + self.causal = causal + self.residual = residual + + if residual: + assert ( + in_channels == num_filters + ), "this block uses residual connection"\ + "the input_channels should equals num_filters" + + self.conv = Conv1D( + self.full_name(), + in_channels, + 2 * num_filters, + filter_size, + dilation, + causal=causal, + dtype=dtype) + + self.fc = Conv1D( + self.full_name(), + conditioner_dim, + 2 * num_filters, + filter_size=1, + dilation=1, + causal=False, + dtype=dtype) + + def forward(self, x, skip=None, conditioner=None): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU + layer, where B means batch_size, C_in means the input channels + T means input time steps. + skip (Variable): Shape(B, C_in, 1, T), skip connection. + conditioner (Variable): Shape(B, C_con, 1, T), expanded mel + conditioner, where C_con is conditioner hidden dim which + equals the num of mel bands. Note that when using residual + connection, the Conv1D_GU does not change the number of + channels, so out channels equals input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where + C_out means the output channels of Conv1D_GU. + skip (Variable): Shape(B, C_out, 1, T), skip connection. + """ + residual = x + x = self.conv(x) + + if conditioner is not None: + cond_bias = self.fc(conditioner) + x += cond_bias + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + # Gated Unit. + x = fluid.layers.elementwise_mul( + fluid.layers.sigmoid(gate), fluid.layers.tanh(content)) + + if skip is None: + skip = x + else: + skip = fluid.layers.scale(skip + x, np.sqrt(0.5)) + + if self.residual: + x = fluid.layers.scale(residual + x, np.sqrt(0.5)) + + return x, skip + + def add_input(self, x, skip=None, conditioner=None): + """ + Inputs: + x: shape(B, num_filters, 1, time_steps) + skip: shape(B, num_filters, 1, time_steps), skip connection + conditioner: shape(B, conditioner_dim, 1, time_steps) + Outputs: + x: shape(B, num_filters, 1, time_steps), where time_steps = 1 + skip: skip connection, same shape as x + """ + residual = x + + # add step input and produce step output + x = self.conv.add_input(x) + + if conditioner is not None: + cond_bias = self.fc(conditioner) + x += cond_bias + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + # Gated Unit. + x = fluid.layers.elementwise_mul( + fluid.layers.sigmoid(gate), fluid.layers.tanh(content)) + + if skip is None: + skip = x + else: + skip = fluid.layers.scale(skip + x, np.sqrt(0.5)) + + if self.residual: + x = fluid.layers.scale(residual + x, np.sqrt(0.5)) + + return x, skip + + +def Conv2DTranspose(name_scope, + num_filters, + filter_size, + padding=0, + stride=1, + dilation=1, + use_cudnn=True, + act=None, + dtype="float32"): + val = 1.0 / (filter_size[0] * filter_size[1]) + weight_init = fluid.initializer.ConstantInitializer(val) + weight_attr = fluid.ParamAttr(initializer=weight_init) + + layer = weight_norm.Conv2DTranspose( + name_scope, + num_filters, + filter_size=filter_size, + padding=padding, + stride=stride, + dilation=dilation, + param_attr=weight_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + return layer diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py index 89783b987bc391b7669f5f543b9085668c821c99..624d3ae6ecd8419af16769a28880239774bd2758 100644 --- a/parakeet/modules/multihead_attention.py +++ b/parakeet/modules/multihead_attention.py @@ -78,17 +78,15 @@ class ScaledDotProductAttention(dg.Layer): """ # Compute attention score attention = layers.matmul( - query, key, transpose_y=True) #transpose the last dim in y - attention = attention / math.sqrt(self.d_key) + query, key, transpose_y=True, alpha=self.d_key + **-0.5) #transpose the last dim in y # Mask key to ignore padding if mask is not None: - attention = attention * mask - mask = (mask == 0).astype(np.float32) * (-2**32 + 1) attention = attention + mask - attention = layers.softmax(attention) - attention = layers.dropout(attention, dropout) + attention = layers.dropout( + attention, dropout, dropout_implementation='upscale_in_train') # Mask query to ignore padding if query_mask is not None: @@ -142,17 +140,11 @@ class MultiheadAttention(dg.Layer): result (Variable), Shape(B, T, C), the result of mutihead attention. attention (Variable), Shape(n_head * B, T, C), the attention of key. """ + batch_size = key.shape[0] seq_len_key = key.shape[1] seq_len_query = query_input.shape[1] - # repeat masks h times - if query_mask is not None: - query_mask = layers.expand(query_mask, - [self.num_head, 1, seq_len_key]) - if mask is not None: - mask = layers.expand(mask, (self.num_head, 1, 1)) - # Make multihead attention # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) key = layers.reshape( @@ -176,6 +168,18 @@ class MultiheadAttention(dg.Layer): result, attention = self.scal_attn( key, value, query, mask=mask, query_mask=query_mask) + key = layers.reshape( + layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + value = layers.reshape( + layers.transpose(value, [2, 0, 1, 3]), + [-1, seq_len_key, self.d_k]) + query = layers.reshape( + layers.transpose(query, [2, 0, 1, 3]), + [-1, seq_len_query, self.d_q]) + + result, attention = self.scal_attn( + key, value, query, mask=mask, query_mask=query_mask) + # concat all multihead result result = layers.reshape( result, [self.num_head, batch_size, seq_len_query, self.d_q]) @@ -184,7 +188,10 @@ class MultiheadAttention(dg.Layer): [batch_size, seq_len_query, -1]) if self.is_concat: result = layers.concat([query_input, result], axis=-1) - result = layers.dropout(self.fc(result), self.dropout) + result = layers.dropout( + self.fc(result), + self.dropout, + dropout_implementation='upscale_in_train') result = result + query_input result = self.layer_norm(result)