diff --git a/examples/fastspeech/config/fastspeech.yaml b/examples/fastspeech/configs/fastspeech.yaml similarity index 100% rename from examples/fastspeech/config/fastspeech.yaml rename to examples/fastspeech/configs/fastspeech.yaml diff --git a/examples/fastspeech/config/synthesis.yaml b/examples/fastspeech/configs/synthesis.yaml similarity index 88% rename from examples/fastspeech/config/synthesis.yaml rename to examples/fastspeech/configs/synthesis.yaml index 9a43dfff4e5aef6fadf2279c3406267292d7216c..ab9dbb48e9756b0d5b8ed4a00edf608ce1e7531a 100644 --- a/examples/fastspeech/config/synthesis.yaml +++ b/examples/fastspeech/configs/synthesis.yaml @@ -3,8 +3,8 @@ audio: n_fft: 2048 sr: 22050 preemphasis: 0.97 - hop_length: 275 - win_length: 1102 + hop_length: 256 + win_length: 1024 power: 1.2 min_level_db: -100 ref_level_db: 20 diff --git a/examples/fastspeech/parse.py b/examples/fastspeech/parse.py index 690f4b2e44ce646505e8c9c8031e2000faeba9d1..52068d3434e9385dae65746b4e2b7231f1fe8bae 100644 --- a/examples/fastspeech/parse.py +++ b/examples/fastspeech/parse.py @@ -52,6 +52,12 @@ def add_config_options_to_parser(parser): type=int, default=0, help="use data parallel or not during training.") + parser.add_argument( + '--alpha', + type=float, + default=1.0, + help="The hyperparameter to determine the length of the expanded sequence \ + mel, thereby controlling the voice speed.") parser.add_argument( '--data_path', diff --git a/examples/fastspeech/synthesis.py b/examples/fastspeech/synthesis.py index 802d4e4b9fa2e7f5ad4967e2acb62b527496784d..774a67fa032c656f4758097556405ade3fbcea2b 100644 --- a/examples/fastspeech/synthesis.py +++ b/examples/fastspeech/synthesis.py @@ -24,6 +24,7 @@ import paddle.fluid.dygraph as dg from parakeet.g2p.en import text_to_sequence from parakeet import audio from parakeet.models.fastspeech.fastspeech import FastSpeech +from parakeet.models.transformer_tts.utils import * def load_checkpoint(step, model_path): @@ -59,12 +60,26 @@ def synthesis(text_input, args): model.eval() text = np.asarray(text_to_sequence(text_input)) - text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) + text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) - pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) + pos_text = np.expand_dims(pos_text, axis=0) + enc_non_pad_mask = get_non_pad_mask(pos_text).astype(np.float32) + enc_slf_attn_mask = get_attn_key_pad_mask(pos_text, + text).astype(np.float32) + + text = dg.to_variable(text) + pos_text = dg.to_variable(pos_text) + enc_non_pad_mask = dg.to_variable(enc_non_pad_mask) + enc_slf_attn_mask = dg.to_variable(enc_slf_attn_mask) mel_output, mel_output_postnet = model( - text, pos_text, alpha=args.alpha) + text, + pos_text, + alpha=args.alpha, + enc_non_pad_mask=enc_non_pad_mask, + enc_slf_attn_mask=enc_slf_attn_mask, + dec_non_pad_mask=None, + dec_slf_attn_mask=None) _ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], diff --git a/examples/fastspeech/train.py b/examples/fastspeech/train.py index f1b59a2ea2428e89a9c56b235cb648e5a761e8ab..7565ac950baa890c54741cbe770517d2f50113f8 100644 --- a/examples/fastspeech/train.py +++ b/examples/fastspeech/train.py @@ -21,6 +21,7 @@ from parse import add_config_options_to_parser from pprint import pprint from ruamel import yaml from tqdm import tqdm +from matplotlib import cm from collections import OrderedDict from tensorboardX import SummaryWriter import paddle.fluid.dygraph as dg @@ -66,12 +67,12 @@ def main(args): with dg.guard(place): with fluid.unique_name.guard(): - transformerTTS = TransformerTTS(cfg) + transformer_tts = TransformerTTS(cfg) model_dict, _ = load_checkpoint( str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) - transformerTTS.set_dict(model_dict) - transformerTTS.eval() + transformer_tts.set_dict(model_dict) + transformer_tts.eval() model = FastSpeech(cfg) model.train() @@ -100,13 +101,33 @@ def main(args): for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) - character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data + (character, mel, mel_input, pos_text, pos_mel, text_length, + mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask, + enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data - _, _, attn_probs, _, _, _ = transformerTTS( - character, mel_input, pos_text, pos_mel) - alignment = dg.to_variable( - get_alignment(attn_probs, mel_lens, cfg[ - 'transformer_head'])).astype(np.float32) + _, _, attn_probs, _, _, _ = transformer_tts( + character, + mel_input, + pos_text, + pos_mel, + dec_slf_mask=dec_slf_mask, + enc_slf_mask=enc_slf_mask, + enc_query_mask=enc_query_mask, + enc_dec_mask=enc_dec_mask, + dec_query_slf_mask=dec_query_slf_mask, + dec_query_mask=dec_query_mask) + alignment, max_attn = get_alignment(attn_probs, mel_lens, + cfg['transformer_head']) + alignment = dg.to_variable(alignment).astype(np.float32) + + if local_rank == 0 and global_step % 5 == 1: + x = np.uint8( + cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255) + writer.add_image( + 'Attention_%d_0' % global_step, + x, + 0, + dataformats="HWC") global_step += 1 @@ -115,7 +136,11 @@ def main(args): character, pos_text, mel_pos=pos_mel, - length_target=alignment) + length_target=alignment, + enc_non_pad_mask=enc_query_mask, + enc_slf_attn_mask=enc_slf_mask, + dec_non_pad_mask=dec_query_slf_mask, + dec_slf_attn_mask=dec_slf_mask) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) diff --git a/examples/fastspeech/train.sh b/examples/fastspeech/train.sh index d293c0cd59b897b97143d8f0478c01877a2960a0..11e78c4e8449c7c24ac5b51394e0a37e20428319 100644 --- a/examples/fastspeech/train.sh +++ b/examples/fastspeech/train.sh @@ -1,6 +1,6 @@ # train model # if you wish to resume from an exists model, uncomment --checkpoint_path and --fastspeech_step -CUDA_VISIBLE_DEVICES=0\ +export CUDA_VISIBLE_DEVICES=0 python -u train.py \ --batch_size=32 \ --epochs=10000 \ diff --git a/examples/transformer_tts/config/synthesis.yaml b/examples/transformer_tts/configs/synthesis.yaml similarity index 72% rename from examples/transformer_tts/config/synthesis.yaml rename to examples/transformer_tts/configs/synthesis.yaml index 217dd8511667827497575a69f503b2ed7d08d273..c23b029354a2d69c1dda2f50953eddb74c2c4c67 100644 --- a/examples/transformer_tts/config/synthesis.yaml +++ b/examples/transformer_tts/configs/synthesis.yaml @@ -8,4 +8,7 @@ audio: power: 1.2 min_level_db: -100 ref_level_db: 20 - outputs_per_step: 1 \ No newline at end of file + outputs_per_step: 1 + +hidden_size: 256 +embedding_size: 512 \ No newline at end of file diff --git a/examples/transformer_tts/config/train_transformer.yaml b/examples/transformer_tts/configs/train_transformer.yaml similarity index 100% rename from examples/transformer_tts/config/train_transformer.yaml rename to examples/transformer_tts/configs/train_transformer.yaml diff --git a/examples/transformer_tts/config/train_vocoder.yaml b/examples/transformer_tts/configs/train_vocoder.yaml similarity index 100% rename from examples/transformer_tts/config/train_vocoder.yaml rename to examples/transformer_tts/configs/train_vocoder.yaml diff --git a/examples/transformer_tts/data.py b/examples/transformer_tts/data.py index 99c6739329de9be22c1778b30a8d7353a7f0370c..f8e85452d375c69e217271c193a43c69b4abdf4b 100644 --- a/examples/transformer_tts/data.py +++ b/examples/transformer_tts/data.py @@ -23,7 +23,8 @@ from parakeet import audio from parakeet.data.sampler import * from parakeet.data.datacargo import DataCargo from parakeet.data.batch import TextIDBatcher, SpecBatcher -from parakeet.data.dataset import DatasetMixin, TransformDataset +from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset +from parakeet.models.transformer_tts.utils import * class LJSpeechLoader: @@ -40,6 +41,8 @@ class LJSpeechLoader: metadata = LJSpeechMetaData(LJSPEECH_ROOT) transformer = LJSpeech(config) dataset = TransformDataset(metadata, transformer) + dataset = CacheDataset(dataset) + sampler = DistributedSampler( len(metadata), nranks, rank, shuffle=shuffle) @@ -196,8 +199,18 @@ def batch_examples(batch): SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels) mel_inputs = np.transpose( SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels) + enc_slf_mask = get_attn_key_pad_mask(pos_texts, texts).astype(np.float32) + enc_query_mask = get_non_pad_mask(pos_texts).astype(np.float32) + dec_slf_mask = get_dec_attn_key_pad_mask(pos_mels, + mel_inputs).astype(np.float32) + enc_dec_mask = get_attn_key_pad_mask(enc_query_mask[:, :, 0], + mel_inputs).astype(np.float32) + dec_query_slf_mask = get_non_pad_mask(pos_mels).astype(np.float32) + dec_query_mask = get_non_pad_mask(pos_mels).astype(np.float32) + return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), - np.array(mel_lens)) + np.array(mel_lens), enc_slf_mask, enc_query_mask, dec_slf_mask, + enc_dec_mask, dec_query_slf_mask, dec_query_mask) def batch_examples_vocoder(batch): diff --git a/examples/transformer_tts/synthesis.py b/examples/transformer_tts/synthesis.py index de833626980b25f759dbc383afd74754b7a08ea9..2896634feaa95a2e619da15aa675644564d99f45 100644 --- a/examples/transformer_tts/synthesis.py +++ b/examples/transformer_tts/synthesis.py @@ -16,6 +16,7 @@ from scipy.io.wavfile import write from parakeet.g2p.en import text_to_sequence import numpy as np from tqdm import tqdm +from matplotlib import cm from tensorboardX import SummaryWriter from ruamel import yaml import paddle.fluid as fluid @@ -25,6 +26,7 @@ import argparse from parse import add_config_options_to_parser from pprint import pprint from collections import OrderedDict +from parakeet.models.transformer_tts.utils import * from parakeet import audio from parakeet.models.transformer_tts.vocoder import Vocoder from parakeet.models.transformer_tts.transformer_tts import TransformerTTS @@ -78,14 +80,18 @@ def synthesis(text_input, args): pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pbar = tqdm(range(args.max_len)) - for i in pbar: + dec_slf_mask = get_triu_tensor( + mel_input.numpy(), mel_input.numpy()).astype(np.float32) + dec_slf_mask = fluid.layers.cast( + dg.to_variable(dec_slf_mask == 0), np.float32) pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( - text, mel_input, pos_text, pos_mel) + text, mel_input, pos_text, pos_mel, dec_slf_mask) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) + mag_pred = model_vocoder(postnet_pred) _ljspeech_processor = audio.AudioProcessor( @@ -111,6 +117,33 @@ def synthesis(text_input, args): wav = _ljspeech_processor.inv_spectrogram( fluid.layers.transpose( fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) + global_step = 0 + for i, prob in enumerate(attn_probs): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) + writer.add_image( + 'Attention_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") + + for i, prob in enumerate(attn_enc): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) + writer.add_image( + 'Attention_enc_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") + + for i, prob in enumerate(attn_dec): + for j in range(4): + x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) + writer.add_image( + 'Attention_dec_%d_0' % global_step, + x, + i * 4 + j, + dataformats="HWC") writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) if not os.path.exists(args.sample_path): os.mkdir(args.sample_path) @@ -124,4 +157,6 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description="Synthesis model") add_config_options_to_parser(parser) args = parser.parse_args() - synthesis("Transformer model is so fast!", args) + synthesis( + "They emphasized the necessity that the information now being furnished be handled with judgment and care.", + args) diff --git a/examples/transformer_tts/synthesis.sh b/examples/transformer_tts/synthesis.sh index 8cb137ac25e94f876c9ebd0a08708a259f0406b6..42b704da2477a47ba7bb8042e620c32197bc7000 100644 --- a/examples/transformer_tts/synthesis.sh +++ b/examples/transformer_tts/synthesis.sh @@ -2,10 +2,10 @@ # train model CUDA_VISIBLE_DEVICES=0 \ python -u synthesis.py \ ---max_len=50 \ +--max_len=600 \ --transformer_step=160000 \ ---vocoder_step=70000 \ ---use_gpu=1 +--vocoder_step=90000 \ +--use_gpu=1 \ --checkpoint_path='./checkpoint' \ --log_dir='./log' \ --sample_path='./sample' \ diff --git a/examples/transformer_tts/train_transformer.py b/examples/transformer_tts/train_transformer.py index f3dd0231b052d1f837eb4fbd9e7b3b4efda70f79..b63fafc3818f3a3bae489b0b39b5432821792376 100644 --- a/examples/transformer_tts/train_transformer.py +++ b/examples/transformer_tts/train_transformer.py @@ -14,7 +14,7 @@ import os from tqdm import tqdm from tensorboardX import SummaryWriter -from pathlib import Path +#from pathlib import Path from collections import OrderedDict import argparse from parse import add_config_options_to_parser @@ -89,21 +89,31 @@ def main(args): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) - character, mel, mel_input, pos_text, pos_mel, text_length, _ = data + character, mel, mel_input, pos_text, pos_mel, text_length, _, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask = data global_step += 1 - mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( - character, mel_input, pos_text, pos_mel) - label = (pos_mel == 0).astype(np.float32) + mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( + character, + mel_input, + pos_text, + pos_mel, + dec_slf_mask=dec_slf_mask, + enc_slf_mask=enc_slf_mask, + enc_query_mask=enc_query_mask, + enc_dec_mask=enc_dec_mask, + dec_query_slf_mask=dec_query_slf_mask, + dec_query_mask=dec_query_mask) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss + # Note: When used stop token loss the learning did not work. if args.stop_token: + label = (pos_mel == 0).astype(np.float32) stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss diff --git a/examples/transformer_tts/train_transformer.sh b/examples/transformer_tts/train_transformer.sh index cdb24cfb39fa149980e12701b4e7304d509cbc40..346d3512fefab0e80238684f454f55e35b5b3b7c 100644 --- a/examples/transformer_tts/train_transformer.sh +++ b/examples/transformer_tts/train_transformer.sh @@ -1,7 +1,7 @@ # train model # if you wish to resume from an exists model, uncomment --checkpoint_path and --transformer_step -CUDA_VISIBLE_DEVICES=0 \ +export CUDA_VISIBLE_DEVICES=2 python -u train_transformer.py \ --batch_size=32 \ --epochs=10000 \ diff --git a/parakeet/data/dataset.py b/parakeet/data/dataset.py index 6ab4ebb32c1b915ee5cc520c7a841b90aeac3515..4aefde55d418e898816200eed0ad5c25357133de 100644 --- a/parakeet/data/dataset.py +++ b/parakeet/data/dataset.py @@ -14,6 +14,7 @@ import six import numpy as np +from tqdm import tqdm class DatasetMixin(object): diff --git a/parakeet/models/fastspeech/decoder.py b/parakeet/models/fastspeech/decoder.py index 46eb391ba0ae76f29267e080d6457921a36aa1c1..8432fc5ba7f21ab9e3b3e7f18a5168fcb41f5d16 100644 --- a/parakeet/models/fastspeech/decoder.py +++ b/parakeet/models/fastspeech/decoder.py @@ -32,6 +32,7 @@ class Decoder(dg.Layer): super(Decoder, self).__init__() n_position = len_max_seq + 1 + self.n_head = n_head self.pos_inp = get_sinusoid_encoding_table( n_position, d_model, padding_idx=0) self.position_enc = dg.Embedding( @@ -55,7 +56,7 @@ class Decoder(dg.Layer): for i, layer in enumerate(self.layer_stack): self.add_sublayer('fft_{}'.format(i), layer) - def forward(self, enc_seq, enc_pos): + def forward(self, enc_seq, enc_pos, non_pad_mask, slf_attn_mask=None): """ Decoder layer of FastSpeech. @@ -69,10 +70,7 @@ class Decoder(dg.Layer): dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. """ dec_slf_attn_list = [] - - # -- Prepare masks - slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos) - non_pad_mask = get_non_pad_mask(enc_pos) + slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1]) # -- Forward dec_output = enc_seq + self.position_enc(enc_pos) diff --git a/parakeet/models/fastspeech/encoder.py b/parakeet/models/fastspeech/encoder.py index 15c8d60e6e016fc1955111f4702aa1f4e2c478e2..15d634eca1aa96a8c0af2b9eac40424cf9c23d7e 100644 --- a/parakeet/models/fastspeech/encoder.py +++ b/parakeet/models/fastspeech/encoder.py @@ -32,14 +32,17 @@ class Encoder(dg.Layer): dropout=0.1): super(Encoder, self).__init__() n_position = len_max_seq + 1 + self.n_head = n_head self.src_word_emb = dg.Embedding( - size=[n_src_vocab, d_model], padding_idx=0) + size=[n_src_vocab, d_model], + padding_idx=0, + param_attr=fluid.initializer.Normal( + loc=0.0, scale=1.0)) self.pos_inp = get_sinusoid_encoding_table( n_position, d_model, padding_idx=0) self.position_enc = dg.Embedding( size=[n_position, d_model], - padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( self.pos_inp), @@ -58,7 +61,7 @@ class Encoder(dg.Layer): for i, layer in enumerate(self.layer_stack): self.add_sublayer('fft_{}'.format(i), layer) - def forward(self, character, text_pos): + def forward(self, character, text_pos, non_pad_mask, slf_attn_mask=None): """ Encoder layer of FastSpeech. @@ -74,10 +77,7 @@ class Encoder(dg.Layer): enc_slf_attn_list (list), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list. """ enc_slf_attn_list = [] - # -- prepare masks - # shape character (N, T) - slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character) - non_pad_mask = get_non_pad_mask(character) + slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1]) # -- Forward enc_output = self.src_word_emb(character) + self.position_enc( @@ -90,4 +90,4 @@ class Encoder(dg.Layer): slf_attn_mask=slf_attn_mask) enc_slf_attn_list += [enc_slf_attn] - return enc_output, non_pad_mask, enc_slf_attn_list + return enc_output, enc_slf_attn_list diff --git a/parakeet/models/fastspeech/fastspeech.py b/parakeet/models/fastspeech/fastspeech.py index 91478af59d67db3565982e17e49cfadb6249e386..a37d5fac06dce8379738f95781c85abcdaa241a4 100644 --- a/parakeet/models/fastspeech/fastspeech.py +++ b/parakeet/models/fastspeech/fastspeech.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import math +import numpy as np import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.g2p.text.symbols import symbols +from parakeet.models.transformer_tts.utils import * from parakeet.models.transformer_tts.post_convnet import PostConvNet from parakeet.models.fastspeech.length_regulator import LengthRegulator from parakeet.models.fastspeech.encoder import Encoder @@ -78,6 +80,10 @@ class FastSpeech(dg.Layer): def forward(self, character, text_pos, + enc_non_pad_mask, + dec_non_pad_mask, + enc_slf_attn_mask=None, + dec_slf_attn_mask=None, mel_pos=None, length_target=None, alpha=1.0): @@ -106,14 +112,20 @@ class FastSpeech(dg.Layer): dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. """ - encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder( - character, text_pos) + encoder_output, enc_slf_attn_list = self.encoder( + character, + text_pos, + enc_non_pad_mask, + slf_attn_mask=enc_slf_attn_mask) if fluid.framework._dygraph_tracer()._train_mode: length_regulator_output, duration_predictor_output = self.length_regulator( encoder_output, target=length_target, alpha=alpha) decoder_output, dec_slf_attn_list = self.decoder( - length_regulator_output, mel_pos) + length_regulator_output, + mel_pos, + dec_non_pad_mask, + slf_attn_mask=dec_slf_attn_mask) mel_output = self.mel_linear(decoder_output) mel_output_postnet = self.postnet(mel_output) + mel_output @@ -122,8 +134,18 @@ class FastSpeech(dg.Layer): else: length_regulator_output, decoder_pos = self.length_regulator( encoder_output, alpha=alpha) - decoder_output, _ = self.decoder(length_regulator_output, - decoder_pos) + slf_attn_mask = get_triu_tensor( + decoder_pos.numpy(), decoder_pos.numpy()).astype(np.float32) + slf_attn_mask = fluid.layers.cast( + dg.to_variable(slf_attn_mask == 0), np.float32) + slf_attn_mask = dg.to_variable(slf_attn_mask) + dec_non_pad_mask = fluid.layers.unsqueeze( + (decoder_pos != 0).astype(np.float32), [-1]) + decoder_output, _ = self.decoder( + length_regulator_output, + decoder_pos, + dec_non_pad_mask, + slf_attn_mask=slf_attn_mask) mel_output = self.mel_linear(decoder_output) mel_output_postnet = self.postnet(mel_output) + mel_output diff --git a/parakeet/models/fastspeech/fft_block.py b/parakeet/models/fastspeech/fft_block.py index f50f11a189d8194bf2bee5c9b0115d556753bbcb..0c0ed4fda024735691fc6c4ddf39ef29ffeb4f4a 100644 --- a/parakeet/models/fastspeech/fft_block.py +++ b/parakeet/models/fastspeech/fft_block.py @@ -46,7 +46,7 @@ class FFTBlock(dg.Layer): padding=padding, dropout=dropout) - def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): + def forward(self, enc_input, non_pad_mask, slf_attn_mask=None): """ Feed Forward Transformer block in FastSpeech. @@ -63,6 +63,7 @@ class FFTBlock(dg.Layer): """ output, slf_attn = self.slf_attn( enc_input, enc_input, enc_input, mask=slf_attn_mask) + output *= non_pad_mask output = self.pos_ffn(output) diff --git a/parakeet/models/fastspeech/length_regulator.py b/parakeet/models/fastspeech/length_regulator.py index 331597ab663de4ea5c66e2b2522d64bc87149a78..f6bc8037f032004f54bc4791cfce9b6611685f49 100644 --- a/parakeet/models/fastspeech/length_regulator.py +++ b/parakeet/models/fastspeech/length_regulator.py @@ -146,11 +146,17 @@ class DurationPredictor(dg.Layer): out = layers.transpose(encoder_output, [0, 2, 1]) out = self.conv1(out) out = layers.transpose(out, [0, 2, 1]) - out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout) + out = layers.dropout( + layers.relu(self.layer_norm1(out)), + self.dropout, + dropout_implementation='upscale_in_train') out = layers.transpose(out, [0, 2, 1]) out = self.conv2(out) out = layers.transpose(out, [0, 2, 1]) - out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout) + out = layers.dropout( + layers.relu(self.layer_norm2(out)), + self.dropout, + dropout_implementation='upscale_in_train') out = layers.relu(self.linear(out)) out = layers.squeeze(out, axes=[-1]) diff --git a/parakeet/models/fastspeech/utils.py b/parakeet/models/fastspeech/utils.py index 5e680f08b618b120e10ead6165c2557772f073e7..cfd6d47cb5005412ed3044a82bd032c811411f1e 100644 --- a/parakeet/models/fastspeech/utils.py +++ b/parakeet/models/fastspeech/utils.py @@ -18,7 +18,6 @@ def get_alignment(attn_probs, mel_lens, n_head): max_F = 0 assert attn_probs[0].shape[0] % n_head == 0 batch_size = int(attn_probs[0].shape[0] // n_head) - #max_attn = attn_probs[0].numpy()[0,batch_size] for i in range(len(attn_probs)): multi_attn = attn_probs[i].numpy() for j in range(n_head): @@ -28,7 +27,7 @@ def get_alignment(attn_probs, mel_lens, n_head): max_F = F max_attn = attn alignment = compute_duration(max_attn, mel_lens) - return alignment + return alignment, max_attn def score_F(attn): diff --git a/parakeet/models/transformer_tts/decoder.py b/parakeet/models/transformer_tts/decoder.py index 3d7adf15b878c3a3a75af80039a74e13d33e06f1..5b17a7a2f7674e5c76ba8776a8a6aa015f029d12 100644 --- a/parakeet/models/transformer_tts/decoder.py +++ b/parakeet/models/transformer_tts/decoder.py @@ -14,7 +14,7 @@ import math import paddle.fluid.dygraph as dg import paddle.fluid as fluid -from parakeet.modules.utils import * +from parakeet.models.transformer_tts.utils import * from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.models.transformer_tts.prenet import PreNet @@ -25,6 +25,7 @@ class Decoder(dg.Layer): def __init__(self, num_hidden, config, num_head=4): super(Decoder, self).__init__() self.num_hidden = num_hidden + self.num_head = num_head param = fluid.ParamAttr() self.alpha = self.create_parameter( shape=(1, ), @@ -98,30 +99,29 @@ class Decoder(dg.Layer): outputs_per_step=config['audio']['outputs_per_step'], use_cudnn=True) - def forward(self, key, value, query, c_mask, positional): + def forward(self, + key, + value, + query, + positional, + mask, + m_mask=None, + m_self_mask=None, + zero_mask=None): # get decoder mask with triangular matrix if fluid.framework._dygraph_tracer()._train_mode: - m_mask = get_non_pad_mask(positional) - mask = get_attn_key_pad_mask((positional == 0).astype(np.float32), - query) - triu_tensor = dg.to_variable( - get_triu_tensor(query.numpy(), query.numpy())).astype( - np.float32) - mask = mask + triu_tensor - mask = fluid.layers.cast(mask == 0, np.float32) - - # (batch_size, decoder_len, encoder_len) - zero_mask = get_attn_key_pad_mask( - layers.squeeze(c_mask, [-1]), query) + m_mask = layers.expand(m_mask, [self.num_head, 1, key.shape[1]]) + m_self_mask = layers.expand(m_self_mask, + [self.num_head, 1, query.shape[1]]) + mask = layers.expand(mask, [self.num_head, 1, 1]) + zero_mask = layers.expand(zero_mask, [self.num_head, 1, 1]) + else: - mask = get_triu_tensor(query.numpy(), - query.numpy()).astype(np.float32) - mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) - m_mask, zero_mask = None, None + m_mask, m_self_mask, zero_mask = None, None, None - # Decoder pre-network +# Decoder pre-network query = self.decoder_prenet(query) # Centered position @@ -132,7 +132,8 @@ class Decoder(dg.Layer): query = positional * self.alpha + query #positional dropout - query = fluid.layers.dropout(query, 0.1) + query = fluid.layers.dropout( + query, 0.1, dropout_implementation='upscale_in_train') # Attention decoder-decoder, encoder-decoder selfattn_list = list() @@ -141,12 +142,13 @@ class Decoder(dg.Layer): for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): query, attn_dec = selfattn( - query, query, query, mask=mask, query_mask=m_mask) + query, query, query, mask=mask, query_mask=m_self_mask) query, attn_dot = attn( key, value, query, mask=zero_mask, query_mask=m_mask) query = ffn(query) selfattn_list.append(attn_dec) attn_list.append(attn_dot) + # Mel linear projection mel_out = self.mel_linear(query) # Post Mel Network diff --git a/parakeet/models/transformer_tts/encoder.py b/parakeet/models/transformer_tts/encoder.py index 548ea8e4640f317b29de486b1d58f710d042d852..ef3821ff1667cf0029ac9c5f077b0ffe95a6c70d 100644 --- a/parakeet/models/transformer_tts/encoder.py +++ b/parakeet/models/transformer_tts/encoder.py @@ -23,6 +23,7 @@ class Encoder(dg.Layer): def __init__(self, embedding_size, num_hidden, num_head=4): super(Encoder, self).__init__() self.num_hidden = num_hidden + self.num_head = num_head param = fluid.ParamAttr(initializer=fluid.initializer.Constant( value=1.0)) self.alpha = self.create_parameter( @@ -31,7 +32,6 @@ class Encoder(dg.Layer): 1024, self.num_hidden, padding_idx=0) self.pos_emb = dg.Embedding( size=[1024, num_hidden], - padding_idx=0, param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( self.pos_inp), @@ -56,13 +56,15 @@ class Encoder(dg.Layer): for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) - def forward(self, x, positional): + def forward(self, x, positional, mask=None, query_mask=None): + if fluid.framework._dygraph_tracer()._train_mode: - query_mask = get_non_pad_mask(positional) - mask = get_attn_key_pad_mask(positional, x) + seq_len_key = x.shape[1] + query_mask = layers.expand(query_mask, + [self.num_head, 1, seq_len_key]) + mask = layers.expand(mask, [self.num_head, 1, 1]) else: query_mask, mask = None, None - # Encoder pre_network x = self.encoder_prenet(x) #(N,T,C) @@ -72,7 +74,7 @@ class Encoder(dg.Layer): x = positional * self.alpha + x #(N, T, C) # Positional dropout - x = layers.dropout(x, 0.1) + x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train') # Self attention encoder attentions = list() @@ -81,4 +83,4 @@ class Encoder(dg.Layer): x = ffn(x) attentions.append(attention) - return x, query_mask, attentions + return x, attentions diff --git a/parakeet/models/transformer_tts/encoderprenet.py b/parakeet/models/transformer_tts/encoderprenet.py index d7014240eb8066cee18a890f4f6d509d3d4a09f7..e953dab062c80d4e9218612981e65030a5fc0270 100644 --- a/parakeet/models/transformer_tts/encoderprenet.py +++ b/parakeet/models/transformer_tts/encoderprenet.py @@ -27,7 +27,10 @@ class EncoderPrenet(dg.Layer): self.num_hidden = num_hidden self.use_cudnn = use_cudnn self.embedding = dg.Embedding( - size=[len(symbols), embedding_size], padding_idx=None) + size=[len(symbols), embedding_size], + padding_idx=0, + param_attr=fluid.initializer.Normal( + loc=0.0, scale=1.0)) self.conv_list = [] k = math.sqrt(1 / embedding_size) self.conv_list.append( @@ -78,10 +81,14 @@ class EncoderPrenet(dg.Layer): low=-k, high=k))) def forward(self, x): + x = self.embedding(x) #(batch_size, seq_len, embending_size) x = layers.transpose(x, [0, 2, 1]) for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): - x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) + x = layers.dropout( + layers.relu(batch_norm(conv(x))), + 0.2, + dropout_implementation='upscale_in_train') x = layers.transpose(x, [0, 2, 1]) #(N,T,C) x = self.projection(x) diff --git a/parakeet/models/transformer_tts/post_convnet.py b/parakeet/models/transformer_tts/post_convnet.py index 8882e79687e0308633132737237f74b560920fd8..60e93824a501f3a9003ac6c89aaa7ce90ccf52da 100644 --- a/parakeet/models/transformer_tts/post_convnet.py +++ b/parakeet/models/transformer_tts/post_convnet.py @@ -108,11 +108,16 @@ class PostConvNet(dg.Layer): conv = self.conv_list[i] input = layers.dropout( - layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout) + layers.tanh(batch_norm(conv(input)[:, :, :len])), + self.dropout, + dropout_implementation='upscale_in_train') conv = self.conv_list[self.num_conv - 1] input = conv(input)[:, :, :len] if self.batchnorm_last: batch_norm = self.batch_norm_list[self.num_conv - 1] - input = layers.dropout(batch_norm(input), self.dropout) + input = layers.dropout( + batch_norm(input), + self.dropout, + dropout_implementation='upscale_in_train') output = layers.transpose(input, [0, 2, 1]) return output diff --git a/parakeet/models/transformer_tts/prenet.py b/parakeet/models/transformer_tts/prenet.py index 6039b6033dce5c861f4e7b94597807310f04c9a7..b47a9f8b58195ed67c85339f86a273867759648a 100644 --- a/parakeet/models/transformer_tts/prenet.py +++ b/parakeet/models/transformer_tts/prenet.py @@ -56,6 +56,12 @@ class PreNet(dg.Layer): Returns: x (Variable), Shape(B, T, C), the result after pernet. """ - x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate) - x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate) + x = layers.dropout( + layers.relu(self.linear1(x)), + self.dropout_rate, + dropout_implementation='upscale_in_train') + x = layers.dropout( + layers.relu(self.linear2(x)), + self.dropout_rate, + dropout_implementation='upscale_in_train') return x diff --git a/parakeet/models/transformer_tts/transformer_tts.py b/parakeet/models/transformer_tts/transformer_tts.py index 1205c6b939efe48bca523824ae3aa3ce25894cce..a7fffbd38b04f17bb2b5392d1f4cb83183be3d6d 100644 --- a/parakeet/models/transformer_tts/transformer_tts.py +++ b/parakeet/models/transformer_tts/transformer_tts.py @@ -24,11 +24,29 @@ class TransformerTTS(dg.Layer): self.decoder = Decoder(config['hidden_size'], config) self.config = config - def forward(self, characters, mel_input, pos_text, pos_mel): - - key, c_mask, attns_enc = self.encoder(characters, pos_text) + def forward(self, + characters, + mel_input, + pos_text, + pos_mel, + dec_slf_mask, + enc_slf_mask=None, + enc_query_mask=None, + enc_dec_mask=None, + dec_query_slf_mask=None, + dec_query_mask=None): + key, attns_enc = self.encoder( + characters, pos_text, mask=enc_slf_mask, query_mask=enc_query_mask) mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder( - key, key, mel_input, c_mask, pos_mel) + key, + key, + mel_input, + pos_mel, + mask=dec_slf_mask, + zero_mask=enc_dec_mask, + m_self_mask=dec_query_slf_mask, + m_mask=dec_query_mask) + return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec diff --git a/parakeet/models/transformer_tts/utils.py b/parakeet/models/transformer_tts/utils.py index 22127446b463a9fd1f2407f29ceca6f2639ac2cc..4b525272ecaf1f1e5e55b4cfc05f55ff0a37ac3c 100644 --- a/parakeet/models/transformer_tts/utils.py +++ b/parakeet/models/transformer_tts/utils.py @@ -51,7 +51,9 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): def get_non_pad_mask(seq): - return layers.unsqueeze((seq != 0).astype(np.float32), [-1]) + mask = (seq != 0).astype(np.float32) + mask = np.expand_dims(mask, axis=-1) + return mask def get_attn_key_pad_mask(seq_k, seq_q): @@ -60,8 +62,22 @@ def get_attn_key_pad_mask(seq_k, seq_q): # Expand to fit the shape of key query attention matrix. len_q = seq_q.shape[1] padding_mask = (seq_k != 0).astype(np.float32) - padding_mask = layers.expand( - layers.unsqueeze(padding_mask, [1]), [1, len_q, 1]) + padding_mask = np.expand_dims(padding_mask, axis=1) + padding_mask = padding_mask.repeat([len_q], axis=1) + padding_mask = (padding_mask == 0).astype(np.float32) * (-2**32 + 1) + return padding_mask + + +def get_dec_attn_key_pad_mask(seq_k, seq_q): + ''' For masking out the padding part of key sequence. ''' + + # Expand to fit the shape of key query attention matrix. + len_q = seq_q.shape[1] + padding_mask = (seq_k == 0).astype(np.float32) + padding_mask = np.expand_dims(padding_mask, axis=1) + triu_tensor = get_triu_tensor(seq_q, seq_q) + padding_mask = padding_mask.repeat([len_q], axis=1) + triu_tensor + padding_mask = (padding_mask != 0).astype(np.float32) * (-2**32 + 1) return padding_mask diff --git a/parakeet/modules/dynamic_gru.py b/parakeet/modules/dynamic_gru.py index 3a6602e3dddf2a18c69d0b9741ec2d6b3b5fe5e7..9e55688fedf3adb45dce67750ec9cd98ebf46cc0 100644 --- a/parakeet/modules/dynamic_gru.py +++ b/parakeet/modules/dynamic_gru.py @@ -53,11 +53,9 @@ class DynamicGRU(dg.Layer): if self.is_reverse: i = inputs.shape[1] - 1 - i input_ = inputs[:, i:i + 1, :] - input_ = layers.reshape( - input_, [-1, input_.shape[2]], inplace=False) + input_ = layers.reshape(input_, [-1, input_.shape[2]]) hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = layers.reshape( - hidden, [-1, 1, hidden.shape[1]], inplace=False) + hidden_ = layers.reshape(hidden, [-1, 1, hidden.shape[1]]) res.append(hidden_) if self.is_reverse: res = res[::-1] diff --git a/parakeet/modules/ffn.py b/parakeet/modules/ffn.py index 3fa8c16e9e97868d6df27f3b2fb3ff8b21d909be..fe39d3cec8a721191180cb31e919033f6dd935a8 100644 --- a/parakeet/modules/ffn.py +++ b/parakeet/modules/ffn.py @@ -71,7 +71,8 @@ class PositionwiseFeedForward(dg.Layer): x = self.w_2(layers.relu(self.w_1(x))) # dropout - x = layers.dropout(x, self.dropout) + x = layers.dropout( + x, self.dropout, dropout_implementation='upscale_in_train') x = layers.transpose(x, [0, 2, 1]) # residual connection diff --git a/parakeet/modules/modules.py b/parakeet/modules/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..72a8d2dfefb26ad67a269b32feb73cdf2d7ecba6 --- /dev/null +++ b/parakeet/modules/modules.py @@ -0,0 +1,610 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle import fluid +import paddle.fluid.dygraph as dg + +import numpy as np + +from . import conv +from . import weight_norm + + +def FC(name_scope, + in_features, + size, + num_flatten_dims=1, + relu=False, + dropout=0.0, + epsilon=1e-30, + act=None, + is_test=False, + dtype="float32"): + """ + A special Linear Layer, when it is used with dropout, the weight is + initialized as normal(0, std=np.sqrt((1-dropout) / in_features)) + """ + + # stds + if isinstance(in_features, int): + in_features = [in_features] + + stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features] + if relu: + stds = [std * np.sqrt(2.0) for std in stds] + + weight_inits = [ + fluid.initializer.NormalInitializer(scale=std) for std in stds + ] + bias_init = fluid.initializer.ConstantInitializer(0.0) + + # param attrs + weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits] + bias_attr = fluid.ParamAttr(initializer=bias_init) + + layer = weight_norm.FC(name_scope, + size, + num_flatten_dims=num_flatten_dims, + param_attr=weight_attrs, + bias_attr=bias_attr, + act=act, + dtype=dtype) + return layer + + +def Conv1D(name_scope, + in_channels, + num_filters, + filter_size=3, + dilation=1, + groups=None, + causal=False, + std_mul=1.0, + dropout=0.0, + use_cudnn=True, + act=None, + dtype="float32"): + """ + A special Conv1D Layer, when it is used with dropout, the weight is + initialized as + normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features))) + """ + # std + std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels)) + weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std) + bias_init = fluid.initializer.ConstantInitializer(0.0) + + # param attrs + weight_attr = fluid.ParamAttr(initializer=weight_init) + bias_attr = fluid.ParamAttr(initializer=bias_init) + + layer = conv.Conv1D( + name_scope, + in_channels, + num_filters, + filter_size, + dilation, + groups=groups, + causal=causal, + param_attr=weight_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + return layer + + +def Embedding(name_scope, + num_embeddings, + embed_dim, + is_sparse=False, + is_distributed=False, + padding_idx=None, + std=0.01, + dtype="float32"): + # param attrs + weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=std)) + layer = dg.Embedding( + name_scope, (num_embeddings, embed_dim), + padding_idx=padding_idx, + param_attr=weight_attr, + dtype=dtype) + return layer + + +class Conv1DGLU(dg.Layer): + """ + A Convolution 1D block with GLU activation. It also applys dropout for the + input x. It fuses speaker embeddings through a FC activated by softsign. It + has residual connection from the input x, and scale the output by + np.sqrt(0.5). + """ + + def __init__(self, + name_scope, + n_speakers, + speaker_dim, + in_channels, + num_filters, + filter_size, + dilation, + std_mul=4.0, + dropout=0.0, + causal=False, + residual=True, + dtype="float32"): + super(Conv1DGLU, self).__init__(name_scope, dtype=dtype) + + # conv spec + self.in_channels = in_channels + self.n_speakers = n_speakers + self.speaker_dim = speaker_dim + self.num_filters = num_filters + self.filter_size = filter_size + self.dilation = dilation + self.causal = causal + self.residual = residual + + # weight init and dropout + self.std_mul = std_mul + self.dropout = dropout + + if residual: + assert ( + in_channels == num_filters + ), "this block uses residual connection"\ + "the input_channes should equals num_filters" + + self.conv = Conv1D( + self.full_name(), + in_channels, + 2 * num_filters, + filter_size, + dilation, + causal=causal, + std_mul=std_mul, + dropout=dropout, + dtype=dtype) + + if n_speakers > 1: + assert (speaker_dim is not None + ), "speaker embed should not be null in multi-speaker case" + self.fc = Conv1D( + self.full_name(), + speaker_dim, + num_filters, + filter_size=1, + dilation=1, + causal=False, + act="softsign", + dtype=dtype) + + def forward(self, x, speaker_embed_bc1t=None): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU + layer, where B means batch_size, C_in means the input channels + T means input time steps. + speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded + speaker embed, where C_sp means speaker embedding size. Note + that when using residual connection, the Conv1DGLU does not + change the number of channels, so out channels equals input + channels. + + Returns: + x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where + C_out means the output channels of Conv1DGLU. + """ + + residual = x + x = fluid.layers.dropout(x, self.dropout) + x = self.conv(x) + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + if speaker_embed_bc1t is not None: + sp = self.fc(speaker_embed_bc1t) + content = content + sp + + # glu + x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) + + if self.residual: + x = fluid.layers.scale(x + residual, np.sqrt(0.5)) + return x + + def add_input(self, x, speaker_embed_bc11=None): + """ + Inputs: + x: shape(B, num_filters, 1, time_steps) + speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps) + + Outputs: + out: shape(B, num_filters, 1, time_steps), where time_steps = 1 + """ + + residual = x + + # add step input and produce step output + x = fluid.layers.dropout(x, self.dropout) + x = self.conv.add_input(x) + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + if speaker_embed_bc11 is not None: + sp = self.fc(speaker_embed_bc11) + content = content + sp + + x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) + + if self.residual: + x = fluid.layers.scale(x + residual, np.sqrt(0.5)) + return x + + +def Conv1DTranspose(name_scope, + in_channels, + num_filters, + filter_size, + padding=0, + stride=1, + dilation=1, + groups=None, + std_mul=1.0, + dropout=0.0, + use_cudnn=True, + act=None, + dtype="float32"): + std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size)) + weight_init = fluid.initializer.NormalInitializer(scale=std) + weight_attr = fluid.ParamAttr(initializer=weight_init) + bias_init = fluid.initializer.ConstantInitializer(0.0) + bias_attr = fluid.ParamAttr(initializer=bias_init) + layer = conv.Conv1DTranspose( + name_scope, + in_channels, + num_filters, + filter_size, + padding=padding, + stride=stride, + dilation=dilation, + groups=groups, + param_attr=weight_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + return layer + + +def compute_position_embedding(rad): + # rad is a transposed radius, shape(embed_dim, n_vocab) + embed_dim, n_vocab = rad.shape + + even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32")) + odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32")) + + even_rads = fluid.layers.gather(rad, even_dims) + odd_rads = fluid.layers.gather(rad, odd_dims) + + sines = fluid.layers.sin(even_rads) + cosines = fluid.layers.cos(odd_rads) + + temp = fluid.layers.scatter(rad, even_dims, sines) + out = fluid.layers.scatter(temp, odd_dims, cosines) + out = fluid.layers.transpose(out, perm=[1, 0]) + return out + + +def position_encoding_init(n_position, + d_pos_vec, + position_rate=1.0, + sinusoidal=True): + """ Init the sinusoid position encoding table """ + + # keep idx 0 for padding token position encoding zero vector + position_enc = np.array([[ + position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) + for i in range(d_pos_vec) + ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + + if sinusoidal: + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + + return position_enc + + +class PositionEmbedding(dg.Layer): + def __init__(self, + name_scope, + n_position, + d_pos_vec, + position_rate=1.0, + is_sparse=False, + is_distributed=False, + param_attr=None, + max_norm=None, + padding_idx=None, + dtype="float32"): + super(PositionEmbedding, self).__init__(name_scope, dtype=dtype) + self.embed = dg.Embedding( + self.full_name(), + size=(n_position, d_pos_vec), + is_sparse=is_sparse, + is_distributed=is_distributed, + padding_idx=None, + param_attr=param_attr, + dtype=dtype) + self.set_weight( + position_encoding_init( + n_position, + d_pos_vec, + position_rate=position_rate, + sinusoidal=False).astype(dtype)) + + self._is_sparse = is_sparse + self._is_distributed = is_distributed + self._remote_prefetch = self._is_sparse and (not self._is_distributed) + if self._remote_prefetch: + assert self._is_sparse is True and self._is_distributed is False + + self._padding_idx = (-1 if padding_idx is None else padding_idx if + padding_idx >= 0 else (n_position + padding_idx)) + self._position_rate = position_rate + self._max_norm = max_norm + self._dtype = dtype + + def set_weight(self, array): + assert self.embed._w.shape == list(array.shape), "shape does not match" + self.embed._w._ivar.value().get_tensor().set( + array, fluid.framework._current_expected_place()) + + def forward(self, indices, speaker_position_rate=None): + """ + Args: + indices (Variable): Shape (B, T, 1), dtype: int64, position + indices, where B means the batch size, T means the time steps. + speaker_position_rate (Variable | float, optional), position + rate. It can be a float point number or a Variable with + shape (1,), then this speaker_position_rate is used for every + example. It can also be a Variable with shape (B, 1), which + contains a speaker position rate for each speaker. + Returns: + out (Variable): Shape(B, C_pos), position embedding, where C_pos + means position embedding size. + """ + rad = fluid.layers.transpose(self.embed._w, perm=[1, 0]) + batch_size = indices.shape[0] + + if speaker_position_rate is None: + weight = compute_position_embedding(rad) + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="lookup_table", + inputs={"Ids": indices, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": + self._padding_idx, # special value for lookup table op + }) + return out + + elif (np.isscalar(speaker_position_rate) or + isinstance(speaker_position_rate, fluid.framework.Variable) and + speaker_position_rate.shape == [1, 1]): + # # make a weight + # scale the weight (the operand for sin & cos) + if np.isscalar(speaker_position_rate): + scaled_rad = fluid.layers.scale(rad, speaker_position_rate) + else: + scaled_rad = fluid.layers.elementwise_mul( + rad, speaker_position_rate[0]) + weight = compute_position_embedding(scaled_rad) + out = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="lookup_table", + inputs={"Ids": indices, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": + self._padding_idx, # special value for lookup table op + }) + return out + + elif np.prod(speaker_position_rate.shape) > 1: + assert speaker_position_rate.shape == [batch_size, 1] + outputs = [] + for i in range(batch_size): + rate = speaker_position_rate[i] # rate has shape [1] + scaled_rad = fluid.layers.elementwise_mul(rad, rate) + weight = compute_position_embedding(scaled_rad) + out = self._helper.create_variable_for_type_inference( + self._dtype) + sequence = indices[i] + self._helper.append_op( + type="lookup_table", + inputs={"Ids": sequence, + "W": weight}, + outputs={"Out": out}, + attrs={ + "is_sparse": self._is_sparse, + "is_distributed": self._is_distributed, + "remote_prefetch": self._remote_prefetch, + "padding_idx": -1, + }) + outputs.append(out) + out = fluid.layers.stack(outputs) + return out + else: + raise Exception("Then you can just use position rate at init") + + +class Conv1D_GU(dg.Layer): + def __init__(self, + name_scope, + conditioner_dim, + in_channels, + num_filters, + filter_size, + dilation, + causal=False, + residual=True, + dtype="float32"): + super(Conv1D_GU, self).__init__(name_scope, dtype=dtype) + + self.conditioner_dim = conditioner_dim + self.in_channels = in_channels + self.num_filters = num_filters + self.filter_size = filter_size + self.dilation = dilation + self.causal = causal + self.residual = residual + + if residual: + assert ( + in_channels == num_filters + ), "this block uses residual connection"\ + "the input_channels should equals num_filters" + + self.conv = Conv1D( + self.full_name(), + in_channels, + 2 * num_filters, + filter_size, + dilation, + causal=causal, + dtype=dtype) + + self.fc = Conv1D( + self.full_name(), + conditioner_dim, + 2 * num_filters, + filter_size=1, + dilation=1, + causal=False, + dtype=dtype) + + def forward(self, x, skip=None, conditioner=None): + """ + Args: + x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU + layer, where B means batch_size, C_in means the input channels + T means input time steps. + skip (Variable): Shape(B, C_in, 1, T), skip connection. + conditioner (Variable): Shape(B, C_con, 1, T), expanded mel + conditioner, where C_con is conditioner hidden dim which + equals the num of mel bands. Note that when using residual + connection, the Conv1D_GU does not change the number of + channels, so out channels equals input channels. + Returns: + x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where + C_out means the output channels of Conv1D_GU. + skip (Variable): Shape(B, C_out, 1, T), skip connection. + """ + residual = x + x = self.conv(x) + + if conditioner is not None: + cond_bias = self.fc(conditioner) + x += cond_bias + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + # Gated Unit. + x = fluid.layers.elementwise_mul( + fluid.layers.sigmoid(gate), fluid.layers.tanh(content)) + + if skip is None: + skip = x + else: + skip = fluid.layers.scale(skip + x, np.sqrt(0.5)) + + if self.residual: + x = fluid.layers.scale(residual + x, np.sqrt(0.5)) + + return x, skip + + def add_input(self, x, skip=None, conditioner=None): + """ + Inputs: + x: shape(B, num_filters, 1, time_steps) + skip: shape(B, num_filters, 1, time_steps), skip connection + conditioner: shape(B, conditioner_dim, 1, time_steps) + Outputs: + x: shape(B, num_filters, 1, time_steps), where time_steps = 1 + skip: skip connection, same shape as x + """ + residual = x + + # add step input and produce step output + x = self.conv.add_input(x) + + if conditioner is not None: + cond_bias = self.fc(conditioner) + x += cond_bias + + content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) + + # Gated Unit. + x = fluid.layers.elementwise_mul( + fluid.layers.sigmoid(gate), fluid.layers.tanh(content)) + + if skip is None: + skip = x + else: + skip = fluid.layers.scale(skip + x, np.sqrt(0.5)) + + if self.residual: + x = fluid.layers.scale(residual + x, np.sqrt(0.5)) + + return x, skip + + +def Conv2DTranspose(name_scope, + num_filters, + filter_size, + padding=0, + stride=1, + dilation=1, + use_cudnn=True, + act=None, + dtype="float32"): + val = 1.0 / (filter_size[0] * filter_size[1]) + weight_init = fluid.initializer.ConstantInitializer(val) + weight_attr = fluid.ParamAttr(initializer=weight_init) + + layer = weight_norm.Conv2DTranspose( + name_scope, + num_filters, + filter_size=filter_size, + padding=padding, + stride=stride, + dilation=dilation, + param_attr=weight_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + return layer diff --git a/parakeet/modules/multihead_attention.py b/parakeet/modules/multihead_attention.py index 89783b987bc391b7669f5f543b9085668c821c99..624d3ae6ecd8419af16769a28880239774bd2758 100644 --- a/parakeet/modules/multihead_attention.py +++ b/parakeet/modules/multihead_attention.py @@ -78,17 +78,15 @@ class ScaledDotProductAttention(dg.Layer): """ # Compute attention score attention = layers.matmul( - query, key, transpose_y=True) #transpose the last dim in y - attention = attention / math.sqrt(self.d_key) + query, key, transpose_y=True, alpha=self.d_key + **-0.5) #transpose the last dim in y # Mask key to ignore padding if mask is not None: - attention = attention * mask - mask = (mask == 0).astype(np.float32) * (-2**32 + 1) attention = attention + mask - attention = layers.softmax(attention) - attention = layers.dropout(attention, dropout) + attention = layers.dropout( + attention, dropout, dropout_implementation='upscale_in_train') # Mask query to ignore padding if query_mask is not None: @@ -142,17 +140,11 @@ class MultiheadAttention(dg.Layer): result (Variable), Shape(B, T, C), the result of mutihead attention. attention (Variable), Shape(n_head * B, T, C), the attention of key. """ + batch_size = key.shape[0] seq_len_key = key.shape[1] seq_len_query = query_input.shape[1] - # repeat masks h times - if query_mask is not None: - query_mask = layers.expand(query_mask, - [self.num_head, 1, seq_len_key]) - if mask is not None: - mask = layers.expand(mask, (self.num_head, 1, 1)) - # Make multihead attention # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) key = layers.reshape( @@ -176,6 +168,18 @@ class MultiheadAttention(dg.Layer): result, attention = self.scal_attn( key, value, query, mask=mask, query_mask=query_mask) + key = layers.reshape( + layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) + value = layers.reshape( + layers.transpose(value, [2, 0, 1, 3]), + [-1, seq_len_key, self.d_k]) + query = layers.reshape( + layers.transpose(query, [2, 0, 1, 3]), + [-1, seq_len_query, self.d_q]) + + result, attention = self.scal_attn( + key, value, query, mask=mask, query_mask=query_mask) + # concat all multihead result result = layers.reshape( result, [self.num_head, batch_size, seq_len_query, self.d_q]) @@ -184,7 +188,10 @@ class MultiheadAttention(dg.Layer): [batch_size, seq_len_query, -1]) if self.is_concat: result = layers.concat([query_input, result], axis=-1) - result = layers.dropout(self.fc(result), self.dropout) + result = layers.dropout( + self.fc(result), + self.dropout, + dropout_implementation='upscale_in_train') result = result + query_input result = self.layer_norm(result)