提交 8e86389e 编写于 作者: L liuyibing01

Merge branch 'master' into 'master'

Modified data.py of TransformerTTS

See merge request !30
...@@ -3,8 +3,8 @@ audio: ...@@ -3,8 +3,8 @@ audio:
n_fft: 2048 n_fft: 2048
sr: 22050 sr: 22050
preemphasis: 0.97 preemphasis: 0.97
hop_length: 275 hop_length: 256
win_length: 1102 win_length: 1024
power: 1.2 power: 1.2
min_level_db: -100 min_level_db: -100
ref_level_db: 20 ref_level_db: 20
......
...@@ -52,6 +52,12 @@ def add_config_options_to_parser(parser): ...@@ -52,6 +52,12 @@ def add_config_options_to_parser(parser):
type=int, type=int,
default=0, default=0,
help="use data parallel or not during training.") help="use data parallel or not during training.")
parser.add_argument(
'--alpha',
type=float,
default=1.0,
help="The hyperparameter to determine the length of the expanded sequence \
mel, thereby controlling the voice speed.")
parser.add_argument( parser.add_argument(
'--data_path', '--data_path',
......
...@@ -24,6 +24,7 @@ import paddle.fluid.dygraph as dg ...@@ -24,6 +24,7 @@ import paddle.fluid.dygraph as dg
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
from parakeet import audio from parakeet import audio
from parakeet.models.fastspeech.fastspeech import FastSpeech from parakeet.models.fastspeech.fastspeech import FastSpeech
from parakeet.models.transformer_tts.utils import *
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
...@@ -59,12 +60,26 @@ def synthesis(text_input, args): ...@@ -59,12 +60,26 @@ def synthesis(text_input, args):
model.eval() model.eval()
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) text = np.expand_dims(text, axis=0)
pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.arange(1, text.shape[1] + 1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pos_text = np.expand_dims(pos_text, axis=0)
enc_non_pad_mask = get_non_pad_mask(pos_text).astype(np.float32)
enc_slf_attn_mask = get_attn_key_pad_mask(pos_text,
text).astype(np.float32)
text = dg.to_variable(text)
pos_text = dg.to_variable(pos_text)
enc_non_pad_mask = dg.to_variable(enc_non_pad_mask)
enc_slf_attn_mask = dg.to_variable(enc_slf_attn_mask)
mel_output, mel_output_postnet = model( mel_output, mel_output_postnet = model(
text, pos_text, alpha=args.alpha) text,
pos_text,
alpha=args.alpha,
enc_non_pad_mask=enc_non_pad_mask,
enc_slf_attn_mask=enc_slf_attn_mask,
dec_non_pad_mask=None,
dec_slf_attn_mask=None)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg['audio']['sr'], sample_rate=cfg['audio']['sr'],
......
...@@ -21,6 +21,7 @@ from parse import add_config_options_to_parser ...@@ -21,6 +21,7 @@ from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from ruamel import yaml from ruamel import yaml
from tqdm import tqdm from tqdm import tqdm
from matplotlib import cm
from collections import OrderedDict from collections import OrderedDict
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
...@@ -66,12 +67,12 @@ def main(args): ...@@ -66,12 +67,12 @@ def main(args):
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
transformerTTS = TransformerTTS(cfg) transformer_tts = TransformerTTS(cfg)
model_dict, _ = load_checkpoint( model_dict, _ = load_checkpoint(
str(args.transformer_step), str(args.transformer_step),
os.path.join(args.transtts_path, "transformer")) os.path.join(args.transtts_path, "transformer"))
transformerTTS.set_dict(model_dict) transformer_tts.set_dict(model_dict)
transformerTTS.eval() transformer_tts.eval()
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.train() model.train()
...@@ -100,13 +101,33 @@ def main(args): ...@@ -100,13 +101,33 @@ def main(args):
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d' % epoch) pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data (character, mel, mel_input, pos_text, pos_mel, text_length,
mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask,
enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data
_, _, attn_probs, _, _, _ = transformerTTS( _, _, attn_probs, _, _, _ = transformer_tts(
character, mel_input, pos_text, pos_mel) character,
alignment = dg.to_variable( mel_input,
get_alignment(attn_probs, mel_lens, cfg[ pos_text,
'transformer_head'])).astype(np.float32) pos_mel,
dec_slf_mask=dec_slf_mask,
enc_slf_mask=enc_slf_mask,
enc_query_mask=enc_query_mask,
enc_dec_mask=enc_dec_mask,
dec_query_slf_mask=dec_query_slf_mask,
dec_query_mask=dec_query_mask)
alignment, max_attn = get_alignment(attn_probs, mel_lens,
cfg['transformer_head'])
alignment = dg.to_variable(alignment).astype(np.float32)
if local_rank == 0 and global_step % 5 == 1:
x = np.uint8(
cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255)
writer.add_image(
'Attention_%d_0' % global_step,
x,
0,
dataformats="HWC")
global_step += 1 global_step += 1
...@@ -115,7 +136,11 @@ def main(args): ...@@ -115,7 +136,11 @@ def main(args):
character, character,
pos_text, pos_text,
mel_pos=pos_mel, mel_pos=pos_mel,
length_target=alignment) length_target=alignment,
enc_non_pad_mask=enc_query_mask,
enc_slf_attn_mask=enc_slf_mask,
dec_non_pad_mask=dec_query_slf_mask,
dec_slf_attn_mask=dec_slf_mask)
mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
mel_loss = layers.mse_loss(mel_output, mel) mel_loss = layers.mse_loss(mel_output, mel)
mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
......
# train model # train model
# if you wish to resume from an exists model, uncomment --checkpoint_path and --fastspeech_step # if you wish to resume from an exists model, uncomment --checkpoint_path and --fastspeech_step
CUDA_VISIBLE_DEVICES=0\ export CUDA_VISIBLE_DEVICES=0
python -u train.py \ python -u train.py \
--batch_size=32 \ --batch_size=32 \
--epochs=10000 \ --epochs=10000 \
......
...@@ -8,4 +8,7 @@ audio: ...@@ -8,4 +8,7 @@ audio:
power: 1.2 power: 1.2
min_level_db: -100 min_level_db: -100
ref_level_db: 20 ref_level_db: 20
outputs_per_step: 1 outputs_per_step: 1
\ No newline at end of file
hidden_size: 256
embedding_size: 512
\ No newline at end of file
...@@ -23,7 +23,8 @@ from parakeet import audio ...@@ -23,7 +23,8 @@ from parakeet import audio
from parakeet.data.sampler import * from parakeet.data.sampler import *
from parakeet.data.datacargo import DataCargo from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset from parakeet.data.dataset import DatasetMixin, TransformDataset, CacheDataset
from parakeet.models.transformer_tts.utils import *
class LJSpeechLoader: class LJSpeechLoader:
...@@ -40,6 +41,8 @@ class LJSpeechLoader: ...@@ -40,6 +41,8 @@ class LJSpeechLoader:
metadata = LJSpeechMetaData(LJSPEECH_ROOT) metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config) transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer) dataset = TransformDataset(metadata, transformer)
dataset = CacheDataset(dataset)
sampler = DistributedSampler( sampler = DistributedSampler(
len(metadata), nranks, rank, shuffle=shuffle) len(metadata), nranks, rank, shuffle=shuffle)
...@@ -196,8 +199,18 @@ def batch_examples(batch): ...@@ -196,8 +199,18 @@ def batch_examples(batch):
SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels) SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1)) #(B,T,num_mels)
mel_inputs = np.transpose( mel_inputs = np.transpose(
SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels) SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1)) #(B,T,num_mels)
enc_slf_mask = get_attn_key_pad_mask(pos_texts, texts).astype(np.float32)
enc_query_mask = get_non_pad_mask(pos_texts).astype(np.float32)
dec_slf_mask = get_dec_attn_key_pad_mask(pos_mels,
mel_inputs).astype(np.float32)
enc_dec_mask = get_attn_key_pad_mask(enc_query_mask[:, :, 0],
mel_inputs).astype(np.float32)
dec_query_slf_mask = get_non_pad_mask(pos_mels).astype(np.float32)
dec_query_mask = get_non_pad_mask(pos_mels).astype(np.float32)
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
np.array(mel_lens)) np.array(mel_lens), enc_slf_mask, enc_query_mask, dec_slf_mask,
enc_dec_mask, dec_query_slf_mask, dec_query_mask)
def batch_examples_vocoder(batch): def batch_examples_vocoder(batch):
......
...@@ -16,6 +16,7 @@ from scipy.io.wavfile import write ...@@ -16,6 +16,7 @@ from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from matplotlib import cm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from ruamel import yaml from ruamel import yaml
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -25,6 +26,7 @@ import argparse ...@@ -25,6 +26,7 @@ import argparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from collections import OrderedDict from collections import OrderedDict
from parakeet.models.transformer_tts.utils import *
from parakeet import audio from parakeet import audio
from parakeet.models.transformer_tts.vocoder import Vocoder from parakeet.models.transformer_tts.vocoder import Vocoder
from parakeet.models.transformer_tts.transformer_tts import TransformerTTS from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
...@@ -78,14 +80,18 @@ def synthesis(text_input, args): ...@@ -78,14 +80,18 @@ def synthesis(text_input, args):
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
pbar = tqdm(range(args.max_len)) pbar = tqdm(range(args.max_len))
for i in pbar: for i in pbar:
dec_slf_mask = get_triu_tensor(
mel_input.numpy(), mel_input.numpy()).astype(np.float32)
dec_slf_mask = fluid.layers.cast(
dg.to_variable(dec_slf_mask == 0), np.float32)
pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = np.arange(1, mel_input.shape[1] + 1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
text, mel_input, pos_text, pos_mel) text, mel_input, pos_text, pos_mel, dec_slf_mask)
mel_input = fluid.layers.concat( mel_input = fluid.layers.concat(
[mel_input, postnet_pred[:, -1:, :]], axis=1) [mel_input, postnet_pred[:, -1:, :]], axis=1)
mag_pred = model_vocoder(postnet_pred) mag_pred = model_vocoder(postnet_pred)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
...@@ -111,6 +117,33 @@ def synthesis(text_input, args): ...@@ -111,6 +117,33 @@ def synthesis(text_input, args):
wav = _ljspeech_processor.inv_spectrogram( wav = _ljspeech_processor.inv_spectrogram(
fluid.layers.transpose( fluid.layers.transpose(
fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
global_step = 0
for i, prob in enumerate(attn_probs):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
writer.add_image(
'Attention_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_enc):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
writer.add_image(
'Attention_enc_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
for i, prob in enumerate(attn_dec):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
writer.add_image(
'Attention_dec_%d_0' % global_step,
x,
i * 4 + j,
dataformats="HWC")
writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
if not os.path.exists(args.sample_path): if not os.path.exists(args.sample_path):
os.mkdir(args.sample_path) os.mkdir(args.sample_path)
...@@ -124,4 +157,6 @@ if __name__ == '__main__': ...@@ -124,4 +157,6 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Synthesis model") parser = argparse.ArgumentParser(description="Synthesis model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
args = parser.parse_args() args = parser.parse_args()
synthesis("Transformer model is so fast!", args) synthesis(
"They emphasized the necessity that the information now being furnished be handled with judgment and care.",
args)
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
# train model # train model
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
python -u synthesis.py \ python -u synthesis.py \
--max_len=50 \ --max_len=600 \
--transformer_step=160000 \ --transformer_step=160000 \
--vocoder_step=70000 \ --vocoder_step=90000 \
--use_gpu=1 --use_gpu=1 \
--checkpoint_path='./checkpoint' \ --checkpoint_path='./checkpoint' \
--log_dir='./log' \ --log_dir='./log' \
--sample_path='./sample' \ --sample_path='./sample' \
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import os import os
from tqdm import tqdm from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from pathlib import Path #from pathlib import Path
from collections import OrderedDict from collections import OrderedDict
import argparse import argparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
...@@ -89,21 +89,31 @@ def main(args): ...@@ -89,21 +89,31 @@ def main(args):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d' % epoch) pbar.set_description('Processing at epoch %d' % epoch)
character, mel, mel_input, pos_text, pos_mel, text_length, _ = data character, mel, mel_input, pos_text, pos_mel, text_length, _, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask = data
global_step += 1 global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
character, mel_input, pos_text, pos_mel)
label = (pos_mel == 0).astype(np.float32) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
character,
mel_input,
pos_text,
pos_mel,
dec_slf_mask=dec_slf_mask,
enc_slf_mask=enc_slf_mask,
enc_query_mask=enc_query_mask,
enc_dec_mask=enc_dec_mask,
dec_query_slf_mask=dec_query_slf_mask,
dec_query_mask=dec_query_mask)
mel_loss = layers.mean( mel_loss = layers.mean(
layers.abs(layers.elementwise_sub(mel_pred, mel))) layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean( post_mel_loss = layers.mean(
layers.abs(layers.elementwise_sub(postnet_pred, mel))) layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss loss = mel_loss + post_mel_loss
# Note: When used stop token loss the learning did not work. # Note: When used stop token loss the learning did not work.
if args.stop_token: if args.stop_token:
label = (pos_mel == 0).astype(np.float32)
stop_loss = cross_entropy(stop_preds, label) stop_loss = cross_entropy(stop_preds, label)
loss = loss + stop_loss loss = loss + stop_loss
......
# train model # train model
# if you wish to resume from an exists model, uncomment --checkpoint_path and --transformer_step # if you wish to resume from an exists model, uncomment --checkpoint_path and --transformer_step
CUDA_VISIBLE_DEVICES=0 \ export CUDA_VISIBLE_DEVICES=2
python -u train_transformer.py \ python -u train_transformer.py \
--batch_size=32 \ --batch_size=32 \
--epochs=10000 \ --epochs=10000 \
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import six import six
import numpy as np import numpy as np
from tqdm import tqdm
class DatasetMixin(object): class DatasetMixin(object):
......
...@@ -32,6 +32,7 @@ class Decoder(dg.Layer): ...@@ -32,6 +32,7 @@ class Decoder(dg.Layer):
super(Decoder, self).__init__() super(Decoder, self).__init__()
n_position = len_max_seq + 1 n_position = len_max_seq + 1
self.n_head = n_head
self.pos_inp = get_sinusoid_encoding_table( self.pos_inp = get_sinusoid_encoding_table(
n_position, d_model, padding_idx=0) n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding( self.position_enc = dg.Embedding(
...@@ -55,7 +56,7 @@ class Decoder(dg.Layer): ...@@ -55,7 +56,7 @@ class Decoder(dg.Layer):
for i, layer in enumerate(self.layer_stack): for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer) self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, enc_seq, enc_pos): def forward(self, enc_seq, enc_pos, non_pad_mask, slf_attn_mask=None):
""" """
Decoder layer of FastSpeech. Decoder layer of FastSpeech.
...@@ -69,10 +70,7 @@ class Decoder(dg.Layer): ...@@ -69,10 +70,7 @@ class Decoder(dg.Layer):
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
""" """
dec_slf_attn_list = [] dec_slf_attn_list = []
slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1])
# -- Prepare masks
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
non_pad_mask = get_non_pad_mask(enc_pos)
# -- Forward # -- Forward
dec_output = enc_seq + self.position_enc(enc_pos) dec_output = enc_seq + self.position_enc(enc_pos)
......
...@@ -32,14 +32,17 @@ class Encoder(dg.Layer): ...@@ -32,14 +32,17 @@ class Encoder(dg.Layer):
dropout=0.1): dropout=0.1):
super(Encoder, self).__init__() super(Encoder, self).__init__()
n_position = len_max_seq + 1 n_position = len_max_seq + 1
self.n_head = n_head
self.src_word_emb = dg.Embedding( self.src_word_emb = dg.Embedding(
size=[n_src_vocab, d_model], padding_idx=0) size=[n_src_vocab, d_model],
padding_idx=0,
param_attr=fluid.initializer.Normal(
loc=0.0, scale=1.0))
self.pos_inp = get_sinusoid_encoding_table( self.pos_inp = get_sinusoid_encoding_table(
n_position, d_model, padding_idx=0) n_position, d_model, padding_idx=0)
self.position_enc = dg.Embedding( self.position_enc = dg.Embedding(
size=[n_position, d_model], size=[n_position, d_model],
padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer( initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp), self.pos_inp),
...@@ -58,7 +61,7 @@ class Encoder(dg.Layer): ...@@ -58,7 +61,7 @@ class Encoder(dg.Layer):
for i, layer in enumerate(self.layer_stack): for i, layer in enumerate(self.layer_stack):
self.add_sublayer('fft_{}'.format(i), layer) self.add_sublayer('fft_{}'.format(i), layer)
def forward(self, character, text_pos): def forward(self, character, text_pos, non_pad_mask, slf_attn_mask=None):
""" """
Encoder layer of FastSpeech. Encoder layer of FastSpeech.
...@@ -74,10 +77,7 @@ class Encoder(dg.Layer): ...@@ -74,10 +77,7 @@ class Encoder(dg.Layer):
enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list. enc_slf_attn_list (list<Variable>), Len(n_layers), Shape(B * n_head, text_T, text_T), the encoder self attention list.
""" """
enc_slf_attn_list = [] enc_slf_attn_list = []
# -- prepare masks slf_attn_mask = layers.expand(slf_attn_mask, [self.n_head, 1, 1])
# shape character (N, T)
slf_attn_mask = get_attn_key_pad_mask(seq_k=character, seq_q=character)
non_pad_mask = get_non_pad_mask(character)
# -- Forward # -- Forward
enc_output = self.src_word_emb(character) + self.position_enc( enc_output = self.src_word_emb(character) + self.position_enc(
...@@ -90,4 +90,4 @@ class Encoder(dg.Layer): ...@@ -90,4 +90,4 @@ class Encoder(dg.Layer):
slf_attn_mask=slf_attn_mask) slf_attn_mask=slf_attn_mask)
enc_slf_attn_list += [enc_slf_attn] enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask, enc_slf_attn_list return enc_output, enc_slf_attn_list
...@@ -12,9 +12,11 @@ ...@@ -12,9 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import math import math
import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
from parakeet.models.transformer_tts.utils import *
from parakeet.models.transformer_tts.post_convnet import PostConvNet from parakeet.models.transformer_tts.post_convnet import PostConvNet
from parakeet.models.fastspeech.length_regulator import LengthRegulator from parakeet.models.fastspeech.length_regulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder from parakeet.models.fastspeech.encoder import Encoder
...@@ -78,6 +80,10 @@ class FastSpeech(dg.Layer): ...@@ -78,6 +80,10 @@ class FastSpeech(dg.Layer):
def forward(self, def forward(self,
character, character,
text_pos, text_pos,
enc_non_pad_mask,
dec_non_pad_mask,
enc_slf_attn_mask=None,
dec_slf_attn_mask=None,
mel_pos=None, mel_pos=None,
length_target=None, length_target=None,
alpha=1.0): alpha=1.0):
...@@ -106,14 +112,20 @@ class FastSpeech(dg.Layer): ...@@ -106,14 +112,20 @@ class FastSpeech(dg.Layer):
dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list. dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
""" """
encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder( encoder_output, enc_slf_attn_list = self.encoder(
character, text_pos) character,
text_pos,
enc_non_pad_mask,
slf_attn_mask=enc_slf_attn_mask)
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
length_regulator_output, duration_predictor_output = self.length_regulator( length_regulator_output, duration_predictor_output = self.length_regulator(
encoder_output, target=length_target, alpha=alpha) encoder_output, target=length_target, alpha=alpha)
decoder_output, dec_slf_attn_list = self.decoder( decoder_output, dec_slf_attn_list = self.decoder(
length_regulator_output, mel_pos) length_regulator_output,
mel_pos,
dec_non_pad_mask,
slf_attn_mask=dec_slf_attn_mask)
mel_output = self.mel_linear(decoder_output) mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output mel_output_postnet = self.postnet(mel_output) + mel_output
...@@ -122,8 +134,18 @@ class FastSpeech(dg.Layer): ...@@ -122,8 +134,18 @@ class FastSpeech(dg.Layer):
else: else:
length_regulator_output, decoder_pos = self.length_regulator( length_regulator_output, decoder_pos = self.length_regulator(
encoder_output, alpha=alpha) encoder_output, alpha=alpha)
decoder_output, _ = self.decoder(length_regulator_output, slf_attn_mask = get_triu_tensor(
decoder_pos) decoder_pos.numpy(), decoder_pos.numpy()).astype(np.float32)
slf_attn_mask = fluid.layers.cast(
dg.to_variable(slf_attn_mask == 0), np.float32)
slf_attn_mask = dg.to_variable(slf_attn_mask)
dec_non_pad_mask = fluid.layers.unsqueeze(
(decoder_pos != 0).astype(np.float32), [-1])
decoder_output, _ = self.decoder(
length_regulator_output,
decoder_pos,
dec_non_pad_mask,
slf_attn_mask=slf_attn_mask)
mel_output = self.mel_linear(decoder_output) mel_output = self.mel_linear(decoder_output)
mel_output_postnet = self.postnet(mel_output) + mel_output mel_output_postnet = self.postnet(mel_output) + mel_output
......
...@@ -46,7 +46,7 @@ class FFTBlock(dg.Layer): ...@@ -46,7 +46,7 @@ class FFTBlock(dg.Layer):
padding=padding, padding=padding,
dropout=dropout) dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): def forward(self, enc_input, non_pad_mask, slf_attn_mask=None):
""" """
Feed Forward Transformer block in FastSpeech. Feed Forward Transformer block in FastSpeech.
...@@ -63,6 +63,7 @@ class FFTBlock(dg.Layer): ...@@ -63,6 +63,7 @@ class FFTBlock(dg.Layer):
""" """
output, slf_attn = self.slf_attn( output, slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, mask=slf_attn_mask) enc_input, enc_input, enc_input, mask=slf_attn_mask)
output *= non_pad_mask output *= non_pad_mask
output = self.pos_ffn(output) output = self.pos_ffn(output)
......
...@@ -146,11 +146,17 @@ class DurationPredictor(dg.Layer): ...@@ -146,11 +146,17 @@ class DurationPredictor(dg.Layer):
out = layers.transpose(encoder_output, [0, 2, 1]) out = layers.transpose(encoder_output, [0, 2, 1])
out = self.conv1(out) out = self.conv1(out)
out = layers.transpose(out, [0, 2, 1]) out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout) out = layers.dropout(
layers.relu(self.layer_norm1(out)),
self.dropout,
dropout_implementation='upscale_in_train')
out = layers.transpose(out, [0, 2, 1]) out = layers.transpose(out, [0, 2, 1])
out = self.conv2(out) out = self.conv2(out)
out = layers.transpose(out, [0, 2, 1]) out = layers.transpose(out, [0, 2, 1])
out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout) out = layers.dropout(
layers.relu(self.layer_norm2(out)),
self.dropout,
dropout_implementation='upscale_in_train')
out = layers.relu(self.linear(out)) out = layers.relu(self.linear(out))
out = layers.squeeze(out, axes=[-1]) out = layers.squeeze(out, axes=[-1])
......
...@@ -18,7 +18,6 @@ def get_alignment(attn_probs, mel_lens, n_head): ...@@ -18,7 +18,6 @@ def get_alignment(attn_probs, mel_lens, n_head):
max_F = 0 max_F = 0
assert attn_probs[0].shape[0] % n_head == 0 assert attn_probs[0].shape[0] % n_head == 0
batch_size = int(attn_probs[0].shape[0] // n_head) batch_size = int(attn_probs[0].shape[0] // n_head)
#max_attn = attn_probs[0].numpy()[0,batch_size]
for i in range(len(attn_probs)): for i in range(len(attn_probs)):
multi_attn = attn_probs[i].numpy() multi_attn = attn_probs[i].numpy()
for j in range(n_head): for j in range(n_head):
...@@ -28,7 +27,7 @@ def get_alignment(attn_probs, mel_lens, n_head): ...@@ -28,7 +27,7 @@ def get_alignment(attn_probs, mel_lens, n_head):
max_F = F max_F = F
max_attn = attn max_attn = attn
alignment = compute_duration(max_attn, mel_lens) alignment = compute_duration(max_attn, mel_lens)
return alignment return alignment, max_attn
def score_F(attn): def score_F(attn):
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import math import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.utils import * from parakeet.models.transformer_tts.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformer_tts.prenet import PreNet from parakeet.models.transformer_tts.prenet import PreNet
...@@ -25,6 +25,7 @@ class Decoder(dg.Layer): ...@@ -25,6 +25,7 @@ class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4): def __init__(self, num_hidden, config, num_head=4):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.num_head = num_head
param = fluid.ParamAttr() param = fluid.ParamAttr()
self.alpha = self.create_parameter( self.alpha = self.create_parameter(
shape=(1, ), shape=(1, ),
...@@ -98,30 +99,29 @@ class Decoder(dg.Layer): ...@@ -98,30 +99,29 @@ class Decoder(dg.Layer):
outputs_per_step=config['audio']['outputs_per_step'], outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn=True) use_cudnn=True)
def forward(self, key, value, query, c_mask, positional): def forward(self,
key,
value,
query,
positional,
mask,
m_mask=None,
m_self_mask=None,
zero_mask=None):
# get decoder mask with triangular matrix # get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
m_mask = get_non_pad_mask(positional) m_mask = layers.expand(m_mask, [self.num_head, 1, key.shape[1]])
mask = get_attn_key_pad_mask((positional == 0).astype(np.float32), m_self_mask = layers.expand(m_self_mask,
query) [self.num_head, 1, query.shape[1]])
triu_tensor = dg.to_variable( mask = layers.expand(mask, [self.num_head, 1, 1])
get_triu_tensor(query.numpy(), query.numpy())).astype( zero_mask = layers.expand(zero_mask, [self.num_head, 1, 1])
np.float32)
mask = mask + triu_tensor
mask = fluid.layers.cast(mask == 0, np.float32)
# (batch_size, decoder_len, encoder_len)
zero_mask = get_attn_key_pad_mask(
layers.squeeze(c_mask, [-1]), query)
else: else:
mask = get_triu_tensor(query.numpy(), m_mask, m_self_mask, zero_mask = None, None, None
query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
# Decoder pre-network # Decoder pre-network
query = self.decoder_prenet(query) query = self.decoder_prenet(query)
# Centered position # Centered position
...@@ -132,7 +132,8 @@ class Decoder(dg.Layer): ...@@ -132,7 +132,8 @@ class Decoder(dg.Layer):
query = positional * self.alpha + query query = positional * self.alpha + query
#positional dropout #positional dropout
query = fluid.layers.dropout(query, 0.1) query = fluid.layers.dropout(
query, 0.1, dropout_implementation='upscale_in_train')
# Attention decoder-decoder, encoder-decoder # Attention decoder-decoder, encoder-decoder
selfattn_list = list() selfattn_list = list()
...@@ -141,12 +142,13 @@ class Decoder(dg.Layer): ...@@ -141,12 +142,13 @@ class Decoder(dg.Layer):
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
self.ffns): self.ffns):
query, attn_dec = selfattn( query, attn_dec = selfattn(
query, query, query, mask=mask, query_mask=m_mask) query, query, query, mask=mask, query_mask=m_self_mask)
query, attn_dot = attn( query, attn_dot = attn(
key, value, query, mask=zero_mask, query_mask=m_mask) key, value, query, mask=zero_mask, query_mask=m_mask)
query = ffn(query) query = ffn(query)
selfattn_list.append(attn_dec) selfattn_list.append(attn_dec)
attn_list.append(attn_dot) attn_list.append(attn_dot)
# Mel linear projection # Mel linear projection
mel_out = self.mel_linear(query) mel_out = self.mel_linear(query)
# Post Mel Network # Post Mel Network
......
...@@ -23,6 +23,7 @@ class Encoder(dg.Layer): ...@@ -23,6 +23,7 @@ class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, num_head=4): def __init__(self, embedding_size, num_hidden, num_head=4):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.num_head = num_head
param = fluid.ParamAttr(initializer=fluid.initializer.Constant( param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
value=1.0)) value=1.0))
self.alpha = self.create_parameter( self.alpha = self.create_parameter(
...@@ -31,7 +32,6 @@ class Encoder(dg.Layer): ...@@ -31,7 +32,6 @@ class Encoder(dg.Layer):
1024, self.num_hidden, padding_idx=0) 1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding( self.pos_emb = dg.Embedding(
size=[1024, num_hidden], size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer( initializer=fluid.initializer.NumpyArrayInitializer(
self.pos_inp), self.pos_inp),
...@@ -56,13 +56,15 @@ class Encoder(dg.Layer): ...@@ -56,13 +56,15 @@ class Encoder(dg.Layer):
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
def forward(self, x, positional): def forward(self, x, positional, mask=None, query_mask=None):
if fluid.framework._dygraph_tracer()._train_mode: if fluid.framework._dygraph_tracer()._train_mode:
query_mask = get_non_pad_mask(positional) seq_len_key = x.shape[1]
mask = get_attn_key_pad_mask(positional, x) query_mask = layers.expand(query_mask,
[self.num_head, 1, seq_len_key])
mask = layers.expand(mask, [self.num_head, 1, 1])
else: else:
query_mask, mask = None, None query_mask, mask = None, None
# Encoder pre_network # Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C) x = self.encoder_prenet(x) #(N,T,C)
...@@ -72,7 +74,7 @@ class Encoder(dg.Layer): ...@@ -72,7 +74,7 @@ class Encoder(dg.Layer):
x = positional * self.alpha + x #(N, T, C) x = positional * self.alpha + x #(N, T, C)
# Positional dropout # Positional dropout
x = layers.dropout(x, 0.1) x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train')
# Self attention encoder # Self attention encoder
attentions = list() attentions = list()
...@@ -81,4 +83,4 @@ class Encoder(dg.Layer): ...@@ -81,4 +83,4 @@ class Encoder(dg.Layer):
x = ffn(x) x = ffn(x)
attentions.append(attention) attentions.append(attention)
return x, query_mask, attentions return x, attentions
...@@ -27,7 +27,10 @@ class EncoderPrenet(dg.Layer): ...@@ -27,7 +27,10 @@ class EncoderPrenet(dg.Layer):
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.use_cudnn = use_cudnn self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( self.embedding = dg.Embedding(
size=[len(symbols), embedding_size], padding_idx=None) size=[len(symbols), embedding_size],
padding_idx=0,
param_attr=fluid.initializer.Normal(
loc=0.0, scale=1.0))
self.conv_list = [] self.conv_list = []
k = math.sqrt(1 / embedding_size) k = math.sqrt(1 / embedding_size)
self.conv_list.append( self.conv_list.append(
...@@ -78,10 +81,14 @@ class EncoderPrenet(dg.Layer): ...@@ -78,10 +81,14 @@ class EncoderPrenet(dg.Layer):
low=-k, high=k))) low=-k, high=k)))
def forward(self, x): def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size) x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x, [0, 2, 1]) x = layers.transpose(x, [0, 2, 1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2) x = layers.dropout(
layers.relu(batch_norm(conv(x))),
0.2,
dropout_implementation='upscale_in_train')
x = layers.transpose(x, [0, 2, 1]) #(N,T,C) x = layers.transpose(x, [0, 2, 1]) #(N,T,C)
x = self.projection(x) x = self.projection(x)
......
...@@ -108,11 +108,16 @@ class PostConvNet(dg.Layer): ...@@ -108,11 +108,16 @@ class PostConvNet(dg.Layer):
conv = self.conv_list[i] conv = self.conv_list[i]
input = layers.dropout( input = layers.dropout(
layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout) layers.tanh(batch_norm(conv(input)[:, :, :len])),
self.dropout,
dropout_implementation='upscale_in_train')
conv = self.conv_list[self.num_conv - 1] conv = self.conv_list[self.num_conv - 1]
input = conv(input)[:, :, :len] input = conv(input)[:, :, :len]
if self.batchnorm_last: if self.batchnorm_last:
batch_norm = self.batch_norm_list[self.num_conv - 1] batch_norm = self.batch_norm_list[self.num_conv - 1]
input = layers.dropout(batch_norm(input), self.dropout) input = layers.dropout(
batch_norm(input),
self.dropout,
dropout_implementation='upscale_in_train')
output = layers.transpose(input, [0, 2, 1]) output = layers.transpose(input, [0, 2, 1])
return output return output
...@@ -56,6 +56,12 @@ class PreNet(dg.Layer): ...@@ -56,6 +56,12 @@ class PreNet(dg.Layer):
Returns: Returns:
x (Variable), Shape(B, T, C), the result after pernet. x (Variable), Shape(B, T, C), the result after pernet.
""" """
x = layers.dropout(layers.relu(self.linear1(x)), self.dropout_rate) x = layers.dropout(
x = layers.dropout(layers.relu(self.linear2(x)), self.dropout_rate) layers.relu(self.linear1(x)),
self.dropout_rate,
dropout_implementation='upscale_in_train')
x = layers.dropout(
layers.relu(self.linear2(x)),
self.dropout_rate,
dropout_implementation='upscale_in_train')
return x return x
...@@ -24,11 +24,29 @@ class TransformerTTS(dg.Layer): ...@@ -24,11 +24,29 @@ class TransformerTTS(dg.Layer):
self.decoder = Decoder(config['hidden_size'], config) self.decoder = Decoder(config['hidden_size'], config)
self.config = config self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel): def forward(self,
characters,
key, c_mask, attns_enc = self.encoder(characters, pos_text) mel_input,
pos_text,
pos_mel,
dec_slf_mask,
enc_slf_mask=None,
enc_query_mask=None,
enc_dec_mask=None,
dec_query_slf_mask=None,
dec_query_mask=None):
key, attns_enc = self.encoder(
characters, pos_text, mask=enc_slf_mask, query_mask=enc_query_mask)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder( mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
key, key, mel_input, c_mask, pos_mel) key,
key,
mel_input,
pos_mel,
mask=dec_slf_mask,
zero_mask=enc_dec_mask,
m_self_mask=dec_query_slf_mask,
m_mask=dec_query_mask)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
...@@ -51,7 +51,9 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): ...@@ -51,7 +51,9 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
def get_non_pad_mask(seq): def get_non_pad_mask(seq):
return layers.unsqueeze((seq != 0).astype(np.float32), [-1]) mask = (seq != 0).astype(np.float32)
mask = np.expand_dims(mask, axis=-1)
return mask
def get_attn_key_pad_mask(seq_k, seq_q): def get_attn_key_pad_mask(seq_k, seq_q):
...@@ -60,8 +62,22 @@ def get_attn_key_pad_mask(seq_k, seq_q): ...@@ -60,8 +62,22 @@ def get_attn_key_pad_mask(seq_k, seq_q):
# Expand to fit the shape of key query attention matrix. # Expand to fit the shape of key query attention matrix.
len_q = seq_q.shape[1] len_q = seq_q.shape[1]
padding_mask = (seq_k != 0).astype(np.float32) padding_mask = (seq_k != 0).astype(np.float32)
padding_mask = layers.expand( padding_mask = np.expand_dims(padding_mask, axis=1)
layers.unsqueeze(padding_mask, [1]), [1, len_q, 1]) padding_mask = padding_mask.repeat([len_q], axis=1)
padding_mask = (padding_mask == 0).astype(np.float32) * (-2**32 + 1)
return padding_mask
def get_dec_attn_key_pad_mask(seq_k, seq_q):
''' For masking out the padding part of key sequence. '''
# Expand to fit the shape of key query attention matrix.
len_q = seq_q.shape[1]
padding_mask = (seq_k == 0).astype(np.float32)
padding_mask = np.expand_dims(padding_mask, axis=1)
triu_tensor = get_triu_tensor(seq_q, seq_q)
padding_mask = padding_mask.repeat([len_q], axis=1) + triu_tensor
padding_mask = (padding_mask != 0).astype(np.float32) * (-2**32 + 1)
return padding_mask return padding_mask
......
...@@ -53,11 +53,9 @@ class DynamicGRU(dg.Layer): ...@@ -53,11 +53,9 @@ class DynamicGRU(dg.Layer):
if self.is_reverse: if self.is_reverse:
i = inputs.shape[1] - 1 - i i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :] input_ = inputs[:, i:i + 1, :]
input_ = layers.reshape( input_ = layers.reshape(input_, [-1, input_.shape[2]])
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden) hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = layers.reshape( hidden_ = layers.reshape(hidden, [-1, 1, hidden.shape[1]])
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_) res.append(hidden_)
if self.is_reverse: if self.is_reverse:
res = res[::-1] res = res[::-1]
......
...@@ -71,7 +71,8 @@ class PositionwiseFeedForward(dg.Layer): ...@@ -71,7 +71,8 @@ class PositionwiseFeedForward(dg.Layer):
x = self.w_2(layers.relu(self.w_1(x))) x = self.w_2(layers.relu(self.w_1(x)))
# dropout # dropout
x = layers.dropout(x, self.dropout) x = layers.dropout(
x, self.dropout, dropout_implementation='upscale_in_train')
x = layers.transpose(x, [0, 2, 1]) x = layers.transpose(x, [0, 2, 1])
# residual connection # residual connection
......
此差异已折叠。
...@@ -78,17 +78,15 @@ class ScaledDotProductAttention(dg.Layer): ...@@ -78,17 +78,15 @@ class ScaledDotProductAttention(dg.Layer):
""" """
# Compute attention score # Compute attention score
attention = layers.matmul( attention = layers.matmul(
query, key, transpose_y=True) #transpose the last dim in y query, key, transpose_y=True, alpha=self.d_key
attention = attention / math.sqrt(self.d_key) **-0.5) #transpose the last dim in y
# Mask key to ignore padding # Mask key to ignore padding
if mask is not None: if mask is not None:
attention = attention * mask
mask = (mask == 0).astype(np.float32) * (-2**32 + 1)
attention = attention + mask attention = attention + mask
attention = layers.softmax(attention) attention = layers.softmax(attention)
attention = layers.dropout(attention, dropout) attention = layers.dropout(
attention, dropout, dropout_implementation='upscale_in_train')
# Mask query to ignore padding # Mask query to ignore padding
if query_mask is not None: if query_mask is not None:
...@@ -142,17 +140,11 @@ class MultiheadAttention(dg.Layer): ...@@ -142,17 +140,11 @@ class MultiheadAttention(dg.Layer):
result (Variable), Shape(B, T, C), the result of mutihead attention. result (Variable), Shape(B, T, C), the result of mutihead attention.
attention (Variable), Shape(n_head * B, T, C), the attention of key. attention (Variable), Shape(n_head * B, T, C), the attention of key.
""" """
batch_size = key.shape[0] batch_size = key.shape[0]
seq_len_key = key.shape[1] seq_len_key = key.shape[1]
seq_len_query = query_input.shape[1] seq_len_query = query_input.shape[1]
# repeat masks h times
if query_mask is not None:
query_mask = layers.expand(query_mask,
[self.num_head, 1, seq_len_key])
if mask is not None:
mask = layers.expand(mask, (self.num_head, 1, 1))
# Make multihead attention # Make multihead attention
# key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn) # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
key = layers.reshape( key = layers.reshape(
...@@ -176,6 +168,18 @@ class MultiheadAttention(dg.Layer): ...@@ -176,6 +168,18 @@ class MultiheadAttention(dg.Layer):
result, attention = self.scal_attn( result, attention = self.scal_attn(
key, value, query, mask=mask, query_mask=query_mask) key, value, query, mask=mask, query_mask=query_mask)
key = layers.reshape(
layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
value = layers.reshape(
layers.transpose(value, [2, 0, 1, 3]),
[-1, seq_len_key, self.d_k])
query = layers.reshape(
layers.transpose(query, [2, 0, 1, 3]),
[-1, seq_len_query, self.d_q])
result, attention = self.scal_attn(
key, value, query, mask=mask, query_mask=query_mask)
# concat all multihead result # concat all multihead result
result = layers.reshape( result = layers.reshape(
result, [self.num_head, batch_size, seq_len_query, self.d_q]) result, [self.num_head, batch_size, seq_len_query, self.d_q])
...@@ -184,7 +188,10 @@ class MultiheadAttention(dg.Layer): ...@@ -184,7 +188,10 @@ class MultiheadAttention(dg.Layer):
[batch_size, seq_len_query, -1]) [batch_size, seq_len_query, -1])
if self.is_concat: if self.is_concat:
result = layers.concat([query_input, result], axis=-1) result = layers.concat([query_input, result], axis=-1)
result = layers.dropout(self.fc(result), self.dropout) result = layers.dropout(
self.fc(result),
self.dropout,
dropout_implementation='upscale_in_train')
result = result + query_input result = result + query_input
result = self.layer_norm(result) result = self.layer_norm(result)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册