# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle.fluid.dygraph as dg import paddle.fluid as fluid from parakeet.models.transformer_tts.utils import * from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet class Encoder(dg.Layer): def __init__(self, embedding_size, num_hidden, num_head=4, n_layers=3): """Encoder layer of TransformerTTS. Args: embedding_size (int): the size of position embedding. num_hidden (int): the size of hidden layer in network. n_layers (int, optional): the layers number of multihead attention. Defaults to 4. num_head (int, optional): the head number of multihead attention. Defaults to 3. """ super(Encoder, self).__init__() self.num_hidden = num_hidden self.num_head = num_head param = fluid.ParamAttr(initializer=fluid.initializer.Constant( value=1.0)) self.alpha = self.create_parameter( shape=(1, ), attr=param, dtype='float32') self.pos_inp = get_sinusoid_encoding_table( 1024, self.num_hidden, padding_idx=0) self.pos_emb = dg.Embedding( size=[1024, num_hidden], param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( self.pos_inp), trainable=False)) self.encoder_prenet = EncoderPrenet( embedding_size=embedding_size, num_hidden=num_hidden, use_cudnn=True) self.layers = [ MultiheadAttention(num_hidden, num_hidden // num_head, num_hidden // num_head) for _ in range(n_layers) ] for i, layer in enumerate(self.layers): self.add_sublayer("self_attn_{}".format(i), layer) self.ffns = [ PositionwiseFeedForward( num_hidden, num_hidden * num_head, filter_size=1, use_cudnn=True) for _ in range(n_layers) ] for i, layer in enumerate(self.ffns): self.add_sublayer("ffns_{}".format(i), layer) def forward(self, x, positional, mask=None, query_mask=None): """ Encode text sequence. Args: x (Variable): shape(B, T_text), dtype float32, the input character, where T_text means the timesteps of input text, positional (Variable): shape(B, T_text), dtype int64, the characters position. mask (Variable, optional): shape(B, T_text, T_text), dtype int64, the mask of encoder self attention. Defaults to None. query_mask (Variable, optional): shape(B, T_text, 1), dtype int64, the query mask of encoder self attention. Defaults to None. Returns: x (Variable): shape(B, T_text, C), the encoder output. attentions (list[Variable]): len(n_layers), the encoder self attention list. """ if fluid.framework._dygraph_tracer()._train_mode: seq_len_key = x.shape[1] query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key]) mask = layers.expand(mask, [self.num_head, 1, 1]) else: query_mask, mask = None, None # Encoder pre_network x = self.encoder_prenet(x) # Get positional encoding positional = self.pos_emb(positional) x = positional * self.alpha + x # Positional dropout x = layers.dropout(x, 0.1, dropout_implementation='upscale_in_train') # Self attention encoder attentions = list() for layer, ffn in zip(self.layers, self.ffns): x, attention = layer(x, x, x, mask=mask, query_mask=query_mask) x = ffn(x) attentions.append(attention) return x, attentions