# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle from paddle import ParamAttr, nn from paddle import nn, ParamAttr from paddle.nn import functional as F import paddle.fluid as fluid import numpy as np gradient_clip = 10 class WrapEncoderForFeature(nn.Layer): def __init__(self, src_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, bos_idx=0): super(WrapEncoderForFeature, self).__init__() self.prepare_encoder = PrepareEncoder( src_vocab_size, d_model, max_length, prepostprocess_dropout, bos_idx=bos_idx, word_emb_param_name="src_word_emb_table") self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd) def forward(self, enc_inputs): conv_features, src_pos, src_slf_attn_bias = enc_inputs enc_input = self.prepare_encoder(conv_features, src_pos) enc_output = self.encoder(enc_input, src_slf_attn_bias) return enc_output class WrapEncoder(nn.Layer): """ embedder + encoder """ def __init__(self, src_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, bos_idx=0): super(WrapEncoder, self).__init__() self.prepare_decoder = PrepareDecoder( src_vocab_size, d_model, max_length, prepostprocess_dropout, bos_idx=bos_idx) self.encoder = Encoder(n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd) def forward(self, enc_inputs): src_word, src_pos, src_slf_attn_bias = enc_inputs enc_input = self.prepare_decoder(src_word, src_pos) enc_output = self.encoder(enc_input, src_slf_attn_bias) return enc_output class Encoder(nn.Layer): """ encoder """ def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd="n", postprocess_cmd="da"): super(Encoder, self).__init__() self.encoder_layers = list() for i in range(n_layer): self.encoder_layers.append( self.add_sublayer( "layer_%d" % i, EncoderLayer(n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd))) self.processer = PrePostProcessLayer(preprocess_cmd, d_model, prepostprocess_dropout) def forward(self, enc_input, attn_bias): for encoder_layer in self.encoder_layers: enc_output = encoder_layer(enc_input, attn_bias) enc_input = enc_output enc_output = self.processer(enc_output) return enc_output class EncoderLayer(nn.Layer): """ EncoderLayer """ def __init__(self, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd="n", postprocess_cmd="da"): super(EncoderLayer, self).__init__() self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, prepostprocess_dropout) self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, attention_dropout) self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, prepostprocess_dropout) self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, prepostprocess_dropout) self.ffn = FFN(d_inner_hid, d_model, relu_dropout) self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, prepostprocess_dropout) def forward(self, enc_input, attn_bias): attn_output = self.self_attn( self.preprocesser1(enc_input), None, None, attn_bias) attn_output = self.postprocesser1(attn_output, enc_input) ffn_output = self.ffn(self.preprocesser2(attn_output)) ffn_output = self.postprocesser2(ffn_output, attn_output) return ffn_output class MultiHeadAttention(nn.Layer): """ Multi-Head Attention """ def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.): super(MultiHeadAttention, self).__init__() self.n_head = n_head self.d_key = d_key self.d_value = d_value self.d_model = d_model self.dropout_rate = dropout_rate self.q_fc = paddle.nn.Linear( in_features=d_model, out_features=d_key * n_head, bias_attr=False) self.k_fc = paddle.nn.Linear( in_features=d_model, out_features=d_key * n_head, bias_attr=False) self.v_fc = paddle.nn.Linear( in_features=d_model, out_features=d_value * n_head, bias_attr=False) self.proj_fc = paddle.nn.Linear( in_features=d_value * n_head, out_features=d_model, bias_attr=False) def _prepare_qkv(self, queries, keys, values, cache=None): if keys is None: # self-attention keys, values = queries, queries static_kv = False else: # cross-attention static_kv = True q = self.q_fc(queries) q = paddle.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) q = paddle.transpose(x=q, perm=[0, 2, 1, 3]) if cache is not None and static_kv and "static_k" in cache: # for encoder-decoder attention in inference and has cached k = cache["static_k"] v = cache["static_v"] else: k = self.k_fc(keys) v = self.v_fc(values) k = paddle.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) k = paddle.transpose(x=k, perm=[0, 2, 1, 3]) v = paddle.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) v = paddle.transpose(x=v, perm=[0, 2, 1, 3]) if cache is not None: if static_kv and not "static_k" in cache: # for encoder-decoder attention in inference and has not cached cache["static_k"], cache["static_v"] = k, v elif not static_kv: # for decoder self-attention in inference cache_k, cache_v = cache["k"], cache["v"] k = paddle.concat([cache_k, k], axis=2) v = paddle.concat([cache_v, v], axis=2) cache["k"], cache["v"] = k, v return q, k, v def forward(self, queries, keys, values, attn_bias, cache=None): # compute q ,k ,v keys = queries if keys is None else keys values = keys if values is None else values q, k, v = self._prepare_qkv(queries, keys, values, cache) # scale dot product attention product = paddle.matmul(x=q, y=k, transpose_y=True) product = product * self.d_model**-0.5 if attn_bias is not None: product += attn_bias weights = F.softmax(product) if self.dropout_rate: weights = F.dropout( weights, p=self.dropout_rate, mode="downscale_in_infer") out = paddle.matmul(weights, v) # combine heads out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.proj_fc(out) return out class PrePostProcessLayer(nn.Layer): """ PrePostProcessLayer """ def __init__(self, process_cmd, d_model, dropout_rate): super(PrePostProcessLayer, self).__init__() self.process_cmd = process_cmd self.functors = [] for cmd in self.process_cmd: if cmd == "a": # add residual connection self.functors.append(lambda x, y: x + y if y is not None else x) elif cmd == "n": # add layer normalization self.functors.append( self.add_sublayer( "layer_norm_%d" % len( self.sublayers(include_sublayers=False)), paddle.nn.LayerNorm( normalized_shape=d_model, weight_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(1.)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(0.))))) elif cmd == "d": # add dropout self.functors.append(lambda x: F.dropout( x, p=dropout_rate, mode="downscale_in_infer") if dropout_rate else x) def forward(self, x, residual=None): for i, cmd in enumerate(self.process_cmd): if cmd == "a": x = self.functors[i](x, residual) else: x = self.functors[i](x) return x class PrepareEncoder(nn.Layer): def __init__(self, src_vocab_size, src_emb_dim, src_max_len, dropout_rate=0, bos_idx=0, word_emb_param_name=None, pos_enc_param_name=None): super(PrepareEncoder, self).__init__() self.src_emb_dim = src_emb_dim self.src_max_len = src_max_len self.emb = paddle.nn.Embedding( num_embeddings=self.src_max_len, embedding_dim=self.src_emb_dim, sparse=True) self.dropout_rate = dropout_rate def forward(self, src_word, src_pos): src_word_emb = src_word src_word_emb = fluid.layers.cast(src_word_emb, 'float32') src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) src_pos = paddle.squeeze(src_pos, axis=-1) src_pos_enc = self.emb(src_pos) src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc if self.dropout_rate: out = F.dropout( x=enc_input, p=self.dropout_rate, mode="downscale_in_infer") else: out = enc_input return out class PrepareDecoder(nn.Layer): def __init__(self, src_vocab_size, src_emb_dim, src_max_len, dropout_rate=0, bos_idx=0, word_emb_param_name=None, pos_enc_param_name=None): super(PrepareDecoder, self).__init__() self.src_emb_dim = src_emb_dim """ self.emb0 = Embedding(num_embeddings=src_vocab_size, embedding_dim=src_emb_dim) """ self.emb0 = paddle.nn.Embedding( num_embeddings=src_vocab_size, embedding_dim=self.src_emb_dim, padding_idx=bos_idx, weight_attr=paddle.ParamAttr( name=word_emb_param_name, initializer=nn.initializer.Normal(0., src_emb_dim**-0.5))) self.emb1 = paddle.nn.Embedding( num_embeddings=src_max_len, embedding_dim=self.src_emb_dim, weight_attr=paddle.ParamAttr(name=pos_enc_param_name)) self.dropout_rate = dropout_rate def forward(self, src_word, src_pos): src_word = fluid.layers.cast(src_word, 'int64') src_word = paddle.squeeze(src_word, axis=-1) src_word_emb = self.emb0(src_word) src_word_emb = paddle.scale(x=src_word_emb, scale=self.src_emb_dim**0.5) src_pos = paddle.squeeze(src_pos, axis=-1) src_pos_enc = self.emb1(src_pos) src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc if self.dropout_rate: out = F.dropout( x=enc_input, p=self.dropout_rate, mode="downscale_in_infer") else: out = enc_input return out class FFN(nn.Layer): """ Feed-Forward Network """ def __init__(self, d_inner_hid, d_model, dropout_rate): super(FFN, self).__init__() self.dropout_rate = dropout_rate self.fc1 = paddle.nn.Linear( in_features=d_model, out_features=d_inner_hid) self.fc2 = paddle.nn.Linear( in_features=d_inner_hid, out_features=d_model) def forward(self, x): hidden = self.fc1(x) hidden = F.relu(hidden) if self.dropout_rate: hidden = F.dropout( hidden, p=self.dropout_rate, mode="downscale_in_infer") out = self.fc2(hidden) return out