From f735102eab15d39ba23c91cc440cf857fca4282c Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Tue, 26 Mar 2019 13:51:00 +0800 Subject: [PATCH] add layer norm to Layers, add transformer test in imperative mode (#16092) * add layer norm to Layers, add transformer prepare encoding * little change * finish encoder part * add decoder part * finish model part * add test case and part of data feed * add transformer test * add to_parameter, add remove in set_attr * test=develop, fix pos encoding bug, create_parameter with stantard name * test=develop, rm dropout test in imperative * test=develop, fix cpu error * test=develop, fix minize bug * test=develop, fix one hot not stop gradient * test=develop, fix one hot not stop gradient * test=develop, refine parameter name * test=develop, fix transformer test in imperative mode * test=develop, fix transformer test in imperative mode * test=develop, fix boost and mkl download error * test=develop, fix boost and mkl download error * test=develop, fix ci and refine code * test=develop, fix ci and refine code --- paddle/fluid/imperative/layer.cc | 3 + paddle/fluid/imperative/tracer.cc | 1 + python/paddle/dataset/wmt14.py | 2 +- python/paddle/fluid/imperative/base.py | 4 +- .../fluid/imperative/layer_object_helper.py | 1 + python/paddle/fluid/imperative/layers.py | 7 + python/paddle/fluid/imperative/nn.py | 132 +- python/paddle/fluid/layer_helper_base.py | 4 +- python/paddle/fluid/layers/nn.py | 3 +- python/paddle/fluid/optimizer.py | 15 +- .../unittests/test_imperative_transformer.py | 1096 +++++++++++++++++ .../fluid/tests/unittests/test_layers.py | 28 + 12 files changed, 1279 insertions(+), 17 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_transformer.py diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 3d1de95f58d..036d2a50a4a 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -315,6 +315,9 @@ std::map> OpBase::ApplyGrad() { for (size_t i = 0; i < outputs.size(); ++i) { framework::Variable* grad = outputs[i]->var_; framework::Variable* orig_grad = origin_outputs[i]->var_; + VLOG(3) << "AddTo Called with orig_grad is: " + << origin_outputs[i]->name_ << " Grad to be added is " + << outputs[i]->name_; AddTo(grad, orig_grad, place_); delete grad; } diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 0cfdea030eb..7c9d0af3ecd 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -277,6 +277,7 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, VarBase* var = current_vars_map[var_it->second]; InitGrad(var, prepared_op.GetDeviceContext()); grad_out_vars.push_back(var->grads_); + VLOG(3) << "grads output var name: " << var->name_; } } } diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py index adc0c1aac80..450f159f9d1 100644 --- a/python/paddle/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -15,7 +15,7 @@ WMT14 dataset. The original WMT14 dataset is too large and a small set of data for set is provided. This module will download dataset from -http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and +http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and parse training set and test set into paddle reader creators. """ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index 174f138bfa2..d619c09b1bd 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -44,7 +44,7 @@ def guard(place=None): yield -def to_variable(value, block=None): +def to_variable(value, block=None, name=None): if isinstance(value, np.ndarray): assert enabled(), "to_variable could only be called in imperative mode" @@ -53,7 +53,7 @@ def to_variable(value, block=None): py_var = framework.Variable( block, type=core.VarDesc.VarType.LOD_TENSOR, - name=None, + name=name, shape=value.shape, dtype=value.dtype) var = py_var._ivar.value() diff --git a/python/paddle/fluid/imperative/layer_object_helper.py b/python/paddle/fluid/imperative/layer_object_helper.py index 6afffe3636d..0dac99a4918 100644 --- a/python/paddle/fluid/imperative/layer_object_helper.py +++ b/python/paddle/fluid/imperative/layer_object_helper.py @@ -105,6 +105,7 @@ class LayerObjectHelper(LayerHelperBase): Returns dtype of the input """ + inputs_in = inputs_in if (inputs_in is not None) else [] inputs = self._multiple_input(inputs_in) dtype = None for each in inputs: diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py index a8a1aac8b0c..e64667f7f46 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/imperative/layers.py @@ -17,10 +17,12 @@ import contextlib import sys import numpy as np import collections +import six from .. import unique_name from paddle.fluid import core from .layer_object_helper import LayerObjectHelper from paddle.fluid import framework +from ..param_attr import ParamAttr __all__ = ['Layer', 'PyLayer'] @@ -72,6 +74,10 @@ class Layer(core.Layer): Returns created parameter Variable. """ + if isinstance(attr, ParamAttr) and (attr.name is not None): + attr.name = ".".join([self._full_name, attr.name]) + elif isinstance(attr, six.string_types): + attr = ".".join([self._full_name, attr]) return self._helper.create_parameter(attr, shape, dtype, is_bias, default_initializer) @@ -164,6 +170,7 @@ class Layer(core.Layer): the sublayer passed in. """ assert isinstance(sublayer, core.Layer) + self._sub_layers[name] = sublayer return sublayer diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py index 604ff753491..9cbfc3e3eb7 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/imperative/nn.py @@ -20,10 +20,12 @@ from .. import core from ..layers import utils from . import layers from ..framework import Variable, OpProtoHolder +from ..layers import layer_function_generator from ..param_attr import ParamAttr from ..initializer import Normal, Constant - -__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit'] +__all__ = [ + 'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm' +] class Conv2D(layers.Layer): @@ -438,7 +440,6 @@ class Embedding(layers.Layer): self._size = size self._is_sparse = is_sparse self._is_distributed = is_distributed - self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else ( size[0] + padding_idx) @@ -471,6 +472,131 @@ class Embedding(layers.Layer): return out +class LayerNorm(layers.Layer): + def __init__(self, + name_scope, + scale=True, + shift=True, + begin_norm_axis=1, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + act=None): + """ + ${comment} + + The formula is as follows: + + .. math:: + + \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i + + \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2} + + h & = f(\\frac{g}{\\sigma}(a - \\mu) + b) + + * :math:`a`: the vector representation of the summed inputs to the neurons + in that layer. + + * :math:`H`: the number of hidden units in a layers + + * :math:`g`: the trainable scale parameter. + + * :math:`b`: the trainable bias parameter. + + Args: + input(Variable): The input tensor variable. + scale(bool): Whether to learn the adaptive gain :math:`g` after + normalization. Default True. + shift(bool): Whether to learn the adaptive bias :math:`b` after + normalization. Default True. + begin_norm_axis(int): The normalization will be performed along + dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`. + Default 1. + epsilon(float): The small value added to the variance to prevent + division by zero. Default 1e-05. + param_attr(ParamAttr|None): The parameter attribute for the learnable + gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is + omitted. If :attr:`scale` is True and :attr:`param_attr` is None, + a default :code:`ParamAttr` would be added as scale. The + :attr:`param_attr` is initialized as 1 if it is added. Default None. + bias_attr(ParamAttr|None): The parameter attribute for the learnable + bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is + omitted. If :attr:`shift` is True and :attr:`param_attr` is None, + a default :code:`ParamAttr` would be added as bias. The + :attr:`bias_attr` is initialized as 0 if it is added. Default None. + act(str): Activation to be applied to the output of layer normalizaiton. + Default None. + Returns: + ${y_comment} + + Examples: + + >>> data = fluid.layers.data(name='data', shape=[3, 32, 32], + >>> dtype='float32') + >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) + """ + + super(LayerNorm, self).__init__(name_scope) + self._scale = scale + self._shift = shift + self._begin_norm_axis = begin_norm_axis + self._epsilon = epsilon + self._param_attr = param_attr + self._bias_attr = bias_attr + self._act = act + + def _build_once(self, input): + self._dtype = self._helper.input_dtype(input) + input_shape = input.shape + param_shape = [ + reduce(lambda x, y: x * y, input_shape[self._begin_norm_axis:]) + ] + if self._scale: + self._scale_w = self.create_parameter( + attr=self._param_attr, + shape=param_shape, + dtype=self._dtype, + default_initializer=Constant(1.0)) + if self._shift: + assert self._bias_attr is not False + self._bias_w = self.create_parameter( + attr=self._bias_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=True) + + def forward(self, input): + inputs = dict() + inputs['X'] = input + if self._scale: + inputs['Scale'] = self._scale_w + if self._shift: + inputs['Bias'] = self._bias_w + # create output + mean_out = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + variance_out = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + layer_norm_out = self._helper.create_variable_for_type_inference( + self._dtype) + + self._helper.append_op( + type="layer_norm", + inputs=inputs, + outputs={ + "Y": layer_norm_out, + "Mean": mean_out, + "Variance": variance_out, + }, + attrs={ + "epsilon": self._epsilon, + "begin_norm_axis": self._begin_norm_axis + }) + + return self._helper.append_activation(layer_norm_out) + + class GRUUnit(layers.Layer): """ **GRU unit layer** diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py index 3504cb79351..a68160d797b 100644 --- a/python/paddle/fluid/layer_helper_base.py +++ b/python/paddle/fluid/layer_helper_base.py @@ -268,11 +268,9 @@ class LayerHelperBase(object): """ # Deepcopy the attr so that parameters can be shared in program attr = copy.deepcopy(attr) - if attr is None: - attr = ParamAttr._to_attr(attr) + attr = ParamAttr._to_attr(attr) if not attr: return None - assert isinstance(attr, ParamAttr) suffix = 'b' if is_bias else 'w' if attr.name is None: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c4e6053fec0..713aedb81e4 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6206,7 +6206,8 @@ def one_hot(input, depth): type="one_hot", inputs={'X': input}, attrs={'depth': depth}, - outputs={'Out': one_hot_out}) + outputs={'Out': one_hot_out}, + stop_gradient=True) return one_hot_out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 8918886a804..505d9572a64 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -397,13 +397,14 @@ class Optimizer(object): for param in parameters: if not param.trainable: continue - # create gradient variable - grad_var = Variable( - block=loss.block, - name=param._ivar._grad_name(), - stop_gradient=True, - ivar=param._ivar._grad_ivar()) - params_grads.append((param, grad_var)) + if param._ivar._grad_ivar() is not None: + # create gradient variable + grad_var = Variable( + block=loss.block, + name=param._ivar._grad_name(), + stop_gradient=True, + ivar=param._ivar._grad_ivar()) + params_grads.append((param, grad_var)) with program_guard(framework.default_main_program(), framework.default_startup_program()): optimize_ops = self._create_optimization_pass(params_grads) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py new file mode 100644 index 00000000000..b06d3e88940 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py @@ -0,0 +1,1096 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid +from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard +from test_imperative_base import new_program_scope +from paddle.fluid import core +import numpy as np +import six +np.set_printoptions(suppress=True) + + +# Copy from models +class TrainTaskConfig(object): + # support both CPU and GPU now. + use_gpu = True + # the epoch number to train. + pass_num = 30 + # the number of sequences contained in a mini-batch. + # deprecated, set batch_size in args. + batch_size = 32 + # the hyper parameters for Adam optimizer. + # This static learning_rate will be multiplied to the LearningRateScheduler + # derived learning rate the to get the final learning rate. + learning_rate = 2.0 + beta1 = 0.9 + beta2 = 0.997 + eps = 1e-9 + # the parameters for learning rate scheduling. + warmup_steps = 8000 + # the weight used to mix up the ground-truth distribution and the fixed + # uniform distribution in label smoothing when training. + # Set this as zero if label smoothing is not wanted. + label_smooth_eps = 0.1 + # the directory for saving trained models. + model_dir = "trained_models" + # the directory for saving checkpoints. + ckpt_dir = "trained_ckpts" + # the directory for loading checkpoint. + # If provided, continue training from the checkpoint. + ckpt_path = None + # the parameter to initialize the learning rate scheduler. + # It should be provided if use checkpoints, since the checkpoint doesn't + # include the training step counter currently. + start_step = 0 + # the frequency to save trained models. + save_freq = 10000 + + +class InferTaskConfig(object): + use_gpu = True + # the number of examples in one run for sequence generation. + batch_size = 10 + # the parameters for beam search. + beam_size = 5 + max_out_len = 256 + # the number of decoded sentences to output. + n_best = 1 + # the flags indicating whether to output the special tokens. + output_bos = False + output_eos = False + output_unk = True + # the directory for loading the trained model. + model_path = "trained_models/pass_1.infer.model" + + +class ModelHyperParams(object): + # These following five vocabularies related configurations will be set + # automatically according to the passed vocabulary path and special tokens. + # size of source word dictionary. + src_vocab_size = 10000 + # size of target word dictionay + trg_vocab_size = 10000 + # index for token + bos_idx = 0 + # index for token + eos_idx = 1 + # index for token + unk_idx = 2 + # max length of sequences deciding the size of position encoding table. + max_length = 4 + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 2048 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + n_layer = 6 + # dropout rates of different modules. + prepostprocess_dropout = 0.1 + attention_dropout = 0.1 + relu_dropout = 0.1 + # to process before each sub-layer + preprocess_cmd = "n" # layer normalization + # to process after each sub-layer + postprocess_cmd = "da" # dropout + residual connection + # random seed used in dropout for CE. + dropout_seed = 1 + # the flag indicating whether to share embedding and softmax weights. + # vocabularies in source and target should be same for weight sharing. + weight_sharing = True + + +def merge_cfg_from_list(cfg_list, g_cfgs): + """ + Set the above global configurations using the cfg_list. + """ + assert len(cfg_list) % 2 == 0 + for key, value in zip(cfg_list[0::2], cfg_list[1::2]): + for g_cfg in g_cfgs: + if hasattr(g_cfg, key): + try: + value = eval(value) + except Exception: # for file path + pass + setattr(g_cfg, key, value) + break + + +def position_encoding_init(n_position, d_pos_vec): + """ + Generate the initial values for the sinusoid position encoding table. + """ + channels = d_pos_vec + position = np.arange(n_position) + num_timescales = channels // 2 + log_timescale_increment = (np.log(float(1e4) / float(1)) / + (num_timescales - 1)) + inv_timescales = np.exp(np.arange( + num_timescales)) * -log_timescale_increment + scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales, + 0) + signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1) + signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant') + position_enc = signal + return position_enc.astype("float32") + + +def create_data(is_static=False): + if is_static: + return [ + src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np, + trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np, + lbl_weight_np + ] + else: + enc_inputs = [ + to_variable(src_word_np), to_variable(src_pos_np), + to_variable(src_slf_attn_bias_np) + ] + dec_inputs = [ + to_variable(trg_word_np), to_variable(trg_pos_np), + to_variable(trg_slf_attn_bias_np), to_variable(trg_src_attn_bias_np) + ] + label = to_variable(lbl_word_np) + weight = to_variable(lbl_weight_np) + return enc_inputs, dec_inputs, label, weight + + +def create_feed_dict_list(data, init=False): + if init: + data_input_names = encoder_data_input_fields + \ + decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names + else: + data_input_names = encoder_data_input_fields + \ + decoder_data_input_fields[:-1] + label_data_input_fields + feed_dict_list = dict() + for i in range(len(data_input_names)): + feed_dict_list[data_input_names[i]] = data[i] + return feed_dict_list + + +def make_all_inputs(input_fields): + """ + Define the input data layers for the transformer model. + """ + inputs = [] + for input_field in input_fields: + input_var = fluid.layers.data( + name=input_field, + shape=input_descs[input_field][0], + dtype=input_descs[input_field][1], + lod_level=input_descs[input_field][2] + if len(input_descs[input_field]) == 3 else 0, + append_batch_size=False) + inputs.append(input_var) + return inputs + + +# The placeholder for batch_size in compile time. Must be -1 currently to be +# consistent with some ops' infer-shape output in compile time, such as the +# sequence_expand op used in beamsearch decoder. +batch_size = 32 +# The placeholder for squence length in compile time. +seq_len = ModelHyperParams.max_length +# Here list the data shapes and data types of all inputs. +# The shapes here act as placeholder and are set to pass the infer-shape in +# compile time. +input_descs = { + # The actual data shape of src_word is: + # [batch_size, max_src_len_in_batch, 1] + "src_word": [(batch_size, seq_len, 1), "int64", 2], + # The actual data shape of src_pos is: + # [batch_size, max_src_len_in_batch, 1] + "src_pos": [(batch_size, seq_len, 1), "int64"], + # This input is used to remove attention weights on paddings in the + # encoder. + # The actual data shape of src_slf_attn_bias is: + # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch] + "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # The actual data shape of trg_word is: + # [batch_size, max_trg_len_in_batch, 1] + "trg_word": [(batch_size, seq_len, 1), "int64", + 2], # lod_level is only used in fast decoder. + # The actual data shape of trg_pos is: + # [batch_size, max_trg_len_in_batch, 1] + "trg_pos": [(batch_size, seq_len, 1), "int64"], + # This input is used to remove attention weights on paddings and + # subsequent words in the decoder. + # The actual data shape of trg_slf_attn_bias is: + # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch] + "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # This input is used to remove attention weights on paddings of the source + # input in the encoder-decoder attention. + # The actual data shape of trg_src_attn_bias is: + # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch] + "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len, + seq_len), "float32"], + # This input is used in independent decoder program for inference. + # The actual data shape of enc_output is: + # [batch_size, max_src_len_in_batch, d_model] + "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"], + # The actual data shape of label_word is: + # [batch_size * max_trg_len_in_batch, 1] + "lbl_word": [(batch_size * seq_len, 1), "int64"], + # This input is used to mask out the loss of paddding tokens. + # The actual data shape of label_weight is: + # [batch_size * max_trg_len_in_batch, 1] + "lbl_weight": [(batch_size * seq_len, 1), "float32"], + # This input is used in beam-search decoder. + "init_score": [(batch_size, 1), "float32", 2], + # This input is used in beam-search decoder for the first gather + # (cell states updation) + "init_idx": [(batch_size, ), "int32"], +} + +# Names of word embedding table which might be reused for weight sharing. +word_emb_param_names = ( + "src_word_emb_table", + "trg_word_emb_table", ) +# Names of position encoding table which will be initialized externally. +pos_enc_param_names = ( + "src_pos_enc_table", + "trg_pos_enc_table", ) +# separated inputs for different usages. +encoder_data_input_fields = ( + "src_word", + "src_pos", + "src_slf_attn_bias", ) +decoder_data_input_fields = ( + "trg_word", + "trg_pos", + "trg_slf_attn_bias", + "trg_src_attn_bias", + "enc_output", ) +label_data_input_fields = ( + "lbl_word", + "lbl_weight", ) +# In fast decoder, trg_pos (only containing the current time step) is generated +# by ops and trg_slf_attn_bias is not needed. +fast_decoder_data_input_fields = ( + "trg_word", + "init_score", + "init_idx", + "trg_src_attn_bias", ) +# if we use py_reader +use_py_reader = False + +# if we run sync mode +sync = False + +# how many batches we use +batch_num = 2 + +np.random.seed = 1 +src_word_np = np.random.randint( + 1, + ModelHyperParams.src_vocab_size - 1, + size=(batch_size, seq_len, 1), + dtype='int64') +src_pos_np = np.random.randint( + 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64') +src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, + seq_len, seq_len).astype('float32') + +trg_word_np = np.random.randint( + 1, + ModelHyperParams.src_vocab_size - 1, + size=(batch_size, seq_len, 1), + dtype='int64') +trg_pos_np = np.random.randint( + 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64') +trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, + seq_len, seq_len).astype('float32') +trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, + seq_len, seq_len).astype('float32') + +lbl_word_np = np.random.randint( + 1, + ModelHyperParams.src_vocab_size - 1, + size=(batch_size * seq_len, 1), + dtype='int64') +lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32') + +# np.random.seed = 1 +# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64') +# src_pos_np = np.random.randint( +# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64') +# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, +# seq_len, seq_len).astype('float32') +# +# trg_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64') +# trg_pos_np = np.random.randint( +# 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64') +# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, +# seq_len, seq_len).astype('float32') +# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, +# seq_len, seq_len).astype('float32') +# +# lbl_word_np = np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64') +# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32') +# +pos_inp1 = position_encoding_init(ModelHyperParams.max_length, + ModelHyperParams.d_model) +pos_inp2 = position_encoding_init(ModelHyperParams.max_length, + ModelHyperParams.d_model) + + +class PrePostProcessLayer(Layer): + def __init__(self, name_scope, process_cmd, shape_len=None): + super(PrePostProcessLayer, self).__init__(name_scope) + for cmd in process_cmd: + if cmd == "n": + self._layer_norm = LayerNorm( + name_scope=self.full_name(), + begin_norm_axis=shape_len - 1, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(1.)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(0.))) + + def forward(self, prev_out, out, process_cmd, dropout_rate=0.): + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = self._layer_norm(out) + elif cmd == "d": # add dropout + if dropout_rate: + out = fluid.layers.dropout( + out, + dropout_prob=dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + return out + + +class PositionwiseFeedForwardLayer(Layer): + def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate): + super(PositionwiseFeedForwardLayer, self).__init__(name_scope) + self._i2h = FC(name_scope=self.full_name(), + size=d_inner_hid, + num_flatten_dims=2, + act="relu") + self._h2o = FC(name_scope=self.full_name(), + size=d_hid, + num_flatten_dims=2) + self._dropout_rate = dropout_rate + + def forward(self, x): + hidden = self._i2h(x) + if self._dropout_rate: + hidden = fluid.layers.dropout( + hidden, + dropout_prob=self._dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + out = self._h2o(hidden) + return out + + +class MultiHeadAttentionLayer(Layer): + def __init__(self, + name_scope, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0., + cache=None, + gather_idx=None, + static_kv=False): + super(MultiHeadAttentionLayer, self).__init__(name_scope) + self._n_head = n_head + self._d_key = d_key + self._d_value = d_value + self._d_model = d_model + self._dropout_rate = dropout_rate + self._q_fc = FC(name_scope=self.full_name(), + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + self._k_fc = FC(name_scope=self.full_name(), + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + self._v_fc = FC(name_scope=self.full_name(), + size=d_value * n_head, + bias_attr=False, + num_flatten_dims=2) + self._proj_fc = FC(name_scope=self.full_name(), + size=self._d_model, + bias_attr=False, + num_flatten_dims=2) + + def forward(self, queries, keys, values, attn_bias): + # compute q ,k ,v + keys = queries if keys is None else keys + values = keys if values is None else values + + q = self._q_fc(queries) + k = self._k_fc(keys) + v = self._v_fc(values) + + # split head + reshaped_q = fluid.layers.reshape( + x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False) + transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) + reshaped_k = fluid.layers.reshape( + x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False) + transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) + reshaped_v = fluid.layers.reshape( + x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False) + transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) + + #scale dot product attention + product = fluid.layers.matmul( + x=transpose_q, + y=transpose_k, + transpose_y=True, + alpha=self._d_model**-0.5) + if attn_bias: + product += attn_bias + weights = fluid.layers.softmax(product) + if self._dropout_rate: + weights_droped = fluid.layers.dropout( + weights, + dropout_prob=self._dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) + out = fluid.layers.matmul(weights_droped, transpose_v) + else: + out = fluid.layers.matmul(weights, transpose_v) + + # combine heads + if len(out.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) + final_out = fluid.layers.reshape( + x=trans_x, + shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], + inplace=False) + + # fc to output + proj_out = self._proj_fc(final_out) + return proj_out + + +class EncoderSubLayer(Layer): + def __init__(self, + name_scope, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(EncoderSubLayer, self).__init__(name_scope) + self._preprocess_cmd = preprocess_cmd + self._postprocess_cmd = postprocess_cmd + self._prepostprocess_dropout = prepostprocess_dropout + + self._preprocess_layer = PrePostProcessLayer(self.full_name(), + self._preprocess_cmd, 3) + self._multihead_attention_layer = MultiHeadAttentionLayer( + self.full_name(), d_key, d_value, d_model, n_head, + attention_dropout) + self._postprocess_layer = PrePostProcessLayer( + self.full_name(), self._postprocess_cmd, None) + self._preprocess_layer2 = PrePostProcessLayer(self.full_name(), + self._preprocess_cmd, 3) + self._positionwise_feed_forward = PositionwiseFeedForwardLayer( + self.full_name(), d_inner_hid, d_model, relu_dropout) + self._postprocess_layer2 = PrePostProcessLayer( + self.full_name(), self._postprocess_cmd, None) + + def forward(self, enc_input, attn_bias): + pre_process_multihead = self._preprocess_layer( + None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout) + attn_output = self._multihead_attention_layer(pre_process_multihead, + None, None, attn_bias) + attn_output = self._postprocess_layer(enc_input, attn_output, + self._postprocess_cmd, + self._prepostprocess_dropout) + pre_process2_output = self._preprocess_layer2( + None, attn_output, self._preprocess_cmd, + self._prepostprocess_dropout) + ffd_output = self._positionwise_feed_forward(pre_process2_output) + return self._postprocess_layer2(attn_output, ffd_output, + self._postprocess_cmd, + self._prepostprocess_dropout) + + +class EncoderLayer(Layer): + def __init__(self, + name_scope, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd="n", + postprocess_cmd="da"): + + super(EncoderLayer, self).__init__(name_scope) + self._preprocess_cmd = preprocess_cmd + self._encoder_sublayers = list() + self._prepostprocess_dropout = prepostprocess_dropout + self._n_layer = n_layer + self._preprocess_layer = PrePostProcessLayer(self.full_name(), + self._preprocess_cmd, 3) + for i in range(n_layer): + self._encoder_sublayers.append( + self.add_sublayer( + 'esl_%d' % i, + EncoderSubLayer( + self.full_name(), n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd))) + + def forward(self, enc_input, attn_bias): + for i in range(self._n_layer): + enc_output = self._encoder_sublayers[i](enc_input, attn_bias) + enc_input = enc_output + + return self._preprocess_layer(None, enc_output, self._preprocess_cmd, + self._prepostprocess_dropout) + + +class PrepareEncoderDecoderLayer(Layer): + def __init__(self, + name_scope, + src_vocab_size, + src_emb_dim, + src_max_len, + dropout_rate, + word_emb_param_name=None, + pos_enc_param_name=None): + super(PrepareEncoderDecoderLayer, self).__init__(name_scope) + self._src_max_len = src_max_len + self._src_emb_dim = src_emb_dim + self._src_vocab_size = src_vocab_size + self._dropout_rate = dropout_rate + self._input_emb = Embedding( + name_scope=self.full_name(), + size=[src_vocab_size, src_emb_dim], + padding_idx=0, + param_attr=fluid.ParamAttr( + name=word_emb_param_name, + initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5))) + + if pos_enc_param_name is pos_enc_param_names[0]: + pos_inp = pos_inp1 + else: + pos_inp = pos_inp2 + self._pos_emb = Embedding( + name_scope=self.full_name(), + size=[self._src_max_len, src_emb_dim], + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, + initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), + trainable=False)) + + # use in imperative_mode to fit different length batch + # self._pos_emb._w = to_variable( + # position_encoding_init(self._src_max_len, self._src_emb_dim)) + + def forward(self, src_word, src_pos): + src_word_emb = self._input_emb(src_word) + src_word_emb = fluid.layers.scale( + x=src_word_emb, scale=self._src_emb_dim**0.5) + # # TODO change this to fit dynamic length input + src_pos_emb = self._pos_emb(src_pos) + src_pos_emb.stop_gradient = True + enc_input = src_word_emb + src_pos_emb + return fluid.layers.dropout( + enc_input, + dropout_prob=self._dropout_rate, + seed=ModelHyperParams.dropout_seed, + is_test=False) if self._dropout_rate else enc_input + + +class WrapEncoderLayer(Layer): + def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head, + d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, + postprocess_cmd, weight_sharing): + """ + The wrapper assembles together all needed layers for the encoder. + """ + super(WrapEncoderLayer, self).__init__(name_cope) + + self._prepare_encoder_layer = PrepareEncoderDecoderLayer( + self.full_name(), + src_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + word_emb_param_name=word_emb_param_names[0], + pos_enc_param_name=pos_enc_param_names[0]) + self._encoder = EncoderLayer( + self.full_name(), n_layer, n_head, d_key, d_value, d_model, + d_inner_hid, prepostprocess_dropout, attention_dropout, + relu_dropout, preprocess_cmd, postprocess_cmd) + + def forward(self, enc_inputs): + src_word, src_pos, src_slf_attn_bias = enc_inputs + enc_input = self._prepare_encoder_layer(src_word, src_pos) + enc_output = self._encoder(enc_input, src_slf_attn_bias) + return enc_output + + +class DecoderSubLayer(Layer): + def __init__(self, + name_scope, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + cache=None, + gather_idx=None): + super(DecoderSubLayer, self).__init__(name_scope) + self._postprocess_cmd = postprocess_cmd + self._preprocess_cmd = preprocess_cmd + self._prepostprcess_dropout = prepostprocess_dropout + self._pre_process_layer = PrePostProcessLayer(self.full_name(), + preprocess_cmd, 3) + self._multihead_attention_layer = MultiHeadAttentionLayer( + self.full_name(), + d_key, + d_value, + d_model, + n_head, + attention_dropout, + cache=cache, + gather_idx=gather_idx) + self._post_process_layer = PrePostProcessLayer(self.full_name(), + postprocess_cmd, None) + self._pre_process_layer2 = PrePostProcessLayer(self.full_name(), + preprocess_cmd, 3) + self._multihead_attention_layer2 = MultiHeadAttentionLayer( + self.full_name(), + d_key, + d_value, + d_model, + n_head, + attention_dropout, + cache=cache, + gather_idx=gather_idx, + static_kv=True) + self._post_process_layer2 = PrePostProcessLayer(self.full_name(), + postprocess_cmd, None) + self._pre_process_layer3 = PrePostProcessLayer(self.full_name(), + preprocess_cmd, 3) + self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer( + self.full_name(), d_inner_hid, d_model, relu_dropout) + self._post_process_layer3 = PrePostProcessLayer(self.full_name(), + postprocess_cmd, None) + + def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias): + pre_process_rlt = self._pre_process_layer( + None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout) + slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None, + None, slf_attn_bias) + slf_attn_output_pp = self._post_process_layer( + dec_input, slf_attn_output, self._postprocess_cmd, + self._prepostprcess_dropout) + pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp, + self._preprocess_cmd, + self._prepostprcess_dropout) + enc_attn_output_pp = self._multihead_attention_layer2( + pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias) + enc_attn_output = self._post_process_layer2( + slf_attn_output, enc_attn_output_pp, self._postprocess_cmd, + self._prepostprcess_dropout) + pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output, + self._preprocess_cmd, + self._prepostprcess_dropout) + ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3) + dec_output = self._post_process_layer3(enc_attn_output, ffd_output, + self._postprocess_cmd, + self._prepostprcess_dropout) + return dec_output + + +class DecoderLayer(Layer): + def __init__(self, + name_scope, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + caches=None, + gather_idx=None): + super(DecoderLayer, self).__init__(name_scope) + self._pre_process_layer = PrePostProcessLayer(self.full_name(), + preprocess_cmd, 3) + self._decoder_sub_layers = list() + self._n_layer = n_layer + self._preprocess_cmd = preprocess_cmd + self._prepostprocess_dropout = prepostprocess_dropout + for i in range(n_layer): + self._decoder_sub_layers.append( + self.add_sublayer( + 'dsl_%d' % i, + DecoderSubLayer( + self.full_name(), + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + cache=None if caches is None else caches[i], + gather_idx=gather_idx))) + + def forward(self, dec_input, enc_output, dec_slf_attn_bias, + dec_enc_attn_bias): + for i in range(self._n_layer): + tmp_dec_output = self._decoder_sub_layers[i]( + dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias) + dec_input = tmp_dec_output + + dec_output = self._pre_process_layer(None, tmp_dec_output, + self._preprocess_cmd, + self._prepostprocess_dropout) + return dec_output + + +class WrapDecoderLayer(Layer): + def __init__(self, + name_scope, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + caches=None, + gather_idx=None): + """ + The wrapper assembles together all needed layers for the encoder. + """ + super(WrapDecoderLayer, self).__init__(name_scope) + + self._prepare_decoder_layer = PrepareEncoderDecoderLayer( + self.full_name(), + trg_vocab_size, + d_model, + max_length, + prepostprocess_dropout, + word_emb_param_name=word_emb_param_names[1], + pos_enc_param_name=pos_enc_param_names[1]) + self._decoder_layer = DecoderLayer( + self.full_name(), + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + caches=caches, + gather_idx=gather_idx) + self._weight_sharing = weight_sharing + if not weight_sharing: + self._fc = FC(self.full_name(), + size=trg_vocab_size, + bias_attr=False) + + def forward(self, dec_inputs=None, enc_output=None): + trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs + dec_input = self._prepare_decoder_layer(trg_word, trg_pos) + dec_output = self._decoder_layer(dec_input, enc_output, + trg_slf_attn_bias, trg_src_attn_bias) + + dec_output_reshape = fluid.layers.reshape( + dec_output, shape=[-1, dec_output.shape[-1]], inplace=False) + + if self._weight_sharing: + predict = fluid.layers.matmul( + x=dec_output_reshape, + y=self._prepare_decoder_layer._input_emb._w, + transpose_y=True) + else: + predict = self._fc(dec_output_reshape) + + if dec_inputs is None: + # Return probs for independent decoder program. + predict_out = fluid.layers.softmax(predict) + return predict_out + return predict + + +class TransFormer(Layer): + def __init__(self, + name_scope, + src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + label_smooth_eps, + use_py_reader=False, + is_test=False): + super(TransFormer, self).__init__(name_scope) + self._label_smooth_eps = label_smooth_eps + self._trg_vocab_size = trg_vocab_size + if weight_sharing: + assert src_vocab_size == trg_vocab_size, ( + "Vocabularies in source and target should be same for weight sharing." + ) + self._wrap_encoder_layer = WrapEncoderLayer( + self.full_name(), src_vocab_size, max_length, n_layer, n_head, + d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, + weight_sharing) + self._wrap_decoder_layer = WrapDecoderLayer( + self.full_name(), trg_vocab_size, max_length, n_layer, n_head, + d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, + attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, + weight_sharing) + + if weight_sharing: + self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w + + def forward(self, enc_inputs, dec_inputs, label, weights): + enc_output = self._wrap_encoder_layer(enc_inputs) + predict = self._wrap_decoder_layer(dec_inputs, enc_output) + if self._label_smooth_eps: + label_out = fluid.layers.label_smooth( + label=fluid.layers.one_hot( + input=label, depth=self._trg_vocab_size), + epsilon=self._label_smooth_eps) + + cost = fluid.layers.softmax_with_cross_entropy( + logits=predict, + label=label_out, + soft_label=True if self._label_smooth_eps else False) + weighted_cost = cost * weights + sum_cost = fluid.layers.reduce_sum(weighted_cost) + token_num = fluid.layers.reduce_sum(weights) + token_num.stop_gradient = True + avg_cost = sum_cost / token_num + return sum_cost, avg_cost, predict, token_num + + +class TestImperativeTransformer(unittest.TestCase): + def test_transformer_float32(self): + seed = 90 + with guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + transformer = TransFormer( + 'transformer', + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, + ModelHyperParams.n_head, + ModelHyperParams.d_key, + ModelHyperParams.d_value, + ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, + ModelHyperParams.prepostprocess_dropout, + ModelHyperParams.attention_dropout, + ModelHyperParams.relu_dropout, + ModelHyperParams.preprocess_cmd, + ModelHyperParams.postprocess_cmd, + ModelHyperParams.weight_sharing, + TrainTaskConfig.label_smooth_eps, + use_py_reader=use_py_reader, + is_test=False) + if sync: + lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( + ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) + with fluid.default_main_program()._lr_schedule_guard(): + learning_rate = lr_decay * TrainTaskConfig.learning_rate + optimizer = fluid.optimizer.Adam( + learning_rate=learning_rate, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + else: + optimizer = fluid.optimizer.SGD(learning_rate=0.003) + dy_param_init = dict() + dy_param_updated = dict() + for i in range(batch_num): + enc_inputs, dec_inputs, label, weights = create_data() + dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer( + enc_inputs, dec_inputs, label, weights) + if i == 0: + for param in transformer.parameters(): + dy_param_init[param.name] = param._numpy() + + dy_avg_cost._backward() + optimizer.minimize(dy_avg_cost) + transformer.clear_gradients() + if i == batch_num - 1: + for param in transformer.parameters(): + dy_param_updated[param.name] = param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + transformer = TransFormer( + 'transformer', + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, + ModelHyperParams.n_head, + ModelHyperParams.d_key, + ModelHyperParams.d_value, + ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, + ModelHyperParams.prepostprocess_dropout, + ModelHyperParams.attention_dropout, + ModelHyperParams.relu_dropout, + ModelHyperParams.preprocess_cmd, + ModelHyperParams.postprocess_cmd, + ModelHyperParams.weight_sharing, + TrainTaskConfig.label_smooth_eps, + use_py_reader=use_py_reader, + is_test=False) + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + optimizer = fluid.optimizer.SGD(learning_rate=0.003) + + data_input_names = encoder_data_input_fields + decoder_data_input_fields[: + -1] + label_data_input_fields + all_inputs = make_all_inputs(data_input_names) + enc_inputs_len = len(encoder_data_input_fields) + dec_inputs_len = len(decoder_data_input_fields[:-1]) + enc_inputs = all_inputs[0:enc_inputs_len] + dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + + dec_inputs_len] + label = all_inputs[-2] + weights = all_inputs[-1] + static_param_updated = dict() + static_param_init = dict() + static_param_name_list = list() + static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer( + enc_inputs, dec_inputs, label, weights) + + optimizer.minimize(static_avg_cost) + for param in transformer.parameters(): + static_param_name_list.append(param.name) + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + for i in range(len(static_param_name_list)): + static_param_init[static_param_name_list[i]] = out[i] + static_sum_cost_value = None + static_avg_cost_value = None + static_predict_value = None + static_token_num_value = None + for i in range(batch_num): + feed_dict = create_feed_dict_list(create_data(True)) + fetch_list = [ + static_sum_cost, static_avg_cost, static_predict, + static_token_num + ] + fetch_list.extend(static_param_name_list) + + out = exe.run(fluid.default_main_program(), + feed=feed_dict, + fetch_list=fetch_list) + static_sum_cost_value = out[0] + static_avg_cost_value = out[1] + static_predict_value = out[2] + static_token_num_value = out[3] + if i == batch_num - 1: + for k in range(4, len(out)): + static_param_updated[static_param_name_list[k - + 4]] = out[k] + + self.assertTrue( + np.allclose(static_avg_cost_value, dy_avg_cost._numpy())) + self.assertTrue( + np.allclose(static_sum_cost_value, dy_sum_cost._numpy())) + self.assertTrue( + np.allclose( + static_predict_value, dy_predict._numpy(), atol=1e-5)) + self.assertTrue( + np.allclose(static_token_num_value, dy_token_num._numpy())) + for key, value in six.iteritems(static_param_init): + self.assertTrue(np.allclose(value, dy_param_init[key])) + for key, value in six.iteritems(static_param_updated): + self.assertTrue( + np.allclose( + value, dy_param_updated[key], atol=1e-4)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f343ed4e87e..1429d24107e 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -70,6 +70,34 @@ class LayerTest(unittest.TestCase): class TestLayer(LayerTest): + def test_layer_norm(self): + inp = np.ones([3, 32, 32], dtype='float32') + with self.static_graph(): + t = layers.data( + name='data', + shape=[3, 32, 32], + dtype='float32', + append_batch_size=False) + ret = layers.layer_norm(t) + static_ret = self.get_static_graph_result( + feed={'data': inp}, fetch_list=[ret])[0] + with self.static_graph(): + t = layers.data( + name='data', + shape=[3, 32, 32], + dtype='float32', + append_batch_size=False) + lm = nn.LayerNorm('layer_norm') + ret = lm(t) + static_ret2 = self.get_static_graph_result( + feed={'data': inp}, fetch_list=[ret])[0] + with self.dynamic_graph(): + lm = nn.LayerNorm('layer_norm') + dy_ret = lm(base.to_variable(inp)) + + self.assertTrue(np.allclose(static_ret, static_ret2)) + self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2)) + def test_relu(self): with self.static_graph(): t = layers.data(name='t', shape=[3, 3], dtype='float32') -- GitLab