diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst index 6e813ab1a820d068ea3e54cad6178f1cf928eadc..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1 100644 --- a/doc/api/v2/config/networks.rst +++ b/doc/api/v2/config/networks.rst @@ -125,3 +125,8 @@ simple_attention :members: simple_attention :noindex: +dot_product_attention +--------------------- +.. automodule:: paddle.v2.networks + :members: dot_product_attention + :noindex: diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index dcf0acb5a2cc7698625a2e254d4c12a32bc9631d..dbadc352a4ccd7483bf67e1025c212f514e32a24 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -21,6 +21,10 @@ limitations under the License. */ #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" +#ifdef PADDLE_USE_MKLDNN +#include "paddle/gserver/layers/MKLDNNLayer.h" +#endif + #ifndef PADDLE_MOBILE_INFERENCE #include "MultiNetwork.h" #include "RecurrentGradientMachine.h" @@ -300,6 +304,17 @@ void NeuralNetwork::backward(const UpdateCallback& callback) { } } +void NeuralNetwork::finish() { +#ifdef PADDLE_USE_MKLDNN + FOR_EACH_R(layer, layers_) { + MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast(*layer); + if (dnnLayer) { + dnnLayer->convertWeightsToPaddle(); + } + } +#endif +} + Argument NeuralNetwork::getLayerOutput(const std::string& layerName) { return getLayer(layerName)->getOutput(); } diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h index 56a1ec78460731554c9b47cf3f517f7654dc314f..16971883b42786e2cb48faafd2a7e95d45075ac8 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.h +++ b/paddle/gserver/gradientmachines/NeuralNetwork.h @@ -134,6 +134,9 @@ public: const std::string& getName() const { return subModelName_; } + /// some finish work, like convert the weight format of MKLDNNLayers + void finish() override; + protected: /** * The constructor of NeuralNetwork. diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp index 8b67a1ef4ffdd42559f8078873ed135751d56674..26810a648343d6203f7937740641325ae8ea6879 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.cpp +++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp @@ -313,6 +313,7 @@ void MKLDNNConvLayer::resetOutValue( cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_); CHECK(cvtOutVal_) << "should not be empty"; } else { + cpuOut->setData(output_.value->getData()); cpuOutVal_ = out; } // when output is cpu device, change the mkldnn output value and make them @@ -456,17 +457,18 @@ void MKLDNNConvLayer::resetOutGrad( MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); } else { const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; + // always share the same grad data of CPU output + // then the activation can get the right grad from output_.grad + output_.grad->setData(cpuOut->getData()); // same PrimitiveDesc with cpuInVal_ CHECK(cpuOutVal_); cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc()); // create reorder if primitive desc does not match if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) { - out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc()); + out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc()); cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); CHECK(cvtOutGrad_); } else { - // share the same data of CPU output - output_.grad->setData(cpuOut->getData()); out = cpuOutGrad_; } } diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 5f9923da769781287e39a3aaaf92248dfe09f225..4e2753eba2350d2c3df81b57fe98270a3c38cb24 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -46,6 +46,9 @@ protected: // backward also need reset after reset forward handle bool needResetBwd_; + // is output only mkldnn + bool outputOnlyMKLDNN_; + // mkldnn engine, stream and primivtives mkldnn::engine engine_; std::shared_ptr stream_; @@ -141,6 +144,9 @@ public: updateInputData(); } + if (!outputOnlyMKLDNN_) { + clearGrads(); + } stream_->submit(pipelineFwd_); } @@ -389,7 +395,8 @@ protected: CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE) << "Only support other device is CPU yet"; } - return outputOtherDevice_.size() == 0; + outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0; + return outputOnlyMKLDNN_; } /** @@ -398,6 +405,16 @@ protected: void setDevice(int id) { deviceId_ = id; } private: + /** + * clear all grad + */ + void clearGrads() { + output_.grad->zeroMem(); + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + outputOtherDevice_[i].grad->zeroMem(); + } + } + /** * Set deviceId of the params used in this layer. */ diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp index 5606aae80ce8e9a1e571d3c057c471b26a59d032..0e53e2d1b7e6691909955eeacd345981a9960ec6 100644 --- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp +++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp @@ -146,6 +146,7 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) { cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_); CHECK(cvtOutVal_) << "should not be emptry"; } else { + cpuOut->setData(output_.value->getData()); cpuOutVal_ = out; } output_.value = std::dynamic_pointer_cast(cpuOutVal_); @@ -213,15 +214,16 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); } else { const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; + // always share the same grad data of CPU output + // then the activation can get the right grad from output_.grad + output_.grad->setData(cpuOut->getData()); cpuOutGrad_ = MKLDNNMatrix::create( cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_); if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) { - out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc()); + out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc()); cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); CHECK(cvtOutGrad_) << "should not be emptry"; } else { - // share the same data of CPU output - output_.grad->setData(cpuOut->getData()); out = cpuOutGrad_; } } diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index fcee19415c13e9731bd47eb53bbff9b52cf6450b..329536afaf6d69676e8c39fdf8b6b8cb87ade5fa 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -26,7 +26,10 @@ if(WITH_MKLDNN) test_MKLDNN.cpp MKLDNNTester.cpp LayerGradUtil.cpp) - add_test(NAME test_MKLDNN COMMAND test_MKLDNN) + add_test(NAME test_MKLDNN + COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python + ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) endif() ################ test_CRFLayerGrad #################### diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp index eaebdd671cfa1b37e5efe149588ca23fdc402a8e..3bf6a9e176cc1235aa5ddefcedd4253e6afc1342 100644 --- a/paddle/gserver/tests/MKLDNNTester.cpp +++ b/paddle/gserver/tests/MKLDNNTester.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #include "MKLDNNTester.h" #include "paddle/gserver/layers/MKLDNNBase.h" #include "paddle/gserver/layers/MKLDNNLayer.h" +#include "paddle/trainer/Trainer.h" namespace paddle { @@ -315,6 +316,7 @@ void MKLDNNTester::runOnce() { auto& value = para->getBuf(PARAMETER_VALUE); real lr = 1e-3; value->add(*grad, lr); + grad->zeroMem(); }; randomTopDiffs(); dnnLayer_->backward(updateCallback); @@ -411,4 +413,143 @@ void MKLDNNTester::run(const TestConfig& dnn, } } +void MKLDNNTester::initArgument(DataIn& data, + const std::string& configPath, + const size_t iter) { + TrainerConfigHelper config(configPath); + size_t batchSize = config.getOptConfig().batch_size(); + data.inArgs.resize(iter); + data.outGrads.resize(iter); + data.paraValues.clear(); + for (const auto& layer_name : config.getModelConfig().input_layer_names()) { + auto layer_config = std::find_if(config.getModelConfig().layers().begin(), + config.getModelConfig().layers().end(), + [=](const LayerConfig& layer_config) { + return layer_config.name() == layer_name; + }); + CHECK(layer_config != config.getModelConfig().layers().end()); + + size_t layerSize = layer_config->size(); + for (size_t i = 0; i < iter; ++i) { + Argument arg; + arg.value = Matrix::create(batchSize, layerSize, false, false); + arg.grad = Matrix::create(batchSize, layerSize, false, false); + arg.value->randomizeUniform(); + arg.value->add(-0.5); + arg.value->sigmoid(*arg.value); + arg.grad->zeroMem(); + arg.ids = VectorT::create(batchSize, false); + arg.ids->rand(layerSize); + generateSequenceStartPositions(batchSize, arg.sequenceStartPositions); + data.inArgs[i].push_back(arg); + } + } + + for (const auto& layer_name : config.getModelConfig().output_layer_names()) { + auto layer_config = std::find_if(config.getModelConfig().layers().begin(), + config.getModelConfig().layers().end(), + [=](const LayerConfig& layer_config) { + return layer_config.name() == layer_name; + }); + CHECK(layer_config != config.getModelConfig().layers().end()); + + size_t layerSize = layer_config->size(); + for (size_t i = 0; i < iter; ++i) { + MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false); + grad->randomizeUniform(); + data.outGrads[i].push_back(grad); + } + } + + for (const auto& para_config : config.getModelConfig().parameters()) { + VectorPtr value = Vector::create(para_config.size(), false); + value->randnorm(0, 2); + data.paraValues.push_back(value); + } +} + +void MKLDNNTester::getOutResult(const std::string& configPath, + DataIn& in, + DataOut& out, + bool use_mkldnn, + size_t iter) { + FLAGS_use_gpu = false; + FLAGS_use_mkldnn = use_mkldnn; + *ThreadLocalRand::getSeed() = 1; + srand(1); + + Trainer trainer; + auto config = std::make_shared(configPath); + trainer.init(config, false); + auto gradientMachine = trainer.getGradientMachine(); + std::vector parameters = gradientMachine->getParameters(); + for (size_t i = 0; i < in.paraValues.size(); i++) { + parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]); + } + UpdateCallback simpleUpdate = [](Parameter* para) { + auto& grad = para->getBuf(PARAMETER_GRADIENT); + auto& value = para->getBuf(PARAMETER_VALUE); + real lr = 1e-2; + value->add(*grad, lr); + grad->zeroMem(); + }; + + vector outArgs; + gradientMachine->start(); + out.outValues.clear(); + out.paraValues.clear(); + for (size_t i = 0; i < iter; ++i) { + VLOG(MKLDNN_TESTS) << "runing iteration " << i; + gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN); + // save forward result + for (size_t k = 0; k < outArgs.size(); k++) { + MatrixPtr value = Matrix::create(outArgs[k].value->getHeight(), + outArgs[k].value->getWidth(), + false, + false); + value->copyFrom(*outArgs[k].value); + out.outValues.push_back(value); + } + + // random backward input + for (size_t k = 0; k < outArgs.size(); k++) { + outArgs[k].grad->copyFrom(*in.outGrads[i][k]); + } + gradientMachine->backward(simpleUpdate); + } + gradientMachine->finish(); + + // save param value + for (size_t i = 0; i < in.paraValues.size(); i++) { + VectorPtr val = Vector::create( + parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false); + val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE)); + out.paraValues.push_back(val); + } +} + +void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) { + CHECK_EQ(ref.outValues.size(), dnn.outValues.size()); + CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size()); + for (size_t i = 0; i < ref.outValues.size(); i++) { + EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps); + } + for (size_t i = 0; i < ref.paraValues.size(); i++) { + EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps); + } +} + +void MKLDNNTester::runBranchesTest(const std::string& configPath, + size_t iter, + float eps) { + DataIn in; + initArgument(in, configPath, iter); + + DataOut outCpu, outDnn; + getOutResult(configPath, in, outCpu, false, iter); + getOutResult(configPath, in, outDnn, true, iter); + + compareResult(outCpu, outDnn, eps); +} + } // namespace paddle diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h index 171d176ee757f1164c38d86273bdf9e5aefeda06..51abfcb67e2ec35fe1b0179e742a7d18f08f8a2c 100644 --- a/paddle/gserver/tests/MKLDNNTester.h +++ b/paddle/gserver/tests/MKLDNNTester.h @@ -33,6 +33,17 @@ class MKLDNNTester { NUM = 2, // Number of total }; + struct DataIn { + std::vector> inArgs; + std::vector> outGrads; + std::vector paraValues; + }; + + struct DataOut { + std::vector outValues; + std::vector paraValues; + }; + protected: std::vector configs_; vector layerNames_; @@ -74,7 +85,17 @@ public: float epsilon = 1e-4, bool log = false, int level = MKLDNN_ALL); - void setLogLevel(int lvl) { lvl_ = lvl; } + static void runBranchesTest(const std::string& configPath, + size_t iter = 3, + float eps = 1e-4); + static void initArgument(DataIn& data, + const std::string& configPath, + size_t iter = 3); + static void getOutResult(const std::string& configPath, + DataIn& in, + DataOut& out, + bool use_mkldnn, + size_t iter = 3); private: void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize); @@ -101,8 +122,9 @@ private: void saveWgt(const vector& from, vector& to); void restoreWgt(const vector& from, vector& to); - double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2); - double compareVector(const VectorPtr& v1, const VectorPtr& v2); + static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2); + static double compareVector(const VectorPtr& v1, const VectorPtr& v2); + static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4); /** * Get delta percent @@ -111,11 +133,11 @@ private: * else return sum(abs(a-b)) / sum(abs(b)) * The return value should be smaller than eps when passing. */ - double getDelta(const real* d1, - const real* d2, - size_t len, - const float failRate = 1e-3, - const float thres = 0.1); + static double getDelta(const real* d1, + const real* d2, + size_t len, + const float failRate = 1e-3, + const float thres = 0.1); }; } // namespace paddle diff --git a/paddle/gserver/tests/mkldnn_branches_conv.conf b/paddle/gserver/tests/mkldnn_branches_conv.conf new file mode 100644 index 0000000000000000000000000000000000000000..2628509db43e6a5f69a4f5ea956bffdc2837e32a --- /dev/null +++ b/paddle/gserver/tests/mkldnn_branches_conv.conf @@ -0,0 +1,56 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +settings(batch_size=16) +channels = get_config_arg("channels", int, 2) + +def two_conv(input, group_name): + out1 = img_conv_layer(input=input, + name=group_name+'_conv1', + filter_size=1, + num_filters=channels, + padding=0, + shared_biases=True, + act=ReluActivation()) + + out2 = img_conv_layer(input=input, + name=group_name+'_conv2', + filter_size=3, + num_filters=channels, + padding=1, + shared_biases=True, + act=ReluActivation()) + return out1, out2 + +data = data_layer(name ="input", size=channels*16*16) + +conv = img_conv_layer(input=data, + num_channels=channels, + filter_size=3, + num_filters=channels, + padding=1, + shared_biases=True, + act=ReluActivation()) + +a1, a2 = two_conv(input=conv, group_name='a') + +concat = concat_layer(input=[a1, a2]) + +b1, b2 = two_conv(input=conv, group_name='b') + +addto = addto_layer(input=[b1, b2]) + +outputs([concat, addto]) diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp index a70b2f17f4f1130322f3c50d244f70fdcf34468b..3571fbb9e335fc6652bdbfc3f9e35beabda5044f 100644 --- a/paddle/gserver/tests/test_MKLDNN.cpp +++ b/paddle/gserver/tests/test_MKLDNN.cpp @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include #include "MKLDNNTester.h" @@ -40,12 +41,13 @@ DECLARE_bool(use_mkldnn); struct testFcDesc { int bs; int ic; - int oc; int ih, iw; // oh == ow == 1 + int oc; }; static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) { cfg.layerConfig.set_type("mkldnn_fc"); + cfg.layerConfig.set_active_type("relu"); cfg.layerConfig.set_size(pm.oc); cfg.inputDefs.push_back( {INPUT_DATA, @@ -86,6 +88,7 @@ struct testConvDesc { static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) { cfg.layerConfig.set_type("mkldnn_conv"); + cfg.layerConfig.set_active_type("relu"); cfg.layerConfig.set_num_filters(pm.oc); cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow); cfg.layerConfig.set_shared_biases(true); @@ -158,6 +161,7 @@ struct testPoolDesc { static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) { cfg.layerConfig.set_type("mkldnn_pool"); + cfg.layerConfig.set_active_type("relu"); cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow); cfg.inputDefs.push_back( {INPUT_DATA, @@ -244,13 +248,26 @@ TEST(MKLDNNActivation, Activations) { } } -// TODO(TJ): add branch test +DECLARE_string(config_args); +TEST(MKLDNNLayer, branches) { + std::vector cases = {"conv"}; + for (auto name : cases) { + std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf"; + for (auto channels : {2, 32}) { + std::ostringstream oss; + oss << "channels=" << channels; + FLAGS_config_args = oss.str(); + MKLDNNTester::runBranchesTest(config); + } + } +} int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); FLAGS_use_gpu = false; FLAGS_use_mkldnn = true; initMain(argc, argv); + initPython(argc, argv); FLAGS_thread_local_rand_use_global_seed = true; srand(1); return RUN_ALL_TESTS(); diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 93e8ac173e721d9623fce91f30ac4642d273caba..120c9d11a5ebaa72b94590e596fd4362c552f979 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -26,8 +26,9 @@ __all__ = [ 'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool", "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru', - 'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool', - 'bidirectional_lstm', 'inputs', 'outputs' + 'simple_attention', 'dot_product_attention', 'simple_gru2', + 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs', + 'outputs' ] ###################################################### @@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence, compute attention weight. :type transform_param_attr: ParameterAttribute :return: a context vector + :rtype: LayerOutput """ assert encoded_proj.size == decoder_state.size proj_size = encoded_proj.size @@ -1396,6 +1398,88 @@ def simple_attention(encoded_sequence, input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name) +@wrap_name_default() +def dot_product_attention(encoded_sequence, + attended_sequence, + transformed_state, + softmax_param_attr=None, + name=None): + """ + Calculate and return a context vector with dot-product attention mechanism. + The dimension of the context vector equals to that of the attended_sequence. + + .. math:: + + a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j} + + e_{i,j} & = a(s_{i-1}, h_{j}) + + a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}} + + c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j} + + where :math:`h_{j}` is the jth element of encoded_sequence, + :math:`z_{j}` is the jth element of attended_sequence, + :math:`s_{i-1}` is transformed_state. + + The example usage is: + + .. code-block:: python + + context = dot_product_attention(encoded_sequence=enc_seq, + attended_sequence=att_seq, + transformed_state=state,) + + :param name: A prefix attached to the name of each layer that defined inside + the dot_product_attention. + :type name: basestring + :param softmax_param_attr: The parameter attribute of sequence softmax + that is used to produce attention weight. + :type softmax_param_attr: ParameterAttribute + :param encoded_sequence: The output hidden vectors of the encoder. + :type encoded_sequence: LayerOutput + :param attended_sequence: The attention weight is computed by a feed forward neural + network which has two inputs : decoder's transformed hidden + state of previous time step and encoder's output. + attended_sequence is the sequence to be attended. + :type attended_sequence: LayerOutput + :param transformed_state: The transformed hidden state of decoder in previous time step. + Since the dot-product operation will be performed on it and the + encoded_sequence, their dimensions must be equal. For flexibility, + we suppose transformations of the decoder's hidden state have been + done outside dot_product_attention and no more will be performed + inside. Then users can use either the original or transformed one. + :type transformed_state: LayerOutput + :return: The context vector. + :rtype: LayerOutput + """ + assert transformed_state.size == encoded_sequence.size + + expanded = expand_layer( + input=transformed_state, + expanded_as=encoded_sequence, + name='%s_expand' % name) + + m = linear_comb_layer( + weights=expanded, vectors=encoded_sequence, name='%s_dot-product') + + attention_weight = fc_layer( + input=m, + size=1, + act=SequenceSoftmaxActivation(), + param_attr=softmax_param_attr, + name="%s_softmax" % name, + bias_attr=False) + + scaled = scaling_layer( + weight=attention_weight, + input=attended_sequence, + name='%s_scaling' % name) + + return pooling_layer( + input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name) + + def inputs(layers, *args): """ Declare the inputs of network. The order of input should be as same as