diff --git a/Dockerfile b/Dockerfile index 136db772cc6a24b8084120fa6bab666bc1eda78e..150344a8116e2be9b5bab8e5fdcc9c37f4025020 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/ RUN apt-get update && \ apt-get install -y \ - git python-pip python-dev openssh-server bison \ + git python-pip python-dev openssh-server bison libnccl-dev \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ python-matplotlib gcc-4.8 g++-4.8 \ diff --git a/doc/design/block.md b/doc/design/block.md index 7cbf0d55b1faeb2093ee7cf234d1c2ad1905885b..4066122c0e8dfa33776796c3d205ba5aec9e0f52 100644 --- a/doc/design/block.md +++ b/doc/design/block.md @@ -189,7 +189,7 @@ OpDesc { inputs = {0} // the index of x in vars of BlockDesc above outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above attrs { - "memories" : {1} // the index of h + "states" : {1} // the index of h "step_net" : } }; diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 774c7b021754be607cd895ca910583e992ed26a0..dbe76a8eaf134f7db08fb545297c8e4db68a7aab 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -19,15 +19,15 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope) proto_library(framework_proto SRCS framework.proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto) -cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info) cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) -cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc glog) +cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) +cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator) -cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog) +cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto) diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index fb552fe3448b3f17e97e1262b5c9a0842f68f8b9..1ae7fb60f01e4925ceb310f661171eb231eb6c96 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -21,6 +21,7 @@ #include "paddle/framework/block_desc.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/dynamic_recurrent_op.h" #include "paddle/operators/net_op.h" #include "paddle/operators/recurrent_op.h" @@ -220,8 +221,7 @@ static std::unique_ptr BackwardRecursive( // process recurrent gradient op as a special operator. if (forwardOp.Type() == "recurrent") { // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), - // or - // this will result in infinite loop. + // or this will result in infinite loop. const auto& rnnop = *static_cast(&forwardOp); auto rnn_grad_op = @@ -231,6 +231,18 @@ static std::unique_ptr BackwardRecursive( // create stepnet's gradient op rnn_grad_op->set_stepnet( BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id)); + } else if (forwardOp.Type() == "dynamic_recurrent") { + // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), + // or this will result in infinite loop. + const auto& rnnop = + *static_cast(&forwardOp); + auto rnn_grad_op = + static_cast(grad_op.get()); + const auto& stepnet_op = + *static_cast(&rnnop.rnn.GetStepUnit()); + // create stepnet's gradient op + rnn_grad_op->rnn.SetStepUnit( + BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id)); } if (net->ops_.empty()) { // Current no aux op is added to network diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index 21d4fdaf0680036a484ee4258e47c6c8854967c3..251e340e6ddcc17ba16bdcab63f2a8c907122eab 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -41,6 +41,19 @@ bool BlockDescBind::HasVar(const std::string &name) const { return vars_.find(name) != vars_.end(); } +VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const { + auto it = vars_.find(name); + if (it == vars_.end()) { + return Parent() == kNoneBlockIndex ? nullptr + : ParentBlock()->FindVarRecursive(name); + } + return it->second.get(); +} + +bool BlockDescBind::HasVarRecursive(const std::string &name) const { + return FindVarRecursive(name) != nullptr; +} + std::vector BlockDescBind::AllVars() const { std::vector res; for (const auto &p : vars_) { @@ -97,7 +110,7 @@ void BlockDescBind::Flush() { } BlockDescBind *BlockDescBind::ParentBlock() const { - if (this->desc_->parent_idx() == -1) { + if (this->desc_->parent_idx() == kNoneBlockIndex) { return nullptr; } return prog_->Block(static_cast(this->desc_->parent_idx())); diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h index 7d1d33f6860aa90518abb379a5e9964d6029c6fa..c685050850dc25f346df49b5ce1d897974870460 100644 --- a/paddle/framework/block_desc.h +++ b/paddle/framework/block_desc.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include "paddle/framework/op_desc.h" +#include "paddle/framework/proto_desc.h" #include "paddle/framework/var_desc.h" #include "paddle/platform/macros.h" @@ -56,6 +57,10 @@ class BlockDescBind { bool HasVar(const std::string &var_name) const; + VarDescBind *FindVarRecursive(const std::string &name_bytes) const; + + bool HasVarRecursive(const std::string &var_name) const; + std::set LocalVarNames() const { std::set var_names; for (auto &var : vars_) { diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto index 2aa961f1407c44fb4d4a149c40b3dad5b243c354..3d023535ef6c49326481ec7edc2bfc9d7c0d4ffa 100644 --- a/paddle/framework/framework.proto +++ b/paddle/framework/framework.proto @@ -68,6 +68,7 @@ message OpProto { optional bool duplicable = 3 [ default = false ]; optional bool intermediate = 4 [ default = false ]; + optional bool dispensable = 5 [ default = false ]; } // AttrProto describes the C++ type Attribute. diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index 5b7badf89c1714331bae9fc8cf94c8da2c66dbad..7c0ea0df7829883ccb36772634263cd33ff32e1d 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -25,31 +25,50 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) { for (size_t i = level_begin; i < level_end; i++) { new_lod.emplace_back(in.at(i)); } + // transform the lowest level to absolute offset. + LoD abs_offset_lod = ToAbsOffset(in); + new_lod.back() = abs_offset_lod[level_end - 1]; return new_lod; } LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, size_t elem_end) { - // slice the lod. - LoD new_lod; - new_lod.reserve(in.size() - level); - auto start = in.at(level)[elem_begin]; - auto end = in.at(level)[elem_end]; - - for (auto it = in.begin() + level; it != in.end(); it++) { - auto it_begin = std::find(it->begin(), it->end(), start); - auto it_end = std::find(it_begin, it->end(), end); - PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info"); - PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info"); - new_lod.emplace_back(it_begin, it_end + 1); - // reset offset if tensor is copyed and sliced. - std::transform(new_lod.back().begin(), new_lod.back().end(), - new_lod.back().begin(), - [start](int v) { return v - start; }); - PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD"); + PADDLE_ENFORCE_LT(level, in.size()); + PADDLE_ENFORCE_LT(elem_end, in[level].size()); + + LoD res; + res.resize(in.size() - level); + // copy the first level + res[0].assign(in[level].begin() + elem_begin, + in[level].begin() + elem_end + 1); + for (size_t lvl = 1; lvl < res.size(); lvl++) { + const auto& in_level = in[level + lvl]; + const auto& above_level = res[lvl - 1]; + auto& out_level = res[lvl]; + out_level.assign(in_level.begin() + above_level.front(), + in_level.begin() + above_level.back() + 1); } - PADDLE_ENFORCE_LE(new_lod.size(), in.size()); - return new_lod; + for (size_t lvl = 0; lvl < res.size(); lvl++) { + // to make the first offset equals 0, all the elements minus the first + // element + size_t front = res[lvl].front(); + for (auto& ele : res[lvl]) { + ele -= front; + } + } + return res; +} + +LoD ToAbsOffset(const LoD& in) { + // the lowest level stores relative offsets + if (in.empty() || in.size() == 1) return in; + LoD result = in; + for (int level = result.size() - 2; level >= 0; level--) { + for (auto& ele : result[level]) { + ele = result[level + 1][ele]; + } + } + return result; } bool operator==(const LoD& a, const LoD& b) { @@ -75,17 +94,7 @@ bool operator==(const LoD& a, const LoD& b) { size_t LoDTensor::NumElements(size_t level, size_t idx) const { PADDLE_ENFORCE_LT(level, NumLevels()); PADDLE_ENFORCE_LT(idx, NumElements(level)); - // the last level of LoD, just return number of records in Tensor - if (level == NumLevels() - 1) { - return lod_[level][idx + 1] - lod_[level][idx]; - } - // high level of LoD, and there is another lower level, return number of - // lower-level elements - auto tmp = SliceInLevel(lod_, level, idx, idx + 1); - PADDLE_ENFORCE_GE(tmp.size(), 2); - // there is a 0 as a placeholder stored in LoD, so the number of elements - // equals lod.size() - 1 - return tmp[1].size() - 1; + return lod_[level][idx + 1] - lod_[level][idx]; } void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) { diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h index 3d893baa35391d38372c735ad62576f3dc35a99b..dec59a5750ab24244a013282b4547fb18d4991ac 100644 --- a/paddle/framework/lod_tensor.h +++ b/paddle/framework/lod_tensor.h @@ -39,23 +39,36 @@ using Vector = thrust::host_vector< #endif /* - * 3-level LoD stores + * LoD is short for Level of Details. * - * 0 10 20 - * 0 5 10 15 20 - * 0 2 5 7 10 12 15 20 - * - * - in a level, each element indicates offset in the underlying Tensor + * - in a level, each element indicates relative offset of the lower level * - the first element should be 0 and that indicates that this sequence start * from 0 * - each sequence's begin and end(no-inclusive) is level[id, id+1] + * + * For example: + * 3-level LoD stores + * + * 0 2 3 + * 0 2 4 7 + * 0 2 5 7 10 12 15 20 */ using LoD = std::vector>; +/* + * Slice levels from a LoD. + * NOTE the lowest level should always be the absolute offsets of the underlying + * tensor instances. So if higher layers are sliced without the lowest level, + * the lower level of the sliced LoD will be transformed to the absolute offset. + */ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end); LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, size_t elem_end); +/* + * Transform an LoD from relative offsets to absolute offsets. + */ +LoD ToAbsOffset(const LoD& in); bool operator==(const LoD& a, const LoD& b); diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc index 44f09f584fb752d7003baa804979f3bb5cd9d651..e1e15abecf5534fb4fd94f7e2b65230c74d175de 100644 --- a/paddle/framework/lod_tensor_test.cc +++ b/paddle/framework/lod_tensor_test.cc @@ -30,8 +30,8 @@ class LoDTensorTester : public ::testing::Test { // 0 5 10 15 20 // 0 2 5 7 10 12 15 20 LoD lod; - lod.push_back(std::vector{0, 10, 20}); - lod.push_back(std::vector{0, 5, 10, 15, 20}); + lod.push_back(std::vector{0, 2, 3}); + lod.push_back(std::vector{0, 2, 5, 8}); lod.push_back(std::vector{0, 2, 5, 7, 10, 12, 15, 17, 20}); ASSERT_EQ(lod.size(), 3UL); @@ -52,14 +52,14 @@ TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); } TEST_F(LoDTensorTester, NumElements) { ASSERT_EQ(lod_tensor_.NumElements(0), 2UL); - ASSERT_EQ(lod_tensor_.NumElements(1), 4UL); + ASSERT_EQ(lod_tensor_.NumElements(1), 3UL); ASSERT_EQ(lod_tensor_.NumElements(2), 8UL); } TEST_F(LoDTensorTester, NumElements2) { ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL); - ASSERT_EQ(lod_tensor_.NumElements(0, 1), 2UL); - ASSERT_EQ(lod_tensor_.NumElements(1, 1), 2UL); + ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL); + ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL); } TEST_F(LoDTensorTester, ShrinkLevels) { @@ -68,17 +68,16 @@ TEST_F(LoDTensorTester, ShrinkLevels) { LoDTensor new_lod_tensor = lod_tensor_; new_lod_tensor.ShrinkLevels(level, level + 1); ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL); - ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level)); ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data()); } // shrink 2 level for (size_t level = 0; level < 2UL; ++level) { LoDTensor new_lod_tensor = lod_tensor_; new_lod_tensor.ShrinkLevels(level, level + 2); + // the lowest level's last element should be the tensor's batch_size. + ASSERT_EQ(new_lod_tensor.lod().back().back(), + lod_tensor_.lod().back().back()); ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL); - ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level)); - ASSERT_EQ(new_lod_tensor.NumElements(1), - lod_tensor_.NumElements(level + 1)); ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data()); } } @@ -86,19 +85,19 @@ TEST_F(LoDTensorTester, ShrinkLevels) { TEST_F(LoDTensorTester, ShrinkInLevel) { size_t level = 0; LoDTensor new_lod_tensor = lod_tensor_; - new_lod_tensor.ShrinkInLevel(level, 0, 2); + new_lod_tensor.ShrinkInLevel(level, 0, 1); EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL); - EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL); - EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL); - EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL); + EXPECT_EQ(new_lod_tensor.NumElements(0), 1UL); + EXPECT_EQ(new_lod_tensor.NumElements(1), 2UL); + EXPECT_EQ(new_lod_tensor.NumElements(2), 5UL); ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data()); level = 1; new_lod_tensor = lod_tensor_; - new_lod_tensor.ShrinkInLevel(level, 0, 2); + new_lod_tensor.ShrinkInLevel(level, 1, 2); ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL); - ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL); - ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL); + ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL); + ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL); ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data()); } diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h index a134befd90a1eaeff6f6ea62f11412df63cdc394..44e8ab16895cc604f85bb83e240eab55739f8ba0 100644 --- a/paddle/framework/op_proto_maker.h +++ b/paddle/framework/op_proto_maker.h @@ -44,6 +44,11 @@ class OpProtoAndCheckerMaker { var_->set_intermediate(true); return *this; } + + VariableBuilder& AsDispensable() { + var_->set_dispensable(true); + return *this; + } }; VariableBuilder AddInput(const std::string& name, const std::string& comment); diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 2fca816f353635d3bff184323755961ee82fbb68..a67625fa88fd2fbe4db43241ee824519ceac7017 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -252,5 +252,20 @@ std::ostream& operator<<(std::ostream& os, return os; } +bool OpSupportGPU(const std::string& op_type) { + auto& all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + if (it == all_kernels.end()) { + // All control operator must support GPU + return true; + } + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + return true; + } + } + return false; +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 12cd307297d010201a47e048089ed7be0db52647..0d0304ac9e13089ef533b0a47f0ec989c8fd7078 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -327,37 +327,47 @@ class CompileTimeInferShapeContext : public InferShapeContext { bool HasInput(const std::string& name) const override { const std::vector& input_names = op_.Input(name); auto length = input_names.size(); + if (length == 0) { + return false; + } PADDLE_ENFORCE_EQ(length, 1UL, "Input(%s) should have only one value, " "but it have %d now", name, length); - return block_.HasVar(input_names[0]); + return block_.HasVarRecursive(input_names[0]); } bool HasOutput(const std::string& name) const override { const std::vector& output_names = op_.Output(name); auto length = output_names.size(); + if (length == 0) { + return false; + } PADDLE_ENFORCE_EQ(length, 1UL, "Output(%s) should have only one value, " "but it have %d now", name, length); - return block_.HasVar(output_names[0]); + return block_.HasVarRecursive(output_names[0]); } bool HasInputs(const std::string& name) const override { const std::vector& input_names = op_.Input(name); - PADDLE_ENFORCE(!input_names.empty(), "Inputs(%s) length is 0", name); + if (input_names.empty()) { + return false; + } for (auto& input : input_names) { - if (!block_.HasVar(input)) return false; + if (!block_.HasVarRecursive(input)) return false; } return true; } bool HasOutputs(const std::string& name) const override { const std::vector& output_names = op_.Output(name); - PADDLE_ENFORCE(!output_names.empty(), "Inputs(%s) length is 0", name); + if (output_names.empty()) { + return false; + } for (auto& output : output_names) { - if (!block_.HasVar(output)) return false; + if (!block_.HasVarRecursive(output)) return false; } return true; } @@ -404,11 +414,11 @@ class CompileTimeInferShapeContext : public InferShapeContext { private: DDim GetDim(const std::string& name) const override { - return framework::make_ddim(block_.FindVar(name)->Shape()); + return framework::make_ddim(block_.FindVarRecursive(name)->Shape()); } void SetDim(const std::string& name, const DDim& dim) override { - block_.FindVar(name)->SetShape(framework::vectorize(dim)); + block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim)); } const OpDescBind& op_; @@ -421,13 +431,27 @@ class RuntimeInferShapeContext : public InferShapeContext { : op_(op), scope_(scope) {} bool HasInput(const std::string& name) const override { - auto ipt = op_.Input(name); + auto& ins = Inputs(name); + size_t length = ins.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs", + name); + auto ipt = ins[0]; auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); return var != nullptr; } bool HasOutput(const std::string& name) const override { - auto ipt = op_.Output(name); + auto& outs = Outputs(name); + size_t length = outs.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs", + name); + auto ipt = outs[0]; auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); return var != nullptr; } @@ -649,5 +673,7 @@ class OperatorWithKernel : public OperatorBase { std::ostream& operator<<(std::ostream& os, const OperatorWithKernel::OpKernelKey& kernel_key); +extern bool OpSupportGPU(const std::string& op_type); + } // namespace framework } // namespace paddle diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc index e2349cefe09a6c1e0b11f77775426fe5c7594c9d..8e99bba81117c9cc50227122527d6ab9a421c251 100644 --- a/paddle/framework/program_desc.cc +++ b/paddle/framework/program_desc.cc @@ -35,8 +35,8 @@ ProgramDesc *ProgramDescBind::Proto() { ProgramDescBind::ProgramDescBind() { auto *block = prog_.mutable_blocks()->Add(); - block->set_idx(0); - block->set_parent_idx(-1); + block->set_idx(kRootBlockIndex); + block->set_parent_idx(kNoneBlockIndex); blocks_.emplace_back(new BlockDescBind(this, block)); } diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h index 20cc1a2325ffd6f8ef52879a749f106c268376d4..dc4cd7cc735b5e4e3466d9b82dc5eb8647c80ef9 100644 --- a/paddle/framework/program_desc.h +++ b/paddle/framework/program_desc.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/framework/framework.pb.h" +#include "paddle/framework/proto_desc.h" #include "paddle/platform/macros.h" namespace paddle { diff --git a/paddle/framework/proto_desc.h b/paddle/framework/proto_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..fa01224fefce50eb3688ff407f0a7c948c5b7cfc --- /dev/null +++ b/paddle/framework/proto_desc.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace framework { + +// The Index of first Block in Program. also called root block. +constexpr int kRootBlockIndex = 0; +// The Parent Index of root Block, this block does not exist. +constexpr int kNoneBlockIndex = -1; + +} // namespace framework +} // namespace paddle diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp index 18c5638100065109fba1f0647a1c5f91256f7b9d..f3ccd68160859795f28a40f8d0d4032adb289ccf 100644 --- a/paddle/gserver/activations/MKLDNNActivation.cpp +++ b/paddle/gserver/activations/MKLDNNActivation.cpp @@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) { copyInVal_ = nullptr; if (act.grad && algo == algorithm::eltwise_tanh) { // tanh need save src input for backward - inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc()); + inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc()); copyInVal_ = std::make_shared(*val_, *inVal_); CHECK(copyInVal_) << "should not be emptry"; pipelineFwd_.push_back(*copyInVal_); @@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) { algorithm algo = getAlgo(this->getName()); float alpha = getBwdAlpha(); float beta = getBeta(); - grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc()); + grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad); auto eng = CPUEngine::Instance().getEngine(); auto bwdDesc = eltwise_bwd::desc( algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta); @@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) { int ic = cnt_ / bs / ih / iw; CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw); val_ = MKLDNNMatrix::create( - act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_); + {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value); CHECK(val_); val_->downSpatial(); } diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h index 4c0234e7b3a91053596c32cea581fa5d1e26b9d5..af02a37cad668708f77ecf423549a8ec993e54fb 100644 --- a/paddle/gserver/layers/MKLDNNBase.h +++ b/paddle/gserver/layers/MKLDNNBase.h @@ -21,8 +21,8 @@ namespace paddle { typedef enum { MKLDNN_BASE = 1, // basical info of MKLDNN MKLDNN_TESTS = 1, // gtest info of MKLDNN - MKLDNN_SIZES = 2, // size info of MKLDNN - MKLDNN_FMTS = 3, // format info of MKLDNN + MKLDNN_FMTS = 2, // format info of MKLDNN + MKLDNN_SIZES = 3, // size info of MKLDNN MKLDNN_ALL = 4, // show all info of MKLDNN } MKLDNN_LOG_LEVEL; diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp index 26810a648343d6203f7937740641325ae8ea6879..83f4e4e6151d727b3e6cf367bb7ecae55dd7df73 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.cpp +++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp @@ -116,8 +116,6 @@ void MKLDNNConvLayer::resetFwd(std::vector& pipeline, resetFwdBuffers(fwdPD_, in, wgt, bias, out); resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out); - - printValueFormatFlow(); } void MKLDNNConvLayer::resetBwd(std::vector& pipeline, @@ -135,12 +133,6 @@ void MKLDNNConvLayer::resetBwd(std::vector& pipeline, resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out); resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out); - - printGradFormatFlow(); -} - -void MKLDNNConvLayer::updateInputData() { - cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); } void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) { @@ -211,11 +203,18 @@ void MKLDNNConvLayer::resetFwdBuffers( MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { CHECK(pd); - resetInValue(pd, in); + resetInValue( + in, std::make_shared(pd->src_primitive_desc())); + + resetOutValue(out, pd->dst_primitive_desc()); - resetWgtBiasValue(pd, wgt, bias); + resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc()); - resetOutValue(pd, out); + if (biases_ && biases_->getW()) { + resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc()); + } else { + bias = nullptr; + } } void MKLDNNConvLayer::resetFwdPipeline( @@ -225,104 +224,12 @@ void MKLDNNConvLayer::resetFwdPipeline( MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - if (cvtInVal_) { - pipeline.push_back(*cvtInVal_); - } - if (bias) { fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out)); } else { fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out)); } pipeline.push_back(*fwd_); - - if (cvtOutVal_) { - pipeline.push_back(*cvtOutVal_); - } -} - -void MKLDNNConvLayer::resetInValue( - std::shared_ptr& pd, MKLDNNMatrixPtr& in) { - const MatrixPtr& inMat = inputLayers_[0]->getOutputValue(); - in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc()); - - // create buffer and reorder if input value do not match - cpuInVal_ = nullptr; - cvtInVal_ = nullptr; - - MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast(inMat); - CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr); - if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) { - in = dnnIn; - return; - } - if (dnnIn) { - if (dnnIn->getFormat() == format::nc) { - CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format"; - // create a new one with nchw format and same data - memory::dims inDims = memory::dims{bs_, ic_, 1, 1}; - dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_); - } - if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) { - in = dnnIn; - return; - } - cpuInVal_ = dnnIn; - in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc()); - cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in); - CHECK(cvtInVal_) << "should not be emptry"; - } else { - memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_}; - cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_); - if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) { - // create new mkldnn matrix - in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc()); - cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in); - CHECK(cvtInVal_) << "should not be emptry"; - } else { - in = cpuInVal_; - } - } -} - -void MKLDNNConvLayer::resetWgtBiasValue( - std::shared_ptr& pd, - MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias) { - wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc()); - VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat(); - - bias = (biases_ && biases_->getW()) - ? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc()) - : nullptr; -} - -void MKLDNNConvLayer::resetOutValue( - std::shared_ptr& pd, MKLDNNMatrixPtr& out) { - out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc()); - - // create reorder if output value has cpu device and pd do not match - cpuOutVal_ = nullptr; - cvtOutVal_ = nullptr; - if (!outputIsOnlyMKLDNN()) { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value; - memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; - cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_); - if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) { - out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc()); - cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_); - CHECK(cvtOutVal_) << "should not be empty"; - } else { - cpuOut->setData(output_.value->getData()); - cpuOutVal_ = out; - } - // when output is cpu device, change the mkldnn output value and make them - // share the same data. Then if next layer use inputlayer->getOuputValue() - // to achieve the input value, it will get the right data. - output_.value = std::dynamic_pointer_cast(cpuOutVal_); - return; - } - output_.value = std::dynamic_pointer_cast(out); } void MKLDNNConvLayer::resetBwdWgtPD( @@ -331,8 +238,8 @@ void MKLDNNConvLayer::resetBwdWgtPD( loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR); // create backward weight using input, output and weight value memory desc - CHECK(inVal_) << "Should have input value"; - CHECK(outVal_) << "Should have output value"; + CHECK(inVal_) << "Should have internal input value"; + CHECK(outVal_) << "Should have internal output value"; CHECK(wgtVal_) << "Should have weight value"; algorithm algo = algorithm::convolution_direct; padding_kind padKind = padding_kind::zero; @@ -372,8 +279,8 @@ void MKLDNNConvLayer::resetBwdDataPD( memory::dims wgtDims, biasDims, strides, dilations, padL, padR; loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR); - CHECK(inVal_) << "Should have input value"; - CHECK(outVal_) << "Should have output value"; + CHECK(inVal_) << "Should have internal input value"; + CHECK(outVal_) << "Should have internal output value"; // create backward data using input and output value memory desc // but using weight memory desc with any format auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct, @@ -399,12 +306,27 @@ void MKLDNNConvLayer::resetBwdBuffers( MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { CHECK(wgtPD); - resetOutGrad(wgtPD, out); + resetOutGrad(out, wgtPD->diff_dst_primitive_desc()); - resetWgtBiasGrad(wgtPD, wgt, bias); + resetWithMatrix( + wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc()); + CHECK(wgtVal_ != nullptr && + wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc()) + << "primitive desc of weight grad and value should be equal"; - resetInGrad(dataPD, in); + bias = nullptr; + if (biases_ && biases_->getWGrad()) { + resetWithMatrix( + bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc()); + CHECK(bias && biasVal_ && + bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc()) + << "primitive desc of bias grad should equal the bias value"; + } + if (dataPD == nullptr) { + return; + } + resetInGrad(in, dataPD->diff_src_primitive_desc()); resetWgtValBwdData(dataPD, wgtValBwdData_); } @@ -416,10 +338,7 @@ void MKLDNNConvLayer::resetBwdPipeline( MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - if (cvtOutGrad_) { - pipeline.push_back(*cvtOutGrad_); - } - + CHECK(inVal_); // add bwdWgt handle if (bias) { bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias)); @@ -431,99 +350,13 @@ void MKLDNNConvLayer::resetBwdPipeline( if (dataPD == nullptr) { return; } - if (cvtWgtVal_) { pipeline.push_back(*cvtWgtVal_); } - // add bwdData handle CHECK(wgtValBwdData_) << "Should have weight memory"; bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in)); pipeline.push_back(*bwdData_); - - if (cvtInGrad_) { - pipeline.push_back(*cvtInGrad_); - } -} - -void MKLDNNConvLayer::resetOutGrad( - std::shared_ptr& wgtPD, MKLDNNMatrixPtr& out) { - cpuOutGrad_ = nullptr; - cvtOutGrad_ = nullptr; - CHECK(outVal_ != nullptr && - outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc()) - << "primitive desc of out grad and value should be equal"; - if (outputIsOnlyMKLDNN()) { - MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); - } else { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; - // always share the same grad data of CPU output - // then the activation can get the right grad from output_.grad - output_.grad->setData(cpuOut->getData()); - // same PrimitiveDesc with cpuInVal_ - CHECK(cpuOutVal_); - cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc()); - // create reorder if primitive desc does not match - if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) { - out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc()); - cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); - CHECK(cvtOutGrad_); - } else { - out = cpuOutGrad_; - } - } -} - -void MKLDNNConvLayer::resetWgtBiasGrad( - std::shared_ptr& wgtPD, - MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias) { - wgt = MKLDNNMatrix::create(weight_->getWGrad(), - wgtPD->diff_weights_primitive_desc()); - CHECK(nullptr != wgtVal_ && - wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc()) - << "primitive desc of weight grad and value should be equal"; - VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat(); - - bias = nullptr; - if (biasVal_ == nullptr) { - return; - } - bias = MKLDNNMatrix::create(biases_->getWGrad(), - wgtPD->diff_bias_primitive_desc()); - CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc()) - << "primitive desc of bias grad should equal the bias value"; -} - -void MKLDNNConvLayer::resetInGrad( - std::shared_ptr& dataPD, - MKLDNNMatrixPtr& in) { - in = nullptr; - cpuInGrad_ = nullptr; - cvtInGrad_ = nullptr; - if (dataPD == nullptr) { - return; - } - - if (inputIsOnlyMKLDNN()) { - MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc()); - CHECK(nullptr != inVal_ && - in->getPrimitiveDesc() == inVal_->getPrimitiveDesc()) - << "primitive desc of input grad and value should be equal"; - } else { - const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE); - // same PrimitiveDesc with cpuInVal_ - CHECK(cpuInVal_); - cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc()); - in = cpuInGrad_; - // create reorder if PrimitiveDesc does not match - if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) { - in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE), - dataPD->diff_src_primitive_desc()); - cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_); - CHECK(cvtInGrad_); - } - } } void MKLDNNConvLayer::resetWgtValBwdData( @@ -537,8 +370,7 @@ void MKLDNNConvLayer::resetWgtValBwdData( // since the primitive_desc would be different with wgtVal_ CHECK(wgtVal_) << "should have weight value"; if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) { - wgtValBwdData_ = - MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc()); + wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc()); cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_); CHECK(cvtWgtVal_); } else { diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h index f84f2f737c47a1b8adc2b83360a0396ffbc6ae24..1fed0e1c6565b763a3ee73a0853f560ddfbd44c6 100644 --- a/paddle/gserver/layers/MKLDNNConvLayer.h +++ b/paddle/gserver/layers/MKLDNNConvLayer.h @@ -48,17 +48,6 @@ protected: // save forward primitive_desc, which can be used backward std::shared_ptr fwdPD_; - // MKLDNNMatrixPtr which should be created from CPU Device - MKLDNNMatrixPtr cpuInVal_; - MKLDNNMatrixPtr cpuInGrad_; - MKLDNNMatrixPtr cpuOutVal_; - MKLDNNMatrixPtr cpuOutGrad_; - // convert handle between CPU device and MKLDNN device - std::shared_ptr cvtInVal_; - std::shared_ptr cvtInGrad_; - std::shared_ptr cvtOutVal_; - std::shared_ptr cvtOutGrad_; - // whether the weight has been init bool hasInitedWgt_; @@ -94,8 +83,6 @@ public: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) override; - void updateInputData() override; - void updateWeights(const UpdateCallback& callback) override; void convertWeightsFromPaddle() override; @@ -109,26 +96,6 @@ public: << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_; } - void printValueFormatFlow() override { - if (cpuInVal_) { - VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>"; - } - MKLDNNLayer::printValueFormatFlow(); - if (cpuOutVal_) { - VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat(); - } - } - - void printGradFormatFlow() override { - if (cpuInGrad_) { - VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<"; - } - MKLDNNLayer::printGradFormatFlow(); - if (cpuOutGrad_) { - VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat(); - } - } - protected: /** * load the dims settings of this conv @@ -162,23 +129,6 @@ protected: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out); - /** - * reset MKLDNNMatrix of input value - */ - void resetInValue(std::shared_ptr& pd, - MKLDNNMatrixPtr& in); - /** - * reset MKLDNNMatrix of weight and bias value - */ - void resetWgtBiasValue(std::shared_ptr& pd, - MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias); - /** - * reset MKLDNNMatrix of output value - */ - void resetOutValue(std::shared_ptr& pd, - MKLDNNMatrixPtr& out); - /** * reset the backward weight primitive descriptor. */ @@ -207,22 +157,6 @@ protected: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out); - /** - * reset MKLDNNMatrix of output grad - */ - void resetOutGrad(std::shared_ptr& wgtPD, - MKLDNNMatrixPtr& out); - /** - * reset MKLDNNMatrix of weight and bias grad - */ - void resetWgtBiasGrad(std::shared_ptr& wgtPD, - MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias); - /** - * reset MKLDNNMatrix of input grad - */ - void resetInGrad(std::shared_ptr& dataPD, - MKLDNNMatrixPtr& in); /** * reset MKLDNNMatrix of weight value for backward data * since the primitive_desc would be different with wgtVal_ diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp index cf19a155681f3a1ceb20af67245c8f2b8fa8fa73..d82063a7130ca928ba042e210eb216f90c7207cd 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.cpp +++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp @@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() { CHECK(wgtVal_) << "should have been initialized"; bool hasNoSpatial_ = ih_ == 1 && iw_ == 1; auto targetDim = wgtVal_->getDims(); - auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo; + auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo; wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim); hasInitedWgt_ = true; } @@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() { CHECK(wgtVal_) << "should have been initialized"; bool hasNoSpatial_ = ih_ == 1 && iw_ == 1; auto targetDim = wgtVal_->getDims(); - auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo; + auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo; wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim); } @@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector& pipeline, resetFwdPD(fwdPD_, in, wgt, bias, out); resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out); - - printValueFormatFlow(); } void MKLDNNFcLayer::resetBwd(std::vector& pipeline, @@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector& pipeline, resetBwdDataPD(bwdDataPD, in, out); resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out); - - printGradFormatFlow(); -} - -void MKLDNNFcLayer::updateInputData() { - inVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); } void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) { @@ -139,51 +131,30 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { resetInValue(in); - - resetWgtBiasValue(wgt, bias); - - resetOutValue(out); -} - -void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) { - if (inputIsOnlyMKLDNN()) { - const MatrixPtr& dnnIn = getInputValue(0); - in = std::dynamic_pointer_cast(dnnIn); - CHECK(in) << "Input should be MKLDNNMatrix"; - } else { - CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; - const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE); - in = MKLDNNMatrix::create( - cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_); - } + CHECK(in); in->downSpatial(); -} -void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias) { + auto outPD = + MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_); + resetOutValue(out, outPD); + format wgtFmt = format::oihw; - if (inVal_->getFormat() == format::nChw8c) { + if (in->getFormat() == format::nChw8c) { wgtFmt = format::oIhw8i; - } else if (inVal_->getFormat() == format::nChw16c) { + } else if (in->getFormat() == format::nChw16c) { wgtFmt = format::oIhw16i; } - wgt = MKLDNNMatrix::create( - weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_); + auto wgtPD = + MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_); + resetWithMatrix(wgt, weight_->getW(), wgtPD); wgt->downSpatial(); - VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat(); - - bias = (biases_ && biases_->getW()) - ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_) - : nullptr; -} -void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) { - out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_); - if (!outputIsOnlyMKLDNN()) { - // fc cpu output value do not need create convert, just share data - getOutput(CPU_DEVICE).value->setData(out->getData()); + if (biases_ && biases_->getW()) { + auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_); + resetWithMatrix(bias, biases_->getW(), biasPD); + } else { + bias = nullptr; } - output_.value = std::dynamic_pointer_cast(out); } void MKLDNNFcLayer::resetFwdPD(std::shared_ptr& pd, @@ -219,7 +190,6 @@ void MKLDNNFcLayer::resetFwdPipeline( } else { fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out)); } - pipeline.push_back(*fwd_); } @@ -227,44 +197,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) { - resetOutGrad(out); - - resetWgtBiasGrad(wgt, bias); - - resetInGrad(in); -} - -void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) { - CHECK(outVal_); - if (outputIsOnlyMKLDNN()) { - MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); - } else { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; - output_.grad->setData(cpuOut->getData()); - out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc()); - } -} + CHECK(inVal_ && outVal_); + resetOutGrad(out, outVal_->getPrimitiveDesc()); + resetInGrad(in, inVal_->getPrimitiveDesc()); -void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, - MKLDNNMatrixPtr& bias) { CHECK(wgtVal_); - wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc()); + resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc()); - bias = nullptr; - if (biasVal_ == nullptr) { - return; - } - bias = - MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc()); -} - -void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) { - in = nullptr; - if (inputLayers_[0]->getOutput().grad == nullptr) { - return; + if (biasVal_) { + resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc()); + } else { + bias = nullptr; } - CHECK(inVal_); - MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc()); } void MKLDNNFcLayer::resetBwdWgtPD( diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h index c76878aafab7e986d2bf478eaba02f2f0aced293..ee861763ff3dc10ddb4c119358b80dbe1614aecb 100644 --- a/paddle/gserver/layers/MKLDNNFcLayer.h +++ b/paddle/gserver/layers/MKLDNNFcLayer.h @@ -66,8 +66,6 @@ public: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) override; - void updateInputData() override; - void updateWeights(const UpdateCallback& callback) override; void convertWeightsFromPaddle() override; @@ -84,9 +82,6 @@ protected: MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out); - void resetInValue(MKLDNNMatrixPtr& in); - void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias); - void resetOutValue(MKLDNNMatrixPtr& out); void resetFwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr in, MKLDNNMatrixPtr wgt, @@ -109,9 +104,6 @@ protected: MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out); - void resetOutGrad(MKLDNNMatrixPtr& out); - void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias); - void resetInGrad(MKLDNNMatrixPtr& in); void resetBwdWgtPD(std::shared_ptr& pd, MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias, diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6bb19976b5552fcd2e420f03de45c77a90ffb9d2 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNLayer.cpp @@ -0,0 +1,333 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLDNNLayer.h" + +using namespace mkldnn; // NOLINT +typedef memory::format format; + +namespace paddle { + +bool MKLDNNLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." + << "Please set WITH_MKLDNN=ON " + << "and set use_mkldnn=True"; + CHECK(!useGpu_) << "Do not support GPU yet"; + + // set device id before Layer::init + setDevice(MKLDNN_DEVICE); + // change param device to MKLDNN device + setParamsDevice(MKLDNN_DEVICE, parameterMap); + if (!Layer::init(layerMap, parameterMap)) { + return false; + } + setOutputMap(); + checkCPUOutputsNumber(); + + stream_.reset(new MKLDNNStream()); + engine_ = CPUEngine::Instance().getEngine(); + return true; +} + +void MKLDNNLayer::forward(PassType passType) { + passType_ = passType; + + { + REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); + CHECK(!inputLayers_.empty()); + copySeqInfoToOutputs(); + size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt(); + if (inputElemenCnt_ != elemenCnt) { + VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; + // reset when input total sizes changed, not only the batchsize + inputElemenCnt_ = elemenCnt; + pipelineFwd_.clear(); + reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); + // all cpu device output grad or value share output's + shareCPUDevice(); + resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); + // MKLDNNLayer output value should be MKLDNNMatrix + // so external output value is necessary. + // Then external input value is not necessary, + // since input may be mkldnn internal buffer. + CHECK(extOutVal_) << "external output value is necessary"; + output_.value = std::dynamic_pointer_cast(extOutVal_); + CHECK(inVal_ && outVal_) << "internal memories are necessary"; + if (cvtInVal_) { + pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_); + } + if (cvtOutVal_) { + pipelineFwd_.push_back(*cvtOutVal_); + } + convertWeightsFromPaddle(); + printSizeInfo(); + printValueFormat(); + needResetBwd_ = true; + } + + if (inputLayers_[0]->getType() == "data") { + // Update input value data when input layer is "data" type, + // since the input value data address might be changed. + CHECK(extInVal_); + extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); + } + + if (!outputOnlyMKLDNN_) { + clearGrads(); + } + stream_->submit(pipelineFwd_); + } + { + REGISTER_TIMER_INFO("FwActTimer", getName().c_str()); + forwardActivation(); + } +} + +void MKLDNNLayer::backward(const UpdateCallback& callback) { + if (needResetBwd_) { + VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; + pipelineBwd_.clear(); + pipelineMergeGrad_.clear(); + mergeGrad_ = nullptr; + resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); + // external output grad is not necessary + // since output may be mkldnn internal buffer or merge them directly. + CHECK(outGrad_) << "internal output grad is necessary"; + if (extOutGrad_) { + CHECK_EQ(extOutGrad_->getData(), output_.grad->getData()) + << "the external buffer should share the same data with output_.grad"; + } + if (cvtOutGrad_) { + pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_); + } + if (cvtInGrad_) { + pipelineBwd_.push_back(*cvtInGrad_); + } + printGradFormat(); + needResetBwd_ = false; + } + + // merge grad must before backward activation + if (mergeGrad_) { + REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str()); + stream_->submit(pipelineMergeGrad_); + } + { + REGISTER_TIMER_INFO("BpActTimer", getName().c_str()); + backwardActivation(); + } + { + REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); + stream_->submit(pipelineBwd_); + } + { + REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); + updateWeights(callback); + } +} + +void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) { + const Argument& input = inputLayers_[0]->getOutput(); + batchsize = input.getBatchSize(); + int h = input.getFrameHeight(); + int w = input.getFrameWidth(); + if (h != 0) { + height = h; + } + if (w != 0) { + width = w; + } +} + +void MKLDNNLayer::reshapeOutput(size_t height, size_t width) { + output_.setFrameHeight(height); + output_.setFrameWidth(width); + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + outputOtherDevice_[i].setFrameHeight(height); + outputOtherDevice_[i].setFrameWidth(width); + } +} + +void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn, + const MatrixPtr& mat, + memory::primitive_desc pd) { + dnn = nullptr; + if (mat == nullptr) { + return; + } + dnn = MKLDNNMatrix::create(pd, mat); +} + +void MKLDNNLayer::resetInValue( + MKLDNNMatrixPtr& in, const std::shared_ptr& intPD) { + cvtInVal_ = nullptr; + extInVal_ = nullptr; + in = nullptr; + CHECK_GT(bs_ * ic_ * ih_ * iw_, 0); + auto extPD = MKLDNNMatrix::createPrimitiveDesc( + {bs_, ic_, ih_, iw_}, format::nchw, engine_); + const MatrixPtr& inMat = inputLayers_[0]->getOutputValue(); + in = std::dynamic_pointer_cast(inMat); + CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr); + if (in == nullptr || in->getFormat() == format::nc) { + in = MKLDNNMatrix::create(extPD, inMat); + } + extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr; + if (in->getFormat() == format::nc) { + CHECK(ih_ == 1 && iw_ == 1); + } + if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) { + return; + } + // need create reorder + in = MKLDNNMatrix::create(*intPD); + extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat); + cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in); + CHECK(cvtInVal_) << "should not be emptry"; +} + +void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out, + memory::primitive_desc intPD) { + cvtOutVal_ = nullptr; + out = MKLDNNMatrix::create(intPD, output_.value); + extOutVal_ = out; + if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) { + return; + } + // need create reorder + CHECK_GT(bs_ * oc_ * oh_ * ow_, 0); + extOutVal_ = MKLDNNMatrix::create( + memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value); + out = MKLDNNMatrix::create(intPD); + cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_); + CHECK(cvtOutVal_) << "should not be empty"; +} + +void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, + memory::primitive_desc intPD) { + cvtInGrad_ = nullptr; + extInGrad_ = nullptr; + in = nullptr; + LayerPtr& input = inputLayers_[0]; + if (input->getOutputGrad() == nullptr) { + // no need input grad + return; + } + CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1) + << "only support input is MKLDNN layer or only have one output layer"; + // when input is a mkldnn branch node, + // this layer will save input grad to a internal buffer, + // and the mkldnn input layer will merge them to actual prev->output_.grad + const MatrixPtr& inMat = + input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr; + in = MKLDNNMatrix::create(intPD, inMat); + Argument& arg = input->getOutput(this->getName()); + arg.grad = std::dynamic_pointer_cast(in); + CHECK(inVal_); + CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal"; + if (inputIsOnlyMKLDNN()) { + return; + } + + extInGrad_ = in; + if (isPaddleFormat(extInGrad_->getFormat())) { + return; + } + // need create reorder + // TODO(TJ): add macro definition to simplify it + CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) + << "should have external input value and the format must be nchw(nc)"; + extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat); + CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD) + << "should have internal input value and primitive desc must equal"; + in = MKLDNNMatrix::create(intPD); + cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_); + CHECK(cvtInGrad_); +} + +void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out, + memory::primitive_desc intPD) { + cvtOutGrad_ = nullptr; + extOutGrad_ = nullptr; + out = nullptr; + MatrixPtr& outMat = output_.grad; + out = MKLDNNMatrix::create(intPD, outMat); + resetMergeGrad(out); + if (outputIsOnlyMKLDNN()) { + return; + } + CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device"; + extOutGrad_ = out; + if (isPaddleFormat(extOutGrad_->getFormat())) { + return; + } + // need create reorder + CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat())) + << "should have external output value and the format must be nchw(nc)"; + extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat); + CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD) + << "should have internal output value and primitive desc must equal"; + out = MKLDNNMatrix::create(intPD); + cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out); + CHECK(cvtOutGrad_); +} + +void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) { + mergeGrad_ = nullptr; + pipelineMergeGrad_.clear(); + if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) { + // do not merge when output is not all MKLDNN or only one output + return; + } + CHECK(out) << "should have reset internal ouput grad"; + std::vector scales(outputMap_.size(), 1.0); + std::vector srcPDs; + std::vector srcs; + for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) { + MKLDNNMatrixPtr src = + std::dynamic_pointer_cast(it->second->grad); + CHECK(src) << "should be MKLDNNMatrix"; + auto srcDims = src->getDims(); + auto dstDims = out->getDims(); + CHECK_EQ(srcDims.size(), dstDims.size()); + for (size_t i = 0; i < srcDims.size(); ++i) { + CHECK_EQ(srcDims[i], dstDims[i]); + } + VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first + << ", format " << src->getFormat(); + srcPDs.push_back(src->getPrimitiveDesc()); + srcs.push_back(*src); + } + + // TODO(TJ): remove me when mkldnn sum support different formats + for (size_t i = 1; i < srcPDs.size(); ++i) { + CHECK(srcPDs[0] == srcPDs[i]); + } + tmpOutGrad_ = out; + tmpCvt_ = nullptr; + if (out->getPrimitiveDesc() != srcPDs[0]) { + tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]); + tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); + CHECK(tmpCvt_); + pipelineMergeGrad_.push_back(*tmpCvt_); + } + + auto sumPD = + sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs); + mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_)); + pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h index 4e2753eba2350d2c3df81b57fe98270a3c38cb24..9b54c95b55cc9b503de5ff527ac983eb4752ddb0 100644 --- a/paddle/gserver/layers/MKLDNNLayer.h +++ b/paddle/gserver/layers/MKLDNNLayer.h @@ -58,11 +58,31 @@ protected: std::vector pipelineFwd_; std::vector pipelineBwd_; - // MKLDNNMatrixPtr with internal format + /* Value and grad are seperated as internal and external buffers. + * Each MKLDNNLayer must init or reset internal buffer at least, + * and the external buffer format is always nchw of nc(when h==w==1), + * which is the same format as paddle. + * The output_.value and output_.grad always save the external data, + * when mixed with cpu device. + * When all layers are mkldnn layers, they could save internal data. + */ + // below MKLDNNMatrix buffers are all internal buffers MKLDNNMatrixPtr inVal_; MKLDNNMatrixPtr inGrad_; MKLDNNMatrixPtr outVal_; MKLDNNMatrixPtr outGrad_; + // below are external value and grad + MKLDNNMatrixPtr extInVal_; + MKLDNNMatrixPtr extInGrad_; + MKLDNNMatrixPtr extOutVal_; + MKLDNNMatrixPtr extOutGrad_; + // convert handle between external and internal buffers + std::shared_ptr cvtInVal_; + std::shared_ptr cvtInGrad_; + std::shared_ptr cvtOutVal_; + std::shared_ptr cvtOutGrad_; + + // weight and bias are always internal buffers MKLDNNMatrixPtr wgtVal_; MKLDNNMatrixPtr wgtGrad_; MKLDNNMatrixPtr biasVal_; @@ -91,6 +111,7 @@ public: oh_(0), ow_(0), needResetBwd_(true), + outputOnlyMKLDNN_(false), engine_(mkldnn::engine::cpu, 0), stream_(nullptr), fwd_(nullptr), @@ -99,92 +120,9 @@ public: ~MKLDNNLayer() {} - virtual bool init(const LayerMap& layerMap, - const ParameterMap& parameterMap) { - CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." - << "Please set WITH_MKLDNN=ON " - << "and set use_mkldnn=True"; - CHECK(!useGpu_) << "Do not support GPU yet"; - - // set device id before Layer::init - setDevice(MKLDNN_DEVICE); - // change param device to MKLDNN device - setParamsDevice(MKLDNN_DEVICE, parameterMap); - if (!Layer::init(layerMap, parameterMap)) { - return false; - } - setOutputMap(); - checkCPUOutputsNumber(); - - stream_.reset(new MKLDNNStream()); - engine_ = CPUEngine::Instance().getEngine(); - return true; - } - - void forward(PassType passType) override { - passType_ = passType; - - { - REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); - CHECK(!inputLayers_.empty()); - copySeqInfoToOutputs(); - size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt(); - if (inputElemenCnt_ != elemenCnt) { - VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; - // reset when input total sizes changed, not only the batchsize - inputElemenCnt_ = elemenCnt; - pipelineFwd_.clear(); - reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); - resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); - convertWeightsFromPaddle(); - needResetBwd_ = true; - } - - if (inputLayers_[0]->getType() == "data") { - updateInputData(); - } - - if (!outputOnlyMKLDNN_) { - clearGrads(); - } - stream_->submit(pipelineFwd_); - } - - /* activation */ { - REGISTER_TIMER_INFO("FwActTimer", getName().c_str()); - forwardActivation(); - } - } - - void backward(const UpdateCallback& callback) override { - if (needResetBwd_) { - VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; - pipelineBwd_.clear(); - pipelineMergeGrad_.clear(); - mergeGrad_ = nullptr; - resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); - needResetBwd_ = false; - } - - // merge grad must before backward activation - if (mergeGrad_) { - REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str()); - stream_->submit(pipelineMergeGrad_); - } - { - REGISTER_TIMER_INFO("BpActTimer", getName().c_str()); - backwardActivation(); - } - { - REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); - stream_->submit(pipelineBwd_); - } - - { - REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); - updateWeights(callback); - } - } + virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + virtual void forward(PassType passType); + virtual void backward(const UpdateCallback& callback); /** * reshape the input image sizes @@ -195,7 +133,7 @@ public: int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0; /** - * reset the mkldnn forward primitve and memory + * reset the mkldnn forward primitve and memories * only would be called when input size changes */ virtual void resetFwd(std::vector& pipeline, @@ -205,7 +143,7 @@ public: MKLDNNMatrixPtr& out) = 0; /** - * reset the mkldnn backward primitve and memory for mkldnn fc + * reset the mkldnn backward primitve and memories * only would be called when needed */ virtual void resetBwd(std::vector& pipeline, @@ -214,12 +152,6 @@ public: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) = 0; - /** - * Update input value data when input layer is "data" type. - * Since the input value data address might be changed. - */ - virtual void updateInputData() {} - /** * Update weights and biases if necessary. */ @@ -246,131 +178,78 @@ protected: /** * reshape the input image sizes and input batchsize */ - virtual void reshapeInput(int& batchsize, int& height, int& width) { - const Argument& input = inputLayers_[0]->getOutput(); - batchsize = input.getBatchSize(); - int h = input.getFrameHeight(); - int w = input.getFrameWidth(); - if (h != 0) { - height = h; - } - if (w != 0) { - width = w; - } - } + void reshapeInput(int& batchsize, int& height, int& width); /** * reshape output image sizes */ - virtual void reshapeOutput(size_t height, size_t width) { - output_.setFrameHeight(height); - output_.setFrameWidth(width); - for (size_t i = 0; i < outputOtherDevice_.size(); i++) { - outputOtherDevice_[i].setFrameHeight(height); - outputOtherDevice_[i].setFrameWidth(width); - } - } + void reshapeOutput(size_t height, size_t width); /** - * reset the output grad matrix from primitive desc. - * and reset the merge grad primitive if needed. - * note: when this layer has serval outputs, - * it could not be mixed with cpu device, - * since it can not get memory desc from cpu device. + * reset MKLDNNMatrix from Matrix and internal primitive desc. + * reset nullptr if matrix or primitive desc is empty */ - virtual void resetOutGrad(MKLDNNMatrixPtr& out, - mkldnn::memory::primitive_desc pd) { - CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet"; - mergeGrad_ = nullptr; - pipelineMergeGrad_.clear(); - out = MKLDNNMatrix::create(output_.grad, pd); - if (outputMap_.size() <= 1) { - return; - } - std::vector scales(outputMap_.size(), 1.0); - std::vector srcPDs; - std::vector srcs; - for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) { - MKLDNNMatrixPtr src = - std::dynamic_pointer_cast(it->second->grad); - VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first; - CHECK(src) << "should be MKLDNNMatrix"; - auto srcDims = src->getDims(); - auto dstDims = out->getDims(); - CHECK_EQ(srcDims.size(), dstDims.size()); - for (size_t i = 0; i < srcDims.size(); ++i) { - CHECK_EQ(srcDims[i], dstDims[i]); - } - srcPDs.push_back(src->getPrimitiveDesc()); - srcs.push_back(*src); - } + void resetWithMatrix(MKLDNNMatrixPtr& dnn, + const MatrixPtr& mat, + mkldnn::memory::primitive_desc pd); - // TODO(TJ): remove me when mkldnn sum support different formats - for (size_t i = 1; i < srcPDs.size(); ++i) { - CHECK(srcPDs[0] == srcPDs[i]); - } - tmpOutGrad_ = nullptr; - tmpCvt_ = nullptr; - if (out->getPrimitiveDesc() != srcPDs[0]) { - tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]); - tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out); - CHECK(tmpCvt_); - pipelineMergeGrad_.push_back(*tmpCvt_); - } else { - tmpOutGrad_ = out; - } + /** + * reset input value from input MKLDNNMatrix and internal primitive desc. + * reset both internal and external buffer and create reorder if necessary. + */ + void resetInValue( + MKLDNNMatrixPtr& in, + const std::shared_ptr& intPD = nullptr); - auto sumPD = mkldnn::sum::primitive_desc( - tmpOutGrad_->getMemoryDesc(), scales, srcPDs); - mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_)); - pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); - } + /** + * reset output value from internal primitive desc. + * reset both internal and external buffer and create reorder if necessary. + */ + void resetOutValue(MKLDNNMatrixPtr& out, + mkldnn::memory::primitive_desc intPD); /** - * reset input grad from primitive desc. - * this function is avaiable for input is only mkldnn - * or input do not care cpu device + * reset input grad from internal primitive desc. + * reset both internal and external buffer and create reorder if necessary. */ - virtual void resetInGrad(MKLDNNMatrixPtr& in, - mkldnn::memory::primitive_desc pd) { - LayerPtr& input = inputLayers_[0]; - const MatrixPtr& grad = - input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad; - in = MKLDNNMatrix::create(grad, pd); - Argument& arg = input->getOutput(this->getName()); - arg.grad = std::dynamic_pointer_cast(in); - } + void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD); /** - * print info about sizes + * reset output grad from internal primitive desc. + * merge grad if necessary. + * reset both internal and external buffer and create reorder if necessary. + * note: about merge grad, when this layer has several outputs, + * it could not be mixed with cpu device, + * since it can not get memory desc from cpu device. */ - virtual void printSizeInfo() { - VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_ - << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_ - << ", oh: " << oh_ << ", ow: " << ow_; - } + void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD); /** - * Print the mkldnn memory format flow of value + * reset the merge grad primitive if necessary. + * note: do not support the grads mixed with cpu device, + * since it can not get memory desc from cpu device. */ - virtual void printValueFormatFlow() { - if (inVal_ && outVal_) { - VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> " - << outVal_->getFormat(); - } - } + void resetMergeGrad(MKLDNNMatrixPtr& out); +protected: /** - * Print the mkldnn memory format flow of grad + * Set deviceId of this layer. */ - virtual void printGradFormatFlow() { - if (inGrad_ && outGrad_) { - VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< " - << outGrad_->getFormat(); + void setDevice(int id) { deviceId_ = id; } + + /** + * check the format is nchw or nc, + * which is supported by Paddle default memory layout + */ + bool isPaddleFormat(mkldnn::memory::format fmt) { + if (fmt == mkldnn::memory::format::nchw || + fmt == mkldnn::memory::format::nc) { + return true; + } else { + return false; } } -protected: /** * If input only has MKLDNN device. * Otherwise, only support the previous layer using CPU device. @@ -380,7 +259,6 @@ protected: if (prevDevice == MKLDNN_DEVICE) { return true; } else { - // do not support GPU yet CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet"; return false; } @@ -400,9 +278,61 @@ protected: } /** - * Set deviceId of this layer. + * print info about sizes */ - void setDevice(int id) { deviceId_ = id; } + virtual void printSizeInfo() { + VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_ + << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_ + << ", oh: " << oh_ << ", ow: " << ow_; + } + + /** + * print the mkldnn memory format of value + */ + virtual void printValueFormat() { + if (extInVal_) { + VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> "; + } + if (inVal_) { + VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>"; + } + if (outVal_) { + VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "; + } + if (extOutVal_) { + VLOG(MKLDNN_FMTS) << extOutVal_->getFormat(); + } + if (wgtVal_) { + VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat(); + } + if (biasVal_) { + VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat(); + } + } + + /** + * print the mkldnn memory format of grad + */ + virtual void printGradFormat() { + if (extOutGrad_) { + VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat(); + } + if (outGrad_) { + VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "; + } + if (inGrad_) { + VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<"; + } + if (extInGrad_) { + VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< "; + } + if (wgtGrad_) { + VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat(); + } + if (biasGrad_) { + VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat(); + } + } private: /** @@ -449,6 +379,19 @@ private: } } + /** + * if have cpu device, share value and grad data with output_ + */ + void shareCPUDevice() { + if (outputIsOnlyMKLDNN()) { + return; + } + for (size_t i = 0; i < outputOtherDevice_.size(); i++) { + outputOtherDevice_[i].value = output_.value; + outputOtherDevice_[i].grad = output_.grad; + } + } + /** * Check the cpu device number of outputOtherDevice_. * should have only one at most. diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp index 0e53e2d1b7e6691909955eeacd345981a9960ec6..6e89260f49979d4edb4da138507a73dc2bf120de 100644 --- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp +++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp @@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector& pipeline, resetFwdPD(fwdPD_, in, out); resetFwdPipeline(pipeline, fwdPD_, in, out); - - printValueFormatFlow(); } void MKLDNNPoolLayer::resetBwd(std::vector& pipeline, @@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector& pipeline, resetBwdPD(pd, in, out); resetBwdPipeline(pipeline, pd, in, out); - - printGradFormatFlow(); -} - -void MKLDNNPoolLayer::updateInputData() { - inVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); } void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { resetInValue(in); - resetOutValue(out); -} - -void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) { - if (inputIsOnlyMKLDNN()) { - const MatrixPtr& dnnIn = getInputValue(0); - in = std::dynamic_pointer_cast(dnnIn); - CHECK(in) << "Input should be MKLDNNMatrix"; - } else { - CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; - const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE); - in = MKLDNNMatrix::create( - cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_); - } -} - -void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) { - CHECK(inVal_) << "Should reset input value first"; memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; - out = MKLDNNMatrix::create( - output_.value, outDims, inVal_->getFormat(), engine_); - - // create reorder if output value has cpu device and pd do not match - cpuOutVal_ = nullptr; - cvtOutVal_ = nullptr; - if (!outputIsOnlyMKLDNN()) { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value; - cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_); - if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) { - out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc()); - cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_); - CHECK(cvtOutVal_) << "should not be emptry"; - } else { - cpuOut->setData(output_.value->getData()); - cpuOutVal_ = out; - } - output_.value = std::dynamic_pointer_cast(cpuOutVal_); - return; - } - output_.value = std::dynamic_pointer_cast(outVal_); + CHECK(in); + auto outPD = + MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_); + resetOutValue(out, outPD); } void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr in, MKLDNNMatrixPtr out) { - memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_}; - memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; memory::dims kernels = memory::dims{fh_, fw_}; memory::dims strides = memory::dims{sh_, sw_}; memory::dims padL = memory::dims{ph_, pw_}; @@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline( ? std::make_shared(pool_fwd(*pd, *in, *out, *workspace_)) : std::make_shared(pool_fwd(*pd, *in, *out)); pipeline.push_back(*fwd_); - - if (cvtOutVal_) { - pipeline.push_back(*cvtOutVal_); - } } void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { - resetOutGrad(out); - - resetInGrad(in); -} -void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { - cpuOutGrad_ = nullptr; - cvtOutGrad_ = nullptr; - CHECK(outVal_); - if (outputIsOnlyMKLDNN()) { - MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc()); - } else { - const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; - // always share the same grad data of CPU output - // then the activation can get the right grad from output_.grad - output_.grad->setData(cpuOut->getData()); - cpuOutGrad_ = MKLDNNMatrix::create( - cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_); - if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) { - out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc()); - cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); - CHECK(cvtOutGrad_) << "should not be emptry"; - } else { - out = cpuOutGrad_; - } - } -} - -void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) { - in = nullptr; - if (inputLayers_[0]->getOutput().grad == nullptr) { - return; - } - CHECK(inVal_); - MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc()); + CHECK(inVal_ && outVal_); + resetOutGrad(out, outVal_->getPrimitiveDesc()); + resetInGrad(in, inVal_->getPrimitiveDesc()); } void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { + pd = nullptr; + if (in == nullptr) { + return; + } memory::dims kernels = memory::dims{fh_, fw_}; memory::dims strides = memory::dims{sh_, sw_}; memory::dims padL = memory::dims{ph_, pw_}; memory::dims padR = getPaddingR(); - CHECK(in); CHECK(out); auto bwdDesc = pool_bwd::desc(poolAlgo_, in->getMemoryDesc(), @@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline( std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { - if (cvtOutGrad_) { - pipeline.push_back(*cvtOutGrad_); + if (pd == nullptr) { + return; } bwdData_ = diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h index 891e15a7efcdd2e54f61352efc1ba7345b91c76b..c5ec87828bfb28b4502b4ec6b47287089c514204 100644 --- a/paddle/gserver/layers/MKLDNNPoolLayer.h +++ b/paddle/gserver/layers/MKLDNNPoolLayer.h @@ -38,13 +38,6 @@ protected: // pooling_avg or pooling_max mkldnn::algorithm poolAlgo_; - // MKLDNNMatrixPtr which should be created from CPU Device - MKLDNNMatrixPtr cpuOutVal_; - MKLDNNMatrixPtr cpuOutGrad_; - // convert handle between CPU device and MKLDNN device - std::shared_ptr cvtOutVal_; - std::shared_ptr cvtOutGrad_; - // save forward primitive_desc, which can be used backward std::shared_ptr fwdPD_; // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/ @@ -74,8 +67,6 @@ public: MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& out) override; - void updateInputData() override; - void printSizeInfo() override { MKLDNNLayer::printSizeInfo(); VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_ @@ -90,8 +81,6 @@ protected: * reset pipeline. */ void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); - void resetInValue(MKLDNNMatrixPtr& in); - void resetOutValue(MKLDNNMatrixPtr& out); void resetFwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr in, MKLDNNMatrixPtr out); @@ -106,8 +95,6 @@ protected: * reset pipeline. */ void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); - void resetOutGrad(MKLDNNMatrixPtr& out); - void resetInGrad(MKLDNNMatrixPtr& in); void resetBwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp index 3bf6a9e176cc1235aa5ddefcedd4253e6afc1342..0a19fe23336ea943cb8a572dc40f8c0fbbd7236a 100644 --- a/paddle/gserver/tests/MKLDNNTester.cpp +++ b/paddle/gserver/tests/MKLDNNTester.cpp @@ -97,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() { parameters_[REF][i]->randomize(); dnnValue->copyFrom(*refValue); - VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName(); + VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName(); printVector(dnnValue); } } @@ -109,7 +109,7 @@ void MKLDNNTester::randomBotDatas() { dataLayers_[REF][i]->getOutputValue()->randomizeUniform(); dataLayers_[DNN][i]->getOutputValue()->copyFrom( *(dataLayers_[REF][i]->getOutputValue())); - VLOG(lvl_) << "Input " << i << " data:"; + VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i; printMatrix(dataLayers_[REF][i]->getOutputValue()); } } @@ -118,12 +118,12 @@ void MKLDNNTester::randomTopDiffs() { refLayer_->getOutputGrad()->randomizeUniform(); dnnLayer_->getOutput(CPU_DEVICE) .grad->copyFrom(*(refLayer_->getOutputGrad())); - VLOG(lvl_) << "Random Backward Input, TopDiff: "; + VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad"; printMatrix(refLayer_->getOutputGrad()); } void MKLDNNTester::checkForward() { - VLOG(MKLDNN_ALL) << "Check Forward"; + VLOG(MKLDNN_TESTS) << "Check Forward"; printTopDatas(); double delta = compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue()); @@ -131,15 +131,15 @@ void MKLDNNTester::checkForward() { } void MKLDNNTester::checkBackwardData() { - VLOG(MKLDNN_ALL) << "Check Backward Data"; + VLOG(MKLDNN_TESTS) << "Check Backward Data"; // TODO(TJ): uncomment me when batch norm ready // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm"; for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) { const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad(); const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad(); - VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i; + VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i; printMatrix(dnnDiff); - VLOG(lvl_) << "Reference Backward Output BotDiff " << i; + VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i; printMatrix(refDiff); double delta = compareMatrix(dnnDiff, refDiff); @@ -153,7 +153,7 @@ void MKLDNNTester::checkBackwardData() { } void MKLDNNTester::checkBackwardWgts() { - VLOG(MKLDNN_ALL) << "Check Backward Weight"; + VLOG(MKLDNN_TESTS) << "Check Backward Weight"; CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size()); vector dnnWgts; // used to temply save mkldnn weights saveWgt(parameters_[DNN], dnnWgts); @@ -165,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() { for (size_t i = 0; i < parameters_[DNN].size(); ++i) { const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE); const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE); - VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName(); + VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value" + << parameters_[DNN][i]->getName(); printVector(dnn); - VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName(); + VLOG(MKLDNN_ALL) << "Reference Result: weight value " + << parameters_[REF][i]->getName(); printVector(ref); double delta = compareVector(dnn, ref); @@ -240,7 +242,8 @@ void MKLDNNTester::printTopDatas() { } for (int n = 0; n < NUM; ++n) { - VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: "; + VLOG(MKLDNN_ALL) << testLayers_[n]->getType() + << " Forward Result: OutputValue"; printMatrix(testLayers_[n]->getOutputValue()); } } @@ -252,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) { std::ostringstream ostr; m->print(ostr); - VLOG(lvl_) << std::endl << ostr.str(); + VLOG(MKLDNN_ALL) << std::endl << ostr.str(); } void MKLDNNTester::printVector(const VectorPtr& v) { @@ -262,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) { std::ostringstream ostr; v->print(ostr, v->getSize()); - VLOG(lvl_) << std::endl << ostr.str(); + VLOG(MKLDNN_ALL) << std::endl << ostr.str(); } double MKLDNNTester::getDelta(const real* d1, @@ -314,7 +317,7 @@ void MKLDNNTester::runOnce() { UpdateCallback updateCallback = [](Parameter* para) { auto& grad = para->getBuf(PARAMETER_GRADIENT); auto& value = para->getBuf(PARAMETER_VALUE); - real lr = 1e-3; + real lr = 1e-2; value->add(*grad, lr); grad->zeroMem(); }; @@ -340,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn, size_t batchSize, size_t inputImgH, size_t inputImgW, + bool printDetails, size_t iter, - float epsilon, - bool log, - int level) { + float epsilon) { CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 || dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) << "should be MKLDNN layer or MKLDNN activation"; @@ -359,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn, ih_ = inputImgH; iw_ = inputImgW; + log_ = printDetails; iter_ = iter; eps_ = epsilon; - log_ = log; - lvl_ = level; // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight reset(dnn, ref, batchSize); @@ -531,9 +532,11 @@ void MKLDNNTester::getOutResult(const std::string& configPath, void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) { CHECK_EQ(ref.outValues.size(), dnn.outValues.size()); CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size()); + VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size(); for (size_t i = 0; i < ref.outValues.size(); i++) { EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps); } + VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size(); for (size_t i = 0; i < ref.paraValues.size(); i++) { EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps); } @@ -544,9 +547,10 @@ void MKLDNNTester::runBranchesTest(const std::string& configPath, float eps) { DataIn in; initArgument(in, configPath, iter); - DataOut outCpu, outDnn; + VLOG(MKLDNN_TESTS) << "runing cpu network"; getOutResult(configPath, in, outCpu, false, iter); + VLOG(MKLDNN_TESTS) << "runing mkldnn network"; getOutResult(configPath, in, outDnn, true, iter); compareResult(outCpu, outDnn, eps); diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h index 51abfcb67e2ec35fe1b0179e742a7d18f08f8a2c..c385d1c72717d120211f167b5c5eb9a557da3714 100644 --- a/paddle/gserver/tests/MKLDNNTester.h +++ b/paddle/gserver/tests/MKLDNNTester.h @@ -58,8 +58,6 @@ protected: size_t iter_; /// whether to print out the details bool log_; - /// vlog level to print the matrix details datas - int lvl_; /// epsilon float eps_; /// input image size, default 1 @@ -70,7 +68,6 @@ public: iter_ = iter; eps_ = epsilon; log_ = false; - lvl_ = MKLDNN_ALL; } ~MKLDNNTester() {} @@ -81,10 +78,9 @@ public: size_t batchSize, size_t inputImgH = 1, size_t inputImgW = 1, + bool printDetails = false, size_t iter = 3, - float epsilon = 1e-4, - bool log = false, - int level = MKLDNN_ALL); + float epsilon = 1e-4); static void runBranchesTest(const std::string& configPath, size_t iter = 3, float eps = 1e-4); diff --git a/paddle/gserver/tests/mkldnn_branches_fc.conf b/paddle/gserver/tests/mkldnn_branches_fc.conf new file mode 100644 index 0000000000000000000000000000000000000000..fb85425c2b63c7604d636e2b0c5d20d91fb5de1b --- /dev/null +++ b/paddle/gserver/tests/mkldnn_branches_fc.conf @@ -0,0 +1,58 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +settings(batch_size=16) +channels = get_config_arg("channels", int, 2) + +def two_fc(input, group_name): + out1 = fc_layer(input=input, + name=group_name+'_fc1', + size=channels, + bias_attr=False, + act=LinearActivation()) + + out2 = fc_layer(input=input, + name=group_name+'_fc2', + size=channels, + bias_attr=False, + act=LinearActivation()) + return out1, out2 + +data = data_layer(name ="input", size=channels*16*16) + +conv = img_conv_layer(input=data, + num_channels=channels, + filter_size=3, + num_filters=channels, + padding=1, + shared_biases=True, + act=LinearActivation()) + +pool = img_pool_layer(input=conv, + pool_size=3, + stride=2, + padding=1, + pool_type=AvgPooling()) + +a1, a2 = two_fc(input=pool, group_name='a') + +concat = concat_layer(input=[a1, a2]) + +b1, b2 = two_fc(input=pool, group_name='b') + +addto = addto_layer(input=[b1, b2]) + +outputs([concat, addto]) diff --git a/paddle/gserver/tests/mkldnn_branches_pool.conf b/paddle/gserver/tests/mkldnn_branches_pool.conf new file mode 100644 index 0000000000000000000000000000000000000000..ca17c74752ab0777a69f818d9f43275a6140cb4c --- /dev/null +++ b/paddle/gserver/tests/mkldnn_branches_pool.conf @@ -0,0 +1,60 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +settings(batch_size=16) +channels = get_config_arg("channels", int, 2) + +def two_pool(input, group_name): + out1 = img_pool_layer(input=input, + name=group_name+'_pool1', + pool_size=3, + stride=2, + padding=0, + pool_type=MaxPooling()) + + out2 = img_pool_layer(input=input, + name=group_name+'_pool2', + pool_size=5, + stride=2, + padding=1, + pool_type=MaxPooling()) + return out1, out2 + +data = data_layer(name ="input", size=channels*16*16) + +conv = img_conv_layer(input=data, + num_channels=channels, + filter_size=3, + num_filters=channels, + padding=1, + shared_biases=True, + act=LinearActivation()) + +pool = img_pool_layer(input=conv, + pool_size=3, + stride=1, + padding=1, + pool_type=AvgPooling()) + +a1, a2 = two_pool(input=pool, group_name='a') + +concat = concat_layer(input=[a1, a2]) + +b1, b2 = two_pool(input=pool, group_name='b') + +addto = addto_layer(input=[b1, b2]) + +outputs([concat, addto]) diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp index 3571fbb9e335fc6652bdbfc3f9e35beabda5044f..6cb4ca5e08eab5b979e404c9e09dcfec11086c22 100644 --- a/paddle/gserver/tests/test_MKLDNN.cpp +++ b/paddle/gserver/tests/test_MKLDNN.cpp @@ -250,7 +250,7 @@ TEST(MKLDNNActivation, Activations) { DECLARE_string(config_args); TEST(MKLDNNLayer, branches) { - std::vector cases = {"conv"}; + std::vector cases = {"conv", "pool", "fc"}; for (auto name : cases) { std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf"; for (auto channels : {2, 32}) { diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp index 0778bb63b7b3bca9b3d2647ca43dad72d783950a..21a8f73c3e650d4b3c3b86247594cd965f4ead35 100644 --- a/paddle/math/MKLDNNMatrix.cpp +++ b/paddle/math/MKLDNNMatrix.cpp @@ -18,7 +18,7 @@ using namespace mkldnn; // NOLINT namespace paddle { -MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) { +MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) { memory::desc md = pd.desc(); size_t ndims = md.data.ndims; int* dims = md.data.dims; @@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) { return std::make_shared(cpuMatrix, pd); } -MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, - memory::dims dims, +MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims, memory::format fmt, engine& eg, + MatrixPtr m, mkldnn::memory::data_type dtype) { - return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg)); + return create(createPrimitiveDesc(dims, fmt, eg, dtype), m); } std::shared_ptr MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src, diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h index c843115eb9a5be50d6ff873f1510844228c9d89f..fe755d096da9713e39581a909e5d21aa93d69f0f 100644 --- a/paddle/math/MKLDNNMatrix.h +++ b/paddle/math/MKLDNNMatrix.h @@ -40,24 +40,37 @@ public: /** * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc */ - static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd); + static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd, + MatrixPtr m = nullptr); /** * Create MKLDNNMatrix from a MatrixPtr and memory details info */ static MKLDNNMatrixPtr create( - MatrixPtr m, mkldnn::memory::dims dims, mkldnn::memory::format fmt, mkldnn::engine& eg, + MatrixPtr m = nullptr, mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32); + /** + * Create primitive descriptor. + * default with f32 dtype + */ + static mkldnn::memory::primitive_desc createPrimitiveDesc( + const mkldnn::memory::dims dims, + const mkldnn::memory::format& fmt, + const mkldnn::engine& eg, + const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) { + return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg); + } + /** * Create Memory descriptor. * default with any format and f32 dtype */ static mkldnn::memory::desc createMemoryDesc( - const mkldnn::memory::dims& dims, + const mkldnn::memory::dims dims, const mkldnn::memory::format& fmt = mkldnn::memory::format::any, const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) { return mkldnn::memory::desc(dims, dtype, fmt); diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc index 2d029394dd97a9c33c9c57fd3565345139cdff92..f80204c6833d6436f2cf21610beea45b36787eea 100644 --- a/paddle/operators/clip_op.cc +++ b/paddle/operators/clip_op.cc @@ -27,8 +27,8 @@ class ClipOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ClipOp should not be null."); auto x_dims = ctx->GetInputDim("X"); - auto max = Attr("max"); - auto min = Attr("min"); + auto max = ctx->Attrs().Get("max"); + auto min = ctx->Attrs().Get("min"); PADDLE_ENFORCE_LT(min, max, "max should be greater than min."); ctx->SetOutputDim("Out", x_dims); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc index 62962be205c10458634411b060caa12890c5fdc9..a0b06ac1dc305bc899f9abaafcc980a6150ecda9 100644 --- a/paddle/operators/dynamic_recurrent_op.cc +++ b/paddle/operators/dynamic_recurrent_op.cc @@ -23,6 +23,7 @@ using framework::Scope; using framework::TensorArray; using framework::LoDTensor; using framework::Variable; +using framework::OperatorBase; using framework::DySeqMetaBatch; namespace detail { @@ -43,10 +44,9 @@ inline void CreateVariables(Scope& scope, * be reordered, but the RNN op should not change the `boot_state` as an input * variable's content. */ -template -inline void ReorderBootState(const DySeqMetaBatch& metas, - const LoDTensor& boot_state, LoDTensor* tensor, - const platform::Place& dst_place) { +inline void ReorderInitialState(const DySeqMetaBatch& metas, + const LoDTensor& boot_state, LoDTensor* tensor, + const platform::Place& dst_place) { for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) { auto slice = tensor->Slice(seq_id, seq_id + 1); auto boot_slice = @@ -56,58 +56,60 @@ inline void ReorderBootState(const DySeqMetaBatch& metas, } } -} // namespace detail - -class DynamicRecurrentOpProtoAndCheckerMaker - : public framework::OpProtoAndCheckerMaker { - public: - DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto, - framework::OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(proto, op_checker) { - const auto& name = DynamicRecurrentOp::kArgName; - // inputs and outputs stored in proto - AddInput(name.inlinks, - "the inputs that need to be segmented for each step.") - .AsDuplicable(); - AddInput(name.boot_memories, "variables to initialize memories.") - .AsDuplicable(); - - AddOutput(name.outlinks, "the outputs that need to concated for all steps.") - .AsDuplicable(); - AddOutput(name.step_scopes, "step scopes"); - - // Attributes stored in AttributeMap - AddAttr>(name.pre_memories, - "names of pre-memories"); - AddAttr>(name.memories, "names of memories"); - - AddComment("This is a RNN operator for varience-length sequences."); +inline void RestoreInitialState(const DySeqMetaBatch& metas, + const LoDTensor& tensor, LoDTensor* boot_state, + const platform::Place& dst_place) { + for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) { + auto slice = tensor.Slice(seq_id, seq_id + 1); + auto boot_slice = + boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1); + boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext()); } -}; +} -void DynamicRecurrentOp::Run(const Scope& scope, - const platform::DeviceContext& dev_ctx) const { - cache_.Init(kArgName, *this, scope, &arg_); +} // namespace detail + +// Implementation for forward propagation. +template <> +void RNNAlgorithm::Run( + const framework::Scope& scope, const framework::OperatorBase& op, + const platform::DeviceContext& dev_ctx) { + SetComputeMode(ComputeMode::kForward); + cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_); SplitInputs(); CreateScopes(); WriteStepInputs(); InitStates(); WriteStepOutputs(); + RunSteps(); + ConcatOutputs(); +} - // call stepnet in all the time steps - for (size_t step = 0; step < cache_.num_steps; step++) { - auto& step_scope = cache_.GetScope(step); - stepnet_->Run(step_scope, dev_ctx); +// Implementation for backward propagation. +template <> +void RNNAlgorithm::Run( + const framework::Scope& scope, const framework::OperatorBase& op, + const platform::DeviceContext& dev_ctx) { + SetComputeMode(ComputeMode::kBackward); + cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_); + SplitInputs(); + WriteStepInputs(); + InitStates(); + WriteStepOutputs(); + RunSteps(); + // copy boot-states' gradients back. + for (const auto& state : arg_.states) { + ExportInitialStateGradient(state); } ConcatOutputs(); } -void DynamicRecurrentOp::SplitInputs() const { +void RNNAlgorithm::SplitInputs() { // TODO(superjom) make level a config // TODO(superjom) check all the inputs has the same LoD int level = 0; - for (const auto& item : cache_.inlinks) { + for (const auto& item : cache_.inputs) { const auto& var = item.second; const auto& tensor = var->Get(); TensorArray& ta = step_inputs_[item.first]; @@ -124,8 +126,8 @@ void DynamicRecurrentOp::SplitInputs() const { } } -void DynamicRecurrentOp::WriteStepInputs() const { - for (const auto& item : cache_.inlinks) { +void RNNAlgorithm::WriteStepInputs() { + for (const auto& item : cache_.inputs) { auto ta_it = step_inputs_.find(item.first); PADDLE_ENFORCE(ta_it != step_inputs_.end(), "step_inputs_ not compatible with memory set"); @@ -142,15 +144,15 @@ void DynamicRecurrentOp::WriteStepInputs() const { } } -void DynamicRecurrentOp::WriteStepOutputs() const { +void RNNAlgorithm::WriteStepOutputs() { // initialize step outputs - for (const auto& item : cache_.outlinks) { + for (const auto& item : cache_.outputs) { step_outputs_.emplace(item.first, TensorArray()); } PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL); } -void DynamicRecurrentOp::CreateScopes() const { +void RNNAlgorithm::CreateScopes() { PADDLE_ENFORCE_GT(cache_.num_steps, 0); // resize scopes size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size(); @@ -159,19 +161,19 @@ void DynamicRecurrentOp::CreateScopes() const { } // init temporary inputs - PADDLE_ENFORCE_NOT_NULL(stepnet_, "stepnet should be set first"); - std::vector memories; - std::vector pre_memories; - std::vector stepnet_outputs; - std::transform(arg_.memories.begin(), arg_.memories.end(), - std::back_inserter(memories), - [](const rnn::MemoryAttr& m) { return m.var; }); - std::transform(arg_.memories.begin(), arg_.memories.end(), - std::back_inserter(pre_memories), - [](const rnn::MemoryAttr& m) { return m.pre_var; }); - for (const auto& item : stepnet_->Outputs()) { + PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first"); + std::vector states; + std::vector ex_states; + std::vector step_unit_outputs; + std::transform(arg_.states.begin(), arg_.states.end(), + std::back_inserter(states), + [](const rnn::StateAttr& m) { return m.var; }); + std::transform(arg_.states.begin(), arg_.states.end(), + std::back_inserter(ex_states), + [](const rnn::StateAttr& m) { return m.pre_var; }); + for (const auto& item : step_unit_->Outputs()) { for (const auto& var : item.second) { - stepnet_outputs.push_back(var); + step_unit_outputs.push_back(var); } } @@ -179,13 +181,13 @@ void DynamicRecurrentOp::CreateScopes() const { auto& scope = cache_.GetScope(step); detail::CreateVariables(scope, arg_.inlinks); detail::CreateVariables(scope, arg_.outlinks); - detail::CreateVariables(scope, memories); - detail::CreateVariables(scope, pre_memories); - detail::CreateVariables(scope, stepnet_outputs); + detail::CreateVariables(scope, states); + detail::CreateVariables(scope, ex_states); + detail::CreateVariables(scope, step_unit_outputs); } } -void DynamicRecurrentOp::ConcatOutputs() const { +void RNNAlgorithm::ConcatOutputs() { // TODO(superjom) transform this to a config int level = 0; for (size_t step = 0; step < cache_.num_steps; step++) { @@ -198,31 +200,45 @@ void DynamicRecurrentOp::ConcatOutputs() const { item.second.WriteShared(step, *tensor); } } - // the inlinks' lods should be the same, so randomly get one lod. + // the inputs' lods should be the same, so randomly get one lod. const auto& some_lod = cache_.scope->FindVar(arg_.inlinks.front())->Get().lod(); const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; for (auto& item : step_outputs_) { auto tensor = item.second.Pack(level, some_meta, some_lod); - auto* output = cache_.outlinks[item.first]->GetMutable(); + auto* output = cache_.outputs[item.first]->GetMutable(); const_cast(output)->ShareDataWith(tensor); } } -void DynamicRecurrentOp::InitStates() const { +void RNNAlgorithm::RunSteps() { + if (IsBackward()) { + // call stepnet in all the time steps reversely + for (int step = cache_.num_steps - 1; step >= 0; step--) { + auto& step_scope = cache_.GetScope(step); + step_unit_->Run(step_scope, *cache_.dev_ctx); + } + } else { + for (size_t step = 0; step < cache_.num_steps; step++) { + auto& step_scope = cache_.GetScope(step); + step_unit_->Run(step_scope, *cache_.dev_ctx); + } + } +} + +void RNNAlgorithm::InitStates() { for (size_t step = 0; step < cache_.num_steps; step++) { - for (const auto& memory : arg_.memories) { - CreateState(memory, step); - LinkState(memory, step); + for (const auto& state : arg_.states) { + CreateState(state, step); + LinkState(state, step); } } } -void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory, - size_t step) const { +void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) { auto& scope = cache_.GetScope(step); - auto& state = *cache_.GetTensor(scope, memory.var); - auto& boot_state = *cache_.GetTensor(*cache_.scope, memory.boot_var); + auto& state = *cache_.GetTensor(scope, state_attr.var); + auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var); size_t num_instances = step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; @@ -231,56 +247,79 @@ void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory, state.Resize(dims); state.mutable_data(platform::CPUPlace()); - states_[memory.var].WriteShared(step, state); + states_[state_attr.var].WriteShared(step, state); } -void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory, - size_t step) const { +void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) { auto& scope = cache_.GetScope(step); - auto& state_pre = *cache_.GetTensor(scope, memory.pre_var); + auto& state_pre = *cache_.GetTensor(scope, state.pre_var); + + // process the first state's boot-state(the 0-step in forward mode or the + // last step in backward mode) + // Only forward mode need to link the boot-state to the `pre-state` in first + // time step. In backward mode, need to copy the gradient of `pre-state` in + // first time step to the gradient of `boot-state`. + if (step == 0 && IsForward()) { + LinkInitialState(state); + } else { + size_t num_instances = + step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; + auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var); + // shink and share from previous state + auto shrinked_pre_state = pre_state->Slice(0, num_instances); + state_pre.ShareDataWith(shrinked_pre_state); + } +} +void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) { // all the step_inputs' metas should be the same, just randomly select one // and get the dyseq meta. const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; - size_t num_instances = - step_inputs_[arg_.inlinks.front()].Read(step).dims()[0]; + auto& scope = cache_.GetScope(0); + auto& state_pre = *cache_.GetTensor(scope, state.pre_var); + auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var); + pre_state->mutable_data(platform::CPUPlace()); + // allocate state + state_pre.Resize(pre_state->dims()); + state_pre.mutable_data(platform::CPUPlace()); + detail::ReorderInitialState(some_meta, *pre_state, &state_pre, + pre_state->place()); +} - LoDTensor* pre_state{nullptr}; - if (step == 0) { - pre_state = cache_.GetTensor(*cache_.scope, memory.boot_var); - pre_state->mutable_data(platform::CPUPlace()); - // allocate memory - state_pre.Resize(pre_state->dims()); - state_pre.mutable_data(platform::CPUPlace()); - detail::ReorderBootState(some_meta, *pre_state, &state_pre, - pre_state->place()); - } else { - pre_state = cache_.GetTensor(cache_.GetScope(step - 1), memory.var); - } +void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) { + // all the step_inputs' metas should be the same, just randomly select one + // and get the dyseq meta. + const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()]; + auto& scope = cache_.GetScope(0); - // shink and share from previous state - auto shrinked_pre_state = pre_state->Slice(0, num_instances); - state_pre.ShareDataWith(shrinked_pre_state); + auto& state_pre = *cache_.GetTensor(scope, state.pre_var); + auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var); + pre_state.Resize(state_pre.dims()); + detail::RestoreInitialState(some_meta, state_pre, &pre_state, + pre_state.place()); } -void DynamicRecurrentOp::ArgCache::Init( - const rnn::ArgumentName& name, const paddle::framework::OperatorBase& op, - const paddle::framework::Scope& scope, rnn::Argument* arg) { +void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name, + const paddle::framework::OperatorBase& op, + const paddle::framework::Scope& scope, + platform::DeviceContext const* dev_ctx, + rnn::Argument* arg) { this->scope = &scope; InitArgument(name, op, arg); CacheScopes(scope, *arg); CacheInlinks(scope, arg->inlinks); CacheOutlinks(scope, arg->outlinks); + this->dev_ctx = dev_ctx; } -void DynamicRecurrentOp::ArgCache::InitArgument(const rnn::ArgumentName& name, - const OperatorBase& op, - rnn::Argument* arg) { +void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name, + const OperatorBase& op, + rnn::Argument* arg) { rnn::InitArgument(name, arg, op, false /*is_grad*/); } -void DynamicRecurrentOp::ArgCache::CacheScopes(const Scope& scope, - const rnn::Argument& arg) { +void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope, + const rnn::Argument& arg) { auto scopes_var = scope.FindVar(arg.step_scopes); PADDLE_ENFORCE(scopes_var != nullptr, "the step_scopes output argument [%s] should be created first " @@ -289,45 +328,85 @@ void DynamicRecurrentOp::ArgCache::CacheScopes(const Scope& scope, this->scopes = scopes_var->GetMutable>(); } -void DynamicRecurrentOp::ArgCache::CacheInlinks( +void RNNAlgorithm::ArgCache::CacheInlinks( const Scope& scope, const std::vector& names) { for (auto name : names) { auto* var = GetVariable(scope, name); - inlinks[name] = var; + inputs[name] = var; } } -void DynamicRecurrentOp::ArgCache::CacheOutlinks( +void RNNAlgorithm::ArgCache::CacheOutlinks( const Scope& scope, const std::vector& names) { for (auto name : names) { auto* var = GetVariable(scope, name); - outlinks[name] = var; + outputs[name] = var; } } -Variable* DynamicRecurrentOp::ArgCache::GetVariable(const Scope& scope, - const std::string& name) { +Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope, + const std::string& name) { auto* var = scope.FindVar(name); PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name); return var; } -LoDTensor* DynamicRecurrentOp::ArgCache::GetTensor( - const framework::Scope& scope, const std::string& name) { +LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope, + const std::string& name) { auto* var = GetVariable(scope, name); return var->GetMutable(); } -const rnn::ArgumentName DynamicRecurrentOp::kArgName{ - "step_net", "step_scopes", "inlinks", "outlinks", - "memories", "pre_memories", "boot_memories"}; +const std::array RNNAlgorithm::kArgNames{ + {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs", + "states", "ex_states", "initial_states"}, + rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD", + "inputs@GRAD", "states", "ex_states", + "initial_states@GRAD"}}}; + +void DynamicRecurrentOp::Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const { + rnn.Run( + scope, *dynamic_cast(this), dev_ctx); +} void DynamicRecurrentGradientOp::Run( - const Scope& scope, const platform::DeviceContext& dev_ctx) const {} + const Scope& scope, const platform::DeviceContext& dev_ctx) const { + rnn.Run( + scope, *dynamic_cast(this), dev_ctx); +} + +class DynamicRecurrentOpProtoAndCheckerMaker + : public framework::OpProtoAndCheckerMaker { + public: + DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + const auto& name = + RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward]; + // inputs and outputs stored in proto + AddInput(name.inlinks, + "the inputs that need to be segmented for each step.") + .AsDuplicable(); + AddInput(name.initial_states, "variables to initialize states.") + .AsDuplicable(); + + AddOutput(name.outlinks, "the outputs that need to concated for all steps.") + .AsDuplicable(); + AddOutput(name.step_scopes, "step scopes"); + + // Attributes stored in AttributeMap + AddAttr>(name.ex_states, "names of ex_states"); + AddAttr>(name.states, "names of states"); + + AddComment("This is a RNN operator for varience-length sequences."); + } +}; } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT( - dynamic_recurrent, paddle::operators::DynamicRecurrentOp, - paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker); +REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp, + paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker, + dynamic_recurrent_grad, + paddle::operators::DynamicRecurrentGradientOp); diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h index ec80a1c90eee3a655febe0dd3d6c67c16ec6c64b..5b0548c3a44c9f58838ecc567ee41a587883c26a 100644 --- a/paddle/operators/dynamic_recurrent_op.h +++ b/paddle/operators/dynamic_recurrent_op.h @@ -27,47 +27,39 @@ namespace paddle { namespace operators { -class DynamicRecurrentOp : public framework::OperatorBase { +class RNNAlgorithm { public: - static const rnn::ArgumentName kArgName; + enum ComputeMode { kForward = 0, kBackward = 1 }; + static const std::array kArgNames; using value_type = float; - DynamicRecurrentOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - DynamicRecurrentOp(const DynamicRecurrentOp& o) - : framework::OperatorBase( - static_cast(o)) { - // TODO(yuyang18): Implement copy ctor well. - PADDLE_THROW("Not implemented"); - } - - void Run(const framework::Scope& scope, - const platform::DeviceContext& dev_ctx) const override; - + /* + * Different `Run` method for forward and backward, `_` is just for template + * specifialization. + */ + template + void Run(const framework::Scope& scope, const framework::OperatorBase& op, + const platform::DeviceContext& dev_ctx); /* * Split the inputs(LoDTensors) to segments for each time step. */ - void SplitInputs() const; + void SplitInputs(); /* * Create step-scopes to store temporary outputs in each time steps. */ - void CreateScopes() const; + void CreateScopes(); /* * Link TensorArray steps to the corresponding variables located in * step-scopes. */ - void WriteStepInputs() const; + void WriteStepInputs(); /* * Write output of each step to the corresponding TensorArray. */ - void WriteStepOutputs() const; + void WriteStepOutputs(); /* * Initialize the states, each state will have a corresponding pre-state, @@ -75,54 +67,83 @@ class DynamicRecurrentOp : public framework::OperatorBase { * pre-state in the first time step will be initialized with an zero tensor or * a tensor in parent scope if is provided. */ - void InitStates() const; + void InitStates(); /* * Create state variables for each time step. */ - void CreateState(const rnn::MemoryAttr& memory, size_t step) const; + void CreateState(const rnn::StateAttr& state, size_t step); /* * Link pre-state variable in current scope to the state variable in the - * previous time step (scope). + * previous time step (scope) by reference. + */ + void LinkState(const rnn::StateAttr& state, size_t step); + + /* + * Link the pre-state of the first time step to the `boot-state` in parent's + * scope. + */ + void LinkInitialState(const rnn::StateAttr& state); + + /* + * Copy the gradient from `pre-state` in the first step-scope to the + * `boot-state` in parent's scope. + */ + void ExportInitialStateGradient(const rnn::StateAttr& state); + + /* + * Calculate time steps. */ - void LinkState(const rnn::MemoryAttr& memory, size_t step) const; + void RunSteps(); /* * Concatenate outputs in each time step and generate a LoDTensor. */ - void ConcatOutputs() const; + void ConcatOutputs(); + + void SetComputeMode(ComputeMode mode) { mode_ = mode; } + bool IsForward() const { return mode_ == ComputeMode::kForward; } + bool IsBackward() const { return mode_ == ComputeMode::kBackward; } /* - * set a stepnet that is created according to a RecurrentOp's stepnet. + * set a step unit that is created according to a RecurrentOp's step unit. */ - void SetStepNet(std::unique_ptr net) { - PADDLE_ENFORCE_NOT_NULL(net); - stepnet_ = std::move(net); + void SetStepUnit(std::unique_ptr step_unit) { + PADDLE_ENFORCE_NOT_NULL(step_unit); + step_unit_ = std::move(step_unit); } - const OperatorBase& GetStepNet() const { return *stepnet_; } + const framework::OperatorBase& GetStepUnit() const { return *step_unit_; } const framework::TensorArray& state(const std::string& name) const { - return states_[name]; + auto it = states_.find(name); + PADDLE_ENFORCE(it != states_.end()); + return it->second; } const framework::TensorArray& step_input(const std::string& name) const { - return step_inputs_[name]; + auto it = step_inputs_.find(name); + PADDLE_ENFORCE(it != step_inputs_.end()); + return it->second; } const framework::TensorArray& step_output(const std::string& name) const { - return step_outputs_[name]; + auto it = step_outputs_.find(name); + PADDLE_ENFORCE(it != step_outputs_.end()); + return it->second; } protected: struct ArgCache { framework::Scope const* scope; std::vector* scopes; - std::map inlinks; - std::map outlinks; + std::map inputs; + std::map outputs; + platform::DeviceContext const* dev_ctx; size_t num_steps{0}; - void Init(const rnn::ArgumentName& name, const OperatorBase& op, - const framework::Scope& scope, rnn::Argument* arg); + void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op, + const framework::Scope& scope, + platform::DeviceContext const* dev_ctx, rnn::Argument* arg); framework::Scope& GetScope(size_t index) { PADDLE_ENFORCE_LT(index, num_steps); @@ -133,8 +154,8 @@ class DynamicRecurrentOp : public framework::OperatorBase { const std::string& name); private: - void InitArgument(const rnn::ArgumentName& name, const OperatorBase& op, - rnn::Argument* arg); + void InitArgument(const rnn::ArgumentName& name, + const framework::OperatorBase& op, rnn::Argument* arg); void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg); void CacheInlinks(const framework::Scope& scope, const std::vector& names); @@ -145,27 +166,49 @@ class DynamicRecurrentOp : public framework::OperatorBase { }; private: - std::unique_ptr stepnet_; - mutable std::map states_; - mutable std::map step_inputs_; - mutable std::map step_outputs_; - mutable std::map> - dy_seq_metas_; - mutable rnn::Argument arg_; - mutable ArgCache cache_; + std::unique_ptr step_unit_; + std::map states_; + std::map step_inputs_; + std::map step_outputs_; + std::map> dy_seq_metas_; + rnn::Argument arg_; + ArgCache cache_; + ComputeMode mode_{ComputeMode::kForward}; #ifdef PADDLE_WITH_TESTING - friend class DynamicRecurrentOpTestHelper; - FRIEND_TEST(DynamicRecurrentOpTestHelper, SplitInputs); - FRIEND_TEST(DynamicRecurrentOpTestHelper, CreateCache); - FRIEND_TEST(DynamicRecurrentOpTestHelper, CreateScopes); - FRIEND_TEST(DynamicRecurrentOpTestHelper, WriteStepInputs); - FRIEND_TEST(DynamicRecurrentOpTestHelper, WriteStepOutputs); - FRIEND_TEST(DynamicRecurrentOpTestHelper, InitStates); - FRIEND_TEST(DynamicRecurrentOpTestHelper, ConcatOutputs); + // test forward + friend class RNNAlgorithmTestHelper; + FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs); + FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache); + FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes); + FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs); + FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs); + FRIEND_TEST(RNNAlgorithmTestHelper, InitStates); + FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs); +// TODO(superjom) test backward #endif }; +class DynamicRecurrentOp : public framework::OperatorBase { + public: + DynamicRecurrentOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + DynamicRecurrentOp(const DynamicRecurrentOp& o) + : framework::OperatorBase( + static_cast(o)) { + PADDLE_THROW("Not implemented"); + } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const override; + + mutable RNNAlgorithm rnn; +}; + class DynamicRecurrentGradientOp : public framework::OperatorBase { public: DynamicRecurrentGradientOp(const std::string& type, @@ -174,8 +217,16 @@ class DynamicRecurrentGradientOp : public framework::OperatorBase { const framework::AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs) {} + DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o) + : framework::OperatorBase( + static_cast(o)) { + PADDLE_THROW("Not implemented"); + } + void Run(const framework::Scope& scope, const platform::DeviceContext& dev_ctx) const override; + + mutable RNNAlgorithm rnn; }; } // namespace operators diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc index 36f405568d7e4ed9a469c3af7a80192b83142b7a..fff63efb24c70b7e864e2d5b011a22883c13dede 100644 --- a/paddle/operators/dynamic_recurrent_op_test.cc +++ b/paddle/operators/dynamic_recurrent_op_test.cc @@ -43,16 +43,16 @@ LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims, return tensor; } -class DynamicRecurrentOpTestHelper : public ::testing::Test { +class RNNAlgorithmTestHelper : public ::testing::Test { protected: - const rnn::ArgumentName argname = DynamicRecurrentOp::kArgName; + const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0]; virtual void SetUp() override { CreateGlobalVariables(); auto op_desc = CreateOpDesc(); op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr); - dop = dynamic_cast(op.get()); + dop = &(dynamic_cast(op.get())->rnn); InitCacheManually(); InitStepNet(); } @@ -63,20 +63,20 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test { op_desc.set_type("dynamic_recurrent"); OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs()); - OpDescNewVar(argname.boot_memories, {"boot_mem"}, op_desc.add_inputs()); + OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs()); OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs()); OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs()); - // set pre-memories + // set pre-states auto pre_memories = op_desc.mutable_attrs()->Add(); - pre_memories->set_name(argname.pre_memories); + pre_memories->set_name(argname.ex_states); pre_memories->set_type(paddle::framework::AttrType::STRINGS); auto pre_memories_item = pre_memories->add_strings(); *pre_memories_item = "mem@pre"; - // set memories + // set states auto memories = op_desc.mutable_attrs()->Add(); - memories->set_name(argname.memories); + memories->set_name(argname.states); memories->set_type(paddle::framework::AttrType::STRINGS); auto memories_item = memories->add_strings(); *memories_item = "mem"; @@ -113,32 +113,33 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test { } void InitCacheManually() { - dop->cache_.Init(DynamicRecurrentOp::kArgName, *dop, scope, &dop->arg_); + dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context, + &dop->arg_); } void InitStepNet() { std::unique_ptr stepnet{new NetOp}; dynamic_cast(stepnet.get()) ->AppendOp(std::unique_ptr(new TestOp( - "test", {{"inlinks", {"in0"}}, {"boot_memories", {"boot_mem"}}}, - {{"outlinks", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {}))); - dop->SetStepNet(std::move(stepnet)); + "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}}, + {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {}))); + dop->SetStepUnit(std::move(stepnet)); } protected: - DynamicRecurrentOp* dop; + RNNAlgorithm* dop; std::unique_ptr op; paddle::platform::CPUDeviceContext device_context; paddle::framework::Scope scope; }; -TEST_F(DynamicRecurrentOpTestHelper, CreateCache) { +TEST_F(RNNAlgorithmTestHelper, CreateCache) { const rnn::Argument& arg = dop->arg_; ASSERT_EQ(arg.inlinks.size(), 1UL); ASSERT_EQ(arg.outlinks.size(), 1UL); } -TEST_F(DynamicRecurrentOpTestHelper, SplitInputs) { +TEST_F(RNNAlgorithmTestHelper, SplitInputs) { dop->SplitInputs(); auto& in0_ta = dop->step_inputs_["in0"]; ASSERT_EQ(in0_ta.size(), 4UL); @@ -153,14 +154,14 @@ TEST_F(DynamicRecurrentOpTestHelper, SplitInputs) { EXPECT_EQ(batch3.dims()[0], 1); } -TEST_F(DynamicRecurrentOpTestHelper, CreateScopes) { +TEST_F(RNNAlgorithmTestHelper, CreateScopes) { dop->SplitInputs(); dop->CreateScopes(); ASSERT_EQ(dop->cache_.num_steps, 4UL); ASSERT_EQ(dop->cache_.scopes->size(), 4UL); } -TEST_F(DynamicRecurrentOpTestHelper, WriteStepInputs) { +TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) { dop->SplitInputs(); dop->CreateScopes(); dop->WriteStepInputs(); @@ -173,7 +174,7 @@ TEST_F(DynamicRecurrentOpTestHelper, WriteStepInputs) { } } -TEST_F(DynamicRecurrentOpTestHelper, WriteStepOutputs) { +TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) { dop->SplitInputs(); dop->CreateScopes(); dop->WriteStepInputs(); @@ -187,11 +188,12 @@ TEST_F(DynamicRecurrentOpTestHelper, WriteStepOutputs) { } } -TEST_F(DynamicRecurrentOpTestHelper, ConcatOutputs) { +TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) { // Let's leave this test to python unittest. } -TEST_F(DynamicRecurrentOpTestHelper, InitStates) { +TEST_F(RNNAlgorithmTestHelper, InitStates) { + dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward); dop->SplitInputs(); dop->CreateScopes(); dop->WriteStepInputs(); @@ -208,12 +210,6 @@ TEST_F(DynamicRecurrentOpTestHelper, InitStates) { auto* boot_state = scope.FindVar("boot_mem"); ASSERT_TRUE(boot_state != nullptr); - - if (step == 0) { - // check pre_state is a reference of boot_state - ASSERT_EQ(boot_state->Get().data(), - pre_state->Get().data()); - } } } diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc index f59f497d9f32069b764a9f777c7e9d6da9cdb108..04dfdf7c48381240108cf924979764966599151f 100644 --- a/paddle/operators/gaussian_random_op.cc +++ b/paddle/operators/gaussian_random_op.cc @@ -59,7 +59,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel { protected: framework::DataType IndicateDataType( const framework::ExecutionContext& ctx) const override { - return static_cast(Attr("data_type")); + return static_cast(ctx.Attr("data_type")); } }; diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc index 9be4d15a43d87ae1a27c81498e8b19b0049a3bfa..2d4d6f13720f0e6888edbddcb3243116506227ba 100644 --- a/paddle/operators/momentum_op.cc +++ b/paddle/operators/momentum_op.cc @@ -75,12 +75,17 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("VelocityOut", "(Tensor) Output updated velocity"); AddAttr("mu", "(float) Momentum coefficient"); + AddAttr("useNesterov", "(bool) Use Nesterov Momentum") + .SetDefault(false); AddComment(R"DOC( -Momentum Algorithm (momentum). +Momentum Algorithm with a flag for Nestrov Moemntum (momentum). velocity = mu * velocity + gradient -param = param - learning_rate * velocity +if (use_nesterov): + param = param - gradient * learning_rate + mu * velocity * learning_rate +else: + param = param - learning_rate * velocity )DOC"); } diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h index f7a724f048782ceee8509ddafcb4834fd8dbba8a..e6d6d1da3df9f7e43a93fcc2e12658a01a491f81 100644 --- a/paddle/operators/momentum_op.h +++ b/paddle/operators/momentum_op.h @@ -34,6 +34,7 @@ class MomentumOpKernel : public framework::OpKernel { velocity_out->mutable_data(ctx.GetPlace()); float mu = ctx.Attr("mu"); + bool use_nesterov = ctx.Attr("useNesterov"); auto p_out = framework::EigenVector::Flatten(*param_out); auto v_out = framework::EigenVector::Flatten(*velocity_out); @@ -46,8 +47,14 @@ class MomentumOpKernel : public framework::OpKernel { auto place = ctx.GetEigenDevice(); Eigen::DSizes grad_dsize(grad->numel()); + v_out.device(place) = v * mu + g; - p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out; + if (use_nesterov) { + p_out.device(place) = p - g * lr.broadcast(grad_dsize) + + v_out * mu * lr.broadcast(grad_dsize); + } else { + p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out; + } } }; diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index dcc90e5d87c9d54df520fcee1b48198bcd953eb1..40303e3adf4db7e8336ed72667fe69afa56c3f69 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -42,7 +42,7 @@ void RecurrentAlgorithm::Run(const Scope& scope, for (size_t step_id = 0; step_id < seq_len; step_id++) { if (step_id > 0) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1); + rnn::LinkMemories(step_scopes, arg_->states, step_id, -1); } (*stepnet_)->Run(*step_scopes[step_id], dev_ctx); } @@ -59,7 +59,8 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope, // Now all variables in scope must be created outside of op. PADDLE_ENFORCE_NOT_NULL(stepnet_); - PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs"); + PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), + "step_unit_ op has no outputs"); if (seq_len > step_scopes->size()) { for (size_t i = step_scopes->size(); i < seq_len; ++i) { @@ -86,7 +87,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope, } void RecurrentAlgorithm::InitMemories(Scope* step_scope) const { - for (auto& attr : arg_->memories) { + for (auto& attr : arg_->states) { auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable(); PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, "memory [%s]'s boot variable [%s] not exists", attr.var, @@ -100,12 +101,12 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope) const { } const rnn::ArgumentName RecurrentOp::kArgName{ - "step_net", "step_scopes", "inlinks", "outlinks", - "memories", "pre_memories", "boot_memories"}; + "step_net", "step_scopes", "inputs", "outputs", + "states", "ex_states", "initial_states"}; const rnn::ArgumentName RecurrentGradientOp::kArgName{ - "step_net", "step_scopes@GRAD", "outlinks@GRAD", "inlinks@GRAD", - "memories", "pre_memories", "boot_memories@GRAD"}; + "step_net", "step_scopes@GRAD", "outputs@GRAD", "inputs@GRAD", + "states", "ex_states", "initial_states@GRAD"}; RecurrentOp::RecurrentOp(const std::string& type, const framework::VariableNameMap& inputs, @@ -127,7 +128,7 @@ class RecurrentAlgorithmProtoAndCheckerMaker AddInput(name.inlinks, "the inputs that need to be segmented for each step.") .AsDuplicable(); - AddInput(name.boot_memories, "variables to initialize memories.") + AddInput(name.initial_states, "variables to initialize states.") .AsDuplicable(); AddOutput(name.outlinks, "the outputs that need to concated for all steps.") @@ -135,9 +136,8 @@ class RecurrentAlgorithmProtoAndCheckerMaker AddOutput(name.step_scopes, "step scopes"); // Attributes stored in AttributeMap - AddAttr>(name.pre_memories, - "names of pre-memories"); - AddAttr>(name.memories, "names of memories"); + AddAttr>(name.ex_states, "names of pre-states"); + AddAttr>(name.states, "names of states"); AddComment("This is a recurrent group operator."); } @@ -152,7 +152,7 @@ void RecurrentGradientAlgorithm::Run( rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len); for (int step_id = seq_len - 1; step_id >= 0; --step_id) { if (static_cast(step_id) != seq_len - 1) { - rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1); + rnn::LinkMemories(step_scopes, arg_->states, step_id, 1); } (*stepnet_)->Run(*step_scopes[step_id], dev_ctx); } @@ -162,7 +162,7 @@ void RecurrentGradientAlgorithm::Run( void RecurrentGradientAlgorithm::LinkBootMemoryGradients( Scope* step_scope) const { - for (auto& attr : arg_->memories) { + for (auto& attr : arg_->states) { PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr, "memory variable [%s] does not exists", attr.var); PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc index 5e878353ce978830ede03ca6284719615ed39718..46f66a1370a35593d1911fc9b3ce76beb38c0956 100644 --- a/paddle/operators/reduce_op.cc +++ b/paddle/operators/reduce_op.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/operators/reduce_op.h" +#include "paddle/operators/net_op.h" namespace paddle { namespace operators { @@ -159,6 +160,66 @@ class ReduceMinOpMaker : public ReduceOpMaker { } }; +class NormOp : public NetOp { + public: + NormOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : NetOp(type, inputs, outputs, attrs) { + PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName, + "Input(X) of NormOp should not be null."); + PADDLE_ENFORCE_NE(Output("AbsOut"), framework::kEmptyVarName, + "Output(AbsOut) of NormOp should not be null."); + PADDLE_ENFORCE_NE(Output("PowOut"), framework::kEmptyVarName, + "Output(PowOut) of NormOp should not be null."); + PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName, + "Output(SumOut) of NormOp should not be null."); + PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName, + "Output(Out) of NormOp should not be null."); + auto dim = Attr("dim"); + auto keep_dim = Attr("keep_dim"); + auto p = Attr("p"); + PADDLE_ENFORCE_GT(p, 0, "Order of the norm should be positive."); + AppendOp(framework::OpRegistry::CreateOp("abs", {{"X", {Input("X")}}}, + {{"Y", {Output("AbsOut")}}}, {})); + AppendOp(framework::OpRegistry::CreateOp("pow", {{"X", {Output("AbsOut")}}}, + {{"Y", {Output("PowOut")}}}, + {{"factor", p}})); + framework::AttributeMap sum_attr; + sum_attr["dim"] = dim; + sum_attr["keep_dim"] = keep_dim; + AppendOp(framework::OpRegistry::CreateOp( + "reduce_sum", {{"X", {Output("PowOut")}}}, + {{"Out", {Output("SumOut")}}}, sum_attr)); + AppendOp(framework::OpRegistry::CreateOp( + "pow", {{"X", {Output("SumOut")}}}, {{"Y", {Output("Out")}}}, + {{"factor", static_cast(1. / p)}})); + CompleteAddOp(false); + } +}; + +class NormOpMaker : public ReduceOpMaker { + public: + NormOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : ReduceOpMaker(proto, op_checker) { + AddOutput("AbsOut", + "(Tensor) The intermediate output of Norm operator, " + "saving the absolute value of the input tensor X.") + .AsIntermediate(); + AddOutput("PowOut", + "(Tensor) The intermediate output of Norm operator, " + "saving the p-th power of the output tensor AbsOut.") + .AsIntermediate(); + AddOutput("SumOut", + "(Tensor) the intermediate output of Norm operator, " + "saving the sum of PowOut reduced on the given dimension.") + .AsIntermediate(); + AddAttr("p", "(float, default 2) The order of Norm.").SetDefault(2); + SetComment("Norm", "vector p-norm"); + AddComment(comment_); + } +}; + } // namespace operators } // namespace paddle @@ -176,6 +237,8 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, ops::ReduceGradOp); +REGISTER_OP_WITHOUT_GRADIENT(norm, ops::NormOp, ops::NormOpMaker); + #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ REGISTER_OP_CPU_KERNEL( \ reduce_type, \ diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc index d0725f50230f70e927fd2bf55b5932dfd2347d6a..ee61ea300c33722471189d06eb09f67a083d2a4d 100644 --- a/paddle/operators/rnn/recurrent_op_utils.cc +++ b/paddle/operators/rnn/recurrent_op_utils.cc @@ -36,7 +36,7 @@ void SegmentInputs(const std::vector& step_scopes, LoDTensor* input = input_var->GetMutable(); f::DDim dims = input->dims(); PADDLE_ENFORCE_EQ(static_cast(dims[0]), seq_len, - "all the inlinks be the same length"); + "all the inputs be the same length"); f::DDim step_dims = slice_ddim(dims, 1, dims.size()); for (size_t j = 0; j < seq_len; j++) { Tensor* step_input = @@ -78,7 +78,7 @@ void ConcatOutputs(const std::vector& step_scopes, } void LinkMemories(const std::vector& scopes, - const std::vector& memories, + const std::vector& memories, const size_t step_id, const int offset) { PADDLE_ENFORCE_LT(step_id, scopes.size(), "step [%d] is out of range of step scopes' size [%d]", @@ -106,26 +106,26 @@ void InitArgument(const ArgumentName& name, Argument* arg, arg->inlinks = op.Inputs(name.inlinks); arg->outlinks = op.Outputs(name.outlinks); - auto& boot_memories = - is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories); + auto& boot_memories = is_grad ? op.Outputs(name.initial_states) + : op.Inputs(name.initial_states); // attributes - auto& memories = op.Attr>(name.memories); - auto& pre_memories = op.Attr>(name.pre_memories); + auto& memories = op.Attr>(name.states); + auto& pre_memories = op.Attr>(name.ex_states); PADDLE_ENFORCE(memories.size() == boot_memories.size(), - "the size of memories, boot_memories don't match:%d,%d", + "the size of states, initial_states don't match:%d,%d", memories.size(), boot_memories.size()); PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(), - "the size of pre_memories, boot_memories don't match:%d,%d", + "the size of ex_states, initial_states don't match:%d,%d", pre_memories.size(), boot_memories.size()); - PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set"); + PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set"); for (size_t i = 0; i < memories.size(); ++i) { - rnn::MemoryAttr mem_attr; + rnn::StateAttr mem_attr; mem_attr.var = memories[i]; mem_attr.pre_var = pre_memories[i]; mem_attr.boot_var = boot_memories[i]; - (arg->memories).push_back(mem_attr); + (arg->states).push_back(mem_attr); } } diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h index fe173edb24ad015b9546546565027358f9b93476..fb0e158e07745d58c6211d33e385b324e492b95e 100644 --- a/paddle/operators/rnn/recurrent_op_utils.h +++ b/paddle/operators/rnn/recurrent_op_utils.h @@ -31,7 +31,7 @@ using Scope = framework::Scope; * boot memories in father scope. Other attributes are copied from Op's proto * attributes. */ -struct MemoryAttr { +struct StateAttr { // name of current state variable std::string var; // name of previous step's state variable @@ -46,7 +46,7 @@ struct Argument { std::string step_scopes; std::vector inlinks; std::vector outlinks; - std::vector memories; + std::vector states; }; struct ArgumentName { @@ -54,9 +54,9 @@ struct ArgumentName { std::string step_scopes; std::string inlinks; std::string outlinks; - std::string memories; // the memory name - std::string pre_memories; // the previous memory name - std::string boot_memories; // the boot memory name + std::string states; // the memory name + std::string ex_states; // the previous memory name + std::string initial_states; // the boot memory name }; /** @@ -74,7 +74,7 @@ void ConcatOutputs(const std::vector& step_scopes, const size_t seq_len, const platform::DeviceContext& ctx); void LinkMemories(const std::vector& step_scopes, - const std::vector& memories, const size_t step_id, + const std::vector& memories, const size_t step_id, const int offset); void InitArgument(const ArgumentName& name, Argument* arg, diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc index f244ddc51fab3a6a82ffe517e35a97bc77f61b3e..39b53948e3cc58ff1d0ab481143b066b1a2fae16 100644 --- a/paddle/operators/uniform_random_op.cc +++ b/paddle/operators/uniform_random_op.cc @@ -65,7 +65,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { protected: framework::DataType IndicateDataType( const framework::ExecutionContext& ctx) const override { - return static_cast(Attr("data_type")); + return static_cast(ctx.Attr("data_type")); } }; diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 9ef47b88fd08b29ad0c917966c499e8d44f1e7af..26b793a4bbf5df7a2635838a6c6a8264ca8ebb67 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -413,18 +413,18 @@ All parameter, weight, gradient are variables in Paddle. return static_cast( rnn_op.release()); }) - .def("set_stepnet", + .def("set_step_unit", [](operators::DynamicRecurrentOp &self, const operators::NetOp &net) - -> void { self.SetStepNet(net.Clone()); }) + -> void { self.rnn.SetStepUnit(net.Clone()); }) .def("get_state", [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.state(name); }) + -> const TensorArray & { return self.rnn.state(name); }) .def("get_step_input", [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.step_input(name); }) + -> const TensorArray & { return self.rnn.step_input(name); }) .def("get_step_output", [](operators::DynamicRecurrentOp &self, const std::string &name) - -> const TensorArray & { return self.step_output(name); }); + -> const TensorArray & { return self.rnn.step_output(name); }); // cond_op py::class_(m, "CondOp") @@ -466,6 +466,8 @@ All parameter, weight, gradient are variables in Paddle. BindVarDsec(m); BindOpDesc(m); + m.def("op_support_gpu", OpSupportGPU); + return m.ptr(); } } // namespace pybind diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 2ac455d771bf78377ce4ee7d921393d3b3958e3c..a08716c5a559def54bb7b989f250b489f6a805a2 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -141,10 +141,17 @@ RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list EOF fi +if [[ ${WITH_GPU} == "ON" ]]; then + NCCL_DEPS="apt-get install -y libnccl-dev &&" +else + NCCL_DEPS="" +fi + cat >> /paddle/build/Dockerfile <