diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc index ab7c61227114fe7a0ce2ff2515dd560706058b64..b892ac77d9ed60210ddadaecb1a4f214e5a25180 100644 --- a/paddle/fluid/operators/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -15,6 +15,7 @@ #include "mkldnn.hpp" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/mkldnn_activation_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { @@ -23,6 +24,18 @@ using paddle::framework::Tensor; using paddle::platform::MKLDNNDeviceContext; namespace { +std::string gethash(const mkldnn::memory::dims &operand_dims, + const mkldnn::algorithm algorithm) { + auto dim2str = [](const mkldnn::memory::dims &operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + }; + return dim2str(operand_dims) + std::to_string(algorithm); +} + template void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm, const T alpha = 0, const T beta = 0) { @@ -37,42 +50,70 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm, const auto *src_data = src->template data(); auto *dst = ctx.template Output("Out"); - const T *dst_data = dst->template mutable_data(ctx.GetPlace()); + T *dst_data = dst->template mutable_data(ctx.GetPlace()); // get memory dim PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4, "Input dim must be with 2 or 4"); std::vector src_tz = framework::vectorize2int(src->dims()); - // create memory description - auto data_md = src_tz.size() == 2 - ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nc) - : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); - - // create memory primitives - auto src_memory = - mkldnn::memory({data_md, mkldnn_engine}, - static_cast(const_cast(src_data))); - auto dst_memory = - mkldnn::memory({data_md, mkldnn_engine}, - static_cast(const_cast(dst_data))); - - auto forward_desc = mkldnn::eltwise_forward::desc( - mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta); - - // save prim desc into global device context to be referred in backward path - const std::string key = ctx.op().Output("Out"); - const std::string key_eltwise_pd = key + "@eltwise_pd"; - auto forward_pd = std::make_shared( - forward_desc, mkldnn_engine); - dev_ctx.SetBlob(key_eltwise_pd, forward_pd); - - auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory); + const std::string key = gethash(src_tz, algorithm); + const std::string key_src_data = + key + ctx.op().Output("Out") + "@eltwise_fwd_src_data"; + const std::string key_src_mem = key + "@eltwise_fwd_src_mem"; + const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem"; + const std::string key_fwd = key + "@eltwise_fwd"; + + auto p_fwd = std::static_pointer_cast( + dev_ctx.GetBlob(key_fwd)); + + // save input data to be referred in backward path + auto p_src_data = std::make_shared(src_data); + dev_ctx.SetBlob(key_src_data, p_src_data); + + if (p_fwd == nullptr) { + // create memory description + auto data_md = src_tz.size() == 2 + ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nc) + : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + // create memory primitives + auto p_src_mem = std::make_shared(mkldnn::memory( + {data_md, mkldnn_engine}, platform::to_void_cast(src_data))); + dev_ctx.SetBlob(key_src_mem, p_src_mem); + + auto p_dst_mem = std::make_shared(mkldnn::memory( + {data_md, mkldnn_engine}, platform::to_void_cast(dst_data))); + dev_ctx.SetBlob(key_dst_mem, p_dst_mem); + + auto fwd_desc = mkldnn::eltwise_forward::desc( + mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta); + auto p_fwd_pd = std::make_shared( + fwd_desc, mkldnn_engine); + const std::string key_fwd_pd = key + "eltwise_fwd_pd"; + dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd); + p_fwd = std::make_shared( + *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get())); + dev_ctx.SetBlob(key_fwd, p_fwd); + } else { + // primitives already exist + auto p_src_mem = + std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); + PADDLE_ENFORCE(p_src_mem != nullptr, + "Fail to find eltwise p_src_mem in device context."); + auto p_dst_mem = + std::static_pointer_cast(dev_ctx.GetBlob(key_dst_mem)); + PADDLE_ENFORCE(p_dst_mem != nullptr, + "Fail to find eltwise p_src_mem in device context."); + + p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data)); + p_dst_mem->set_data_handle(dst_data); + } // push primitive to stream and wait until it's executed - std::vector pipeline = {eltwise}; + std::vector pipeline = {*(p_fwd.get())}; mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } @@ -83,8 +124,7 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm, const auto &mkldnn_engine = dev_ctx.GetEngine(); // get buffers - const auto *x = ctx.template Input("X"); - const auto *src = x->template data(); + const auto *out = ctx.template Input("Out"); auto *dout = ctx.template Input(framework::GradVarName("Out")); const auto *diff_dst = dout->template data(); @@ -94,45 +134,73 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm, const T *diff_src = dx->template mutable_data(ctx.GetPlace()); // get memory dim - std::vector src_tz = framework::vectorize2int(x->dims()); - - // create memory description - auto data_md = src_tz.size() == 2 - ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nc) - : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); - - // create memory primitives - auto src_memory = mkldnn::memory( - {data_md, mkldnn_engine}, static_cast(const_cast(src))); - auto diff_src_memory = - mkldnn::memory({data_md, mkldnn_engine}, - static_cast(const_cast(diff_src))); - auto diff_dst_memory = - mkldnn::memory({data_md, mkldnn_engine}, - static_cast(const_cast(diff_dst))); - - auto backward_desc = - mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta); - - // retrieve eltwise primitive desc from device context - const std::string key = ctx.op().Input("Out"); - const std::string key_eltwise_pd = key + "@eltwise_pd"; - const std::shared_ptr forward_pd = dev_ctx.GetBlob(key_eltwise_pd); - PADDLE_ENFORCE(forward_pd != nullptr, - "Fail to find eltwise_pd in device context"); - auto *p_forward_pd = - static_cast(forward_pd.get()); - - auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc( - backward_desc, mkldnn_engine, *p_forward_pd); - - auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory, - diff_dst_memory, diff_src_memory); + std::vector src_tz = framework::vectorize2int(out->dims()); + + const std::string key = gethash(src_tz, algorithm); + const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem"; + const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem"; + const std::string key_grad = key + "@eltwise_grad"; + + const std::string key_src_data = + key + ctx.op().Input("Out") + "@eltwise_fwd_src_data"; + const auto p_src_data = + std::static_pointer_cast(dev_ctx.GetBlob(key_src_data)); + + const std::string key_src_mem = key + "@eltwise_fwd_src_mem"; + auto p_src_mem = + std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); + p_src_mem->set_data_handle(*p_src_data.get()); + + auto p_grad = std::static_pointer_cast( + dev_ctx.GetBlob(key_grad)); + + if (p_grad == nullptr) { + // create memory description + auto data_md = src_tz.size() == 2 + ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nc) + : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + // create memory primitives + std::shared_ptr p_diff_src_mem = + std::make_shared(mkldnn::memory( + {data_md, mkldnn_engine}, platform::to_void_cast(diff_src))); + dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem); + std::shared_ptr p_diff_dst_mem = + std::make_shared(mkldnn::memory( + {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst))); + dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem); + + auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, + alpha, beta); + + const std::string key_fwd_pd = key + "eltwise_fwd_pd"; + auto *p_fwd_pd = static_cast( + dev_ctx.GetBlob(key_fwd_pd).get()); + + auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc( + bwd_desc, mkldnn_engine, *p_fwd_pd); + + p_grad = std::make_shared( + eltwise_bwd_prim_desc, *static_cast(p_src_mem.get()), + *(static_cast(p_diff_dst_mem.get())), + *(static_cast(p_diff_src_mem.get()))); + } else { + // primitives already exist + auto p_diff_src_mem = std::static_pointer_cast( + dev_ctx.GetBlob(key_diff_src_mem)); + auto p_diff_dst_mem = std::static_pointer_cast( + dev_ctx.GetBlob(key_diff_dst_mem)); + + p_diff_src_mem->set_data_handle( + platform::to_void_reinterpret_cast(diff_src)); + p_diff_dst_mem->set_data_handle( + platform::to_void_reinterpret_cast(diff_dst)); + } // push primitive to stream and wait until it's executed - std::vector pipeline = {eltwise_bwd}; + std::vector pipeline = {*(p_grad.get())}; mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } } // anonymous namespace diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 55482abdf09516077a94ca99140ae7961f0915aa..dd71c66a75a039429f6e4b1771bb31508bb6b56d 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -41,7 +41,7 @@ namespace operators { \ protected: \ std::unique_ptr<::paddle::framework::OpDesc> Apply() const override { \ - auto *op = new ::paddle::framework::OpDesc(); \ + auto* op = new ::paddle::framework::OpDesc(); \ op->SetType(#KERNEL_TYPE "_grad"); \ op->SetInput("Out", Output("Out")); \ op->SetInput(::paddle::framework::GradVarName("Out"), \ @@ -54,23 +54,50 @@ namespace operators { } \ } +framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, + const framework::OperatorWithKernel& oper, + const std::string& name) { + framework::LibraryType library{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + auto it = oper.Attrs().find("use_mkldnn"); + if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + } +#endif + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( + framework::ToDataType(ctx.Input(name)->type()), + ctx.GetPlace(), layout, library); +} + class ActivationOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, "X"); + } }; class ActivationOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, "Out"); + } }; __attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC( diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h index f26a165b5a59f01f864d62bbf798f4cbffa65371..85664623d7330e9473286d995bec67879510dbd7 100644 --- a/paddle/fluid/operators/mkldnn_activation_op.h +++ b/paddle/fluid/operators/mkldnn_activation_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" @@ -60,52 +62,5 @@ class MKLDNNActivationGradKernel } }; -namespace { // NOLINT -framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx, - const framework::OperatorWithKernel& oper) { - framework::LibraryType library{framework::LibraryType::kPlain}; -#ifdef PADDLE_WITH_MKLDNN - if (library == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library = framework::LibraryType::kMKLDNN; - } -#endif - framework::DataLayout layout = framework::DataLayout::kAnyLayout; - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.GetPlace(), layout, library); -} -} // anonymous namespace - -class ActivationWithMKLDNNOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return GetKernelType(ctx, *this); - } -}; - -class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); - } - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return GetKernelType(ctx, *this); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 56ed5912a15437b72b769610912c7493d77e5964..f1187620d81ff3bc1deef2106edb54d6199fa927 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -38,6 +38,11 @@ void* to_void_cast(const Type* t) { return static_cast(const_cast(t)); } +template +void* to_void_reinterpret_cast(const Type* t) { + return reinterpret_cast(const_cast(t)); +} + template using tf_desc = typename Type::desc;