From c6c65c65c7a4e90de880cd50d0db4da1ad42da55 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 22 Apr 2020 14:19:55 +0200 Subject: [PATCH] [DNNL] Added elementwise_add mkl-dnn inplace (#23477) --- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- .../framework/ir/graph_pattern_detector.cc | 20 +-- .../framework/ir/graph_pattern_detector.h | 3 +- .../ir/mkldnn/mkldnn_inplace_pass.cc | 146 +++++++++++++----- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 27 ++-- .../mkldnn/elementwise_add_mkldnn_op.cc | 43 +++--- .../operators/mkldnn/inplace_op_tests.cmake | 2 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 11 +- .../mkldnn/test_mkldnn_op_inplace.cc | 75 +++++++-- paddle/fluid/platform/mkldnn_helper.h | 5 + paddle/fluid/platform/mkldnn_reuse.h | 36 ++++- 11 files changed, 259 insertions(+), 111 deletions(-) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index db78c6bb82..8dd9fd271f 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -86,7 +86,7 @@ endif() if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn) - pass_library(mkldnn_inplace_pass inference DEPS mkldnn_placement_pass op_registry softmax_op softmax DIR mkldnn) + pass_library(mkldnn_inplace_pass inference DEPS mkldnn_placement_pass op_registry elementwise_add_op activation_op softmax_op softmax DIR mkldnn) pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn) pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn) pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 4fbdaf3330..4b20fd7a82 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1892,30 +1892,30 @@ PDNode *patterns::MultipleQuantize::operator()() { } PDNode *patterns::MKLDNNInPlace::operator()() { - // TODO(jczaja): Enable more mkl-dnn ops e.g. activation, elementwise_add, - // batch_norm.... auto possible_inplace_op = - pattern->NewNode(inplace_to_be_op_repr())->assert_is_ops({"softmax"}); + pattern->NewNode(inplace_to_be_op_repr()) + ->assert_is_ops({"elementwise_add", "softmax"}); - // TODO(jczaja): Enable more mkl-dnn ops e.g. activation, elementwise_add, - // batch_norm.... + // TODO(jczaja): Enable more mkl-dnn ops e.g. activation, batch_norm.... auto input = pattern->NewNode(inplace_to_be_op_in_repr()) - ->assert_is_ops_input({"softmax"}) + ->assert_is_ops_input({"elementwise_add", "softmax"}) ->AsInput(); - // TODO(jczaja): Enable more mkl-dnn ops e.g. activation, elementwise_add, - // batch_norm.... + // TODO(jczaja): Enable more mkl-dnn ops e.g. activation, batch_norm.... auto output = pattern->NewNode(inplace_to_be_op_out_repr()) - ->assert_is_ops_output({"softmax"}) - ->AsIntermediate(); + ->assert_is_ops_output({"elementwise_add", "softmax"}) + ->AsOutput(); auto next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + auto next_output = pattern->NewNode(next_op_out_repr())->AsOutput(); // Check if op is MKL-DNN enabled possible_inplace_op->assert_op_attr("use_mkldnn", true); + // linked structure possible_inplace_op->LinksTo({output}); possible_inplace_op->LinksFrom({input}); next_op->LinksFrom({output}); + next_op->LinksTo({next_output}); return possible_inplace_op; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 3139ec6ba7..7e077a6bdc 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1140,11 +1140,12 @@ struct MKLDNNInPlace : public PatternBase { : PatternBase(pattern, name_scope, "mkldnn_inplace") {} PDNode* operator()(); - // MKL-DNN's in-place ops: BatchNorm, Softmax, Layer Norm + // MKL-DNN's in-place ops: BatchNorm, Softmax, Elementwise_add PATTERN_DECL_NODE(inplace_to_be_op); PATTERN_DECL_NODE(inplace_to_be_op_in); PATTERN_DECL_NODE(inplace_to_be_op_out); PATTERN_DECL_NODE(next_op); + PATTERN_DECL_NODE(next_op_out); }; struct TransposeFlattenConcat : public PatternBase { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc index 9b56d6831b..6590ef44f8 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -30,6 +31,7 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL(graph, platform::errors::InvalidArgument( "Pointer to graph argument should not be NULL.")); + std::unordered_map original_output_names; GraphPatternDetector gpd; patterns::MKLDNNInPlace mkldnn_inplace{gpd.mutable_pattern(), "mkldnn_inplace"}; @@ -40,72 +42,136 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const { Graph* g) { VLOG(3) << "Start to handle MKL-DNN In-Place pass"; - GET_IR_NODE_FROM_SUBGRAPH(inplace_to_be_op, inplace_to_be_op, + GET_IR_NODE_FROM_SUBGRAPH(current_op, inplace_to_be_op, mkldnn_inplace); + GET_IR_NODE_FROM_SUBGRAPH(current_op_in, inplace_to_be_op_in, mkldnn_inplace); - GET_IR_NODE_FROM_SUBGRAPH(inplace_to_be_op_in, inplace_to_be_op_in, - mkldnn_inplace); - GET_IR_NODE_FROM_SUBGRAPH(inplace_to_be_op_out, inplace_to_be_op_out, + GET_IR_NODE_FROM_SUBGRAPH(current_op_out, inplace_to_be_op_out, mkldnn_inplace); GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, mkldnn_inplace); + GET_IR_NODE_FROM_SUBGRAPH(next_op_out, next_op_out, mkldnn_inplace); - if ((inplace_to_be_op->Op()->HasAttr("use_mkldnn") == false) || - (boost::get(inplace_to_be_op->Op()->GetAttr("use_mkldnn")) == - false)) { + if ((current_op->Op()->HasAttr("use_mkldnn") == false) || + (boost::get(current_op->Op()->GetAttr("use_mkldnn")) == false)) { VLOG(3) << "do not perform mkl-dnn inplace: use_mkldnn missing or set to " "false"; return; } - auto& infer_inplace = OpInfoMap::Instance() - .Get(inplace_to_be_op->Op()->Type()) - .infer_inplace_; + auto& infer_inplace = + OpInfoMap::Instance().Get(current_op->Op()->Type()).infer_inplace_; if (!infer_inplace) { VLOG(3) << "do not perform mkl-dnn inplace: missing InplaceInferer"; return; } - // TODO(jczaja): Enable more ops - if (inplace_to_be_op->Op()->Type() != "softmax") { - VLOG(3) - << "Curently works for softmax only. TODO(jczaja): support other ops"; + VLOG(3) << "DNNL Inplace op(" << current_op->id() << ") " + << "Curr Node In: " << current_op_in->Name() + << " Curr Node out: " << current_op_out->Name(); + + VLOG(3) << "DNNL Inplace next op(" << next_op->id() << ") " + << " next Node out: " << next_op_out->Name(); + + auto inputs = current_op->Op()->Inputs(); + auto outputs = current_op->Op()->Outputs(); + auto in_to_outs = infer_inplace(false); // strictly no CUDA for MKL-DNN + VLOG(3) << "DNNL InplaceInferer op(" << current_op->id() << ") " + << in_to_outs.begin()->first << ": " + << inputs[in_to_outs.begin()->first][0] << " " + << in_to_outs.begin()->second << ": " + << outputs[in_to_outs.begin()->second][0]; + // If InferInplace pattern does not contain input node then skip + auto inplace_input_vec = inputs[in_to_outs.begin()->first]; + if (std::find(inplace_input_vec.begin(), inplace_input_vec.end(), + current_op_in->Name()) == inplace_input_vec.end()) { + VLOG(3) << "DNNL in-place pass SKIP pattern "; return; } - // Iterate over all nodes that are ops - // and check if in-place to be var is part of inputs - // if positive then do not perform inplace - for (const Node* n : graph->Nodes()) { - if (n->IsOp()) { - // Avoid searchin in op that is to be inplace - if ((n->id() != inplace_to_be_op->id())) { - auto* op = n->Op(); - auto inputs = op->Inputs(); - auto in_place_input = inplace_to_be_op_in->Name(); - for (auto& it : inputs) { - for (auto& var_name : it.second) { - if (var_name == in_place_input) { - VLOG(3) << "MKL-DNN in-place pass: in-place var cannot be an " - "input to more than one operator"; - return; - } - } + // Checking if this particular node (to be inplaced, overwritten) + // is used anywhere else apart from inplaced op + auto input_consumers = current_op_in->outputs; + if (input_consumers.size() > 1) { + VLOG(3) << "DNNL in-place pass FAIL: in-place var cannot " + "be an input to multiple operators"; + return; + } + + // If this op was alrady inplaced in previous pass placements + // then we need to update input of next op + // but original name to be changed is gone, so we need to remember it + // on first time given op is to be inplaced + if (current_op_in->Name() != current_op_out->Name()) { + original_output_names[current_op->Name() + current_op_in->Name()] = + current_op_out->Name(); + } else { + VLOG(3) << "DNNL Inplace: Current op already inplaced! "; + } + + // It may be that next op is reusing some of vars, we need to + // make sure that unwanted inplace is not created + // TODO(jczaja): Make UT for that one + for (auto& n : current_op_out->outputs) { + auto& n_op_infer_inplace = + OpInfoMap::Instance().Get(n->Op()->Type()).infer_inplace_; + if ((n_op_infer_inplace == nullptr)) { + for (auto& m : n->outputs) { + if (m->Name() == current_op_in->Name()) { + VLOG(3) << "DNNL in-place pass FAIL: in-place var cannot " + "be an output to non-inplaced next op"; + return; } } } } - auto original_name = inplace_to_be_op_out->Name(); - inplace_to_be_op_out->RenameVar(inplace_to_be_op_in->Name()); + auto original_name = + original_output_names[current_op->Name() + current_op_in->Name()]; + current_op_out->RenameVar(current_op_in->Name()); // Get mapping of input to output - auto in_to_outs = infer_inplace(false); // strictly no CUDA for MKL-DNN - // TODO(jczaja): Support more complex situations auto out_name = in_to_outs.begin()->second; - inplace_to_be_op->Op()->SetOutput( - out_name, std::vector({inplace_to_be_op_out->Name()})); - next_op->Op()->RenameInput(original_name, inplace_to_be_op_out->Name()); + current_op->Op()->SetOutput( + out_name, std::vector({current_op_out->Name()})); + + // If next op in a line is doing inplace + // then we need to update its output as well + + // Get inferer of next op + // If no inferer then we are done + auto& next_op_infer_inplace = + OpInfoMap::Instance().Get(next_op->Op()->Type()).infer_inplace_; + if (next_op_infer_inplace) { + auto in_to_outs = next_op_infer_inplace(false); + auto out_name = in_to_outs.begin()->second; + auto* op = next_op->Op(); + auto inputs = op->Inputs(); + auto outputs = op->Outputs(); + // Check if in-place happened + // for variable we changed (original name) + // TODO(jczaja): make recursive propagation of inplace + auto next_op_inplace_inputs = inputs[in_to_outs.begin()->first]; + if ((next_op_inplace_inputs == outputs[in_to_outs.begin()->second]) && + (std::find(next_op_inplace_inputs.begin(), + next_op_inplace_inputs.end(), + original_name) != next_op_inplace_inputs.end())) { + VLOG(3) << "DNNL InPlace: Next Op is in-placed , updating its " + "input " + "and output var!"; + next_op->Op()->SetOutput( + out_name, std::vector({current_op_out->Name()})); + next_op_out->RenameVar(current_op_in->Name()); + // Get ops that next_op_out is linked to and update their input + auto next_op_out_consumers = next_op_out->outputs; // Has to be ops + for (auto& c : next_op_out_consumers) { + c->Op()->RenameInput(original_name, current_op_out->Name()); + } + } + } + + next_op->Op()->RenameInput(original_name, current_op_out->Name()); + found_inplace_count++; - VLOG(3) << "MKL-DNN InPlace applied!"; + VLOG(3) << "DNNL InPlace applied!"; }; gpd(graph, handler); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 0486541fae..794345dd7a 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -21,6 +21,9 @@ USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +USE_OP(elementwise_add); +USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); +USE_OP(relu); namespace paddle { namespace framework { @@ -62,8 +65,9 @@ class MKLDNNInplacePassTest { bool branched) { ProgramDesc prog; - for (auto& v : std::vector( - {"a", "weights", "bias", "f", "g", "h", "i", "j", "k"})) { + for (auto& v : + std::vector({"a", "weights", "bias", "f", "g", "h", "i", + "j", "k", "l", "m", "z"})) { auto* var = prog.MutableBlock(0)->Var(v); var->SetType(proto::VarType::SELECTED_ROWS); if (v == "weights" || v == "bias") { @@ -83,9 +87,12 @@ class MKLDNNInplacePassTest { SetOp(&prog, "elementwise_add", "elementwise_add1", std::vector({"h", "i"}), std::vector({"j"}), mkldnn_enabled_op.compare("elementwise_add") == 0); + SetOp(&prog, "relu", "relu2", std::vector({"j"}), + std::vector({"k"}), + mkldnn_enabled_op.compare("softmax") == 0); if (branched == true) { SetOp(&prog, "softmax", "softmax2", std::vector({"g"}), - std::vector({"k"}), + std::vector({"z"}), mkldnn_enabled_op.compare("softmax") == 0); } @@ -105,12 +112,11 @@ class MKLDNNInplacePassTest { unsigned use_mkldnn_true_count = 0; std::unordered_map input_names; std::unordered_map output_names; + input_names["softmax"] = "X"; output_names["softmax"] = "Out"; - input_names["batch_norm"] = "X"; - output_names["batch_norm"] = "Y"; - input_names["layer_norm"] = "X"; - output_names["layer_norm"] = "Y"; + input_names["elementwise_add"] = "X"; + output_names["elementwise_add"] = "Out"; VLOG(3) << DebugString(graph); @@ -135,15 +141,18 @@ class MKLDNNInplacePassTest { TEST(MKLDNNInplacePass, inplace_softmax) { // softmax to be mkl-dnn enabled and made in-place - MKLDNNInplacePassTest().MainTest("softmax", false, 1); } TEST(MKLDNNInplacePass, inplace_softmax_branched) { - // softmax to be mkl-dnn enabled and made in-place + // softmax's input is shared by two branches. so no in-place MKLDNNInplacePassTest().MainTest("softmax", true, 0); } +TEST(MKLDNNInplacePass, inplace_elementwise_add) { + // Two elementwise_add mkl-dnn enabled op instances to be made inplace + MKLDNNInplacePassTest().MainTest("elementwise_add", false, 1); +} } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 72d2855ad4..e45964aadc 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -56,39 +56,34 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { y->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument("Wrong format set for Y tensor")); - const T* x_data = x->data(); - const T* y_data = y->data(); - auto src_x_tz = framework::vectorize(x->dims()); auto src_y_tz = framework::vectorize(y->dims()); auto dst_tz = framework::vectorize(z->dims()); - std::vector scales = {1.0f, 1.0f}; + // Currently MKL-DNN kernel supports only Z <- X + Y, shape(X) == shape(Y) + // TODO(jczaja): Binary primitive support broadcasting, so we can support + // this in kernel + platform::BinaryMKLDNNHandler handler( + dnnl::algorithm::binary_add, src_x_tz, x->format(), y->format(), + dev_ctx, ctx.GetPlace(), ctx.OutputName("Out")); - const std::string key = - platform::CreateKey(src_x_tz, ctx.OutputName("Out")); + auto src_x_memory = handler.AcquireSrcMemory(x); + auto src_y_memory = handler.AcquireSecondSrcMemory(y); - platform::SumMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); + // For Inplace src and and dst are the same memory object + auto dst_memory = + x->IsSharedBufferWith(*z) ? src_x_memory : handler.AcquireDstMemory(z); - auto src_x_memory = handler.AcquireSrcMemory( - {{src_x_tz}, platform::MKLDNNGetDataType(), x->format()}, - paddle::platform::to_void_cast(x_data)); - auto src_y_memory = handler.AcquireSecondSrcMemory( - {{src_y_tz}, platform::MKLDNNGetDataType(), y->format()}, - paddle::platform::to_void_cast(y_data)); - auto dst_md = memory::desc({dst_tz}, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); - auto sum_pd = handler.AcquireSumPrimitiveDescriptor( - {src_x_memory, src_y_memory}, scales, dst_md); - T* z_data = - z->mutable_data(ctx.GetPlace(), sum_pd->dst_desc().get_size()); - auto dst_memory = handler.AcquireDstMemoryFromPrimitive(z_data); - auto sum_prim = handler.AcquireSum(); + auto binary_prim = handler.AcquireForwardPrimitive(); mkldnn::stream astream(mkldnn_engine); - sum_prim->execute(astream, {{MKLDNN_ARG_MULTIPLE_SRC, *src_x_memory}, - {MKLDNN_ARG_MULTIPLE_SRC + 1, *src_y_memory}, - {MKLDNN_ARG_DST, *dst_memory}}); + + std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_x_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); astream.wait(); z->set_layout(DataLayout::kMKLDNN); diff --git a/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake b/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake index 0c68184e8b..cf43f5c595 100644 --- a/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake +++ b/paddle/fluid/operators/mkldnn/inplace_op_tests.cmake @@ -1,2 +1,2 @@ -cc_test(test_mkldnn_op_inplace SRCS mkldnn/test_mkldnn_op_inplace.cc DEPS op_registry softmax_op softmax scope device_context enforce executor) +cc_test(test_mkldnn_op_inplace SRCS mkldnn/test_mkldnn_op_inplace.cc DEPS op_registry elementwise_add_op softmax_op softmax scope device_context enforce executor) diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 996530c0e9..e957321e9c 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -45,7 +45,8 @@ class SoftmaxMKLDNNHandler : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dims, uniq_name)) { + // Softmax may be inplace then uniq_name is no longer unique + platform::CreateKey(dims, axis, uniq_name)) { auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, @@ -60,7 +61,7 @@ class SoftmaxMKLDNNHandler : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dims, uniq_name)) { + platform::CreateKey(dims, axis, uniq_name)) { auto data_softmax_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); auto diff_softmax_md = @@ -95,13 +96,13 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { auto softmax_src_memory_p = handler.AcquireSrcMemory(input); auto softmax_p = handler.AcquireForwardPrimitive(); // For Inplace src and and dst are the same memory object - auto softmax_dst_memory_p = input->Holder() == output->Holder() + auto softmax_dst_memory_p = input->IsSharedBufferWith(*output) ? softmax_src_memory_p : handler.AcquireDstMemory(output); mkldnn::stream astream(dev_ctx.GetEngine()); - softmax_p->execute(astream, {{MKLDNN_ARG_SRC, *softmax_src_memory_p}, - {MKLDNN_ARG_DST, *softmax_dst_memory_p}}); + softmax_p->execute(astream, {{DNNL_ARG_SRC, *softmax_src_memory_p}, + {DNNL_ARG_DST, *softmax_dst_memory_p}}); astream.wait(); const bool is_test = ctx.Attr("is_test"); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 7cdc6990f3..4551813db1 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -27,38 +27,68 @@ USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +USE_OP(elementwise_add); +USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); namespace paddle { namespace operators { +struct InputVars { + std::string name; + framework::LoDTensor *tensor; +}; + template -bool TestMain(const platform::Place &place, const framework::DDim &dims) { +bool TestMain(const platform::Place &place, const std::string &op_type, + const framework::DDim &dims, const int num_inputs) { framework::Scope scope; - auto *x = scope.Var("x")->GetMutable(); - auto *y = scope.Var("y")->GetMutable(); - x->Resize(dims); - y->Resize(dims); - - size_t numel = static_cast(framework::product(dims)); - - auto x_ptr = x->mutable_data(place); - auto y_ptr = y->mutable_data(place); + std::vector input_names = { + {"x", scope.Var("x")->GetMutable()}, + {"x1", num_inputs > 1 + ? scope.Var("x1")->GetMutable() + : nullptr}, + {"x2", num_inputs > 2 + ? scope.Var("x2")->GetMutable() + : nullptr}, + {"x3", num_inputs > 3 + ? scope.Var("x3")->GetMutable() + : nullptr}, + {"x4", num_inputs > 4 + ? scope.Var("x4")->GetMutable() + : nullptr}}; + auto *y = scope.Var("y")->GetMutable(); + // Initialize input data std::uniform_real_distribution dist(static_cast(10.0), static_cast(20.0)); std::mt19937 engine; + size_t numel = static_cast(framework::product(dims)); + for (int i = 0; i < num_inputs; ++i) { + input_names[i].tensor->Resize(dims); + auto data_ptr = input_names[i].tensor->mutable_data(place); + for (size_t i = 0; i < numel; ++i) { + data_ptr[i] = dist(engine); + } + } + // Initialize output + y->Resize(dims); + auto y_ptr = y->mutable_data(place); for (size_t i = 0; i < numel; ++i) { - x_ptr[i] = dist(engine); y_ptr[i] = static_cast(0); } auto &pool = platform::DeviceContextPool::Instance(); // Out of place (reference) computation - auto op_ref = framework::OpRegistry::CreateOp( - "softmax", {{"X", {"x"}}}, {{"Out", {"y"}}}, {{"use_mkldnn", {true}}}); + auto op_ref = num_inputs > 1 ? framework::OpRegistry::CreateOp( + op_type, {{"X", {"x"}}, {"Y", {"x1"}}}, + {{"Out", {"y"}}}, {{"use_mkldnn", {true}}}) + : framework::OpRegistry::CreateOp( + op_type, {{"X", {"x"}}}, {{"Out", {"y"}}}, + {{"use_mkldnn", {true}}}); + op_ref->Run(scope, place); pool.Get(place)->Wait(); @@ -66,15 +96,20 @@ bool TestMain(const platform::Place &place, const framework::DDim &dims) { auto &ref_tensor = scope.FindVar("y")->Get(); // In-place (to be tested) computation - auto op = framework::OpRegistry::CreateOp( - "softmax", {{"X", {"x"}}}, {{"Out", {"x"}}}, {{"use_mkldnn", {true}}}); + auto op = num_inputs > 1 ? framework::OpRegistry::CreateOp( + op_type, {{"X", {"x"}}, {"Y", {"x1"}}}, + {{"Out", {"x"}}}, {{"use_mkldnn", {true}}}) + : framework::OpRegistry::CreateOp( + op_type, {{"X", {"x"}}}, {{"Out", {"x"}}}, + {{"use_mkldnn", {true}}}); + op->Run(scope, place); platform::DeviceContextPool::Instance().Get(place)->Wait(); // Get in-place result auto &out_tensor = scope.FindVar("x")->Get(); PADDLE_ENFORCE_EQ( - &out_tensor, x, + &out_tensor, input_names[0].tensor, platform::errors::InvalidArgument( "Input and output vars should share tensor for In-place test")); @@ -88,7 +123,13 @@ bool TestMain(const platform::Place &place, const framework::DDim &dims) { TEST(test_softmax_inplace, cpu_place) { framework::DDim dims({32, 64}); platform::CPUPlace p; - ASSERT_TRUE(TestMain(p, dims)); + ASSERT_TRUE(TestMain(p, "softmax", dims, 1)); +} + +TEST(test_elementwise_add_inplace, cpu_place) { + framework::DDim dims({1, 12, 20, 20}); + platform::CPUPlace p; + ASSERT_TRUE(TestMain(p, "elementwise_add", dims, 2)); } } // namespace operators diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index cc5e6b0da1..08b86b0727 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -101,6 +101,11 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in, } } +struct mkldnn_dummy_primitive { + struct primitive_desc {}; + struct desc {}; +}; + inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, mkldnn::memory::data_type data_type, MKLDNNMemoryFormat format) { diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 25d50f1fc8..9c9e7924b3 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -30,7 +30,8 @@ namespace platform { using user_function = std::function(const float*)>; using memory = mkldnn::memory; -template +template class MKLDNNHandlerT { public: MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, @@ -351,6 +352,35 @@ class MKLDNNHandler { std::string key_common_; }; +template +class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { + public: + BinaryMKLDNNHandler(const dnnl::algorithm algo, + const std::vector& dims, + const MKLDNNMemoryFormat src0_fmt, + const MKLDNNMemoryFormat src1_fmt, + const platform::MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dims, uniq_name)) { + // TODO(jczaja): Add function checking if data already exists + auto src0_md = dnnl::memory::desc(dims, MKLDNNGetDataType(), src0_fmt); + auto src1_md = dnnl::memory::desc(dims, MKLDNNGetDataType(), src1_fmt); + auto dst_md = + memory::desc(dims, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + + this->AcquireForwardPrimitiveDescriptor(algo, src0_md, src1_md, dst_md); + } + + std::shared_ptr AcquireSecondSrcMemory( + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src_desc(), to_void_cast(input_data), "@src1_mem_p"); + } +}; + class SumMKLDNNHandler : public MKLDNNHandler { public: SumMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx, @@ -419,7 +449,7 @@ class ActivationMKLDNNHandler : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dims, unique_name)) { + platform::CreateKey(dims, "a", algorithm, unique_name)) { auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, @@ -437,7 +467,7 @@ class ActivationMKLDNNHandler : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dims, unique_name)) { + platform::CreateKey(dims, "a", algorithm, unique_name)) { auto diff_dst_md = platform::MKLDNNMemDesc( dims, platform::MKLDNNGetDataType(), diff_fmt); auto src_md = -- GitLab