diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ffcdc079985fa66793599c0b7f1b3a71f400b393..ddad70a6a5f31ccb974f78ca35f045c59f45b8be 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -47,24 +47,13 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, - ctx.GetPlace(), x, y, z, scale_x, - scale_y, scale_o); + platform::BinaryMKLDNNHandler handler( + BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z, + scale_x, scale_y, scale_o, ctx.OutputName("Out")); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - // (jczaja) For Inplace src and dst should be the same memory object. - // So x should share buffer with z. But UT mechanics is testing inplace - // execution for this op not checking that x can be bradcasted to match in - // shape y tensor. - // This is wrong as when x is to be broadcasted then z(out) will match the - // shape of y which is bigger than x. Hence if x is smaller in shape than z - // and they share a buffer (of - // shape x) then this buffer is not big enough to hold result of elementwise - // operation. - auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z)) - ? src_x_memory - : handler.AcquireDstMemory(z); + const auto dst_memory = handler.AcquireDstMemory(z); const auto binary_prim = handler.AcquireForwardPrimitive(); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index af4aab8047888a9b41b0b576535ac4f7e81002ec..1c246e8d18937087639129d32001a297eec3ca42 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -48,8 +48,9 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { if (dx) { // dx = dout*y platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); + dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, + ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f, + ctx.InputName(framework::GradVarName("Out"))); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); @@ -74,8 +75,9 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { // Handler is having nullptr passed instead of output tensor as // we want Dst buffer to be allocated by oneDNN not to use Tensor platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, x, nullptr, 1.0f, 1.0f, 1.0f); + dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, + ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f, + ctx.InputName(framework::GradVarName("Out"))); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_x_memory = handler.AcquireSecondSrcMemory(x); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index d992890adeec3eb4e899f8099c8bbe210e786abc..3b92d2e2d889137c005ae6de9be6942b5af49bd3 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -79,15 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx, paddle::platform::errors::PreconditionNotMet( "Operator DNNL eletwise_forward must use CPUPlace")); auto &dev_ctx = ctx.template device_context(); - const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); bool is_inplaced = x->IsSharedBufferWith(*y); - platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, - ctx.GetPlace(), x); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, dev_ctx, + ctx.GetPlace(), x, + ctx.InputName("X"), is_inplaced); auto src_memory_p = handler.AcquireSrcMemory(x); auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); @@ -106,14 +106,13 @@ template void eltwise_grad(const framework::ExecutionContext &ctx, mkldnn::algorithm algorithm) { auto &dev_ctx = ctx.template device_context(); - const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); const auto *diff_y = ctx.Input(framework::GradVarName("Out")); auto *diff_x = ctx.Output(framework::GradVarName("X")); - platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, - ctx.GetPlace(), x, diff_y); + platform::ActivationMKLDNNHandler handler( + algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X")); auto src_memory_p = handler.AcquireBackwardSrcMemory(x); auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); diff --git a/paddle/fluid/operators/mkldnn/caching_tests.cmake b/paddle/fluid/operators/mkldnn/caching_tests.cmake index d7c295672e002169dc72e3ae4916280ad2082891..4130c295b203eb0fddaf7e9fd8f398baa4144c99 100644 --- a/paddle/fluid/operators/mkldnn/caching_tests.cmake +++ b/paddle/fluid/operators/mkldnn/caching_tests.cmake @@ -1 +1 @@ -cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce) +cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op softmax scope device_context enforce) diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc index 84ac14d04b85b3817ddf50ef0eda9a622b2b4c27..ae17048b5d568baf4722e63299c9ef2ca3fb6bae 100644 --- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc @@ -29,7 +29,6 @@ class ScaleMKLDNNKernel : public framework::OpKernel { void RunKernel(const framework::ExecutionContext& ctx) const { const auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); @@ -37,12 +36,11 @@ class ScaleMKLDNNKernel : public framework::OpKernel { bool is_inplaced = x->IsSharedBufferWith(*out); platform::ActivationMKLDNNHandler handler( - mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(), - x); + mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x, + ctx.InputName("X"), is_inplaced); auto src_memory_p = handler.AcquireSrcMemory(x); - auto dst_memory_p = - is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); + auto dst_memory_p = handler.AcquireDstMemory(out); auto activation_p = handler.AcquireForwardPrimitive(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index b0f27719bf9adc41d09b486b3bacc484cf506929..e065800e4d1c71ee4bc47fe09b26ed1ea0b9d2c9 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -32,56 +32,69 @@ using platform::to_void_cast; template class SoftmaxMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { + : public platform::MKLDNNHandlerT { public: - SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine, + SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, - Tensor* output, const int axis) - : platform::MKLDNNHandlerNoCachingT( - mkldnn_engine, cpu_place) { - PADDLE_ENFORCE_EQ( - input->dims(), output->dims(), - platform::errors::InvalidArgument( - "The shape of input and output tensor must be identical.")); - - auto softmax_tz = framework::vectorize(input->dims()); - auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, - axis); + Tensor* output, const int axis, + const std::string uniq_name, bool is_inplaced) + : platform::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + // Softmax may be inplace then uniq_name is no longer unique + is_inplaced ? platform::CreateKey( + dev_ctx, framework::vectorize(input->dims()), + axis, uniq_name) + : platform::CreateKey( + dev_ctx, framework::vectorize(input->dims()), + uniq_name)) { + if (!this->isCached()) { + PADDLE_ENFORCE_EQ( + input->dims(), output->dims(), + platform::errors::InvalidArgument( + "The shape of input and output tensor must be identical.")); + + auto softmax_tz = framework::vectorize(input->dims()); + auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, + axis); + } } SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx, - const mkldnn::engine mkldnn_engine, + const MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, const Tensor* out, const Tensor* out_grad, Tensor* in_x_grad, const std::string& unique_name) - : platform::MKLDNNHandlerNoCachingT( - mkldnn_engine, cpu_place) { - PADDLE_ENFORCE_EQ(out_grad->dims(), in_x_grad->dims(), - platform::errors::InvalidArgument( - "The shape of softmax_grad's input " - "and output must be identical, but shapes differ, " - "out_grad: %s in_grad: %s", - out_grad->dims(), in_x_grad->dims())); - - auto dims = out_grad->dims(); // input and output share the same shape - const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - auto softmax_tz = framework::vectorize(dims); - - auto data_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out->format()); - auto diff_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, - data_softmax_md, axis); - this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, - axis); + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(out->dims()), + unique_name)) { + if (!this->isBwdCached()) { + PADDLE_ENFORCE_EQ( + out_grad->dims(), in_x_grad->dims(), + platform::errors::InvalidArgument("The shape of softmax_grad's input " + "and output must be identical.")); + + auto dims = out_grad->dims(); // input and output share the same shape + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); + auto softmax_tz = framework::vectorize(dims); + + auto data_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out->format()); + auto diff_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, + data_softmax_md, axis); + this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, + axis); + } } }; @@ -98,8 +111,9 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { const int axis = CanonicalAxis(ctx.Attr("axis"), input->dims().size()); - SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, - output, axis); + SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, ctx.GetPlace(), + input, output, axis, ctx.OutputName("Out"), + is_inplaced); auto softmax_src_memory_p = handler.AcquireSrcMemory(input); // For Inplace src and and dst are the same memory object @@ -135,12 +149,11 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { paddle::platform::errors::PreconditionNotMet( "Operator DNNL SoftmaxGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* output = ctx.Input("Out"); auto* out_grad = ctx.template Input(framework::GradVarName("Out")); auto* in_x_grad = ctx.template Output(framework::GradVarName("X")); - SoftmaxMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), output, + SoftmaxMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), output, out_grad, in_x_grad, ctx.InputName("Out")); auto dst_memory_p = handler.AcquireDstMemory(output); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 7251653793f89900efa5382db74201a1fc232574..cad4f47ec14022243ec04b50901a13f8d305a54e 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -33,8 +33,6 @@ USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(conv2d); -USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { namespace operators { @@ -66,19 +64,16 @@ class CacheTester { template void RunOperator(const platform::Place &place, const std::string &op_type, - const framework::DDim &dims, const std::string &first_input) { + const framework::DDim &dims, const std::string &output_name, + bool inplace = false) { framework::Scope scope; std::map num_inputs = {{"softmax", 1}, {"relu", 1}, - {"conv2d", 2}, {"elementwise_add", 2}, {"elementwise_mul", 2}}; - std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X"; - std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y"; - std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out"; - std::string output_name = "output"; + std::string first_input = inplace == true ? output_name : "x"; std::vector input_names = { {first_input, scope.Var(first_input)->GetMutable()}, @@ -118,40 +113,71 @@ void RunOperator(const platform::Place &place, const std::string &op_type, auto &pool = platform::DeviceContextPool::Instance(); - auto op = - num_inputs[op_type] > 1 - ? framework::OpRegistry::CreateOp( - op_type, {{first_input_var_name, {first_input}}, - {second_input_var_name, {"x1"}}}, - {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}) - : framework::OpRegistry::CreateOp( - op_type, {{first_input_var_name, {first_input}}}, - {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}); + auto op = num_inputs[op_type] > 1 + ? framework::OpRegistry::CreateOp( + op_type, {{"X", {first_input}}, {"Y", {"x1"}}}, + {{"Out", {output_name}}}, {{"use_mkldnn", {true}}}) + : framework::OpRegistry::CreateOp( + op_type, {{"X", {first_input}}}, {{"Out", {output_name}}}, + {{"use_mkldnn", {true}}}); op->Run(scope, place); pool.Get(place)->Wait(); } -TEST(test_conv2d_reuse_cache, cpu_place) { - framework::DDim dims({1, 16, 32, 64}); +TEST(test_softmax_reuse_cache, cpu_place) { + framework::DDim dims({32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "conv2d", dims, "input_signal"); - RunOperator(p, "conv2d", dims, "input_signal"); - PADDLE_ENFORCE_EQ(ct.Analyze(9), true, + RunOperator(p, "softmax", dims, "softmax_out"); + RunOperator(p, "softmax", dims, "softmax_out"); + PADDLE_ENFORCE_EQ(ct.Analyze(4), true, platform::errors::InvalidArgument( - "Invalid number of cached oneDNN objects")); + "Wrong number of cached oneDNN objects")); } -TEST(test_conv2d_noreuse_cache, cpu_place) { - framework::DDim dims({1, 16, 32, 64}); +TEST(test_softmax_noreuse_cache, cpu_place) { + framework::DDim dims({32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "conv2d", dims, "input_signal"); - RunOperator(p, "conv2d", dims, "input_signal2"); - PADDLE_ENFORCE_EQ(ct.Analyze(18), true, + RunOperator(p, "softmax", dims, "softmax_out"); + RunOperator(p, "softmax", dims, "softmax_out2"); + PADDLE_ENFORCE_EQ(ct.Analyze(8), true, platform::errors::InvalidArgument( - "Invalid number of cached oneDNN objects")); + "Wrong number of cached oneDNN objects")); +} + +TEST(test_softmax_inplace_cache, cpu_place) { + framework::DDim dims({32, 64}); + platform::CPUPlace p; + CacheTester ct; + RunOperator(p, "softmax", dims, "softmax_out"); + RunOperator(p, "softmax", dims, "softmax_out", true); + PADDLE_ENFORCE_EQ(ct.Analyze(7), true, + platform::errors::InvalidArgument( + "Wrong number of cached oneDNN objects")); +} + +TEST(test_relu_inplace_cache, cpu_place) { + framework::DDim dims({32, 64}); + platform::CPUPlace p; + CacheTester ct; + RunOperator(p, "relu", dims, "relu_out"); + RunOperator(p, "relu", dims, "relu_out", true); + PADDLE_ENFORCE_EQ(ct.Analyze(7), true, + platform::errors::InvalidArgument( + "Wrong number of cached oneDNN objects")); +} + +TEST(test_elementwise_add_reuse_cache, cpu_place) { + framework::DDim dims({32, 64}); + platform::CPUPlace p; + CacheTester ct; + RunOperator(p, "elementwise_add", dims, "elementwise_add_out"); + RunOperator(p, "relu", dims, "elementwise_add_out", true); + PADDLE_ENFORCE_EQ(ct.Analyze(8), true, + platform::errors::InvalidArgument( + "Wrong number of cached oneDNN objects")); } } // namespace operators diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 95b8e0c610b1d4b3b8892c451258c07363e2609f..f63d45d7ff6ae611dc1633e94dac00c4f6db2339 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -34,211 +34,6 @@ using framework::Tensor; using user_function = std::function(const float*)>; using memory = mkldnn::memory; -template -class MKLDNNHandlerNoCachingT { - public: - MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) - : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - } - - std::shared_ptr AcquireForwardPrimitive() { - return std::make_shared(*fwd_pd_); - } - - std::shared_ptr AcquireBackwardPrimitive() { - return std::make_shared(*bwd_pd_); - } - - std::shared_ptr AcquireBackwardWeightsPrimitive() { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, platform::errors::Unavailable("BWD_PD should be set when " - "getting BWD prim .")); - return std::make_shared(*bwd_w_pd_); - } - - std::shared_ptr AcquireSrcMemory( - const framework::Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), - to_void_cast(input_data)); - } - - template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { - T_out* ptr = - output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); - } - - template - std::shared_ptr AcquireDstMemory(void) { - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); - } - - template - std::shared_ptr AcquireDstMemory( - const framework::Tensor* output) { - const T_out* output_data = output->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), - to_void_cast(output_data)); - } - - std::shared_ptr AcquireDiffDstMemory( - const framework::Tensor* diffdst) { - const T* ptr = diffdst->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), - to_void_cast(ptr)); - } - - std::shared_ptr AcquireDiffSrcMemory( - framework::Tensor* diffsrc) { - T* ptr = - diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); - } - - // Buffer of given Tensor is used for oneDNN computation - std::shared_ptr AcquireDiffWeightsMemory( - framework::Tensor* diff_weights) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - T* ptr = diff_weights->mutable_data( - place_, bwd_w_pd_->diff_weights_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), - ptr); - } - - // Buffer is allocated by oneDNN to store computation results - std::shared_ptr AcquireDiffWeightsMemory(void) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); - } - - protected: - // If your primitive descriptor requires attributes, pass them as a - // first argument and paramters to descriptor constructor in the following - // arguments. Otherwise, all arguments will be forwarded to descriptor - // constructor, including the first one. - template - void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { - CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); - } - - // Using sfinae to specialise variadic function. Workaround for not having - // if constexpr in C++ 11. - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(args)...); - fwd_pd_ = std::make_shared( - fwd_desc, first, engine_); - } - - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(first), - std::forward(args)...); - fwd_pd_ = - std::make_shared(fwd_desc, engine_); - } - - template - void AcquireBackwardPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL(fwd_pd_, - platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = typename TBackward::desc(std::forward(args)...); - bwd_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - } - - template - void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL(fwd_pd_, - platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = - typename TBackward_params::desc(std::forward(args)...); - bwd_w_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, void* ptr) { - return std::make_shared(md, engine_, ptr); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md) { - return std::make_shared(md, engine_); - } - - void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p) { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - - template - std::shared_ptr AcquireMemoryWithReorder( - const mkldnn::memory::desc& user_md, - const mkldnn::memory::desc& target_md, void* ptr, - const std::string& suffix, bool is_persistent = false, - std::function(const F*)> custom_reorder_func = {}) { - std::shared_ptr target_memory_p; - if (custom_reorder_func) { - auto reordered_data = - custom_reorder_func(reinterpret_cast(ptr)); - ptr = reinterpret_cast(reordered_data.get()); - } - auto user_memory_p = std::make_shared(user_md, engine_, ptr); - if (user_md != target_md) { - target_memory_p = std::make_shared(target_md, engine_); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } else { - target_memory_p = user_memory_p; - } - return target_memory_p; - } - - mkldnn::engine engine_; - platform::Place place_; - std::shared_ptr fwd_pd_; - std::shared_ptr bwd_pd_; - std::shared_ptr bwd_w_pd_; -}; - template @@ -284,7 +79,7 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); if (backward_p == nullptr) { PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( - "BWD_PD should be set when " + "Error: BWD_PD should be set when " "getting BWD prim witk key: %s .", key_p)); backward_p = std::make_shared(*bwd_w_pd_); @@ -343,7 +138,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); + "Error: BWD_W_PD should be set when getting BWD grad of weights.")); T* ptr = diff_weights->mutable_data( place_, bwd_w_pd_->diff_weights_desc().get_size()); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr, @@ -355,7 +150,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); + "Error: BWD_W_PD should be set when getting BWD grad of weights.")); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), "@diff_wei_mem_p"); } @@ -794,70 +589,70 @@ class MKLDNNHandler { }; template -class BinaryMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { +class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, + const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z) - : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d", - DataLayout::kMKLDNN, x->layout())); - PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for X tensor : %d (undef)", - static_cast(x->format()))); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d", - DataLayout::kMKLDNN, y->layout())); - PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Y tensor : %d (undef)", - static_cast(y->format()))); - - const auto src_x_tz = framework::vectorize(x->dims()); - const auto src_y_tz = framework::vectorize(y->dims()); - // if output tensor(z) is nullptr then we are computing into oneDNN - // managed buffer - auto rankdiff = x->dims().size() - y->dims().size(); - const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) - : framework::vectorize(z->dims()); - - auto src0_md = dnnl::memory::desc( - src_x_tz, platform::MKLDNNGetDataType(), x->format()); - auto src1_md = dnnl::memory::desc( - src_y_tz, platform::MKLDNNGetDataType(), y->format()); - if (rankdiff > 0) { // Second input is of smaller rank than first - std::vector dims1_ex(rankdiff, 1); - dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), - src_y_tz.begin(), src_y_tz.end()); - src1_md = src1_md.reshape(dims1_ex); - } else if (rankdiff < 0) { // First input is of smaller than second - std::vector dims0_ex(-rankdiff, 1); - dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), - src_x_tz.begin(), src_x_tz.end()); - src0_md = src0_md.reshape(dims0_ex); - } - const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); + float scale_x, float scale_y, float scale_z, + const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, engine, cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), + uniq_name)) { + if (!this->isCached()) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for X tensor.")); + PADDLE_ENFORCE_NE( + x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for X tensor.")); + + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); + PADDLE_ENFORCE_NE( + y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for Y tensor.")); + + const auto src_x_tz = framework::vectorize(x->dims()); + const auto src_y_tz = framework::vectorize(y->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer + auto rankdiff = x->dims().size() - y->dims().size(); + const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : framework::vectorize(z->dims()); + + auto src0_md = dnnl::memory::desc( + src_x_tz, platform::MKLDNNGetDataType(), x->format()); + auto src1_md = dnnl::memory::desc( + src_y_tz, platform::MKLDNNGetDataType(), y->format()); + if (rankdiff > 0) { // Second input is of smaller rank than first + std::vector dims1_ex(rankdiff, 1); + dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), + src_y_tz.begin(), src_y_tz.end()); + src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), src_x_tz.end()); + src0_md = src0_md.reshape(dims0_ex); + } + const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, - dst_md); + auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, + src1_md, dst_md); + } } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), - to_void_cast(input_data)); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src1_desc(), to_void_cast(input_data), "@src1_mem_p"); } private: @@ -980,95 +775,111 @@ class ReductionMKLDNNHandler template class ActivationMKLDNNHandler - : public MKLDNNHandlerNoCachingT { + : public MKLDNNHandlerT { public: ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const mkldnn::engine engine, Place cpu_place, - const framework::Tensor* in_x) - : platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - // eltwise_linear means we are in scale op - if (algorithm == mkldnn::algorithm::eltwise_linear) { - bool bias_after_scale = ctx.Attr("bias_after_scale"); - auto* scale_tensor = ctx.Input("ScaleTensor"); - alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") - : (float)*(scale_tensor->data()); - beta = ctx.Attr("bias"); - // if bias_after_scale == true - // out = scale*X + bias - // else - // out = scale*(X + bias) = scale*X + scale*bias - if (!bias_after_scale) beta *= alpha; - } else { - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); + const MKLDNNDeviceContext& dev_ctx, Place cpu_place, + const framework::Tensor* in_x, + const std::string& unique_name, bool is_inplaced) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + is_inplaced ? platform::CreateKey( + dev_ctx, framework::vectorize(in_x->dims()), "a", + algorithm, unique_name) + : platform::CreateKey( + dev_ctx, framework::vectorize(in_x->dims()), "a", + unique_name)) { + if (!this->isCached()) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + // eltwise_linear means we are in scale op + if (algorithm == mkldnn::algorithm::eltwise_linear) { + bool bias_after_scale = ctx.Attr("bias_after_scale"); + auto* scale_tensor = ctx.Input("ScaleTensor"); + alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") + : (float)*(scale_tensor->data()); + beta = ctx.Attr("bias"); + // if bias_after_scale == true + // out = scale*X + bias + // else + // out = scale*(X + bias) = scale*X + scale*bias + if (!bias_after_scale) beta *= alpha; + } else { + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } } - } - PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, - platform::errors::Unimplemented( - "Input dimension size can be 1, 2, 3, 4, " - "5, or 6, but now the dimension size is", - in_x->dims().size())); + PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, + platform::errors::Unimplemented( + "Input dimension size can be 1, 2, 3, 4, " + "5, or 6, but now the dimension size is", + in_x->dims().size())); - auto src_tz = framework::vectorize(in_x->dims()); - auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto md = - mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); + auto src_tz = framework::vectorize(in_x->dims()); + auto src_fmt = + src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto md = mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), + src_fmt); - this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, - algorithm, md, alpha, beta); + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta); + } } ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const mkldnn::engine engine, Place cpu_place, - const framework::Tensor* in_x, const Tensor* out_grad) - : platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + const MKLDNNDeviceContext& dev_ctx, Place cpu_place, + const framework::Tensor* in_x, const Tensor* out_grad, + const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), + "a", unique_name)) { + if (!this->isBwdCached()) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } - auto diff_dst_tz = framework::vectorize(out_grad->dims()); + auto diff_dst_tz = framework::vectorize(out_grad->dims()); - auto src_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto diff_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); + auto src_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto diff_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); - auto dims = framework::vectorize(in_x->dims()); - auto diff_dst_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), src_fmt); + auto dims = framework::vectorize(in_x->dims()); + auto diff_dst_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, - algorithm, src_md, alpha, beta); - this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, - alpha, beta); + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, algorithm, src_md, alpha, beta); + this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, + alpha, beta); + } } std::shared_ptr AcquireBackwardSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), - to_void_cast(input_data)); + to_void_cast(input_data), + "@bwd-src_mem_p"); } }; @@ -1619,6 +1430,11 @@ using ConvMKLDNNHandler = mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights>; +using ConvTransposeMKLDNNHandler = + ConvMKLDNNTemplateHandler; + template static std::shared_ptr SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output,