From 4730a4be24ba2634dccf9ee32b1ae32d1c61b553 Mon Sep 17 00:00:00 2001 From: pzelazko-intel Date: Sat, 10 Mar 2018 16:48:23 +0100 Subject: [PATCH] MKLDNN pool2d OP kernel added (#8879) * MKLDNN pool2d OP kernel added * conv2d and pool2d MKLDNN kernels renamed * MKLDNN conv2d kernel refactoring --- paddle/fluid/operators/conv_mkldnn_op.cc | 289 ++++++++---------- paddle/fluid/operators/pool_mkldnn_op.cc | 217 +++++++++++++ paddle/fluid/operators/pool_op.cc | 48 +-- python/paddle/fluid/layers/nn.py | 4 +- python/paddle/fluid/nets.py | 6 +- .../fluid/tests/unittests/test_pool2d_op.py | 39 +++ 6 files changed, 417 insertions(+), 186 deletions(-) create mode 100644 paddle/fluid/operators/pool_mkldnn_op.cc diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index d59cc2c9d..0a8a5d4c7 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -12,58 +12,21 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "mkldnn.hpp" -#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { -using paddle::framework::Tensor; -using paddle::platform::MKLDNNDeviceContext; -using paddle::platform::MKLDNNMemDesc; - -using mkldnn::memory; // Note: paddle has also "memory" namespace -using mkldnn::primitive; -using mkldnn::convolution_forward; -using mkldnn::convolution_backward_weights; -using mkldnn::convolution_backward_data; -using mkldnn::convolution_direct; -using mkldnn::prop_kind; -using mkldnn::padding_kind; -using mkldnn::stream; - -namespace { -std::unique_ptr -ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, - const memory::desc& dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::engine& engine); - -convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc( - const memory::desc& src, const memory::desc& diff_weights, - const memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine); - -convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( - const memory::desc& diff_src, const memory::desc& weights, - const memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine); -} // anonymous namespace - template -class ConvOpMkldnnKernel : public paddle::framework::OpKernel { +class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); auto* input = ctx.Input("Input"); @@ -88,7 +51,6 @@ class ConvOpMkldnnKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); const T* filter_data = filter->data(); - // allocate memory for output T* output_data = output->mutable_data(ctx.GetPlace()); PADDLE_ENFORCE(input->dims().size() == 4, @@ -102,48 +64,69 @@ class ConvOpMkldnnKernel : public paddle::framework::OpKernel { std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // TODO(pzelazko-intel): support more formats - // memory descriptors for convolution src/weight/dst - auto conv_src_md = - MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw); - auto conv_weights_md = - MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw); - auto conv_dst_md = - MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw); - - // create memory primitives - auto conv_src_memory = - memory({conv_src_md, mkldnn_engine}, (void*)input_data); - auto conv_weights_memory = - memory({conv_weights_md, mkldnn_engine}, (void*)filter_data); - auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data); - - std::unique_ptr conv_pd = - ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides, - paddings, mkldnn_engine); - - // save p_conv_pd into dev_ctx to be referred in backward path - auto p_conv_pd = conv_pd.get(); - std::shared_ptr conv_pd_value = std::move(conv_pd); - dev_ctx.SetBlob(key_conv_pd, conv_pd_value); + auto src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + auto weights_memory = + mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data); + auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data); + + std::shared_ptr conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine); + + // save conv_pd into global device context to be referred in backward path + dev_ctx.SetBlob(key_conv_pd, conv_pd); // create convolution op primitive - auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory, - conv_weights_memory, conv_dst_memory); + auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory, + weights_memory, dst_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{conv_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } - // push op to stream and wait MKLDNN until it's executed - std::vector pipeline{conv_prim}; - stream(stream::kind::eager).submit(pipeline).wait(); + private: + std::unique_ptr + ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src, + const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& dst, + const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine) const { + mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; + mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto conv_desc = mkldnn::convolution_forward::desc( + mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, + dst, stride_dims, padding_dims, padding_dims, + mkldnn::padding_kind::zero); + + auto p_conv_pd = + new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + + return std::unique_ptr( + p_conv_pd); } }; template -class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel { +class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* input = ctx.Input("Input"); @@ -170,7 +153,6 @@ class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel { T* input_grad_data = nullptr; T* filter_grad_data = nullptr; - // allocate memory for gradient of input/filter if (input_grad) { input_grad_data = input_grad->mutable_data(ctx.GetPlace()); } @@ -184,130 +166,111 @@ class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel { std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // TODO(pzelazko-intel): support more formats - auto conv_src_md = - MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw); - auto conv_diff_src_md = - MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw); - auto conv_weights_md = - MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw); - auto conv_diff_weights_md = - MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw); - auto conv_diff_dst_md = - MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw); + auto src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto diff_src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto diff_weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto diff_dst_md = platform::MKLDNNMemDesc( + dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); // create memory - auto conv_diff_dst_memory = - memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data); + auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine}, + (void*)output_grad_data); // Retrieve conv_pd from device context - std::shared_ptr conv_pd; - convolution_forward::primitive_desc* p_conv_pd; - - conv_pd = dev_ctx.GetBlob(key_conv_pd); + auto conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); PADDLE_ENFORCE(conv_pd != nullptr, "Fail to find conv_pd in device context"); - p_conv_pd = - static_cast(conv_pd.get()); // create backward conv primitive for weights if (filter_grad) { // create primitive descriptor - convolution_backward_weights::primitive_desc conv_bwd_weights_pd = - ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md, - conv_diff_dst_md, strides, paddings, - *p_conv_pd, mkldnn_engine); + mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd = + ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md, + strides, paddings, *conv_pd, + mkldnn_engine); // create memory - auto conv_diff_weights_memory = memory( - {conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data); - auto conv_src_memory = - memory({conv_src_md, mkldnn_engine}, (void*)input_data); + auto diff_weights_memory = mkldnn::memory( + {diff_weights_md, mkldnn_engine}, (void*)filter_grad_data); + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); // create backward conv primitive for weights - auto conv_bwd_weights_prim = convolution_backward_weights( - conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory, - conv_diff_weights_memory); + auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights( + conv_bwd_weights_pd, src_memory, diff_dst_memory, + diff_weights_memory); // push primitive and execute it - std::vector pipeline{conv_bwd_weights_prim}; - stream(stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline{conv_bwd_weights_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } if (input_grad) { // create primitive descriptor - convolution_backward_data::primitive_desc conv_bwd_data_pd = - ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md, - conv_diff_dst_md, strides, paddings, - *p_conv_pd, mkldnn_engine); + mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd = + ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md, + strides, paddings, *conv_pd, mkldnn_engine); // create memory - auto conv_diff_src_memory = - memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data); - auto conv_weights_memory = - memory({conv_weights_md, mkldnn_engine}, (void*)filter_data); + auto diff_src_memory = + mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data); + auto weights_memory = + mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data); // create backward conv primitive for data - auto conv_bwd_data_prim = - convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory, - conv_weights_memory, conv_diff_src_memory); + auto conv_bwd_data_prim = mkldnn::convolution_backward_data( + conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory); - // push primitive and execute it - std::vector pipeline{conv_bwd_data_prim}; - stream(stream::kind::eager).submit(pipeline).wait(); + // push primitive to stream and wait until it's executed + std::vector pipeline{conv_bwd_data_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } } // Compute() + + private: + mkldnn::convolution_backward_weights::primitive_desc + ConvBwdWeightsPrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights, + const mkldnn::memory::desc& diff_dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::convolution_forward::primitive_desc& conv_pd, + const mkldnn::engine& engine) const { + auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( + mkldnn::convolution_direct, src, diff_weights, diff_dst, strides, + paddings, paddings, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc( + conv_bwd_weights_desc, engine, conv_pd); + } + + mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( + const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& diff_dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::convolution_forward::primitive_desc& conv_pd, + const mkldnn::engine& engine) const { + auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( + mkldnn::convolution_direct, diff_src, weights, diff_dst, strides, + paddings, paddings, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc, + engine, conv_pd); + } }; -namespace { -std::unique_ptr ConvFwdPrimitiveDesc( - const memory::desc& src, const memory::desc& weights, - const memory::desc& dst, const std::vector& strides, - const std::vector& paddings, const mkldnn::engine& engine) { - mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; - mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; - - auto conv_desc = mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst, - stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - - auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine); - - return std::unique_ptr( - p_conv_pd); -} - -convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc( - const memory::desc& src, const memory::desc& diff_weights, - const memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) { - auto conv_bwd_weights_desc = convolution_backward_weights::desc( - convolution_direct, src, diff_weights, diff_dst, strides, paddings, - paddings, padding_kind::zero); - return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc, - engine, conv_pd); -} - -convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( - const memory::desc& diff_src, const memory::desc& weights, - const memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) { - auto conv_bwd_data_desc = convolution_backward_data::desc( - convolution_direct, diff_src, weights, diff_dst, strides, paddings, - paddings, padding_kind::zero); - return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine, - conv_pd); -} -} // anonymous namespace } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvOpMkldnnKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvGradOpMkldnnKernel); + ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc new file mode 100644 index 000000000..c88578570 --- /dev/null +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -0,0 +1,217 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +template +class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const Tensor* input = ctx.Input("X"); + Tensor* output = ctx.Output("Out"); + + // Get an unique name from "argument" name of "Out" variable + // This name will be used as key when saving info into device context + const std::string key = ctx.op().Output("Out"); + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(input->dims()[i + 2]); + } + } + + // Only 2D pooling is supported now + PADDLE_ENFORCE(ksize.size() == 2, "ksize must be 2D, i.e. 2D pooling"); + PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg", + "pooling_type must be 'max' or 'avg'"); + PADDLE_ENFORCE(input->dims().size() == 4, + "Input dim must be with 4, i.e. NCHW"); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + std::shared_ptr pool_pd = + CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, + pooling_type, mkldnn_engine); + + // save pool_pd into global device context to be referred in backward path + dev_ctx.SetBlob(key_pool_pd, pool_pd); + + std::shared_ptr workspace_memory = + CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); + + // save pool_workspace_memory to be referred in backward path + dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); + + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + auto dst_memory = + mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); + + auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, + *workspace_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{pool_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + + private: + std::unique_ptr CreatePrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, + const std::vector& stride, const std::vector& padding, + const std::vector& kernel, const std::string& pooling_type, + const mkldnn::engine& engine) const { + auto pool_desc = mkldnn::pooling_forward::desc( + mkldnn::prop_kind::forward, + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero); + + auto p_pool_pd = + new mkldnn::pooling_forward::primitive_desc(pool_desc, engine); + return std::unique_ptr(p_pool_pd); + } + + std::unique_ptr CreateWorkspaceMemory( + std::shared_ptr pool_pd, + const std::string& pooling_type, const mkldnn::engine& engine) const { + mkldnn::memory::primitive_desc workspace_md = + pooling_type == "max" + ? pool_pd->workspace_primitive_desc() + : mkldnn::memory::primitive_desc( + {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw}, + engine); + + auto p_workspace_memory = new mkldnn::memory(workspace_md); + return std::unique_ptr(p_workspace_memory); + } +}; + +template +class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + const Tensor* in_x = ctx.Input("X"); + const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); + + // Get an unique name from "argument" name of "Out" variable + // This name will be used as key when referring info from device context + const std::string key = ctx.op().Input("Out"); + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + + auto& dev_ctx = + ctx.template device_context(); + const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine(); + + const T* out_grad_data = out_grad->data(); + T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); + + std::vector diff_src_tz = + paddle::framework::vectorize2int(in_x_grad->dims()); + std::vector diff_dst_tz = + paddle::framework::vectorize2int(out_grad->dims()); + + auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + // Retrieve pool_pd/pool_workspace_memory from device context + auto pool_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_pd)); + PADDLE_ENFORCE(pool_pd != nullptr, + "Fail to find pool_pd in device context"); + + auto workspace_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_workspace_memory)); + PADDLE_ENFORCE(workspace_memory != nullptr, + "Fail to find workspace_memory in device context"); + + auto pool_bwd_desc = mkldnn::pooling_backward::desc( + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, + mkldnn::padding_kind::zero); + auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( + pool_bwd_desc, mkldnn_engine, *pool_pd); + + auto diff_src_memory = + mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); + auto diff_dst_memory = + mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); + + auto bwd_prim = mkldnn::pooling_backward( + pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{bwd_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } // Compute() +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::PoolMKLDNNOpKernel); +REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::PoolMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index ac22acb25..d78da1001 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -13,6 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -76,20 +82,18 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.template device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); @@ -107,20 +111,18 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.template device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); @@ -181,6 +183,9 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) "output height and width. False is the default. If it is set to False, " "the floor function will be used.") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -276,6 +281,9 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker) "output height and width. False is the default. If it is set to False, " "the floor function will be used.") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b4fa530aa..10b0405f4 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1406,6 +1406,7 @@ def pool2d(input, global_pooling=False, use_cudnn=True, ceil_mode=False, + use_mkldnn=False, name=None): """ This function adds the operator for pooling in 2 dimensions, using the @@ -1443,7 +1444,8 @@ def pool2d(input, "strides": pool_stride, "paddings": pool_padding, "use_cudnn": use_cudnn, - "ceil_mode": ceil_mode + "ceil_mode": ceil_mode, + "use_mkldnn": use_mkldnn }) return pool_out diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 8c627ad55..3b2e1a307 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -45,7 +45,8 @@ def simple_img_conv_pool(input, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, - use_cudnn=use_cudnn) + use_cudnn=use_cudnn, + use_mkldnn=use_mkldnn) return pool_out @@ -107,7 +108,8 @@ def img_conv_group(input, pool_size=pool_size, pool_type=pool_type, pool_stride=pool_stride, - use_cudnn=use_cudnn) + use_cudnn=use_cudnn, + use_mkldnn=use_mkldnn) return pool_out diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index d2107fb47..964d78f19 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -79,6 +79,7 @@ def avg_pool2D_forward_naive(x, class TestPool2d_Op(OpTest): def setUp(self): self.use_cudnn = False + self.use_mkldnn = False self.init_test_case() self.init_global_pool() self.init_op_type() @@ -99,6 +100,7 @@ class TestPool2d_Op(OpTest): 'pooling_type': self.pool_type, 'global_pooling': self.global_pool, 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, 'ceil_mode': self.ceil_mode, 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter } @@ -260,5 +262,42 @@ class TestCeilModeCase4(TestCase2): self.ceil_mode = True +#--------------------test pool2d MKLDNN-------------------- +class TestMKLDNNCase1(TestPool2d_Op): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase2(TestCase1): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase3(TestCase2): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase4(TestCase3): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase5(TestCase4): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + +class TestMKLDNNCase6(TestCase5): + def init_op_type(self): + self.use_mkldnn = True + self.op_type = "pool2d" + + if __name__ == '__main__': unittest.main() -- GitLab