MKLDNN conv2d kernel added (#8451)

* MKLDNN conv2 OP kernel added * TODOs added * mkldnn conv2d OP refactor * CanCUDNNBeUsed and CanMKLDNNBeUsed moved

MKLDNN conv2d kernel added (#8451)
* MKLDNN conv2 OP kernel added * TODOs added * mkldnn conv2d OP refactor * CanCUDNNBeUsed and CanMKLDNNBeUsed moved
8c71adaa · pzelazko-intel · Tao Luo · 049383c6 · 8c71adaa · 8c71adaa
10 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
@@ -13,6 +15,8 @@ function(op_library TARGET)
    set(cu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(CUDNN_FILE)
+    set(mkldnn_cc_srcs)
+    set(MKLDNN_FILE)
    set(op_common_deps operator op_registry math_function)
    set(options "")
    set(oneValueArgs "")
@@ -36,12 +40,20 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
        endif()
+        if(WITH_MKLDNN)
+            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
+                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
@@ -62,11 +74,11 @@ function(op_library TARGET)
        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
    endif()
    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
+            ${op_common_deps})
    endif()
    # Define operators that don't need pybind here.
@@ -101,7 +113,8 @@ function(op_library TARGET)
    # pybind USE_CPU_ONLY_OP
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -112,6 +125,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+    endif()
    # pybind USE_OP
    if (${pybind_flag} EQUAL 0)
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::primitive;
+using mkldnn::convolution_forward;
+using mkldnn::convolution_backward_weights;
+using mkldnn::convolution_backward_data;
+using mkldnn::convolution_direct;
+using mkldnn::prop_kind;
+using mkldnn::padding_kind;
+using mkldnn::stream;
+namespace {
+std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                     const memory::desc& dst, const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const mkldnn::engine& engine);
+convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
+    const memory::desc& src, const memory::desc& diff_weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine);
+convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
+    const memory::desc& diff_src, const memory::desc& weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine);
+}  // anonymous namespace
+template <typename T>
+class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Output("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    // TODO(pzelazko-intel) add support for group convolution and dilation
+    PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
+    PADDLE_ENFORCE(
+        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        "dilation in convolution is not implemented yet");
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    // allocate memory for output
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // TODO(pzelazko-intel): support more formats
+    // memory descriptors for convolution src/weight/dst
+    auto conv_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_dst_md =
+        MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
+    // create memory primitives
+    auto conv_src_memory =
+        memory({conv_src_md, mkldnn_engine}, (void*)input_data);
+    auto conv_weights_memory =
+        memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
+    auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data);
+    std::unique_ptr<convolution_forward::primitive_desc> conv_pd =
+        ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides,
+                             paddings, mkldnn_engine);
+    // save p_conv_pd into dev_ctx to be referred in backward path
+    auto p_conv_pd = conv_pd.get();
+    std::shared_ptr<void> conv_pd_value = std::move(conv_pd);
+    dev_ctx.SetBlob(key_conv_pd, conv_pd_value);
+    // create convolution op primitive
+    auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory,
+                                         conv_weights_memory, conv_dst_memory);
+    // push op to stream and wait MKLDNN until it's executed
+    std::vector<primitive> pipeline{conv_prim};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
+template <typename T>
+class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const Tensor* filter = ctx.Input<Tensor>("Filter");
+    const Tensor* output = ctx.Input<Tensor>("Output");
+    const Tensor* output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    if (!input_grad && !filter_grad) return;
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Input("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    T* input_grad_data = nullptr;
+    T* filter_grad_data = nullptr;
+    // allocate memory for gradient of input/filter
+    if (input_grad) {
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (filter_grad) {
+      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // TODO(pzelazko-intel): support more formats
+    auto conv_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_diff_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_diff_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_diff_dst_md =
+        MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
+    // create memory
+    auto conv_diff_dst_memory =
+        memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data);
+    // Retrieve conv_pd from device context
+    std::shared_ptr<void> conv_pd;
+    convolution_forward::primitive_desc* p_conv_pd;
+    conv_pd = dev_ctx.GetBlob(key_conv_pd);
+    PADDLE_ENFORCE(conv_pd != nullptr,
+                   "Fail to find conv_pd in device context");
+    p_conv_pd =
+        static_cast<convolution_forward::primitive_desc*>(conv_pd.get());
+    // create backward conv primitive for weights
+    if (filter_grad) {
+      // create primitive descriptor
+      convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
+          ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md,
+                                      conv_diff_dst_md, strides, paddings,
+                                      *p_conv_pd, mkldnn_engine);
+      // create memory
+      auto conv_diff_weights_memory = memory(
+          {conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
+      auto conv_src_memory =
+          memory({conv_src_md, mkldnn_engine}, (void*)input_data);
+      // create backward conv primitive for weights
+      auto conv_bwd_weights_prim = convolution_backward_weights(
+          conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory,
+          conv_diff_weights_memory);
+      // push primitive and execute it
+      std::vector<primitive> pipeline{conv_bwd_weights_prim};
+      stream(stream::kind::eager).submit(pipeline).wait();
+    }
+    if (input_grad) {
+      // create primitive descriptor
+      convolution_backward_data::primitive_desc conv_bwd_data_pd =
+          ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md,
+                                   conv_diff_dst_md, strides, paddings,
+                                   *p_conv_pd, mkldnn_engine);
+      // create memory
+      auto conv_diff_src_memory =
+          memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data);
+      auto conv_weights_memory =
+          memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
+      // create backward conv primitive for data
+      auto conv_bwd_data_prim =
+          convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory,
+                                    conv_weights_memory, conv_diff_src_memory);
+      // push primitive and execute it
+      std::vector<primitive> pipeline{conv_bwd_data_prim};
+      stream(stream::kind::eager).submit(pipeline).wait();
+    }
+  }  // Compute()
+};
+namespace {
+std::unique_ptr<convolution_forward::primitive_desc> ConvFwdPrimitiveDesc(
+    const memory::desc& src, const memory::desc& weights,
+    const memory::desc& dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings, const mkldnn::engine& engine) {
+  mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+  mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+  auto conv_desc = mkldnn::convolution_forward::desc(
+      mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst,
+      stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+  auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine);
+  return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+      p_conv_pd);
+}
+convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
+    const memory::desc& src, const memory::desc& diff_weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine) {
+  auto conv_bwd_weights_desc = convolution_backward_weights::desc(
+      convolution_direct, src, diff_weights, diff_dst, strides, paddings,
+      paddings, padding_kind::zero);
+  return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc,
+                                                      engine, conv_pd);
+}
+convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
+    const memory::desc& diff_src, const memory::desc& weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine) {
+  auto conv_bwd_data_desc = convolution_backward_data::desc(
+      convolution_direct, diff_src, weights, diff_dst, strides, paddings,
+      paddings, padding_kind::zero);
+  return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine,
+                                                   conv_pd);
+}
+}  // anonymous namespace
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvOpMkldnnKernel<float>);
+REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvGradOpMkldnnKernel<float>);
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
-  framework::LibraryType library_;
+#ifdef PADDLE_WITH_MKLDNN
-  if (use_cudnn) {
+  if (library_ == framework::LibraryType::kPlain &&
-    library_ = framework::LibraryType::kCUDNN;
+      platform::CanMKLDNNBeUsed(ctx)) {
-  } else {
+    library_ = framework::LibraryType::kMKLDNN;
-    library_ = framework::LibraryType::kPlain;
  }
+#endif
  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
+#ifdef PADDLE_WITH_MKLDNN
-  framework::LibraryType library_;
+  if (library_ == framework::LibraryType::kPlain &&
-  if (use_cudnn) {
+      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
+    library_ = framework::LibraryType::kMKLDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
  }
+#endif
  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <vector>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };
+inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (use_cudnn) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  return use_cudnn;
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
  PADDLE_ENFORCE_GT(places.size(), 0);
  for (size_t i = 0; i < places.size(); i++) {
    if (platform::is_cpu_place(places[i])) {
+#ifdef PADDLE_WITH_MKLDNN
+      device_contexts_.emplace(places[i],
+                               new platform::MKLDNNDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+#else
      device_contexts_.emplace(places[i],
                               new platform::CPUDeviceContext(
                                   boost::get<platform::CPUPlace>(places[i])));
+#endif
    } else if (platform::is_gpu_place(places[i])) {
 #ifdef PADDLE_WITH_CUDA
      device_contexts_.emplace(places[i],
@@ -170,64 +176,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 #ifdef PADDLE_WITH_MKLDNN
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), ready_(false) {
+    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
 }
-template <typename T>
+void MKLDNNDeviceContext::SetBlob(const std::string& name,
-void MKLDNNDeviceContext::AddElement(const std::string& op_key,
+                                  std::shared_ptr<void> data) const {
-                                     const T& value) {
+  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  if (GetElement<T>(op_key)) {
+  p = p_blobs_.get();
-    return;
-  }
-  GetElementPool<T>().emplace(op_key, std::move(value));
-}
-template <typename T>
+  auto it = p->find(name);
-const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
-  auto it = GetElementPool<T>().find(op_key);
-  return it == GetElementPool<T>().end() ? nullptr : it->second;
-}
-template <>
+  if (it == p->end()) {
-const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+    (*p)[name] = data;  // create new blob
-                         std::hash<std::string>>&
+  } else {
-MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
+    it->second = data;  // set data to existing blob
-  return memory_pool_;
+  }
-}
-template <>
+  return;
-const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
-  return primitive_pool_;
 }
-template <>
+std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
-const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+    const std::string& name) const {
-                         std::hash<std::string>>&
+  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
+  p = p_blobs_.get();
-  return primitive_desc_pool_;
-}
-void MKLDNNDeviceContext::Execute(bool block) {
+  auto it = p->find(name);
-  if (pipeline_.empty()) {
-    return;
-  }
-  ResetStream();
-  stream_->submit(pipeline_).wait(block);
-  ready_ = false;
-  pipeline_.clear();
-}
-void MKLDNNDeviceContext::ResetStream() {
+  if (it != p->end()) {
-  if (ready_) {
+    return it->second;
-    return;
  }
-  // TODO(TJ): change me when mkldnn have specific method to reset this state
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  return nullptr;
-  ready_ = true;
 }
 #endif

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #endif
 #ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include <mkldnn.hpp>
 #endif
 #include "paddle/fluid/platform/enforce.h"
@@ -114,46 +114,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 public:
  explicit MKLDNNDeviceContext(CPUPlace place);
-  /* \brief  Add new element: memory, primitive or primitive desc */
-  template <typename T>
-  void AddElement(const std::string& op_key, const T& value);
-  /* \brief  Get existed element: memory, primitive or primitive desc */
-  template <typename T>
-  const T& GetElement(const std::string& op_key) const;
-  /* \brief  Get element pool: memory, primitive or primitive desc pool */
-  template <typename T>
-  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
-  GetElementPool() const;
  /* \brief  Get the active engine */
-  const MKLDNNEngine& engine() const { return *engine_; }
+  const mkldnn::engine& GetEngine() const { return engine_; }
-  /* \brief  Submit primitive to pipeline */
-  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
-  /*! \brief  Execute all submitted primitives in pipeline */
+  // Set data to blob (i.e. name/data pair). Create blob if not existing
-  void Execute(bool block = true);
+  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
- protected:
+  // Find a saved blob. Return nullptr if not found
-  /*! \brief  Reset the stream to prepare next exectue */
+  std::shared_ptr<void> GetBlob(const std::string& name) const;
-  void ResetStream();
 private:
-  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+  mkldnn::engine engine_;
-                     std::hash<std::string>>
+  std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
-      memory_pool_;
+      p_blobs_;
-  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                     std::hash<std::string>>
-      primitive_pool_;
-  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
-                     std::hash<std::string>>
-      primitive_desc_pool_;
-  std::vector<MKLDNNPrimitive> pipeline_;
-  MKLDNNStreamPtr stream_;
-  MKLDNNEnginePtr engine_;
-  bool ready_;
 };
 #endif

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -16,12 +16,15 @@ limitations under the License. */
 #include <mkldnn.hpp>
+#include "paddle/fluid/framework/operator.h"
 namespace paddle {
 namespace platform {
 using MKLDNNStream = mkldnn::stream;
 using MKLDNNEngine = mkldnn::engine;
 using MKLDNNMemory = mkldnn::memory;
+using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
 using MKLDNNPrimitive = mkldnn::primitive;
 using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
 typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
 typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
+inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
+                                          mkldnn::memory::data_type data_type,
+                                          mkldnn::memory::format format) {
+  mkldnn::memory::dims tz = dims;
+  return mkldnn::memory::desc({tz}, data_type, format);
+}
+inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
+  return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
+}
 }  // namespace platform
 }  // namespace paddle
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1111,6 +1111,7 @@ def conv2d(input,
           param_attr=None,
           bias_attr=None,
           use_cudnn=True,
+           use_mkldnn=False,
           act=None):
    """
    **Convlution2D Layer**
@@ -1252,7 +1253,8 @@ def conv2d(input,
            'strides': stride,
            'paddings': padding,
            'groups': groups,
-            'use_cudnn': use_cudnn
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': use_mkldnn
        })
    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)

--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
                         act,
                         param_attr=None,
                         pool_type='max',
-                         use_cudnn=True):
+                         use_cudnn=True,
+                         use_mkldnn=False):
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
        act=act,
-        use_cudnn=use_cudnn)
+        use_cudnn=use_cudnn,
+        use_mkldnn=use_mkldnn)
    pool_out = layers.pool2d(
        input=conv_out,
@@ -58,7 +60,8 @@ def img_conv_group(input,
                   conv_batchnorm_drop_rate=0.0,
                   pool_stride=1,
                   pool_type=None,
-                   use_cudnn=True):
+                   use_cudnn=True,
+                   use_mkldnn=False):
    """
    Image Convolution Group, Used for vgg net.
    """
@@ -90,7 +93,8 @@ def img_conv_group(input,
            padding=conv_padding[i],
            param_attr=param_attr[i],
            act=local_conv_act,
-            use_cudnn=use_cudnn)
+            use_cudnn=use_cudnn,
+            use_mkldnn=use_mkldnn)
        if conv_with_batchnorm[i]:
            tmp = layers.batch_norm(input=tmp, act=conv_act)

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 class TestConv2dOp(OpTest):
    def setUp(self):
        self.use_cudnn = False
+        self.use_mkldnn = False
        self.init_op_type()
        self.init_group()
        self.init_dilation()
@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
            'paddings': self.pad,
            'groups': self.groups,
            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
        }
        self.outputs = {'Output': output}
@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
+#----------------Conv2dMKLDNN----------------
+class TestMKLDNN(TestConv2dOp):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+class TestMKLDNNWithPad(TestWithPad):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+class TestMKLDNNWithStride(TestWithStride):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
 if __name__ == '__main__':
    unittest.main()