Refactor of conv fp32 oneDNN operator (#25137) (#25572)

e7724a2c · Adam · GitHub · 9bf70039 · e7724a2c · e7724a2c
Showing with 467 addition and 266 deletion

paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +345 -261

paddle/fluid/platform/mkldnn_reuse.h paddle/fluid/platform/mkldnn_reuse.h +122 -5

未找到文件。
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -26,42 +26,24 @@ using mkldnn::memory;
 using mkldnn::primitive;
 using mkldnn::reorder;
 using mkldnn::stream;
-using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
+using platform::to_void_cast;
 inline void GetWeightsTz(std::vector<int64_t>& weights_tz,  // NOLINT
-                         int groups, bool is_conv3d) {
+                         const int groups) {
  if (groups > 1) {
-    if (is_conv3d) {
+    // if (is_conv3d) [o, i, d, h, w]->[g, o/g, i, d, h, w]
-      int output = weights_tz[0];
+    // else [o, i, h, w] -> [g, o/g, i, h, w]
-      int input = weights_tz[1];
+    weights_tz.push_back(0);
-      int dimension = weights_tz[2];
+    std::rotate(weights_tz.begin(), weights_tz.end() - 1, weights_tz.end());
-      int height = weights_tz[3];
+    weights_tz[0] = groups;
-      int width = weights_tz[4];
+    weights_tz[1] = weights_tz[1] / groups;
-      weights_tz.resize(6);
-      weights_tz[0] = groups;
-      weights_tz[1] = output / groups;
-      weights_tz[2] = input;
-      weights_tz[3] = dimension;
-      weights_tz[4] = height;
-      weights_tz[5] = width;
-    } else {
-      int output = weights_tz[0];
-      int input = weights_tz[1];
-      int height = weights_tz[2];
-      int width = weights_tz[3];
-      weights_tz.resize(5);
-      weights_tz[0] = groups;
-      weights_tz[1] = output / groups;
-      weights_tz[2] = input;
-      weights_tz[3] = height;
-      weights_tz[4] = width;
-    }
  }
 }
-inline MKLDNNMemoryFormat GetWeightsFormat(MKLDNNMemoryFormat format,
+inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format,
-                                           int groups, bool is_conv3d) {
+                                           const int groups,
+                                           const bool is_conv3d) {
  if (is_conv3d) {
    return (groups == 1) ? format : MKLDNNMemoryFormat::goidhw;
  } else {
@@ -90,284 +72,386 @@ static mkldnn::memory::data_type GetDstType(bool is_int8,
  return dst_dt;
 }
-template <typename T, typename K>
+template <typename T>
-class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+class ConvMKLDNNHandlerT
+    : public platform::MKLDNNHandlerT<T, mkldnn::convolution_forward> {
 public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+  ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx,
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                     const platform::MKLDNNDeviceContext& dev_ctx,
-                   "It must use CPUPlace.");
+                     const mkldnn::engine mkldnn_engine,
-    bool is_INT8 =
+                     platform::Place cpu_place, const Tensor* input,
-        std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+                     const Tensor* filter, const Tensor* bias, Tensor* output,
-    if (!is_INT8) {
+                     const std::string& unique_name)
-      ComputeFP32(ctx);
+      : platform::MKLDNNHandlerT<T, mkldnn::convolution_forward>(
-    } else {
+            dev_ctx, mkldnn_engine, cpu_place,
-      std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
+            platform::CreateKey(framework::vectorize(input->dims()),
-      bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+                                unique_name)) {
-      bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+    if (!this->isCached()) {
-      auto residual_param = ctx.Input<Tensor>("ResidualData");
+      PADDLE_ENFORCE_EQ(
-      auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation,
+          input->layout(), DataLayout::kMKLDNN,
-                               fuse_residual_conn, residual_param);
+          platform::errors::InvalidArgument(
-      if (dst_dt == mkldnn::memory::data_type::f32) {
+              "The input tensor's layout should be %d, but got %d.",
-        ComputeINT8<float>(ctx);
+              DataLayout::kMKLDNN, input->layout()));
-      } else if (dst_dt == mkldnn::memory::data_type::u8) {
+      PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-        ComputeINT8<uint8_t>(ctx);
+                        platform::errors::InvalidArgument(
-      } else if (dst_dt == mkldnn::memory::data_type::s8) {
+                            "Wrong format set for Input tensor"));
-        ComputeINT8<int8_t>(ctx);
-      }
-    }
-  }
-  void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
+      PADDLE_ENFORCE_EQ(
-    const bool is_test = ctx.Attr<bool>("is_test");
+          filter->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument(
+              "The Filter tensor's layout should be %d, but got %d.",
+              DataLayout::kMKLDNN, filter->layout()));
+      PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
+                        platform::errors::InvalidArgument(
+                            "Wrong format set for Filter tensor"));
-    auto& dev_ctx =
+      PADDLE_ENFORCE_GE(
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+          input->dims().size(), 4,
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
+          platform::errors::InvalidArgument(
+              "Input must be with 4 or 5 dimensions, i.e. NCHW or "
+              "NCDHW, but got dimension = %d .",
+              input->dims().size()));
+      PADDLE_ENFORCE_LE(
+          input->dims().size(), 5,
+          platform::errors::InvalidArgument(
+              "Input must be with 4 or 5 dimensions, i.e. NCHW or "
+              "NCDHW, but got dimension = %d .",
+              input->dims().size()));
-    auto* input = ctx.Input<Tensor>("Input");
+      PADDLE_ENFORCE_GE(
-    auto* filter = ctx.Input<Tensor>("Filter");
+          filter->dims().size(), 4,
-    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
+          platform::errors::InvalidArgument(
-    auto* output = ctx.Output<Tensor>("Output");
+              "Filter must be with 4 or 5 dimensions, i.e. OIHW or "
+              "OIDHW, but got dimension = %d .",
+              filter->dims().size()));
+      PADDLE_ENFORCE_LE(
+          filter->dims().size(), 5,
+          platform::errors::InvalidArgument(
+              "Filter must be with 4 or 5 dimensions, i.e. OIHW or "
+              "OIDHW, but got dimension = %d .",
+              filter->dims().size()));
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
+      if (bias) {
-                      platform::errors::InvalidArgument(
+        PADDLE_ENFORCE_EQ(
-                          "The input tensor's layout should be %d, but got %d.",
+            bias->layout(), DataLayout::kMKLDNN,
-                          DataLayout::kMKLDNN, input->layout()));
+            platform::errors::InvalidArgument(
-    PADDLE_ENFORCE_NE(
+                "The Bias tensor's layout should be %d, but got %d.",
-        input->format(), MKLDNNMemoryFormat::undef,
+                DataLayout::kMKLDNN, bias->layout()));
-        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
+        PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef,
+                          platform::errors::InvalidArgument(
+                              "Got wrong format for Bias tensor."));
-    PADDLE_ENFORCE_EQ(
+        PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-        filter->layout(), DataLayout::kMKLDNN,
+                          platform::errors::InvalidArgument(
-        platform::errors::InvalidArgument(
+                              "Bias must only have 1 dimension, "
-            "The Filter tensor's layout should be %d, but got %d.",
+                              "i.e. X, but got dimension = %d .",
-            DataLayout::kMKLDNN, filter->layout()));
+                              bias->dims().size()));
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
+      }
-                      "Wrong format set for Filter tensor");
-    PADDLE_ENFORCE_GE(
+      const std::string fuse_activation =
-        input->dims().size(), 4,
+          ctx.Attr<std::string>("fuse_activation");
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
+      const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    PADDLE_ENFORCE_LE(
+      const float fuse_beta = ctx.Attr<float>("fuse_beta");
-        input->dims().size(), 5,
+      const bool fuse_residual_conn =
-        "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
+          ctx.Attr<bool>("fuse_residual_connection");
+      const int groups = ctx.Attr<int>("groups");
+      const std::string padding_algorithm =
+          ctx.Attr<std::string>("padding_algorithm");
-    PADDLE_ENFORCE_GE(
+      const auto input_dims = input->dims();
-        filter->dims().size(), 4,
+      const auto data_dims =
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
+          framework::slice_ddim(input_dims, 2, input_dims.size());
-    PADDLE_ENFORCE_LE(
+      const auto filter_dims = filter->dims();
-        filter->dims().size(), 5,
+      const auto filter_data_dims =
-        "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
+          framework::slice_ddim(filter_dims, 2, filter_dims.size());
-    if (bias) {
+      const auto ksize = framework::vectorize(filter_data_dims);
-      PADDLE_ENFORCE_EQ(
+      const bool is_test = ctx.Attr<bool>("is_test");
-          bias->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument(
-              "The Bias tensor's layout should be %d, but got %d.",
-              DataLayout::kMKLDNN, bias->layout()));
-      PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef,
-                        "Wrong format set for Bias tensor");
-      PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
+      auto strides_temp = ctx.Attr<std::vector<int>>("strides");
-                        "Bias must only have 1 dimension, i.e. X");
+      std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
-    }
-    std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
+      auto paddings_temp = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
+      std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
-    std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
+      auto dilations_temp = ctx.Attr<std::vector<int>>("dilations");
-    std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
+      std::vector<int64_t> dilations(begin(dilations_temp),
+                                     end(dilations_temp));
-    std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
+      UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-    std::vector<int64_t> dilations(begin(dilations_temp), end(dilations_temp));
+                               data_dims, strides, ksize);
+      const bool is_conv3d = strides.size() == 3U;
-    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
+      PADDLE_ENFORCE_EQ(
-    float fuse_alpha = ctx.Attr<float>("fuse_alpha");
+          is_conv3d
-    float fuse_beta = ctx.Attr<float>("fuse_beta");
+              ? dilations.size() == 3 && dilations[0] == 1 &&
-    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+                    dilations[1] == 1 && dilations[2] == 1
-    int groups = ctx.Attr<int>("groups");
+              : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+          true, platform::errors::Unimplemented(
-    bool is_conv3d = strides.size() == 3U;
+                    "Dilation in oneDNN convolution is not implemented yet"));
-    auto input_dims = input->dims();
+      const auto src_tz = paddle::framework::vectorize(input->dims());
-    auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size());
-    auto filter_dims = filter->dims();
-    auto filter_data_dims =
-        framework::slice_ddim(filter_dims, 2, filter_dims.size());
-    auto ksize = framework::vectorize(filter_data_dims);
+      auto weights_tz = paddle::framework::vectorize(filter->dims());
+      GetWeightsTz(weights_tz, groups);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+      const auto dst_tz = paddle::framework::vectorize(output->dims());
-                             data_dims, strides, ksize);
-    std::vector<primitive> pipeline;
+      const mkldnn::memory::dims stride_dims = strides;
+      const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
-    PADDLE_ENFORCE(
+      /* create memory descriptor for convolution without specified format
-        is_conv3d
+       * ('any') which lets a primitive (convolution in this case) choose
-            ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
+       * the memory format preferred for best performance
-                  dilations[2] == 1
+       */
-            : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+      // TODO(jczaja): This is workaround to make grad op UT's numerical
-        "dilation in convolution is not implemented yet");
+      // gradient computation proper as this op is called directly without
+      // fetch op following it , so numercial grad is computed (in python)
+      // using block formats which will give wrong results
+      const std::string data_format = ctx.Attr<std::string>("data_format");
+      auto chosen_memory_format =
+          is_test ? MKLDNNMemoryFormat::any
+                  : platform::data_format_to_memory_format(data_format);
+      // Check the format for user's special output
+      if (chosen_memory_format != MKLDNNMemoryFormat::any) {
+        if (is_conv3d) {
+          chosen_memory_format = platform::MKLDNNFormatForSize(
+              src_tz.size(), chosen_memory_format);
+        }
+      }
-    const T* input_data = input->data<T>();
+      const auto src_md = platform::MKLDNNMemDesc(
-    const T* filter_data = filter->data<T>();
+          src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+      const auto weights_md =
+          platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(),
+                                  MKLDNNMemoryFormat::any);
+      const auto dst_md = platform::MKLDNNMemDesc(
+          dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    auto src_tz = paddle::framework::vectorize(input->dims());
+      const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
-    auto weights_tz = paddle::framework::vectorize(filter->dims());
+                                         : mkldnn::prop_kind::forward_training;
-    int g = std::max(groups, 1);
-    GetWeightsTz(weights_tz, g, is_conv3d);
+      const mkldnn::primitive_attr conv_attr = CreatePostOps(
+          fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn);
-    auto dst_tz = paddle::framework::vectorize(output->dims());
+      if (bias) {
+        auto bias_tz = framework::vectorize(bias->dims());
+        auto bias_md = platform::MKLDNNMemDesc(
+            bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+        this->AcquireForwardPrimitiveDescriptor(
+            conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
+            src_md, weights_md, bias_md, dst_md, stride_dims,
+            mkldnn_paddings[0], mkldnn_paddings[1]);
+      } else {
+        this->AcquireForwardPrimitiveDescriptor(
+            conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
+            src_md, weights_md, dst_md, stride_dims, mkldnn_paddings[0],
+            mkldnn_paddings[1]);
+      }
+    }
+  }
-    // Get unique name for storing MKLDNN primitives
+  mkldnn::primitive_attr CreatePostOps(
-    const std::string key = platform::CreateKey(
+      std::string fuse_activation, float fuse_alpha, float fuse_beta,
-        src_tz, ctx.InputName("Input") + ctx.InputName("Filter"));
+      bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
+      float sum_scale = 1.0f) {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    if (output_shift_scale.size() > 0) {
+      int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
+      conv_attr.set_output_scales(mask, output_shift_scale);
+    }
-    auto src_format = input->format();
+    // Fusion with Elementwise layer relies on adding a sum post-operation with
-    MKLDNNMemoryFormat weights_format =
+    // the scale parameter. It is assumed that when fuse_residual_connection is
-        GetWeightsFormat(filter->format(), g, is_conv3d);
+    // true, the output tensor contains the data coming from residual
+    // connection. The result of this post_op is:
+    // Output = scale * Output + Conv_Out.
+    if (fuse_residual_conn) {
+      post_operations.append_sum(sum_scale);
+    }
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
+      constexpr float scale = 1.0f;
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     fuse_alpha, fuse_beta);
+    } else if (fuse_activation == "relu6") {
+      constexpr float scale = 1.0f;
+      post_operations.append_eltwise(scale,
+                                     mkldnn::algorithm::eltwise_bounded_relu,
+                                     fuse_alpha, fuse_beta);
+    } else if (fuse_activation == "swish") {
+      constexpr float scale = 1.0f;
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_swish,
+                                     fuse_alpha, fuse_beta);
+    }
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryWithReorder(
+      const framework::Tensor* input) {
+    const T* input_data = input->data<T>();
    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
+        framework::vectorize(input->dims()), platform::MKLDNNGetDataType<T>(),
-    auto user_weights_md = platform::MKLDNNMemDesc(
+        input->format());
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
-    /* create memory descriptor for convolution without specified format
+    return this->AcquireMemoryWithReorder(
-     * ('any') which lets a primitive (convolution in this case) choose
+        user_src_md, this->fwd_pd_->src_desc(), to_void_cast<T>(input_data),
-     * the memory format preferred for best performance
+        "@src_mem_p");
-     */
+  }
-    // TODO(jczaja): This is workaround to make grad op UT's numerical
-    // gradient computation proper as this op is called directly without
-    // fetch op following it , so numercial grad is computed (in python)
-    // using block formats which will give wrong results
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    auto chosen_memory_format =
-        is_test ? MKLDNNMemoryFormat::any
-                : platform::data_format_to_memory_format(data_format);
-    weights_format = MKLDNNMemoryFormat::any;
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryWithReorder(
-    // Check the format for user's special output
+      const framework::Tensor* filter, const int groups, const bool is_conv3d,
-    if (chosen_memory_format != MKLDNNMemoryFormat::any) {
+      const bool is_test) {
-      if (is_conv3d) {
+    // This is workaround to make execution faster, delete
-        chosen_memory_format =
+    // if statement after including md inside Tensor
-            platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target");
-      }
+    if (is_test && weights_mem_p) {
+      return weights_mem_p;
+    } else {
+      const T* filter_data = filter->data<T>();
+      auto weights_tz = framework::vectorize(filter->dims());
+      GetWeightsTz(weights_tz, groups);
+      auto user_src_md = platform::MKLDNNMemDesc(
+          weights_tz, platform::MKLDNNGetDataType<T>(),
+          GetWeightsFormat(filter->format(), groups, is_conv3d));
+      return this->AcquireMemoryWithReorder(
+          user_src_md, this->fwd_pd_->weights_desc(),
+          to_void_cast<T>(filter_data), "@weights_mem_p", is_test);
    }
+  }
-    auto src_md = platform::MKLDNNMemDesc(
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
-        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+      const framework::Tensor* bias, const bool is_test) {
-    auto weights_md = platform::MKLDNNMemDesc(
+    const T* bias_data = bias->data<T>();
-        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
+    auto user_bias_md = platform::MKLDNNMemDesc(
-    std::vector<int64_t> bias_tz;
+        framework::vectorize(bias->dims()), platform::MKLDNNGetDataType<T>(),
-    auto dst_md = platform::MKLDNNMemDesc(
+        MKLDNNMemoryFormat::x);
-        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    return this->AcquireMemoryWithReorder(
+        user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<T>(bias_data),
+        "@bias_mem_p", is_test);
+  }
-    platform::ConvMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
+  std::shared_ptr<mkldnn::memory> AcquireResidualMemory(
+      const framework::Tensor* residual_param) {
+    const T* residual_data = residual_param->data<T>();
+    auto user_residual_md = platform::MKLDNNMemDesc(
+        framework::vectorize(residual_param->dims()),
+        framework::ToMKLDNNDataType(residual_param->type()),
+        residual_param->format());
+    return this->AcquireMemoryFromPrimitive(user_residual_md,
+                                            to_void_cast<T>(residual_data),
+                                            "@user_residual_data_mem_p");
+  }
-    // create a conv primitive descriptor and save it for usage in backward
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryWithResidual(
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
+      framework::Tensor* output, const framework::Tensor* residual_param) {
-    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
+    std::shared_ptr<dnnl::memory> dst_memory_p;
-                                 : mkldnn::prop_kind::forward_training;
+    if (residual_param->format() !=
-    if (bias) {
+        platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) {
-      bias_tz = paddle::framework::vectorize(bias->dims());
+      auto residual_memory_p = this->AcquireResidualMemory(residual_param);
-      auto bias_md = platform::MKLDNNMemDesc(
+      dst_memory_p = this->AcquireDstMemory(output);
-          bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+      this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst");
-      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
-          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
-          fwd_prop_kind);
    } else {
-      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
+      // Changing ShareDataWith to TensorCopy results in performance drop
-          src_md, weights_md, boost::none, dst_md, strides, paddings,
+      // on ResNet architectures
-          mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
+      // (https://github.com/PaddlePaddle/Paddle/issues/22964)
-          fuse_residual_conn, fwd_prop_kind);
+      output->ShareDataWith(*residual_param);
+      dst_memory_p = this->AcquireDstMemory(output);
    }
+    return dst_memory_p;
+  }
+};
-    // create mkldnn memory from input tensors (data/weights)
+template <typename T, typename K>
-    auto user_src_memory_p =
+class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
-        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+ public:
-    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-        user_weights_md, to_void_cast<T>(filter_data));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Operator DNNL Conv must use CPUPlace"));
+    bool is_INT8 =
+        std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+    if (!is_INT8) {
+      ComputeFP32(ctx);
+    } else {
+      std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
+      bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+      bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+      auto residual_param = ctx.Input<Tensor>("ResidualData");
+      auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation,
+                               fuse_residual_conn, residual_param);
+      if (dst_dt == mkldnn::memory::data_type::f32) {
+        ComputeINT8<float>(ctx);
+      } else if (dst_dt == mkldnn::memory::data_type::u8) {
+        ComputeINT8<uint8_t>(ctx);
+      } else if (dst_dt == mkldnn::memory::data_type::s8) {
+        ComputeINT8<int8_t>(ctx);
+      }
+    }
+  }
-    // create reorder primitive if the input format is not the preferred one
+  void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
-    auto src_memory_p =
+    auto& dev_ctx =
-        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
-        user_weights_memory_p, pipeline, is_test);
-    std::shared_ptr<mkldnn::memory> dst_memory_p, user_residual_memory_p;
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const bool is_conv3d = ctx.Attr<std::vector<int>>("strides").size() == 3U;
+    const bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
-    if (fuse_residual_conn) {
+    const auto* input = ctx.Input<Tensor>("Input");
-      auto residual_param = ctx.Input<Tensor>("ResidualData");
+    const auto* filter = ctx.Input<Tensor>("Filter");
-      auto residual_param_data = residual_param->data<T>();
+    const auto* bias =
+        ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-      PADDLE_ENFORCE_NE(
+    auto* output = ctx.Output<Tensor>("Output");
-          residual_param_data, nullptr,
-          "Provide data if you want MKLDNN conv+elementwise_add fusion");
-      PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(),
-                        "Output and elementwise parameter need to have the "
-                        "same dimension sizes");
-      if (residual_param->format() != handler.GetDstFormat()) {
-        auto output_data =
-            output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
-        auto residual_data_tz =
-            paddle::framework::vectorize(residual_param->dims());
-        auto residual_data_type =
-            paddle::framework::ToMKLDNNDataType(residual_param->type());
-        auto user_residual_md = platform::MKLDNNMemDesc(
+    ConvMKLDNNHandlerT<T> handler(
-            residual_data_tz, residual_data_type, residual_param->format());
+        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias,
-        user_residual_memory_p = handler.AcquireResidualDataMemory(
+        output, ctx.InputName("Input") + ctx.InputName("Filter"));
-            user_residual_md, to_void_cast<T>(residual_param_data));
-        dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory(
+    auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
-            user_residual_memory_p, to_void_cast<T>(output_data), pipeline);
-      } else {
-        // Changing ShareDataWith to TensorCopy results in performance drop
-        // on ResNet architectures
-        // (https://github.com/PaddlePaddle/Paddle/issues/22964)
-        output->ShareDataWith(*residual_param);
-        auto output_data = output->mutable_data<T>(ctx.GetPlace());
-        dst_memory_p =
-            handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
-      }
-    } else {
-      auto output_data =
-          output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
-      dst_memory_p =
-          handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
-    }
-    auto conv_p = handler.AcquireConvolution();
+    auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
+        filter, ctx.Attr<int>("groups"), is_conv3d, is_test);
-    mkldnn::stream astream(mkldnn_engine);
+    std::shared_ptr<dnnl::memory> dst_memory_p;
-    if (bias) {
+    if (fuse_residual_conn) {
-      const T* bias_data = bias->data<T>();
+      auto* residual_param = ctx.Input<Tensor>("ResidualData");
-      auto user_bias_md = platform::MKLDNNMemDesc(
+      dst_memory_p =
-          {bias_tz}, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+          handler.AcquireDstMemoryWithResidual(output, residual_param);
-      auto user_bias_memory_p =
+    } else {
-          handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
+      dst_memory_p = handler.AcquireDstMemory(output);
+    }
-      auto bias_memory_p =
+    auto conv_p = handler.AcquireForwardPrimitive();
-          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
-      conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p},
+    std::unordered_map<int, dnnl::memory> args = {
-                                {MKLDNN_ARG_WEIGHTS, *weights_memory_p},
+        {MKLDNN_ARG_SRC, *src_memory_p},
-                                {MKLDNN_ARG_BIAS, *bias_memory_p},
+        {MKLDNN_ARG_WEIGHTS, *weights_memory_p},
-                                {MKLDNN_ARG_DST, *dst_memory_p}});
+        {MKLDNN_ARG_DST, *dst_memory_p}};
-    } else {
+    if (bias) {
-      conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p},
+      auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test);
-                                {MKLDNN_ARG_WEIGHTS, *weights_memory_p},
+      args.insert({MKLDNN_ARG_BIAS, *bias_memory_p});
-                                {MKLDNN_ARG_DST, *dst_memory_p}});
    }
+    mkldnn::stream astream(mkldnn_engine);
+    conv_p->execute(astream, args);
    astream.wait();
    output->set_layout(DataLayout::kMKLDNN);
    output->set_format(GetMKLDNNFormat(*dst_memory_p));
  }
  template <typename T_out>
  void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
    const bool is_test = ctx.Attr<bool>("is_test");
@@ -516,7 +600,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      auto weights_tz = paddle::framework::vectorize(filter->dims());
      int g = std::max(groups, 1);
-      GetWeightsTz(weights_tz, g, is_conv3d);
+      GetWeightsTz(weights_tz, g);
      auto dst_tz = paddle::framework::vectorize(output->dims());
      PADDLE_ENFORCE_EQ(
@@ -562,9 +646,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          ((g) == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw);
      /* create memory descriptor for convolution without specified format
-      * ('any') which lets a primitive (convolution in this case) choose
+       * ('any') which lets a primitive (convolution in this case) choose
-      * the memory format preferred for best performance
+       * the memory format preferred for best performance
-      */
+       */
      auto chosen_memory_format = MKLDNNMemoryFormat::any;
      std::vector<int64_t> bias_tz;
@@ -823,7 +907,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto weights_tz = paddle::framework::vectorize(filter->dims());
    int g = std::max(groups, 1);
-    GetWeightsTz(weights_tz, g, is_conv3d);
+    GetWeightsTz(weights_tz, g);
    auto dst_tz = paddle::framework::vectorize(output_grad->dims());
    auto src_format = input->format();
@@ -836,7 +920,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const std::string key = platform::CreateKey(
        src_tz, ctx.InputName("Input") + ctx.InputName("Filter"));
-    const std::string key_conv_pd = key + "@conv_pd";
+    const std::string key_conv_pd = key + "@forward_pd";
    std::vector<primitive> pipeline;
    // Create user memory descriptors

--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -108,8 +108,20 @@ class MKLDNNHandlerT {
  }
 protected:
-  template <typename... Args>
+  bool isCached() {
-  void AcquireForwardPrimitiveDescriptor(Args&&... args) {
+    const std::string key_pd = key_common_ + "@forward_pd";
+    fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
+        dev_ctx_.GetBlob(key_pd));
+    const std::string key_p = key_ + "@forward_p";
+    return (dev_ctx_.GetBlob(key_p) != nullptr);
+  }
+  // If your primitive descriptor requires attributes, pass them as a
+  // first argument and paramters to descriptor constructor in the following
+  // arguments. Otherwise, all arguments will be forwarded to descriptor
+  // constructor, including the first one.
+  template <typename Arg, typename... Args>
+  void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) {
    // Forward PD has to be passed to Grad op that
    // may be executed by diffrent thread, hence
    // for that one we use key that does not contain TID
@@ -123,14 +135,34 @@ class MKLDNNHandlerT {
      fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
          dev_ctx_.GetBlob(key_pd));
      if (fwd_pd_ == nullptr) {
-        auto fwd_desc = typename TForward::desc(std::forward<Args>(args)...);
+        CreateForwardPrimitiveDescriptor(first_arg,
-        fwd_pd_ = std::make_shared<typename TForward::primitive_desc>(fwd_desc,
+                                         std::forward<Args>(args)...);
-                                                                      engine_);
        dev_ctx_.SetBlob(key_pd, fwd_pd_);
      }
    }
  }
+  // Using sfinae to specialise variadic function. Workaround for not having
+  // if constexpr in C++ 11.
+  template <class First, class... Args>
+  typename std::enable_if<std::is_same<typename std::decay<First>::type,
+                                       dnnl::primitive_attr>::value>::type
+  CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
+    auto fwd_desc = typename TForward::desc(std::forward<Args>(args)...);
+    fwd_pd_ = std::make_shared<typename TForward::primitive_desc>(
+        fwd_desc, first, engine_);
+  }
+  template <class First, class... Args>
+  typename std::enable_if<!std::is_same<typename std::decay<First>::type,
+                                        dnnl::primitive_attr>::value>::type
+  CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
+    auto fwd_desc = typename TForward::desc(std::forward<First>(first),
+                                            std::forward<Args>(args)...);
+    fwd_pd_ =
+        std::make_shared<typename TForward::primitive_desc>(fwd_desc, engine_);
+  }
  template <typename... Args>
  void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
    const std::string key_fwd_pd = key_common_ + "@forward_pd";
@@ -162,6 +194,91 @@ class MKLDNNHandlerT {
    return mem_p;
  }
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::desc md, const std::string& suffix) {
+    const auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(md, engine_);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    }
+    return mem_p;
+  }
+  void AcquireReorder(const std::shared_ptr<mkldnn::memory>& user_memory_p,
+                      const std::shared_ptr<mkldnn::memory>& target_memory_p,
+                      const std::string& suffix) {
+    const auto key_reorder_p = key_ + suffix + "reorder_p";
+    auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+        dev_ctx_.GetBlob(key_reorder_p));
+    if (reorder_p == nullptr) {
+      reorder_p =
+          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+    }
+    mkldnn::stream astream(engine_);
+    reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
+                                 {MKLDNN_ARG_TO, *target_memory_p}});
+    astream.wait();
+  }
+  std::shared_ptr<mkldnn::memory> AcquireMemoryWithReorder(
+      const mkldnn::memory::desc& user_md,
+      const mkldnn::memory::desc& target_md, void* ptr,
+      const std::string& suffix, bool is_persistent = false) {
+    const auto target_key = key_ + suffix + "_target";
+    const auto key_reorder_p = key_ + suffix + "reorder_p";
+    const auto user_key = key_ + suffix + "_user";
+    auto target_memory_p =
+        std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(target_key));
+    if (target_memory_p == nullptr) {
+      auto user_memory_p =
+          std::make_shared<dnnl::memory>(user_md, engine_, ptr);
+      if (user_md != target_md) {
+        target_memory_p = std::make_shared<mkldnn::memory>(target_md, engine_);
+        auto reorder_p =
+            std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
+        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+        mkldnn::stream astream(engine_);
+        reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
+                                     {MKLDNN_ARG_TO, *target_memory_p}});
+        astream.wait();
+      } else {
+        target_memory_p = user_memory_p;
+      }
+      dev_ctx_.SetBlob(user_key, user_memory_p);
+      dev_ctx_.SetBlob(target_key, target_memory_p);
+    } else if (!is_persistent) {
+      mkldnn::stream astream(engine_);
+      auto user_memory_p =
+          std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(user_key));
+      user_memory_p->set_data_handle(ptr);
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx_.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
+                                     {MKLDNN_ARG_TO, *target_memory_p}});
+        astream.wait();
+      }
+    }
+    return target_memory_p;
+  }
+  std::shared_ptr<mkldnn::memory> AcquireMemory(const std::string& suffix) {
+    const auto local_key = key_ + suffix;
+    return std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx_.GetBlob(local_key));
+  }
  const MKLDNNDeviceContext& dev_ctx_;
  mkldnn::engine engine_;
  platform::Place place_;