Adapting device-specific Extra Attributes for the PHI kernel (#46342)

* add extra attr property set * add type_info for all context * add onednn context to all context * fix context compile error * simplify conv kernel args * pass runtime attr into dev_ctx * fix marco error * clear conv_grad_kernel extra args * merge conv_grad_grad into conv_grad * clear conv2d_grad_grad extra attrs * clear yaml and eager extra attr * fix conv1d error * change to thread local * fix npu compile failed * try to fix windows compile failed * add conv2d onednn phi kernel * fix ci bugs (#36) * fix compile bugs (#38) * fix extra input transform bug (#39) * support dynamic created attr (#40) * reset extra info gen code * rm conv_grad_grad kernel * reimpl pass attr adapting * add int attr support * remove vector inputnames creating * fix map at error * Update paddle/phi/kernels/onednn/conv_grad_kernel.cc Co-authored-by: N Sławomir Siwek <slawomir.siwek@intel.com> * remove useless extra attrs * replace mkldnn_engine by onednn_engine Co-authored-by: N YuanRisheng <yuanrisheng@baidu.com> Co-authored-by: N Sławomir Siwek <slawomir.siwek@intel.com>

Adapting device-specific Extra Attributes for the PHI kernel (#46342)
* add extra attr property set * add type_info for all context * add onednn context to all context * fix context compile error * simplify conv kernel args * pass runtime attr into dev_ctx * fix marco error * clear conv_grad_kernel extra args * merge conv_grad_grad into conv_grad * clear conv2d_grad_grad extra attrs * clear yaml and eager extra attr * fix conv1d error * change to thread local * fix npu compile failed * try to fix windows compile failed * add conv2d onednn phi kernel * fix ci bugs (#36) * fix compile bugs (#38) * fix extra input transform bug (#39) * support dynamic created attr (#40) * reset extra info gen code * rm conv_grad_grad kernel * reimpl pass attr adapting * add int attr support * remove vector inputnames creating * fix map at error * Update paddle/phi/kernels/onednn/conv_grad_kernel.cc Co-authored-by: N Sławomir Siwek <slawomir.siwek@intel.com> * remove useless extra attrs * replace mkldnn_engine by onednn_engine Co-authored-by: N YuanRisheng <yuanrisheng@baidu.com> Co-authored-by: N Sławomir Siwek <slawomir.siwek@intel.com>
c923e6c9 · Chen Weihang · GitHub · f82d7e3c · c923e6c9 · c923e6c9
68 changed file
--- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
@@ -24,10 +24,7 @@ paddle::experimental::Tensor conv2d_ad_func(
    const paddle::experimental::Tensor& filter,
    std::vector<int> strides,
    std::vector<int> paddings,
-    std::string paddding_algorithm,
-    int groups,
+    std::string padding_algorithm,
    std::vector<int> dilations,
-    std::string data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search);
+    int groups,
+    std::string data_format);
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -29,13 +29,10 @@ paddle::experimental::Tensor conv2d_ad_func(
    const paddle::experimental::Tensor& filter,
    std::vector<int> strides,
    std::vector<int> paddings,
-    std::string paddding_algorithm,
-    int groups,
+    std::string padding_algorithm,
    std::vector<int> dilations,
-    std::string data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search) {
+    int groups,
+    std::string data_format) {
  // Dygraph Record Event
  paddle::platform::RecordEvent dygraph_entrance_record_event(
      "conv2d dygraph", paddle::platform::TracerEventType::Operator, 1);
@@ -64,13 +61,10 @@ paddle::experimental::Tensor conv2d_ad_func(
                            new_filter,
                            strides,
                            paddings,
-                            paddding_algorithm,
-                            groups,
+                            padding_algorithm,
                            dilations,
-                            data_format,
-                            use_addto,
-                            workspace_size_MB,
-                            exhaustive_search);
+                            groups,
+                            data_format);
    }
  }

@@ -92,13 +86,10 @@ paddle::experimental::Tensor conv2d_ad_func(
                              filter,
                              strides,
                              paddings,
-                              paddding_algorithm,
-                              groups,
+                              padding_algorithm,
                              dilations,
-                              data_format,
-                              use_addto,
-                              workspace_size_MB,
-                              exhaustive_search);
+                              groups,
+                              data_format);
    transformer->SetOutTensorLayout(&out);
    if (need_tune) {
      egr::Controller::Instance().EnableLayoutAutoTune();
@@ -119,13 +110,10 @@ paddle::experimental::Tensor conv2d_ad_func(
                                                 filter,
                                                 strides,
                                                 paddings,
-                                                 paddding_algorithm,
-                                                 groups,
+                                                 padding_algorithm,
                                                 dilations,
-                                                 data_format,
-                                                 use_addto,
-                                                 workspace_size_MB,
-                                                 exhaustive_search);
+                                                 groups,
+                                                 data_format);
  // Check NaN and Inf if needed
  if (FLAGS_check_nan_inf) {
    egr::CheckTensorHasNanOrInf("conv2d", api_result);
@@ -157,13 +145,10 @@ paddle::experimental::Tensor conv2d_ad_func(
    // SetAttributes if needed
    grad_node->SetAttributestrides(strides);
    grad_node->SetAttributepaddings(paddings);
-    grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
+    grad_node->SetAttributepadding_algorithm(padding_algorithm);
    grad_node->SetAttributegroups(groups);
    grad_node->SetAttributedilations(dilations);
    grad_node->SetAttributedata_format(data_format);
-    grad_node->SetAttributeuse_addto(use_addto);
-    grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
-    grad_node->SetAttributeexhaustive_search(exhaustive_search);
    // Set TensorWrappers for Forward Inputs if needed
    grad_node->SetTensorWrapperinput(input);
    grad_node->SetTensorWrapperfilter(filter);

--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
@@ -46,13 +46,10 @@ Conv2dGradNodeFinal::operator()(
  auto& grad_out = hooked_grads[0][0];
  auto& strides = this->strides_;
  auto& paddings = this->paddings_;
-  auto& paddding_algorithm = this->paddding_algorithm_;
+  auto& padding_algorithm = this->padding_algorithm_;
  auto& groups = this->groups_;
  auto& dilations = this->dilations_;
  auto& data_format = this->data_format_;
-  auto& use_addto = this->use_addto_;
-  auto& workspace_size_MB = this->workspace_size_MB_;
-  auto& exhaustive_search = this->exhaustive_search_;
  // Prepare Grad function call

  const auto& out_metas = OutputMeta();
@@ -87,13 +84,10 @@ Conv2dGradNodeFinal::operator()(
                                    grad_out,
                                    strides,
                                    paddings,
-                                    paddding_algorithm,
-                                    groups,
+                                    padding_algorithm,
                                    dilations,
+                                    groups,
                                    data_format,
-                                    use_addto,
-                                    workspace_size_MB,
-                                    exhaustive_search,
                                    api_output_0,
                                    api_output_1);
  // Check NaN and Inf id needed
@@ -134,13 +128,10 @@ Conv2dGradNodeFinal::operator()(
    // SetAttributes if needed
    grad_node->SetAttributestrides(strides);
    grad_node->SetAttributepaddings(paddings);
-    grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
+    grad_node->SetAttributepadding_algorithm(padding_algorithm);
    grad_node->SetAttributegroups(groups);
    grad_node->SetAttributedilations(dilations);
    grad_node->SetAttributedata_format(data_format);
-    grad_node->SetAttributeuse_addto(use_addto);
-    grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
-    grad_node->SetAttributeexhaustive_search(exhaustive_search);
    // Set TensorWrappers for Forward Inputs if needed
    grad_node->SetTensorWrapperinput(input);
    grad_node->SetTensorWrapperfilter(filter);
@@ -215,13 +206,10 @@ Conv2dDoubleGradNodeFinal::operator()(

  auto& strides = this->strides_;
  auto& paddings = this->paddings_;
-  auto& paddding_algorithm = this->paddding_algorithm_;
+  auto& padding_algorithm = this->padding_algorithm_;
  auto& groups = this->groups_;
  auto& dilations = this->dilations_;
  auto& data_format = this->data_format_;
-  auto& use_addto = this->use_addto_;
-  auto& workspace_size_MB = this->workspace_size_MB_;
-  auto& exhaustive_search = this->exhaustive_search_;
  // Prepare Grad function call

  const auto& out_metas = OutputMeta();
@@ -261,13 +249,10 @@ Conv2dDoubleGradNodeFinal::operator()(
                                         grad_filter_grad_optional,
                                         strides,
                                         paddings,
-                                         paddding_algorithm,
-                                         groups,
+                                         padding_algorithm,
                                         dilations,
+                                         groups,
                                         data_format,
-                                         use_addto,
-                                         workspace_size_MB,
-                                         exhaustive_search,
                                         api_output_0,
                                         api_output_1,
                                         api_output_2);

--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
@@ -63,8 +63,8 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase {
  void SetAttributepaddings(const std::vector<int>& paddings) {
    paddings_ = paddings;
  }
-  void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) {
-    paddding_algorithm_ = paddding_algorithm;
+  void SetAttributepadding_algorithm(const std::string& padding_algorithm) {
+    padding_algorithm_ = padding_algorithm;
  }
  void SetAttributegroups(const int& groups) { groups_ = groups; }
  void SetAttributedilations(const std::vector<int>& dilations) {
@@ -73,13 +73,6 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase {
  void SetAttributedata_format(const std::string& data_format) {
    data_format_ = data_format;
  }
-  void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; }
-  void SetAttributeworkspace_size_MB(const int& workspace_size_MB) {
-    workspace_size_MB_ = workspace_size_MB;
-  }
-  void SetAttributeexhaustive_search(const bool& exhaustive_search) {
-    exhaustive_search_ = exhaustive_search;
-  }

 private:
  // TensorWrappers
@@ -89,13 +82,10 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase {
  // Attributes
  std::vector<int> strides_;
  std::vector<int> paddings_;
-  std::string paddding_algorithm_;
+  std::string padding_algorithm_;
  int groups_;
  std::vector<int> dilations_;
  std::string data_format_;
-  bool use_addto_;
-  int workspace_size_MB_;
-  bool exhaustive_search_;
 };

 class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase {
@@ -146,8 +136,8 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase {
  void SetAttributepaddings(const std::vector<int>& paddings) {
    paddings_ = paddings;
  }
-  void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) {
-    paddding_algorithm_ = paddding_algorithm;
+  void SetAttributepadding_algorithm(const std::string& padding_algorithm) {
+    padding_algorithm_ = padding_algorithm;
  }
  void SetAttributegroups(const int& groups) { groups_ = groups; }
  void SetAttributedilations(const std::vector<int>& dilations) {
@@ -156,13 +146,6 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase {
  void SetAttributedata_format(const std::string& data_format) {
    data_format_ = data_format;
  }
-  void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; }
-  void SetAttributeworkspace_size_MB(const int& workspace_size_MB) {
-    workspace_size_MB_ = workspace_size_MB;
-  }
-  void SetAttributeexhaustive_search(const bool& exhaustive_search) {
-    exhaustive_search_ = exhaustive_search;
-  }

 private:
  // TensorWrappers
@@ -173,13 +156,10 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase {
  // Attributes
  std::vector<int> strides_;
  std::vector<int> paddings_;
-  std::string paddding_algorithm_;
+  std::string padding_algorithm_;
  int groups_;
  std::vector<int> dilations_;
  std::string data_format_;
-  bool use_addto_;
-  int workspace_size_MB_;
-  bool exhaustive_search_;
 };

 class AddNGradNodeFinal : public egr::GradNodeBase {

--- a/paddle/fluid/framework/archive.h
+++ b/paddle/fluid/framework/archive.h
@@ -32,8 +32,8 @@
 #include <valarray>
 #include <vector>

-#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/expect.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -30,7 +30,7 @@
 #include <utility>
 #include <vector>

-#include "paddle/fluid/framework/expect.h"
+#include "paddle/phi/core/expect.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/operators/ops_extra_info.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -2269,7 +2270,8 @@ Scope* OperatorWithKernel::PrepareData(
      }

      std::unique_ptr<OpKernelType> new_expected_kernel_key = nullptr;
-      if (run_phi_kernel_ && in_def->backend != phi::Backend::ALL_BACKEND) {
+      if (run_phi_kernel_ && in_def != nullptr &&
+          in_def->backend != phi::Backend::ALL_BACKEND) {
        auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
        if ((in_def->backend != tensor_backend &&
             (in_def->backend != phi::Backend::GPUDNN ||
@@ -2388,7 +2390,6 @@ Scope* OperatorWithKernel::PrepareData(
                          input_names.size(),
                          input_defs.size()));
    for (size_t i = 0; i < input_defs.size(); ++i) {
-      const auto& input_defs = phi_kernel_->args_def().input_defs();
      auto& in_def = input_defs.at(i);
      std::string input_name = input_names[i];
      auto iter = ctx->inputs.find(input_name);
@@ -2400,6 +2401,22 @@ Scope* OperatorWithKernel::PrepareData(
          no_buffer_ins && no_buffer_ins->count(input_name) > 0;
      prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input);
    }
+#ifdef PADDLE_WITH_MKLDNN
+    // For input that is Extra, only MKLDNN will use Extra Inputs
+    auto& extra_input_names =
+        paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap(
+            Type());
+    for (const auto& input_name : extra_input_names) {
+      auto iter = ctx->inputs.find(input_name);
+      if (iter == ctx->inputs.end()) {
+        continue;
+      }
+      bool should_skip_input =
+          no_buffer_ins && no_buffer_ins->count(input_name) > 0;
+      std::vector<Variable*>& input_vars = iter->second;
+      prepare_input_data(input_name, &input_vars, nullptr, should_skip_input);
+    }
+#endif
  } else {
    for (auto& var_name_item : Inputs()) {
      bool should_skip_input =
@@ -2699,6 +2716,65 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
  return (*arg_map_fn_)(arg_mapping_ctx);
 }

+static void SetDnnAttrIntoDeviceContext(
+    phi::DeviceContext* dev_ctx,
+    const Attribute& attr,
+    const std::string& attr_name,
+    const operators::ExtraAttrPropertySet& attr_propertys) {
+#ifdef PADDLE_WITH_MKLDNN
+  if (phi::OneDNNContext::classof(dev_ctx) &&
+      attr_propertys.Support(operators::ExtraAttrProperty::ONEDNN)) {
+    VLOG(4) << "Runtime attr `" << attr_name << "` is passed to OneDNNContext.";
+    phi::OneDNNContext* one_dnn_ctx = static_cast<phi::OneDNNContext*>(dev_ctx);
+    switch (AttrTypeID(attr)) {
+      case proto::AttrType::FLOAT:
+        one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(float, attr));
+        break;
+      case proto::AttrType::INT:
+        one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr));
+        break;
+      case proto::AttrType::STRING:
+        one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(std::string, attr));
+        break;
+      case proto::AttrType::INTS:
+        one_dnn_ctx->SetDnnAttr(attr_name,
+                                PADDLE_GET_CONST(std::vector<int>, attr));
+        break;
+      case proto::AttrType::FLOATS:
+        one_dnn_ctx->SetDnnAttr(attr_name,
+                                PADDLE_GET_CONST(std::vector<float>, attr));
+        break;
+      case proto::AttrType::BOOLEAN:
+        one_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr));
+        break;
+      default:
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported Attribute value type `%s` for phi.",
+            platform::demangle(attr.type().name())));
+    }
+  }
+#endif
+#ifdef PADDLE_WITH_CUDA
+  if (phi::GPUContext::classof(dev_ctx) &&
+      attr_propertys.Support(operators::ExtraAttrProperty::GPUDNN)) {
+    VLOG(4) << "Runtime attr `" << attr_name << "` is passed to GPUDNNContext.";
+    phi::GPUContext* gpu_dnn_ctx = static_cast<phi::GPUContext*>(dev_ctx);
+    switch (AttrTypeID(attr)) {
+      case proto::AttrType::INT:
+        gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(int, attr));
+        break;
+      case proto::AttrType::BOOLEAN:
+        gpu_dnn_ctx->SetDnnAttr(attr_name, PADDLE_GET_CONST(bool, attr));
+        break;
+      default:
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported Attribute value type `%s` for phi.",
+            platform::demangle(attr.type().name())));
+    }
+  }
+#endif
+}
+
 void OperatorWithKernel::BuildPhiKernelContext(
    const RuntimeContext& ctx,
    platform::DeviceContext* dev_ctx,
@@ -2713,6 +2789,15 @@ void OperatorWithKernel::BuildPhiKernelContext(
  auto attr_defs = phi_kernel_->args_def().attribute_defs();
  auto output_defs = phi_kernel_->args_def().output_defs();

+#if defined(PADDLE_WITH_MKLDNN)
+  if (phi::OneDNNContext::classof(dev_ctx)) {
+    // Onednn holds this op's variable's name and init them here.
+    phi::OneDNNContext* one_dnn_ctx = static_cast<phi::OneDNNContext*>(dev_ctx);
+    one_dnn_ctx->SetInputsName(Inputs());
+    one_dnn_ctx->SetOutputsName(Outputs());
+  }
+#endif
+
  PADDLE_ENFORCE_EQ(input_names.size(),
                    input_defs.size(),
                    platform::errors::InvalidArgument(
@@ -2992,6 +3077,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
      } break;
      default: {
        if (attr_iter == Attrs().end()) {
+          // TODO(chenweihang): remove this backup searching later
          attr_iter = RuntimeAttrs().find(attr_names[i]);
          PADDLE_ENFORCE_NE(attr_iter,
                            RuntimeAttrs().end(),
@@ -3075,6 +3161,63 @@ void OperatorWithKernel::BuildPhiKernelContext(
    }
  }
  VLOG(4) << "Done attributes";
+
+  // For compatible with Op with extra attrs for specific backend
+#if defined(PADDLE_WITH_MKLDNN) || defined(PADDLE_WITH_CUDA)
+  auto& runtime_attrs = RuntimeAttrs();
+  for (const auto& attr_iter : runtime_attrs) {
+    auto& attr_name = attr_iter.first;
+    auto& attr = attr_iter.second;
+    auto attr_propertys = paddle::operators::GetExtraAttrPropertys(attr_name);
+    SetDnnAttrIntoDeviceContext(dev_ctx, attr, attr_name, attr_propertys);
+  }
+  // TODO(chenweihang): Since the pass will still `SetAttr` in the OpDesc,
+  // we try to add these Attrs to the RuntimeAttrs, but these OpDesc will lose
+  // the RuntimeAttrs information in the process of converting the Graph to
+  // the Program, so additional record configuration will be introduced,
+  // which increases the The cost of development and understanding, so we
+  // still use Attrs to get and the attributes set by these passes from Attrs
+  // for the time being. In the future, it is necessary to clarify the
+  // positioning of RuntimeAttrs and expand related functions.
+  auto& attrs = Attrs();
+  for (const auto& attr_iter : attrs) {
+    auto& attr_name = attr_iter.first;
+    auto& attr = attr_iter.second;
+    auto attr_propertys = paddle::operators::GetExtraAttrPropertys(attr_name);
+    SetDnnAttrIntoDeviceContext(dev_ctx, attr, attr_name, attr_propertys);
+  }
+  VLOG(4) << "Done runtime attributes";
+#endif
+
+// For compatible with Op with extra input for onednn backend
+#ifdef PADDLE_WITH_MKLDNN
+  if (phi::OneDNNContext::classof(dev_ctx)) {
+    phi::OneDNNContext* one_dnn_ctx = static_cast<phi::OneDNNContext*>(dev_ctx);
+    auto& extra_input_names =
+        paddle::operators::ExtraInfoUtils::Instance().GetExtraInputNamesMap(
+            Type());
+    for (const auto& input_name : extra_input_names) {
+      auto it = ctx.inputs.find(input_name);
+      if (it == ctx.inputs.end() || it->second.size() == 0) {
+        one_dnn_ctx->SetDnnInput(input_name, nullptr);
+      } else {
+        auto ins_vector = it->second;
+        PADDLE_ENFORCE_EQ(
+            ins_vector.size(),
+            1UL,
+            phi::errors::InvalidArgument(
+                "OneDNN's extra input only allows one input tensor."));
+        auto* var = ins_vector[0];
+        PADDLE_ENFORCE_EQ(var->IsType<phi::DenseTensor>(),
+                          true,
+                          phi::errors::InvalidArgument(
+                              "OneDNN's extra input only can be DenseTensor."));
+        one_dnn_ctx->SetDnnInput(input_name, &(var->Get<phi::DenseTensor>()));
+      }
+    }
+  }
+  VLOG(4) << "Done runtime extra inputs";
+#endif
 }

 }  // namespace framework

--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
+#include "paddle/phi/core/expect.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
 #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
+#include "paddle/phi/core/expect.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -14,11 +14,11 @@

 #include <tuple>

-#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/core/expect.h"

 #include "paddle/phi/core/visit_type.h"

@@ -1184,20 +1184,6 @@ class ConvMKLDNNGradOpKernel : public framework::OpKernel<T> {

 namespace ops = paddle::operators;

-REGISTER_OP_KERNEL(conv2d,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::ConvMKLDNNOpKernel<float>,
-                   ops::ConvMKLDNNOpKernel<paddle::platform::bfloat16>,
-                   ops::ConvMKLDNNOpKernel<uint8_t>,
-                   ops::ConvMKLDNNOpKernel<int8_t>);
-
-REGISTER_OP_KERNEL(conv2d_grad,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::ConvMKLDNNGradOpKernel<float>,
-                   ops::ConvMKLDNNGradOpKernel<paddle::platform::bfloat16>);
-
 REGISTER_OP_KERNEL(depthwise_conv2d,
                   MKLDNN,
                   ::paddle::platform::CPUPlace,

--- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/core/expect.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -36,7 +36,7 @@ PD_DECLARE_KERNEL(relu, OneDNN, ONEDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
 USE_OP_ITSELF(conv2d);
-USE_OP_DEVICE_KERNEL(conv2d, MKLDNN);
+PD_DECLARE_KERNEL(conv2d, OneDNN, ONEDNN);

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/ops_extra_info.h
+++ b/paddle/fluid/operators/ops_extra_info.h
@@ -14,11 +14,137 @@

 #pragma once

+#include <string>
+#include <unordered_map>
+#include <vector>
+
 #include "paddle/fluid/framework/attribute.h"

 namespace paddle {
 namespace operators {

+// This file is to be compatible with the bad design and
+// implementation of fluid in the past
+
+// Many operators in fluid have extra attributes, which are generally added
+// to implement some specific kernel selection and to meet the specialization
+// needs of a specific operation library like mkldnn or cudnn
+enum class ExtraAttrProperty : uint8_t {
+  // The attributes that are no longer used by any scene
+  DEPRECATED = 0,
+  // The attributes used for framework execution scheduling,
+  // such as `use_mkldnn`, `use_cudnn`, no need to save
+  SCHEDULE,
+  // The attributes for ONEDNN only, can be saved in OneDNNContext
+  ONEDNN,
+  // The attributes for ONEDNN only, can be saved in GPUContext
+  GPUDNN,
+  // Add necessary properties as needed
+};
+
+class ExtraAttrPropertySet final {
+ public:
+  constexpr ExtraAttrPropertySet() : bitset_(0) {}
+  constexpr ExtraAttrPropertySet(ExtraAttrProperty e)  // NOLINT
+      : bitset_(e == ExtraAttrProperty::DEPRECATED
+                    ? 0
+                    : 1ULL << (static_cast<uint8_t>(e) - 1)) {}
+
+  inline uint64_t bitset() const { return bitset_; }
+
+  bool inline Support(ExtraAttrProperty e) const {
+    // DEPRECATED ExtraAttr always return false
+    return static_cast<bool>(bitset_ & ExtraAttrPropertySet(e).bitset());
+  }
+  bool IsEmpty() const { return bitset_ == 0; }
+
+  ExtraAttrPropertySet operator|(const ExtraAttrPropertySet& other) const {
+    return ExtraAttrPropertySet(bitset_ | other.bitset());
+  }
+  ExtraAttrPropertySet operator&(const ExtraAttrPropertySet& other) const {
+    return ExtraAttrPropertySet(bitset_ & other.bitset());
+  }
+  ExtraAttrPropertySet operator-(const ExtraAttrPropertySet& other) const {
+    return ExtraAttrPropertySet(bitset_ & ~other.bitset());
+  }
+  ExtraAttrPropertySet operator^(const ExtraAttrPropertySet& other) const {
+    return ExtraAttrPropertySet(bitset_ ^ other.bitset());
+  }
+
+  bool operator==(const ExtraAttrPropertySet& other) const {
+    return bitset_ == other.bitset();
+  }
+
+ private:
+  constexpr ExtraAttrPropertySet(uint64_t bitset) : bitset_(bitset) {}
+  uint64_t bitset_;
+};
+
+const std::unordered_map<std::string, ExtraAttrPropertySet>
+    extra_attr_properties = {
+        // DEPRECATED attributes
+        {"use_quantizer", ExtraAttrProperty::DEPRECATED},
+        // SCHEDULE attributes
+        {"use_cudnn", ExtraAttrProperty::SCHEDULE},
+        {"use_mkldnn", ExtraAttrProperty::SCHEDULE},
+        // ONEDNN dedicated attributes
+        {"Bias", ExtraAttrProperty::ONEDNN},
+        {"data_format", ExtraAttrProperty::ONEDNN},
+        {"force_fp32_output", ExtraAttrProperty::ONEDNN},
+        {"fuse_activation", ExtraAttrProperty::ONEDNN},
+        {"fuse_activation_type", ExtraAttrProperty::ONEDNN},
+        {"fuse_activation_alpha", ExtraAttrProperty::ONEDNN},
+        {"fuse_activation_beta", ExtraAttrProperty::ONEDNN},
+        {"fuse_activation_scale", ExtraAttrProperty::ONEDNN},
+        {"fuse_alpha", ExtraAttrProperty::ONEDNN},
+        {"fuse_beta", ExtraAttrProperty::ONEDNN},
+        {"fuse_relu", ExtraAttrProperty::ONEDNN},
+        {"fuse_residual_connection", ExtraAttrProperty::ONEDNN},
+        {"fuse_with_relu", ExtraAttrProperty::ONEDNN},
+        {"fused_reshape_Out", ExtraAttrProperty::ONEDNN},
+        {"fused_transpose_Out", ExtraAttrProperty::ONEDNN},
+        {"fused_reshape_X", ExtraAttrProperty::ONEDNN},
+        {"fused_reshape_Y", ExtraAttrProperty::ONEDNN},
+        {"fused_transpose_X", ExtraAttrProperty::ONEDNN},
+        {"fused_transpose_Y", ExtraAttrProperty::ONEDNN},
+        {"mkldnn_data_type", ExtraAttrProperty::ONEDNN},
+        {"ResidualData", ExtraAttrProperty::ONEDNN},
+        {"scale_x", ExtraAttrProperty::ONEDNN},
+        {"scale_y", ExtraAttrProperty::ONEDNN},
+        {"scale_out", ExtraAttrProperty::ONEDNN},
+        {"Scale_in", ExtraAttrProperty::ONEDNN},
+        {"Scale_in_eltwise", ExtraAttrProperty::ONEDNN},
+        {"Scale_x", ExtraAttrProperty::ONEDNN},
+        {"Scale_y", ExtraAttrProperty::ONEDNN},
+        {"Scale_out", ExtraAttrProperty::ONEDNN},
+        {"Scale_weights", ExtraAttrProperty::ONEDNN},
+        {"x_data_format", ExtraAttrProperty::ONEDNN},
+        {"y_data_format", ExtraAttrProperty::ONEDNN},
+        // ONEDNN pass dedicated attributes
+        {"Activation_scale", ExtraAttrProperty::ONEDNN},
+        {"Bias_scales", ExtraAttrProperty::ONEDNN},
+        {"Output_shift_scale", ExtraAttrProperty::ONEDNN},
+        {"Sum_scale", ExtraAttrProperty::ONEDNN},
+        // GPUDNN dedicated attributes
+        {"exhaustive_search", ExtraAttrProperty::GPUDNN},
+        {"fuse_relu_before_depthwise_conv", ExtraAttrProperty::GPUDNN},
+        {"use_addto", ExtraAttrProperty::GPUDNN},
+        {"workspace_size_MB", ExtraAttrProperty::GPUDNN},
+        // Mixed-use attributes
+        {"is_test",
+         ExtraAttrPropertySet(ExtraAttrProperty::ONEDNN) |
+             ExtraAttrPropertySet(ExtraAttrProperty::GPUDNN)},
+};
+
+inline ExtraAttrPropertySet GetExtraAttrPropertys(
+    const std::string& attr_name) {
+  auto iter = extra_attr_properties.find(attr_name);
+  if (iter != extra_attr_properties.end()) {
+    return iter->second;
+  }
+  return ExtraAttrPropertySet();
+}
+
 template <typename T>
 struct ExtraAttrChecker {
  ExtraAttrChecker(const std::string& attr_name, T default_value)
@@ -71,6 +197,15 @@ class ExtraInfoUtils {
    return empty_extra_attrs_checker_;
  }

+  const std::vector<std::string>& GetExtraInputNamesMap(
+      const std::string& op_type) const {
+    auto iter = g_extra_input_names_map_.find(op_type);
+    if (iter != g_extra_input_names_map_.end()) {
+      return iter->second;
+    }
+    return empty_extra_input_names_;
+  }
+
 private:
  ExtraInfoUtils();

@@ -83,6 +218,12 @@ class ExtraInfoUtils {
      g_extra_attrs_checker_;
  std::vector<std::function<void(framework::AttributeMap*, bool)>>
      empty_extra_attrs_checker_{};
+
+  // TODO(chenweihang): move these extra inputs into op_compat.yaml
+  std::unordered_map<std::string, std::vector<std::string>>
+      g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}},
+                                  {"conv2d_grad", {"Bias"}}};
+  std::vector<std::string> empty_extra_input_names_;
 };

 }  // namespace operators

--- a/paddle/fluid/platform/device/mlu/device_context.h
+++ b/paddle/fluid/platform/device/mlu/device_context.h
@@ -89,7 +89,9 @@ class MLUContext {
  DISABLE_COPY_AND_ASSIGN(MLUContext);
 };

-class MLUDeviceContext : public DeviceContext {
+class MLUDeviceContext
+    : public DeviceContext,
+      public phi::TypeInfoTraits<DeviceContext, MLUDeviceContext> {
 public:
  explicit MLUDeviceContext(MLUPlace place);
  virtual ~MLUDeviceContext();
@@ -148,6 +150,8 @@ class MLUDeviceContext : public DeviceContext {
    return thread_ctx_.at(this);
  }

+  static const char* name() { return "MLUDeviceContext"; }
+
 private:
  int compute_capability_;
  int driver_version_;

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <set>

 #include "glog/logging.h"
-#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
@@ -28,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/expect.h"

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -144,7 +144,9 @@ struct DefaultDeviceContextType<platform::CPUPlace> {

 // Graphcore IPU
 #ifdef PADDLE_WITH_IPU
-class IPUDeviceContext : public DeviceContext {
+class IPUDeviceContext
+    : public DeviceContext,
+      public phi::TypeInfoTraits<DeviceContext, IPUDeviceContext> {
 public:
  IPUDeviceContext() = delete;
  explicit IPUDeviceContext(IPUPlace place);
@@ -154,6 +156,8 @@ class IPUDeviceContext : public DeviceContext {
  /*! \brief  Wait for all operations completion in the stream. */
  void Wait() const override;

+  static const char* name() { return "IPUDeviceContext"; }
+
 private:
  IPUPlace place_;
 };
@@ -188,7 +192,9 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 #endif

 #ifdef PADDLE_WITH_ASCEND_CL
-class NPUDeviceContext : public DeviceContext {
+class NPUDeviceContext
+    : public DeviceContext,
+      public phi::TypeInfoTraits<DeviceContext, NPUDeviceContext> {
 public:
  explicit NPUDeviceContext(NPUPlace place);
  virtual ~NPUDeviceContext();
@@ -224,6 +230,8 @@ class NPUDeviceContext : public DeviceContext {

  // void WaitStreamCallback() const { return stream_->WaitCallback(); }

+  static const char* name() { return "NPUDeviceContext"; }
+
 private:
  NPUPlace place_;
  aclrtContext context_;
@@ -248,7 +256,9 @@ struct DefaultDeviceContextType<platform::NPUPlace> {
 };

 // Currently, NPUPinnedDeviceContext is only used to data copying.
-class NPUPinnedDeviceContext : public DeviceContext {
+class NPUPinnedDeviceContext
+    : public DeviceContext,
+      public phi::TypeInfoTraits<DeviceContext, NPUPinnedDeviceContext> {
 public:
  NPUPinnedDeviceContext();
  explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
@@ -257,6 +267,8 @@ class NPUPinnedDeviceContext : public DeviceContext {

  Eigen::DefaultDevice* eigen_device() const;

+  static const char* name() { return "NPUPinnedDeviceContext"; }
+
 private:
  NPUPinnedPlace place_;
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
@@ -276,7 +288,9 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
 };

 // Currently, CUDAPinnedDeviceContext is only used to data copying.
-class CUDAPinnedDeviceContext : public DeviceContext {
+class CUDAPinnedDeviceContext
+    : public DeviceContext,
+      public phi::TypeInfoTraits<DeviceContext, CUDAPinnedDeviceContext> {
 public:
  CUDAPinnedDeviceContext();
  explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
@@ -285,6 +299,8 @@ class CUDAPinnedDeviceContext : public DeviceContext {

  Eigen::DefaultDevice* eigen_device() const;

+  static const char* name() { return "CUDAPinnedDeviceContext"; }
+
 private:
  CUDAPinnedPlace place_;
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -122,77 +122,80 @@ using namespace ::phi::enforce;  // NOLINT
 #endif

 /*
- * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get
- *   safely. paddle::get is not a completely safe api, although it will not
- *   go wrong in most cases, but in extreme cases, it may fail and directly
- *   throw a paddle::bad_variant_access const exception, without any stack
- *information.
- *   This kind of problems is difficult to debug, so add these macros to
- *   enrich paddle::get error information. At the same time, we restrict
- *   the direct use of paddle::get by CI rule.
+ * Summary: This macro is used to get Variable or internal type
+ *   data (such as LoDTensor or SelectedRows) of the Input and
+ *   Output in op, generally used when call scope.FindVar(Input/
+ *   Output("Name")) or ctx.Input<LoDTensor>().
+ *   Firstly this macro check whether the obtained pointer is null,
+ *   and then return data if it is not null.
+ *
+ * Note: This macro is only suitable for specific scenarios and
+ *   does not intended to be widely used. If it cannot meet the
+ *   requirements, please use other PADDLE_ENFORCE** check macro.
 *
 * Parameters:
- *     __TYPE: the target variable type
- *     __VALUE: the target variable to get
+ *     __PTR: pointer
+ *     __ROLE: (string), Input or Output
+ *     __NAME: (string), Input or Output name
+ *     __OP_TYPE: (string), the op type
 *
- * Examples:
- *     - unsafe writing: int x = paddle::get<int>(y);
- *     - safe writing: int x = PADDLE_GET(int, y);
+ * Return: The data pointed to by the pointer.
 *
- * Note: GCC 4.8 cannot select right overloaded function here, so need
- *    to define different functions and macros here, after we upgreade
- *    CI gcc version, we can only define one PADDLE_GET macro.
+ * Examples:
+ *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
 */
-namespace details {
-
-using namespace phi::enforce::details;  // NOLINT
-
-#define DEFINE_SAFE_PADDLE_GET(                                              \
-    __InputType, __OutputType, __OutputTypePtr, __FuncName)                  \
-  template <typename OutputType, typename InputType>                         \
-  auto __FuncName(                                                           \
-      __InputType input, const char* expression, const char* file, int line) \
-      ->typename std::conditional<std::is_pointer<InputType>::value,         \
-                                  __OutputTypePtr,                           \
-                                  __OutputType>::type {                      \
-    try {                                                                    \
-      return paddle::get<OutputType>(input);                                 \
-    } catch (paddle::bad_variant_access const&) {                            \
-      HANDLE_THE_ERROR                                                       \
-      throw ::phi::enforce::EnforceNotMet(                                   \
-          phi::errors::InvalidArgument(                                      \
-              "paddle::get failed, cannot get value "                        \
-              "(%s) by type %s, its type is %s.",                            \
-              expression,                                                    \
-              phi::enforce::demangle(typeid(OutputType).name()),             \
-              phi::enforce::demangle(input.type().name())),                  \
-          file,                                                              \
-          line);                                                             \
-      END_HANDLE_THE_ERROR                                                   \
-    }                                                                        \
-  }
-
-DEFINE_SAFE_PADDLE_GET(InputType&, OutputType&, OutputType*, SafeBoostGet);
-DEFINE_SAFE_PADDLE_GET(const InputType&,
-                       const OutputType&,
-                       const OutputType*,
-                       SafeBoostGetConst);
-DEFINE_SAFE_PADDLE_GET(InputType&&,
-                       OutputType,
-                       OutputType*,
-                       SafeBoostGetMutable);
-
-}  // namespace details
+#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)               \
+  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {      \
+    auto* __ptr = (__PTR);                                              \
+    if (UNLIKELY(nullptr == __ptr)) {                                   \
+      auto __summary__ = phi::errors::NotFound(                         \
+          "Unable to get %s data of %s %s in operator %s. "             \
+          "Possible reasons are:\n"                                     \
+          "  1. The %s is not the %s of operator %s;\n"                 \
+          "  2. The %s has no corresponding variable passed in;\n"      \
+          "  3. The %s corresponding variable is not initialized.",     \
+          phi::demangle(                                                \
+              typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
+                  .name()),                                             \
+          __ROLE,                                                       \
+          __NAME,                                                       \
+          __OP_TYPE,                                                    \
+          __NAME,                                                       \
+          __ROLE,                                                       \
+          __OP_TYPE,                                                    \
+          __NAME,                                                       \
+          __NAME);                                                      \
+      auto __message__ = ::paddle::string::Sprintf(                     \
+          "%s\n  [Hint: pointer " #__PTR " should not be null.]",       \
+          __summary__.error_message());                                 \
+      __THROW_ERROR_INTERNAL__(                                         \
+          phi::ErrorSummary(__summary__.code(), __message__));          \
+    }                                                                   \
+    return *__ptr;                                                      \
+  })())

-#define PADDLE_GET(__TYPE, __VALUE)                \
-  paddle::platform::details::SafeBoostGet<__TYPE>( \
-      __VALUE, #__VALUE, __FILE__, __LINE__)
-#define PADDLE_GET_CONST(__TYPE, __VALUE)               \
-  paddle::platform::details::SafeBoostGetConst<__TYPE>( \
-      __VALUE, #__VALUE, __FILE__, __LINE__)
-#define PADDLE_GET_MUTABLE(__TYPE, __VALUE)               \
-  paddle::platform::details::SafeBoostGetMutable<__TYPE>( \
-      __VALUE, #__VALUE, __FILE__, __LINE__)
+/*
+ * Summary: This macro is used to check whether op has specified
+ * Input or Output Variables. Because op's Input and Output
+ * checking are written similarly, so abstract this macro.
+ *
+ * Parameters:
+ *     __EXPR: (bool), the bool expression
+ *     __ROLE: (string), Input or Output
+ *     __NAME: (string), Input or Output name
+ *     __OP_TYPE: (string), the op type
+ *
+ * Examples:
+ *    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
+ */
+#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE)                    \
+  do {                                                                       \
+    PADDLE_ENFORCE_EQ(                                                       \
+        __EXPR,                                                              \
+        true,                                                                \
+        phi::errors::NotFound(                                               \
+            "No %s(%s) found for %s operator.", __ROLE, __NAME, __OP_TYPE)); \
+  } while (0)

 /** OTHER EXCEPTION AND ENFORCE **/


--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -528,10 +528,9 @@ struct CannotToStringType {
 };

 TEST(enforce, cannot_to_string_type) {
-  static_assert(
-      !paddle::platform::details::CanToString<CannotToStringType>::kValue,
-      "CannotToStringType must not be converted to string");
-  static_assert(paddle::platform::details::CanToString<int>::kValue,
+  static_assert(!phi::enforce::details::CanToString<CannotToStringType>::kValue,
+                "CannotToStringType must not be converted to string");
+  static_assert(phi::enforce::details::CanToString<int>::kValue,
                "int can be converted to string");
  CannotToStringType obj1(3), obj2(4), obj3(3);


--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -312,8 +312,8 @@
    func : conj

 - backward_op : conv2d_grad
-  forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
-  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format)
  output : Tensor(input_grad), Tensor(filter_grad)
  infer_meta :
    func : GeneralBinaryGradInferMeta
@@ -324,8 +324,8 @@
  backward : conv2d_grad_grad

 - backward_op : conv2d_grad_grad
-  forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter)
-  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format)
  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
  infer_meta :
    func : GeneralTernaryGradInferMeta
@@ -357,8 +357,8 @@
  backward : conv2d_transpose_double_grad

 - backward_op : conv3d_grad
-  forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
-  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
  output : Tensor(input_grad), Tensor(filter_grad)
  infer_meta :
    func : GeneralBinaryGradInferMeta
@@ -369,8 +369,8 @@
  backward : conv3d_grad_grad

 - backward_op : conv3d_grad_grad
-  forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter)
-  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
  infer_meta :
    func : GeneralTernaryGradInferMeta
@@ -439,21 +439,21 @@
  optional : mask

 - backward_op : depthwise_conv2d_grad
-  forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out)
-  args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn)
+  forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn)
  output : Tensor(input_grad), Tensor(filter_grad)
  infer_meta :
    func : GeneralBinaryGradInferMeta
    param : [input, filter]
  kernel :
    func : depthwise_conv2d_grad
-    param : [input, filter, out_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu]
+    param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu]
    use_gpudnn : use_gpudnn
  backward : depthwise_conv2d_grad_grad

 - backward_op : depthwise_conv2d_grad_grad
-  forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter)
-  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
  infer_meta :
    func : GeneralTernaryGradInferMeta

--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -454,7 +454,7 @@
  backward : conj_grad

 - op : conv2d
-  args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format)
  output : Tensor
  infer_meta :
    func : ConvInferMeta
@@ -474,10 +474,10 @@
  backward : conv2d_transpose_grad

 - op : conv3d
-  args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
  output : Tensor
  infer_meta :
-    func : ConvInferMeta
+    func : Conv3DInferMeta
  kernel :
    func : conv3d
    use_gpudnn : true
@@ -564,7 +564,7 @@
  args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn)
  output : Tensor(out)
  infer_meta :
-    func : ConvInferMeta
+    func : DepthwiseConvInferMeta
    param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search]
  kernel :
    func : depthwise_conv2d

--- a/paddle/phi/backends/all_context.h
+++ b/paddle/phi/backends/all_context.h
@@ -23,9 +23,8 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#ifdef PADDLE_WITH_XPU
+#include "paddle/phi/backends/onednn/onednn_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#endif

 #ifndef PADDLE_WITH_CUSTOM_KERNEL
 // TODO(wilber): DeviceContextPool nees include fluid file.

--- a/paddle/phi/backends/cpu/cpu_context.h
+++ b/paddle/phi/backends/cpu/cpu_context.h
@@ -24,7 +24,8 @@ limitations under the License. */

 namespace phi {

-class PADDLE_API CPUContext : public DeviceContext {
+class PADDLE_API CPUContext : public DeviceContext,
+                              public TypeInfoTraits<DeviceContext, CPUContext> {
 public:
  CPUContext();
  CPUContext(CPUContext&&);
@@ -34,6 +35,8 @@ class PADDLE_API CPUContext : public DeviceContext {
  Eigen::DefaultDevice* eigen_device() const;
  const Place& GetPlace() const override;

+  static const char* name() { return "CPUContext"; }
+
 protected:
  // NOTE: External users manage resources. Used in inference scenarios.
  // The Set interface is for inference only, DeviceContext will mark the

--- a/paddle/phi/backends/custom/custom_context.h
+++ b/paddle/phi/backends/custom/custom_context.h
@@ -21,7 +21,8 @@ limitations under the License. */

 namespace phi {

-class CustomContext : public DeviceContext {
+class CustomContext : public DeviceContext,
+                      public TypeInfoTraits<DeviceContext, CustomContext> {
 public:
  explicit CustomContext(const CustomPlace&);

@@ -35,6 +36,8 @@ class CustomContext : public DeviceContext {
  // Wait for all operations completion in the stream.
  void Wait() const override;

+  static const char* name() { return "CustomContext"; }
+
 public:
  // NOTE: DeviceContext hold resources. Used in training scenarios.
  // The interface used by the training scene, DeviceContext will initialize

--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -717,6 +717,23 @@ struct GPUContext::Impl {
    }
  }

+  bool HasDnnAttr(const std::string& attr_name) const {
+    return dnn_attrs_.count(attr_name) != 0UL;
+  }
+
+  const Attribute& GetDnnAttr(const std::string& attr_name) const {
+    auto iter = dnn_attrs_.find(attr_name);
+    PADDLE_ENFORCE_NE(
+        iter,
+        dnn_attrs_.end(),
+        phi::errors::NotFound("Attribute `%s` is not found in OneDNNContext."));
+    return iter->second;
+  }
+
+  void SetDnnAttr(const std::string& attr_name, Attribute attr) {
+    dnn_attrs_[attr_name] = attr;
+  }
+
  // use one flag for all handles?
  // they should be accessed consistently
  bool owned_{false};
@@ -780,8 +797,15 @@ struct GPUContext::Impl {
  Allocator* allocator_{nullptr};  // external resource.
  // A internal resouce to initinalize eigen_device.
  std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_{nullptr};
+
+  // Holds some attributes only used by the gpudnn kernel calculation
+  // Because DeviceContext is a global singleton, you need to ensure thread
+  // safety, use the thread_local variable
+  static thread_local AttributeMap dnn_attrs_;
 };

+thread_local AttributeMap GPUContext::Impl::dnn_attrs_ = {};
+
 GPUContext::GPUContext(GPUContext&&) = default;

 GPUContext& GPUContext::operator=(GPUContext&&) = default;
@@ -1000,4 +1024,16 @@ void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; }

 void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; }

+bool GPUContext::HasDnnAttr(const std::string& attr_name) const {
+  return impl_->HasDnnAttr(attr_name);
+}
+
+const Attribute& GPUContext::GetDnnAttr(const std::string& attr_name) const {
+  return impl_->GetDnnAttr(attr_name);
+}
+
+void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) {
+  return impl_->SetDnnAttr(attr_name, std::move(attr));
+}
+
 }  // namespace phi
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_helper.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/attribute.h"
 #include "paddle/phi/core/device_context.h"

 namespace phi {
@@ -77,7 +78,8 @@ class DnnWorkspaceHandle {
  std::unique_ptr<std::mutex> mtx_;
 };

-class PADDLE_API GPUContext : public DeviceContext {
+class PADDLE_API GPUContext : public DeviceContext,
+                              public TypeInfoTraits<DeviceContext, GPUContext> {
 public:
  explicit GPUContext(const GPUPlace& place, bool init = true);

@@ -166,6 +168,13 @@ class PADDLE_API GPUContext : public DeviceContext {

  void WaitStreamCallback() const;

+  // Several methods for adapting Dnn-specific attributes
+  bool HasDnnAttr(const std::string& attr_name) const;
+  const Attribute& GetDnnAttr(const std::string& attr_name) const;
+  void SetDnnAttr(const std::string& attr_name, Attribute attr);
+
+  static const char* name() { return "GPUContext"; }
+
 public:
  /*! \brief  Return nccl communicators. */
  ncclComm_t nccl_comm() const;
@@ -250,10 +259,10 @@ class PADDLE_API GPUContext : public DeviceContext {
  std::unique_ptr<Impl> impl_;
 };

-// Note: In order to register the kernel of CUDNN, GPUDNNContext is required.
+// Note: In order to register the kernel of CUDNN, DnnContext is required.
 // Currently, CUDNN kernel directly uses GPUContext. But if the kernel function
 // has the same name, this will lead to duplicate instantiations of GPU kernel
-// and GPUDNN kernel function, so if we using GPUDNNContext = GPUContext, we
+// and Dnn kernel function, so if we using DnnContext = GPUContext, we
 // must use different function name for cudnn kernel
 using GPUDNNContext = GPUContext;


--- a/paddle/phi/backends/onednn/onednn_context.cc
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -16,9 +16,10 @@

 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/utils/flat_hash_map.h"

-#include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/expect.h"

 namespace phi {

@@ -284,6 +285,69 @@ struct OneDNNContext::Impl {
    return key_it->second;
  }

+  bool HasDnnAttr(const std::string& attr_name) const {
+    return dnn_attrs_.count(attr_name) != 0UL;
+  }
+  const Attribute& GetDnnAttr(const std::string& attr_name) const {
+    auto iter = dnn_attrs_.find(attr_name);
+    PADDLE_ENFORCE_NE(
+        iter,
+        dnn_attrs_.end(),
+        phi::errors::NotFound("Attribute `%s` is not found in OneDNNContext."));
+    return iter->second;
+  }
+
+  void SetDnnAttr(const std::string& attr_name, Attribute attr) {
+    dnn_attrs_[attr_name] = attr;
+  }
+
+  bool HasDnnInput(const std::string& input_name) const {
+    return dnn_inputs_.count(input_name) != 0UL;
+  }
+
+  const DenseTensor* GetDnnInput(const std::string& input_name) const {
+    auto iter = dnn_inputs_.find(input_name);
+    PADDLE_ENFORCE_NE(
+        iter,
+        dnn_inputs_.end(),
+        phi::errors::NotFound(
+            "Input DenseTensor `%s` is not found in OneDNNContext."));
+    return iter->second;
+  }
+
+  void SetDnnInput(const std::string& input_name, const DenseTensor* input) {
+    dnn_inputs_[input_name] = input;
+  }
+
+  void SetInputsName(const TensorNameMap& inputs_name) {
+    inputs_name_ = inputs_name;
+  }
+
+  void SetOutputsName(const TensorNameMap& outputs_name) {
+    outputs_name_ = outputs_name;
+  }
+
+  const std::vector<std::string>& GetInputsName(
+      const std::string& input) const {
+    auto it = inputs_name_.find(input);
+    PADDLE_ENFORCE_NE(it,
+                      inputs_name_.end(),
+                      phi::errors::NotFound(
+                          "OneDnnContext does not have the input %s.", input));
+    return it->second;
+  }
+
+  const std::vector<std::string>& GetOutputsName(
+      const std::string& output) const {
+    auto it = outputs_name_.find(output);
+    PADDLE_ENFORCE_NE(
+        it,
+        outputs_name_.end(),
+        phi::errors::NotFound("OneDnnContext does not have the output %s.",
+                              output));
+    return it->second;
+  }
+
  std::shared_ptr<BlobMap> p_blobmap_;
  // Map key is pointer of executor and value is a data(iterator in map) needed
  // to erase
@@ -291,8 +355,35 @@ struct OneDNNContext::Impl {
  std::shared_ptr<std::mutex> p_mutex_;
  // 0 - clearing is allowed. x > 0 do not clear.
  unsigned int block_next_cache_clearing_ = 0;
+
+  // Holds some attributes only used by the onednn kernel calculation
+  // Since original mkldnn op kernel directly adds the operations that require
+  // fusion to the native kernel operations, and uses the attribute `fuse_xxx`
+  // to control, for onednn, there will be some attributes that seem to be
+  // independent of the device are also saved here.
+  // Here, the operation of fusion needs to be implemented separately as
+  // a fusion op and kernel, instead of patching it to a basic operation.
+  // Because DeviceContext is a global singleton, you need to ensure thread
+  // safety, use the thread_local variable
+  static thread_local AttributeMap dnn_attrs_;
+  // For onednn, in addition to extra attrs, there are also extra inputs,
+  // but the number is small. Hope that the implementation can be optimized
+  // to remove this member in the future.
+  static thread_local paddle::flat_hash_map<std::string, const DenseTensor*>
+      dnn_inputs_;
+
+  // Onednn need get input and output's name in current Kernel for generating
+  // unique_key.
+  static thread_local TensorNameMap inputs_name_;
+  static thread_local TensorNameMap outputs_name_;
 };

+thread_local AttributeMap OneDNNContext::Impl::dnn_attrs_ = {};
+thread_local paddle::flat_hash_map<std::string, const DenseTensor*>
+    OneDNNContext::Impl::dnn_inputs_ = {};
+thread_local TensorNameMap OneDNNContext::Impl::inputs_name_ = {};
+thread_local TensorNameMap OneDNNContext::Impl::outputs_name_ = {};
+
 OneDNNContext::OneDNNContext(const Place& place)
    : CPUContext(place), impl_(std::make_unique<Impl>()) {}

@@ -322,5 +413,49 @@ OneDNNContext::BlobPtr_t<void> OneDNNContext::GetBlob(
  return impl_->GetBlob(name);
 }

+bool OneDNNContext::HasDnnAttr(const std::string& attr_name) const {
+  return impl_->HasDnnAttr(attr_name);
+}
+
+const Attribute& OneDNNContext::GetDnnAttr(const std::string& attr_name) const {
+  return impl_->GetDnnAttr(attr_name);
+}
+
+void OneDNNContext::SetDnnAttr(const std::string& attr_name, Attribute attr) {
+  return impl_->SetDnnAttr(attr_name, std::move(attr));
+}
+
+bool OneDNNContext::HasDnnInput(const std::string& input_name) const {
+  return impl_->HasDnnInput(input_name);
+}
+
+const DenseTensor* OneDNNContext::GetDnnInput(
+    const std::string& input_name) const {
+  return impl_->GetDnnInput(input_name);
+}
+
+void OneDNNContext::SetDnnInput(const std::string& input_name,
+                                const DenseTensor* input) {
+  return impl_->SetDnnInput(input_name, input);
+}
+
+void OneDNNContext::SetInputsName(const TensorNameMap& inputs_name) {
+  impl_->SetInputsName(inputs_name);
+}
+
+void OneDNNContext::SetOutputsName(const TensorNameMap& outputs_name) {
+  impl_->SetOutputsName(outputs_name);
+}
+
+const std::vector<std::string>& OneDNNContext::GetInputsName(
+    const std::string& input) const {
+  return impl_->GetInputsName(input);
+}
+
+const std::vector<std::string>& OneDNNContext::GetOutputsName(
+    const std::string& output) const {
+  return impl_->GetOutputsName(output);
+}
+
 }  // namespace phi
 #endif
--- a/paddle/phi/backends/onednn/onednn_context.h
+++ b/paddle/phi/backends/onednn/onednn_context.h
@@ -20,9 +20,12 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/attribute.h"

 namespace phi {

+using TensorNameMap = std::map<std::string, std::vector<std::string>>;
+
 class OneDNNContextThreadLocals {
  // default mkldnn session id

@@ -134,6 +137,26 @@ class OneDNNContext : public CPUContext {
    return OneDNNContextThreadLocals::fetch();
  }

+  // Several methods for adapting ONEDNN-specific attributes and inputs
+  bool HasDnnAttr(const std::string& attr_name) const;
+  const Attribute& GetDnnAttr(const std::string& attr_name) const;
+  void SetDnnAttr(const std::string& attr_name, Attribute attr);
+
+  bool HasDnnInput(const std::string& input_name) const;
+  const DenseTensor* GetDnnInput(const std::string& input_name) const;
+  void SetDnnInput(const std::string& input_name, const DenseTensor* input);
+
+  void SetInputsName(const TensorNameMap& inputs_name);
+
+  void SetOutputsName(const TensorNameMap& outputs_name);
+
+  const std::vector<std::string>& GetInputsName(const std::string& input) const;
+
+  const std::vector<std::string>& GetOutputsName(
+      const std::string& output) const;
+
+  static const char* name() { return "OneDNNContext"; }
+
 private:
  struct Impl;
  std::unique_ptr<Impl> impl_;

--- a/paddle/phi/backends/onednn/onednn_helper.h
+++ b/paddle/phi/backends/onednn/onednn_helper.h
@@ -195,6 +195,41 @@ inline std::string CreateKey(const OneDNNContext& dev_ctx, ArgTypes&&... args) {
  return key;
 }

+inline std::vector<std::vector<int64_t>> ToOnednnPadding(
+    const std::vector<int64_t>& paddings) {
+  if (paddings.size() == 6) {
+    int padding_front = paddings[0];
+    int padding_back = paddings[1];
+    int padding_top = paddings[2];
+    int padding_bottom = paddings[3];
+    int padding_left = paddings[4];
+    int padding_right = paddings[5];
+
+    return {{padding_front, padding_top, padding_left},
+            {padding_back, padding_bottom, padding_right}};
+  } else {
+    int padding_top = paddings[0];
+    int padding_bottom = paddings[1];
+    int padding_left = paddings[2];
+    int padding_right = paddings[3];
+
+    return {{padding_top, padding_left}, {padding_bottom, padding_right}};
+  }
+}
+
+// The function adjusts the vector of weight dimensions for group convolutions
+inline void GetGroupConvWeightsTz(std::vector<int64_t>& weights_tz,  // NOLINT
+                                  const int groups) {
+  if (groups > 1) {
+    // if (is_conv3d) [o, i, d, h, w]->[g, o/g, i, d, h, w]
+    // else [o, i, h, w] -> [g, o/g, i, h, w]
+    weights_tz.push_back(0);
+    std::rotate(weights_tz.begin(), weights_tz.end() - 1, weights_tz.end());
+    weights_tz[0] = groups;
+    weights_tz[1] = weights_tz[1] / groups;
+  }
+}
+
 inline void MatchShapeToLayout(DenseTensor* tensor_in,
                               DataLayout from,
                               DataLayout to) {

--- a/paddle/phi/backends/onednn/onednn_reuse.h
+++ b/paddle/phi/backends/onednn/onednn_reuse.h
@@ -39,6 +39,67 @@ using memory = dnnl::memory;

 using OneDNNMemoryFormat = dnnl::memory::format_tag;

+static void AppendActivation(const OneDNNContext& dev_ctx,
+                             dnnl::post_ops& post_ops,  // NOLINT
+                             float activation_scale = 1.0f) {
+  const auto invalid_attribute =
+      dev_ctx.HasDnnAttr("fuse_activation")
+          ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation"))
+                .empty()
+          : true;
+  if (invalid_attribute) return;
+
+  const auto fuse_activation =
+      dev_ctx.HasDnnAttr("fuse_activation")
+          ? PADDLE_GET_CONST(std::string, dev_ctx.GetDnnAttr("fuse_activation"))
+          : "";
+  const auto fuse_alpha =
+      dev_ctx.HasDnnAttr("fuse_alpha")
+          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_alpha"))
+          : 0.0f;
+  const auto fuse_beta =
+      dev_ctx.HasDnnAttr("fuse_beta")
+          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fuse_beta"))
+          : 0.0f;
+
+  if (fuse_activation == "hard_sigmoid") {
+    post_ops.append_eltwise(activation_scale,
+                            dnnl::algorithm::eltwise_linear,
+                            fuse_alpha,
+                            fuse_beta);
+    post_ops.append_eltwise(
+        activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f);
+  } else {
+    const std::unordered_map<std::string, dnnl::algorithm> activation_map = {
+        {"abs", dnnl::algorithm::eltwise_abs},
+        {"clip", dnnl::algorithm::eltwise_clip},
+        {"gelu", dnnl::algorithm::eltwise_gelu_erf},
+        {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf},
+        {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh},
+        {"hard_swish", dnnl::algorithm::eltwise_hardswish},
+        {"leaky_relu", dnnl::algorithm::eltwise_relu},
+        {"mish", dnnl::algorithm::eltwise_mish},
+        {"relu", dnnl::algorithm::eltwise_relu},
+        {"relu6", dnnl::algorithm::eltwise_bounded_relu},
+        {"sigmoid", dnnl::algorithm::eltwise_logistic},
+        {"sqrt", dnnl::algorithm::eltwise_sqrt},
+        {"swish", dnnl::algorithm::eltwise_swish},
+        {"tanh", dnnl::algorithm::eltwise_tanh}};
+
+    const auto& activation_type = activation_map.find(fuse_activation);
+
+    PADDLE_ENFORCE_NE(
+        activation_type,
+        activation_map.end(),
+        phi::errors::InvalidArgument(
+            "Activation '%s' not found in oneDNN algorithms mapper",
+            fuse_activation));
+
+    post_ops.append_eltwise(
+        activation_scale, activation_type->second, fuse_alpha, fuse_beta);
+  }
+}
+
 template <typename T,
          typename TForward,
          typename TBackward = onednn_dummy_primitive,
@@ -1085,5 +1146,6 @@ class ClipOneDNNHandler
                                            to_void_cast<T>(input_data));
  }
 };
+
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once

+#ifdef PADDLE_WITH_XPU
+
 #include <memory>

 #include "paddle/phi/backends/xpu/forwards.h"
@@ -26,7 +28,8 @@ namespace xpu = baidu::xpu::api;

 namespace phi {

-class XPUContext : public DeviceContext {
+class XPUContext : public DeviceContext,
+                   public TypeInfoTraits<DeviceContext, XPUContext> {
 public:
  XPUContext();

@@ -65,6 +68,8 @@ class XPUContext : public DeviceContext {

  XPUStream stream() const;

+  static const char* name() { return "XPUContext"; }
+
 private:
  struct Impl;
  std::unique_ptr<Impl> impl_;
@@ -79,3 +84,5 @@ using KPSContext = XPUContext;
 #endif

 }  // namespace phi
+
+#endif
--- a/paddle/phi/core/attribute.h
+++ b/paddle/phi/core/attribute.h
@@ -48,6 +48,6 @@ using Attribute = paddle::variant<bool,
                                  DataLayout,
                                  Place>;

-using RuntimeAttrs = paddle::flat_hash_map<std::string, Attribute>;
+using AttributeMap = paddle::flat_hash_map<std::string, Attribute>;

 }  // namespace phi
--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/generator.h"
+#include "paddle/phi/core/utils/type_registry.h"

 namespace phi {
 class TensorBase;
@@ -188,9 +189,21 @@ class PADDLE_API DeviceContext {
   */
  Generator* GetHostGenerator() const;

+  /**
+   * @brief Return the type information of the derived class to support
+   *        safely downcast in non-rtti environment.
+   *
+   * @return The type information of the derived class.
+   */
+  TypeInfo<DeviceContext> type_info() const { return type_info_; }
+
 private:
  struct Impl;
  std::unique_ptr<Impl> impl_;
+
+  template <typename T, typename U>
+  friend class TypeInfoTraits;
+  TypeInfo<DeviceContext> type_info_{TypeInfo<DeviceContext>::kUnknownType};
 };

 }  // namespace phi
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -43,6 +43,7 @@ limitations under the License. */
 #include "paddle/phi/core/errors.h"
 #include "paddle/utils/string/printf.h"
 #include "paddle/utils/string/to_string.h"
+#include "paddle/utils/variant.h"

 DECLARE_int32(call_stack_level);

@@ -409,80 +410,75 @@ struct EnforceNotMet : public std::exception {
 /** EXTENDED TOOL FUNCTIONS WITH CHECKING **/

 /*
- * Summary: This macro is used to get Variable or internal type
- *   data (such as LoDTensor or SelectedRows) of the Input and
- *   Output in op, generally used when call scope.FindVar(Input/
- *   Output("Name")) or ctx.Input<LoDTensor>().
- *   Firstly this macro check whether the obtained pointer is null,
- *   and then return data if it is not null.
- *
- * Note: This macro is only suitable for specific scenarios and
- *   does not intended to be widely used. If it cannot meet the
- *   requirements, please use other PADDLE_ENFORCE** check macro.
+ * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get
+ *   safely. paddle::get is not a completely safe api, although it will not
+ *   go wrong in most cases, but in extreme cases, it may fail and directly
+ *   throw a paddle::bad_variant_access const exception, without any stack
+ *information.
+ *   This kind of problems is difficult to debug, so add these macros to
+ *   enrich paddle::get error information. At the same time, we restrict
+ *   the direct use of paddle::get by CI rule.
 *
 * Parameters:
- *     __PTR: pointer
- *     __ROLE: (string), Input or Output
- *     __NAME: (string), Input or Output name
- *     __OP_TYPE: (string), the op type
- *
- * Return: The data pointed to by the pointer.
+ *     __TYPE: the target variable type
+ *     __VALUE: the target variable to get
 *
 * Examples:
- *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
- */
-#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)               \
-  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {      \
-    auto* __ptr = (__PTR);                                              \
-    if (UNLIKELY(nullptr == __ptr)) {                                   \
-      auto __summary__ = phi::errors::NotFound(                         \
-          "Unable to get %s data of %s %s in operator %s. "             \
-          "Possible reasons are:\n"                                     \
-          "  1. The %s is not the %s of operator %s;\n"                 \
-          "  2. The %s has no corresponding variable passed in;\n"      \
-          "  3. The %s corresponding variable is not initialized.",     \
-          phi::demangle(                                                \
-              typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
-                  .name()),                                             \
-          __ROLE,                                                       \
-          __NAME,                                                       \
-          __OP_TYPE,                                                    \
-          __NAME,                                                       \
-          __ROLE,                                                       \
-          __OP_TYPE,                                                    \
-          __NAME,                                                       \
-          __NAME);                                                      \
-      auto __message__ = ::paddle::string::Sprintf(                     \
-          "%s\n  [Hint: pointer " #__PTR " should not be null.]",       \
-          __summary__.error_message());                                 \
-      __THROW_ERROR_INTERNAL__(                                         \
-          phi::ErrorSummary(__summary__.code(), __message__));          \
-    }                                                                   \
-    return *__ptr;                                                      \
-  })())
-
-/*
- * Summary: This macro is used to check whether op has specified
- * Input or Output Variables. Because op's Input and Output
- * checking are written similarly, so abstract this macro.
- *
- * Parameters:
- *     __EXPR: (bool), the bool expression
- *     __ROLE: (string), Input or Output
- *     __NAME: (string), Input or Output name
- *     __OP_TYPE: (string), the op type
+ *     - unsafe writing: int x = paddle::get<int>(y);
+ *     - safe writing: int x = PADDLE_GET(int, y);
 *
- * Examples:
- *    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
+ * Note: GCC 4.8 cannot select right overloaded function here, so need
+ *    to define different functions and macros here, after we upgrade
+ *    CI gcc version, we can only define one PADDLE_GET macro.
 */
-#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE)                    \
-  do {                                                                       \
-    PADDLE_ENFORCE_EQ(                                                       \
-        __EXPR,                                                              \
-        true,                                                                \
-        phi::errors::NotFound(                                               \
-            "No %s(%s) found for %s operator.", __ROLE, __NAME, __OP_TYPE)); \
-  } while (0)
+namespace details {
+
+#define DEFINE_SAFE_PADDLE_GET(                                              \
+    __InputType, __OutputType, __OutputTypePtr, __FuncName)                  \
+  template <typename OutputType, typename InputType>                         \
+  auto __FuncName(                                                           \
+      __InputType input, const char* expression, const char* file, int line) \
+      ->typename std::conditional<std::is_pointer<InputType>::value,         \
+                                  __OutputTypePtr,                           \
+                                  __OutputType>::type {                      \
+    try {                                                                    \
+      return paddle::get<OutputType>(input);                                 \
+    } catch (paddle::bad_variant_access const&) {                            \
+      HANDLE_THE_ERROR                                                       \
+      throw ::phi::enforce::EnforceNotMet(                                   \
+          phi::errors::InvalidArgument(                                      \
+              "paddle::get failed, cannot get value "                        \
+              "(%s) by type %s, its type is %s.",                            \
+              expression,                                                    \
+              phi::enforce::demangle(typeid(OutputType).name()),             \
+              phi::enforce::demangle(input.type().name())),                  \
+          file,                                                              \
+          line);                                                             \
+      END_HANDLE_THE_ERROR                                                   \
+    }                                                                        \
+  }
+
+DEFINE_SAFE_PADDLE_GET(InputType&, OutputType&, OutputType*, SafeBoostGet);
+DEFINE_SAFE_PADDLE_GET(const InputType&,
+                       const OutputType&,
+                       const OutputType*,
+                       SafeBoostGetConst);
+DEFINE_SAFE_PADDLE_GET(InputType&&,
+                       OutputType,
+                       OutputType*,
+                       SafeBoostGetMutable);
+
+}  // namespace details
+
+#define PADDLE_GET(__TYPE, __VALUE)            \
+  phi::enforce::details::SafeBoostGet<__TYPE>( \
+      __VALUE, #__VALUE, __FILE__, __LINE__)
+#define PADDLE_GET_CONST(__TYPE, __VALUE)           \
+  phi::enforce::details::SafeBoostGetConst<__TYPE>( \
+      __VALUE, #__VALUE, __FILE__, __LINE__)
+#define PADDLE_GET_MUTABLE(__TYPE, __VALUE)           \
+  phi::enforce::details::SafeBoostGetMutable<__TYPE>( \
+      __VALUE, #__VALUE, __FILE__, __LINE__)

 }  // namespace enforce
 using namespace enforce;  // NOLINT

--- a/paddle/fluid/framework/expect.h
+++ b/paddle/fluid/framework/expect.h
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -138,8 +138,6 @@ class KernelContext {
  template <typename AttrType>
  const AttrType& AttrAt(size_t idx) const;

-  const RuntimeAttrs& GetRuntimeAttrs() const { return runtime_attrs_; }
-
  size_t InputsSize() const { return inputs_.size(); }
  size_t OutputsSize() const { return outputs_.size(); }
  size_t AttrsSize() const { return attrs_.size(); }
@@ -161,8 +159,6 @@ class KernelContext {
  paddle::small_vector<std::pair<int, int>, kInputSmallVectorSize> input_range_;
  paddle::small_vector<std::pair<int, int>, kOutputSmallVectorSize>
      output_range_;
-
-  RuntimeAttrs runtime_attrs_;
 };

 }  // namespace phi
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -233,8 +233,6 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
        args_def->AppendAttribute(AttributeType::DATA_LAYOUT);
      } else if (arg_type == std::type_index(typeid(Place))) {
        args_def->AppendAttribute(AttributeType::PLACE);
-      } else if (arg_type == std::type_index(typeid(RuntimeAttrs))) {
-        // do nothing
      } else {
        PADDLE_THROW(phi::errors::Unavailable(
            "Unsupported kernel argument type `%s`.", arg_type.name()));

--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -14,13 +14,7 @@

 #pragma once

-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/custom/custom_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/backends/onednn/onednn_context.h"
-#ifdef PADDLE_WITH_XPU
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#endif
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -330,21 +324,6 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {

  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(TensorArray);

-  template <typename... Tail>
-  struct KernelCallHelper<const RuntimeAttrs&, Tail...> {
-    template <int dev_ctx_idx,
-              int in_idx,
-              int attr_idx,
-              int out_idx,
-              typename... PreviousArgs>
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {
-      const auto& runtime_attrs = ctx->GetRuntimeAttrs();
-      KernelCallHelper<Tail...>::
-          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx>(
-              ctx, pargs..., runtime_attrs);
-    }
-  };
-
  /* End case */
  template <typename T>
  struct KernelCallHelper<TypeTag<T>> {

--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -409,12 +409,9 @@ void ConvInferMeta(const MetaTensor& input,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings_t,
                   const std::string& padding_algorithm,
-                   int groups,
                   const std::vector<int>& dilations_t,
+                   int groups,
                   const std::string& data_format,
-                   bool use_addto,
-                   int workspace_size_MB,
-                   bool exhaustive_search,
                   MetaTensor* out,
                   MetaConfig config) {
  std::vector<int> paddings = paddings_t;
@@ -559,27 +556,27 @@ void ConvInferMeta(const MetaTensor& input,
  out->set_dtype(input.dtype());
 }

-void ConvInferInferMeta(const MetaTensor& input,
-                        const MetaTensor& filter,
-                        const std::vector<int>& strides,
-                        const std::vector<int>& paddings,
-                        const std::string& paddding_algorithm,
-                        int groups,
-                        const std::vector<int>& dilations,
-                        const std::string& data_format,
-                        MetaTensor* out,
-                        MetaConfig config) {
+void Conv3DInferMeta(const MetaTensor& input,
+                     const MetaTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::string& padding_algorithm,
+                     int groups,
+                     const std::vector<int>& dilations,
+                     const std::string& data_format,
+                     bool use_addto,
+                     int workspace_size_MB,
+                     bool exhaustive_search,
+                     MetaTensor* out,
+                     MetaConfig config) {
  ConvInferMeta(input,
                filter,
                strides,
                paddings,
-                paddding_algorithm,
-                groups,
+                padding_algorithm,
                dilations,
+                groups,
                data_format,
-                /*use_addto=*/false,
-                /*workspace_size_MB=*/512,  // useless in infermeta
-                /*exhaustive_search=*/false,
                out,
                config);
 }
@@ -922,6 +919,31 @@ void CrossEntropyWithSoftmaxInferMeta(const MetaTensor& logits,
  loss->share_lod(logits);
 }

+void DepthwiseConvInferMeta(const MetaTensor& input,
+                            const MetaTensor& filter,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::string& padding_algorithm,
+                            int groups,
+                            const std::vector<int>& dilations,
+                            const std::string& data_format,
+                            bool use_addto,
+                            int workspace_size_MB,
+                            bool exhaustive_search,
+                            MetaTensor* out,
+                            MetaConfig config) {
+  ConvInferMeta(input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                dilations,
+                groups,
+                data_format,
+                out,
+                config);
+}
+
 void DistInferMeta(const MetaTensor& x,
                   const MetaTensor& y,
                   float p,
@@ -2876,4 +2898,3 @@ void Unpool3dInferMeta(const MetaTensor& x,
 }  // namespace phi

 PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta);
-PD_REGISTER_INFER_META_FN(conv2d_infer, phi::ConvInferInferMeta);
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -80,26 +80,26 @@ void ConvInferMeta(const MetaTensor& input,
                   const MetaTensor& filter,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
-                   const std::string& paddding_algorithm,
-                   int groups,
+                   const std::string& padding_algorithm,
                   const std::vector<int>& dilations,
+                   int groups,
                   const std::string& data_format,
-                   bool use_addto,
-                   int workspace_size_MB,
-                   bool exhaustive_search,
                   MetaTensor* out,
                   MetaConfig config = MetaConfig());

-void ConvInferInferMeta(const MetaTensor& input,
-                        const MetaTensor& filter,
-                        const std::vector<int>& strides,
-                        const std::vector<int>& paddings,
-                        const std::string& paddding_algorithm,
-                        int groups,
-                        const std::vector<int>& dilations,
-                        const std::string& data_format,
-                        MetaTensor* out,
-                        MetaConfig config = MetaConfig());
+void Conv3DInferMeta(const MetaTensor& input,
+                     const MetaTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::string& padding_algorithm,
+                     int groups,
+                     const std::vector<int>& dilations,
+                     const std::string& data_format,
+                     bool use_addto,
+                     int workspace_size_MB,
+                     bool exhaustive_search,
+                     MetaTensor* out,
+                     MetaConfig config = MetaConfig());

 void ConvTransposeInferMeta(const MetaTensor& x,
                            const MetaTensor& filter,
@@ -143,6 +143,20 @@ void CrossEntropyWithSoftmaxInferMeta(const MetaTensor& logits,
                                      MetaTensor* loss,
                                      MetaConfig config = MetaConfig());

+void DepthwiseConvInferMeta(const MetaTensor& input,
+                            const MetaTensor& filter,
+                            const std::vector<int>& strides,
+                            const std::vector<int>& paddings,
+                            const std::string& padding_algorithm,
+                            int groups,
+                            const std::vector<int>& dilations,
+                            const std::string& data_format,
+                            bool use_addto,
+                            int workspace_size_MB,
+                            bool exhaustive_search,
+                            MetaTensor* out,
+                            MetaConfig config = MetaConfig());
+
 void DistInferMeta(const MetaTensor& x,
                   const MetaTensor& y,
                   float p,

--- a/paddle/phi/kernels/conv_grad_grad_kernel.h
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ConvGradGradKernel(const Context& dev_ctx,
-                        const DenseTensor& input,
-                        const DenseTensor& filter,
-                        const DenseTensor& out_grad,
-                        const paddle::optional<DenseTensor>& input_grad_grad,
-                        const paddle::optional<DenseTensor>& filter_grad_grad,
-                        const std::vector<int>& strides,
-                        const std::vector<int>& paddings,
-                        const std::string& paddding_algorithm,
-                        int groups,
-                        const std::vector<int>& dilations,
-                        const std::string& data_format,
-                        bool use_addto,
-                        int workspace_size_MB,
-                        bool exhaustive_search,
-                        DenseTensor* input_grad,
-                        DenseTensor* filter_grad,
-                        DenseTensor* out_grad_grad);
-
-template <typename T, typename Context>
-void Conv3DGradGradKernel(const Context& dev_ctx,
-                          const DenseTensor& input,
-                          const DenseTensor& filter,
-                          const DenseTensor& out_grad,
-                          const paddle::optional<DenseTensor>& input_grad_grad,
-                          const paddle::optional<DenseTensor>& filter_grad_grad,
-                          const std::vector<int>& strides,
-                          const std::vector<int>& paddings,
-                          const std::string& paddding_algorithm,
-                          int groups,
-                          const std::vector<int>& dilations,
-                          const std::string& data_format,
-                          bool use_addto,
-                          int workspace_size_MB,
-                          bool exhaustive_search,
-                          DenseTensor* input_grad,
-                          DenseTensor* filter_grad,
-                          DenseTensor* out_grad_grad);
-
-}  // namespace phi
--- a/paddle/phi/kernels/conv_grad_kernel.h
+++ b/paddle/phi/kernels/conv_grad_kernel.h
@@ -25,13 +25,10 @@ void ConvGradKernel(const Context& dev_ctx,
                    const DenseTensor& out_grad,
                    const std::vector<int>& strides,
                    const std::vector<int>& paddings,
-                    const std::string& paddding_algorithm,
-                    int groups,
+                    const std::string& padding_algorithm,
                    const std::vector<int>& dilations,
+                    int groups,
                    const std::string& data_format,
-                    bool use_addto,
-                    int workspace_size_MB,
-                    bool exhaustive_search,
                    DenseTensor* input_grad,
                    DenseTensor* filter_grad);

@@ -42,7 +39,7 @@ void Conv3DGradKernel(const Context& dev_ctx,
                      const DenseTensor& out_grad,
                      const std::vector<int>& strides,
                      const std::vector<int>& paddings,
-                      const std::string& paddding_algorithm,
+                      const std::string& padding_algorithm,
                      int groups,
                      const std::vector<int>& dilations,
                      const std::string& data_format,
@@ -59,7 +56,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
                             const DenseTensor& out_grad,
                             const std::vector<int>& strides,
                             const std::vector<int>& paddings,
-                             const std::string& paddding_algorithm,
+                             const std::string& padding_algorithm,
                             int groups,
                             const std::vector<int>& dilations,
                             const std::string& data_format,
@@ -70,4 +67,41 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
                             DenseTensor* input_grad,
                             DenseTensor* filter_grad);

+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const DenseTensor& out_grad,
+                        const paddle::optional<DenseTensor>& input_grad_grad,
+                        const paddle::optional<DenseTensor>& filter_grad_grad,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::string& padding_algorithm,
+                        const std::vector<int>& dilations,
+                        int groups,
+                        const std::string& data_format,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad,
+                        DenseTensor* out_grad_grad);
+
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& dev_ctx,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const DenseTensor& out_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::string& padding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad,
+                          DenseTensor* out_grad_grad);
+
 }  // namespace phi
--- a/paddle/phi/kernels/conv_kernel.cc
+++ b/paddle/phi/kernels/conv_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/conv_kernel.h"
-
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ConvInferKernel(const Context& dev_ctx,
-                     const DenseTensor& input,
-                     const DenseTensor& filter,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::string& paddding_algorithm,
-                     int groups,
-                     const std::vector<int>& dilations,
-                     const std::string& data_format,
-                     DenseTensor* out) {
-  ConvKernel<T, Context>(dev_ctx,
-                         input,
-                         filter,
-                         strides,
-                         paddings,
-                         paddding_algorithm,
-                         groups,
-                         dilations,
-                         data_format,
-                         /*use_addto=*/false,
-                         /*workspace_size_MB=*/
-                         paddle::platform::GetDefaultConvWorkspaceSizeLimitMB(),
-                         /*exhaustive_search=*/false,
-                         out);
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(
-    conv2d_infer, CPU, ALL_LAYOUT, phi::ConvInferKernel, float, double) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(
-    conv2d_infer, GPU, ALL_LAYOUT, phi::ConvInferKernel, float, double) {}
-#endif
--- a/paddle/phi/kernels/conv_kernel.h
+++ b/paddle/phi/kernels/conv_kernel.h
@@ -25,12 +25,9 @@ void ConvKernel(const Context& dev_ctx,
                const std::vector<int>& strides,
                const std::vector<int>& paddings,
                const std::string& padding_algorithm,
-                int groups,
                const std::vector<int>& dilations,
+                int groups,
                const std::string& data_format,
-                bool use_addto,
-                int workspace_size_MB,
-                bool exhaustive_search,
                DenseTensor* out);

 template <typename T, typename Context>
@@ -54,7 +51,7 @@ void DepthwiseConvKernel(const Context& dev_ctx,
                         const DenseTensor& filter,
                         const std::vector<int>& strides,
                         const std::vector<int>& paddings,
-                         const std::string& paddding_algorithm,
+                         const std::string& padding_algorithm,
                         int groups,
                         const std::vector<int>& dilations,
                         const std::string& data_format,
@@ -64,16 +61,4 @@ void DepthwiseConvKernel(const Context& dev_ctx,
                         bool fuse_relu,
                         DenseTensor* out);

-template <typename T, typename Context>
-void ConvInferKernel(const Context& dev_ctx,
-                     const DenseTensor& input,
-                     const DenseTensor& filter,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::string& paddding_algorithm,
-                     int groups,
-                     const std::vector<int>& dilations,
-                     const std::string& data_format,
-                     DenseTensor* out);
-
 }  // namespace phi
--- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
-
-namespace phi {
-template <typename T, typename Context>
-void Conv3DGradGradKernel(const Context& ctx,
-                          const DenseTensor& input,
-                          const DenseTensor& filter,
-                          const DenseTensor& out_grad,
-                          const paddle::optional<DenseTensor>& input_grad_grad,
-                          const paddle::optional<DenseTensor>& filter_grad_grad,
-                          const std::vector<int>& strides,
-                          const std::vector<int>& paddings_t,
-                          const std::string& padding_algorithm,
-                          int groups,
-                          const std::vector<int>& dilations_t,
-                          const std::string& data_format,
-                          bool use_addto,
-                          int workspace_size_MB,
-                          bool exhaustive_search_t,
-                          DenseTensor* input_grad,
-                          DenseTensor* filter_grad,
-                          DenseTensor* out_grad_grad) {
-  ConvGradGradKernel<T>(ctx,
-                        input,
-                        filter,
-                        out_grad,
-                        input_grad_grad,
-                        filter_grad_grad,
-                        strides,
-                        paddings_t,
-                        padding_algorithm,
-                        groups,
-                        dilations_t,
-                        data_format,
-                        use_addto,
-                        workspace_size_MB,
-                        exhaustive_search_t,
-                        input_grad,
-                        filter_grad,
-                        out_grad_grad);
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(
-    conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
-}
-
-PD_REGISTER_KERNEL(conv3d_grad_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::Conv3DGradGradKernel,
-                   float,
-                   double) {}
--- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
@@ -27,7 +27,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
                             const DenseTensor& out_grad,
                             const std::vector<int>& strides,
                             const std::vector<int>& paddings,
-                             const std::string& paddding_algorithm,
+                             const std::string& padding_algorithm,
                             int groups,
                             const std::vector<int>& dilations,
                             const std::string& data_format,
@@ -43,13 +43,10 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
                    out_grad,
                    strides,
                    paddings,
-                    paddding_algorithm,
-                    groups,
+                    padding_algorithm,
                    dilations,
+                    groups,
                    data_format,
-                    use_addto,
-                    workspace_size_MB,
-                    exhaustive_search,
                    input_grad,
                    filter_grad);
 }
@@ -61,7 +58,7 @@ void Conv3DGradKernel(const Context& dev_ctx,
                      const DenseTensor& out_grad,
                      const std::vector<int>& strides,
                      const std::vector<int>& paddings,
-                      const std::string& paddding_algorithm,
+                      const std::string& padding_algorithm,
                      int groups,
                      const std::vector<int>& dilations,
                      const std::string& data_format,
@@ -76,17 +73,50 @@ void Conv3DGradKernel(const Context& dev_ctx,
                    out_grad,
                    strides,
                    paddings,
-                    paddding_algorithm,
-                    groups,
+                    padding_algorithm,
                    dilations,
+                    groups,
                    data_format,
-                    use_addto,
-                    workspace_size_MB,
-                    exhaustive_search,
                    input_grad,
                    filter_grad);
 }

+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& ctx,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const DenseTensor& out_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings_t,
+                          const std::string& padding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations_t,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search_t,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad,
+                          DenseTensor* out_grad_grad) {
+  ConvGradGradKernel<T>(ctx,
+                        input,
+                        filter,
+                        out_grad,
+                        input_grad_grad,
+                        filter_grad_grad,
+                        strides,
+                        paddings_t,
+                        padding_algorithm,
+                        dilations_t,
+                        groups,
+                        data_format,
+                        input_grad,
+                        filter_grad,
+                        out_grad_grad);
+}
+
 }  // namespace phi

 PD_REGISTER_KERNEL(
@@ -101,3 +131,14 @@ PD_REGISTER_KERNEL(depthwise_conv2d_grad,

 PD_REGISTER_KERNEL(
    conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
+
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
+
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3DGradGradKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_kernel.cc
@@ -19,6 +19,30 @@
 #include "paddle/phi/kernels/impl/conv_kernel_impl.h"

 namespace phi {
+
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings,
+                const std::string& padding_algorithm,
+                const std::vector<int>& dilations,
+                int groups,
+                const std::string& data_format,
+                DenseTensor* out) {
+  ConvKernelImpl<T>(dev_ctx,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    padding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    out);
+}
+
 template <typename T, typename Context>
 void DepthwiseConvKernel(const Context& dev_ctx,
                         const DenseTensor& input,
@@ -34,19 +58,16 @@ void DepthwiseConvKernel(const Context& dev_ctx,
                         bool exhaustive_search,
                         bool fuse_relu,
                         DenseTensor* out) {
-  ConvKernel<T>(dev_ctx,
-                input,
-                filter,
-                strides,
-                paddings,
-                padding_algorithm,
-                groups,
-                dilations,
-                data_format,
-                use_addto,
-                workspace_size_MB,
-                exhaustive_search,
-                out);
+  ConvKernelImpl<T>(dev_ctx,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    padding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    out);
 }

 template <typename T, typename Context>
@@ -63,19 +84,16 @@ void Conv3DKernel(const Context& dev_ctx,
                  int workspace_size_MB,
                  bool exhaustive_search,
                  DenseTensor* out) {
-  ConvKernel<T>(dev_ctx,
-                input,
-                filter,
-                strides,
-                paddings,
-                padding_algorithm,
-                groups,
-                dilations,
-                data_format,
-                use_addto,
-                workspace_size_MB,
-                exhaustive_search,
-                out);
+  ConvKernelImpl<T>(dev_ctx,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    padding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    out);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc
@@ -12,6 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
 #include "paddle/phi/kernels/erfinv_grad_kernel.h"

 #include "paddle/phi/backends/cpu/cpu_context.h"

--- a/paddle/phi/kernels/cpu/erfinv_kernel.cc
+++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc
@@ -12,10 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
 #include "paddle/phi/kernels/erfinv_kernel.h"

-#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  auto eigen_in = EigenVector<T>::Flatten(x);
+  auto eigen_out = EigenVector<T>::Flatten(*out);
+  auto& place = *ctx.eigen_device();
+  constexpr T half = static_cast<T>(0.5);
+  constexpr T half_sqrt = static_cast<T>(M_SQRT1_2);
+  eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt;
+}
+
+}  // namespace phi

 PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {}
--- a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
-
-PD_REGISTER_KERNEL(
-    conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
-}
--- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
@@ -27,7 +27,7 @@ void Conv3DGradKernel(const Context& dev_ctx,
                      const DenseTensor& out_grad,
                      const std::vector<int>& strides,
                      const std::vector<int>& paddings,
-                      const std::string& paddding_algorithm,
+                      const std::string& padding_algorithm,
                      int groups,
                      const std::vector<int>& dilations,
                      const std::string& data_format,
@@ -42,13 +42,10 @@ void Conv3DGradKernel(const Context& dev_ctx,
                    out_grad,
                    strides,
                    paddings,
-                    paddding_algorithm,
-                    groups,
+                    padding_algorithm,
                    dilations,
+                    groups,
                    data_format,
-                    use_addto,
-                    workspace_size_MB,
-                    exhaustive_search,
                    input_grad,
                    filter_grad);
 }
@@ -60,3 +57,7 @@ PD_REGISTER_KERNEL(

 PD_REGISTER_KERNEL(
    conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
+
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
--- a/paddle/phi/kernels/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_kernel.cu
@@ -20,6 +20,29 @@

 namespace phi {

+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings,
+                const std::string& padding_algorithm,
+                const std::vector<int>& dilations,
+                int groups,
+                const std::string& data_format,
+                DenseTensor* out) {
+  ConvKernelImpl<T>(dev_ctx,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    padding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    out);
+}
+
 template <typename T, typename Context>
 void Conv3DKernel(const Context& dev_ctx,
                  const DenseTensor& input,
@@ -34,19 +57,16 @@ void Conv3DKernel(const Context& dev_ctx,
                  int workspace_size_MB,
                  bool exhaustive_search,
                  DenseTensor* out) {
-  ConvKernel<T>(dev_ctx,
-                input,
-                filter,
-                strides,
-                paddings,
-                padding_algorithm,
-                groups,
-                dilations,
-                data_format,
-                use_addto,
-                workspace_size_MB,
-                exhaustive_search,
-                out);
+  ConvKernelImpl<T>(dev_ctx,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    padding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    out);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu
@@ -12,6 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+
 #include "paddle/phi/kernels/erfinv_grad_kernel.h"

 #include "paddle/phi/backends/gpu/gpu_context.h"

--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -42,18 +42,23 @@ void ConvCudnnKernel(const Context& ctx,
                     const std::vector<int>& strides,
                     const std::vector<int>& paddings_t,
                     const std::string& padding_algorithm,
-                     int groups,
                     const std::vector<int>& dilations_t,
+                     int groups,
                     const std::string& data_format,
-                     bool use_addto,
-                     int workspace_size_MB,
-                     bool exhaustive_search_t,
                     DenseTensor* output) {
  ctx.template Alloc<T>(output);
  std::vector<int> paddings = paddings_t;
  std::vector<int> dilations = dilations_t;

-  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool has_exhaustive_search = ctx.HasDnnAttr("exhaustive_search");
+  VLOG(4) << "GPUContext contains `exhaustive_search`: "
+          << has_exhaustive_search;
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, ctx.GetDnnAttr("exhaustive_search"))
+          : false;
+  bool exhaustive_search =
+      FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
  bool deterministic = FLAGS_cudnn_deterministic;
  PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
                    false,
@@ -402,12 +407,9 @@ void Conv3DCudnnKernel(const Context& dev_ctx,
                     strides,
                     paddings,
                     padding_algorithm,
-                     groups,
                     dilations,
+                     groups,
                     data_format,
-                     use_addto,
-                     workspace_size_MB,
-                     exhaustive_search,
                     out);
 }

@@ -432,12 +434,9 @@ void DepthwiseConvCudnnKernel(const Context& dev_ctx,
                     strides,
                     paddings,
                     padding_algorithm,
-                     groups,
                     dilations,
+                     groups,
                     data_format,
-                     use_addto,
-                     workspace_size_MB,
-                     exhaustive_search,
                     out);
 }


--- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/phi/kernels/conv_kernel.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ConvGradGradKernel(const Context& dev_ctx,
-                        const DenseTensor& input,
-                        const DenseTensor& filter,
-                        const DenseTensor& out_grad,
-                        const paddle::optional<DenseTensor>& input_grad_grad,
-                        const paddle::optional<DenseTensor>& filter_grad_grad,
-                        const std::vector<int>& strides_t,
-                        const std::vector<int>& paddings_t,
-                        const std::string& padding_algorithm,
-                        int groups,
-                        const std::vector<int>& dilations_t,
-                        const std::string& data_format,
-                        bool use_addto,
-                        int workspace_size_MB,
-                        bool exhaustive_search,
-                        DenseTensor* input_grad,
-                        DenseTensor* filter_grad,
-                        DenseTensor* out_grad_grad) {
-  const DenseTensor* X = &input;
-  const DenseTensor* dY = &out_grad;
-  const DenseTensor* ddX = input_grad_grad.get_ptr();
-  const DenseTensor* ddW_in = filter_grad_grad.get_ptr();
-
-  DenseTensor* ddY = out_grad_grad;
-  DenseTensor* dW = filter_grad;
-  DenseTensor* dX = input_grad;
-  DenseTensor W = filter;
-
-  if (!ddY && !dW && !dX) return;
-
-  const std::vector<int> strides = strides_t;
-  std::vector<int> paddings = paddings_t;
-  std::vector<int> dilations = dilations_t;
-
-  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-  // transform Tensor
-  DenseTensor transformed_X(X->type());
-  DenseTensor transformed_dY(dY->type());
-  DenseTensor transformed_ddX(X->type());
-
-  if (channel_last) {
-    ResizeToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
-    TransToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
-
-    ResizeToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
-    TransToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
-
-    if (ddX) {
-      ResizeToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
-      TransToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
-    }
-  } else {
-    transformed_X = *X;
-    transformed_dY = *dY;
-    if (ddX) {
-      transformed_ddX = *ddX;
-    }
-  }
-
-  // update padding and dilation
-  auto in_dims = transformed_X.dims();
-  auto filter_dims = W.dims();
-
-  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
-  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = vectorize<int>(filter_data_dims);
-  UpdatePaddingAndDilation(
-      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-  const int batch_size = static_cast<int>(transformed_X.dims()[0]);
-  std::vector<int64_t> filter_shape_vec(vectorize(W.dims()));
-  std::vector<int64_t> output_shape_vec(vectorize(transformed_dY.dims()));
-
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  // col_shape [in_channel/group, kh, kw, oh, ow]
-  col_shape_vec[0] = transformed_X.dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
-  }
-  DDim col_shape(make_ddim(col_shape_vec));
-  // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
-  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
-  // input_shape [Cin, H, W]
-  DDim input_shape =
-      slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
-  // filter_matrix_shape [Cout, Cin * kh * kw]
-  DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]};
-
-  W.Resize(filter_matrix_shape);
-  DDim output_matrix_shape = {
-      transformed_dY.dims()[1],
-      transformed_dY.numel() /
-          (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
-  int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
-  int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  DenseTensor col;
-  DenseTensor col_matrix;
-  if (is_expand) {
-    col.Resize(col_shape);
-    dev_ctx.template Alloc<T>(&col);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  phi::funcs::SetConstant<Context, T> set_zero;
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  // dx convolution double grad:  gemm + col2im(col2vol)
-  // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
-  // oH, oW)
-  if (dX && ddW_in) {
-    Tensor ddW;
-    ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-    dev_ctx.template Alloc<T>(dX);
-
-    DenseTensor transformed_dX(dX->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<Context, T>(dev_ctx, dX, &transformed_dX);
-
-    } else {
-      transformed_dX = *dX;
-    }
-    // if is_expand is false, the operation of set_zero is unnecessary
-    // because math::matmul will reset dx
-    if (is_expand) {
-      set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
-    }
-    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
-    paddle::operators::math::
-        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
-            col2im;
-
-    for (int i = 0; i < batch_size; i++) {
-      DenseTensor dy_batch =
-          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-      DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
-      for (int g = 0; g < groups; g++) {
-        // gemm
-        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-        DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-        DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
-        if (!is_expand) {
-          col_matrix.ShareDataWith(dx_slice);
-          col_matrix.Resize(col_matrix_shape);
-        }
-        blas.MatMul(
-            ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0));
-
-        if (is_expand && data_dim == 2U) {
-          col2im(dev_ctx,
-                 col,
-                 dilations,
-                 strides,
-                 std::vector<int>{
-                     paddings[0], paddings[2], paddings[1], paddings[3]},
-                 &dx_slice);
-        } else if (is_expand && data_dim == 3U) {
-          col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
-        }
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<Context, T>(dev_ctx, &transformed_dX, dX);
-    }
-  }
-
-  // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
-  // oH, oW)
-  // dw convolution double grad:  im2col(vol2col) + gemm
-  if (dW && ddX) {
-    dev_ctx.template Alloc<T>(dW);
-    set_zero(dev_ctx, dW, static_cast<T>(0));
-    DenseTensor dW_arr = *dW;
-    dW_arr.Resize(filter_matrix_shape);
-    paddle::operators::math::
-        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
-            im2col;
-    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
-    for (int i = 0; i < batch_size; ++i) {
-      DenseTensor dy_batch =
-          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-      Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-      for (int g = 0; g < groups; ++g) {
-        // im2col
-        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-        DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-        if (!is_expand) {
-          col.ShareDataWith(ddx_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          im2col(dev_ctx,
-                 ddx_slice,
-                 dilations,
-                 strides,
-                 std::vector<int>{
-                     paddings[0], paddings[2], paddings[1], paddings[3]},
-                 &col);
-        } else if (data_dim == 3U) {
-          vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-        }
-
-        DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(
-            dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0));
-      }
-    }
-  }
-
-  // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
-  // w/ddw(Cout, Cin, kh, kw)
-  // ddy convolution double grad: im2col(vol2col) + gemm
-  if (ddY) {
-    dev_ctx.template Alloc<T>(ddY);
-
-    DenseTensor transformed_ddY(ddY->type());
-    if (channel_last) {
-      ResizeToChannelFirst<Context, T>(dev_ctx, ddY, &transformed_ddY);
-    } else {
-      transformed_ddY = *ddY;
-    }
-
-    set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
-    paddle::operators::math::
-        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
-            im2col;
-    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
-    for (int i = 0; i < batch_size; ++i) {
-      DenseTensor ddy_batch =
-          transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
-      for (int g = 0; g < groups; ++g) {
-        // gemm
-        DenseTensor ddy_slice =
-            ddy_batch.Slice(g * out_step, (g + 1) * out_step);
-
-        if (ddX) {
-          DenseTensor ddx_batch =
-              transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-          DenseTensor ddx_slice =
-              ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(ddx_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx,
-                   ddx_slice,
-                   dilations,
-                   strides,
-                   std::vector<int>{
-                       paddings[0], paddings[2], paddings[1], paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-          }
-          DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(
-              w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0));
-        }
-
-        if (ddW_in) {
-          DenseTensor x_batch =
-              transformed_X.Slice(i, i + 1).Resize(input_shape);
-          DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          DenseTensor ddW;
-          ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-          if (!is_expand) {
-            col.ShareDataWith(x_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx,
-                   x_slice,
-                   dilations,
-                   strides,
-                   std::vector<int>{
-                       paddings[0], paddings[2], paddings[1], paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
-          }
-
-          // gemm
-          DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(
-              ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0));
-        }
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<Context, T>(dev_ctx, &transformed_ddY, ddY);
-    }
-  }
-}
-
-}  // namespace phi
--- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
@@ -16,7 +16,6 @@

 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/phi/kernels/conv_grad_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -32,12 +31,9 @@ void ConvGradKernel(const Context& dev_ctx,
                    const std::vector<int>& strides,
                    const std::vector<int>& paddings_t,
                    const std::string& padding_algorithm,
-                    int groups,
                    const std::vector<int>& dilations_t,
+                    int groups,
                    const std::string& data_format,
-                    bool use_addto,
-                    int workspace_size_MB,
-                    bool exhaustive_search,
                    DenseTensor* input_grad,
                    DenseTensor* filter_grad) {
  // The filter and filter_grad will be reshaped in the calculations,
@@ -254,4 +250,304 @@ void ConvGradKernel(const Context& dev_ctx,
  }
 }

+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const DenseTensor& out_grad,
+                        const paddle::optional<DenseTensor>& input_grad_grad,
+                        const paddle::optional<DenseTensor>& filter_grad_grad,
+                        const std::vector<int>& strides_t,
+                        const std::vector<int>& paddings_t,
+                        const std::string& padding_algorithm,
+                        const std::vector<int>& dilations_t,
+                        int groups,
+                        const std::string& data_format,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad,
+                        DenseTensor* out_grad_grad) {
+  const DenseTensor* X = &input;
+  const DenseTensor* dY = &out_grad;
+  const DenseTensor* ddX = input_grad_grad.get_ptr();
+  const DenseTensor* ddW_in = filter_grad_grad.get_ptr();
+
+  DenseTensor* ddY = out_grad_grad;
+  DenseTensor* dW = filter_grad;
+  DenseTensor* dX = input_grad;
+  DenseTensor W = filter;
+
+  if (!ddY && !dW && !dX) return;
+
+  const std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform Tensor
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_dY(dY->type());
+  DenseTensor transformed_ddX(X->type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+    TransToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+    TransToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+      TransToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+    }
+  } else {
+    transformed_X = *X;
+    transformed_dY = *dY;
+    if (ddX) {
+      transformed_ddX = *ddX;
+    }
+  }
+
+  // update padding and dilation
+  auto in_dims = transformed_X.dims();
+  auto filter_dims = W.dims();
+
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+
+  const int batch_size = static_cast<int>(transformed_X.dims()[0]);
+  std::vector<int64_t> filter_shape_vec(vectorize(W.dims()));
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_dY.dims()));
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  // col_shape [in_channel/group, kh, kw, oh, ow]
+  col_shape_vec[0] = transformed_X.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+  // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+  // input_shape [Cin, H, W]
+  DDim input_shape =
+      slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
+  // filter_matrix_shape [Cout, Cin * kh * kw]
+  DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]};
+
+  W.Resize(filter_matrix_shape);
+  DDim output_matrix_shape = {
+      transformed_dY.dims()[1],
+      transformed_dY.numel() /
+          (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
+  int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
+
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  DenseTensor col;
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    dev_ctx.template Alloc<T>(&col);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  // dx convolution double grad:  gemm + col2im(col2vol)
+  // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
+  // oH, oW)
+  if (dX && ddW_in) {
+    Tensor ddW;
+    ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+    dev_ctx.template Alloc<T>(dX);
+
+    DenseTensor transformed_dX(dX->type());
+
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, dX, &transformed_dX);
+
+    } else {
+      transformed_dX = *dX;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary
+    // because math::matmul will reset dx
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col_matrix.ShareDataWith(dx_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(
+            ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0));
+
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &dx_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_dX, dX);
+    }
+  }
+
+  // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
+  // oH, oW)
+  // dw convolution double grad:  im2col(vol2col) + gemm
+  if (dW && ddX) {
+    dev_ctx.template Alloc<T>(dW);
+    set_zero(dev_ctx, dW, static_cast<T>(0));
+    DenseTensor dW_arr = *dW;
+    dW_arr.Resize(filter_matrix_shape);
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; ++g) {
+        // im2col
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col.ShareDataWith(ddx_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 ddx_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+        }
+
+        DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(
+            dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0));
+      }
+    }
+  }
+
+  // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
+  // w/ddw(Cout, Cin, kh, kw)
+  // ddy convolution double grad: im2col(vol2col) + gemm
+  if (ddY) {
+    dev_ctx.template Alloc<T>(ddY);
+
+    DenseTensor transformed_ddY(ddY->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddY, &transformed_ddY);
+    } else {
+      transformed_ddY = *ddY;
+    }
+
+    set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor ddy_batch =
+          transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
+      for (int g = 0; g < groups; ++g) {
+        // gemm
+        DenseTensor ddy_slice =
+            ddy_batch.Slice(g * out_step, (g + 1) * out_step);
+
+        if (ddX) {
+          DenseTensor ddx_batch =
+              transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor ddx_slice =
+              ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+          if (!is_expand) {
+            col.ShareDataWith(ddx_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   ddx_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+          }
+          DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0));
+        }
+
+        if (ddW_in) {
+          DenseTensor x_batch =
+              transformed_X.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          DenseTensor ddW;
+          ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+          if (!is_expand) {
+            col.ShareDataWith(x_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   x_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
+          }
+
+          // gemm
+          DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0));
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_ddY, ddY);
+    }
+  }
+}
+
 }  // namespace phi
--- a/paddle/phi/kernels/impl/conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_kernel_impl.h
--- a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h
@@ -13,9 +13,6 @@
 // limitations under the License.

 #pragma once
-#ifndef _USE_MATH_DEFINES
-#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
-#endif

 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"

--- a/paddle/phi/kernels/impl/erfinv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/erfinv_kernel_impl.h
--- a/paddle/phi/kernels/onednn/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_grad_kernel.cc
--- a/paddle/phi/kernels/onednn/conv_handler.h
+++ b/paddle/phi/kernels/onednn/conv_handler.h
--- a/paddle/phi/kernels/onednn/conv_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_kernel.cc
--- a/paddle/phi/kernels/xpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_grad_kernel.cc
--- a/paddle/phi/kernels/xpu/conv_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_kernel.cc
--- a/paddle/phi/ops/compat/conv2d_sig.cc
+++ b/paddle/phi/ops/compat/conv2d_sig.cc
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -289,12 +289,9 @@ class Conv2D(layers.Layer):
                self._stride,
                self._padding,
                "EXPLICIT",
-                self._groups if self._groups else 1,
                self._dilation,
+                self._groups if self._groups else 1,
                "NCHW",
-                False,
-                -1,
-                False,
            )
            if self.bias is not None:
                pre_act = F.elementwise_add(pre_bias, self.bias, axis=1)

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py