Move conv to pten (#39354)

* move conv to pten * move conv to pten; test=develop * fix bug; * add conv cudnn impl; test=develop * update * update operator; test=develop * fix bug; test=develop * move operator and prepared_operator to develop; test=develop * resolve conflict; test=develop * remove useless code;test=develop * add depency ; test=develop * fix bug; * add sig.cc ; test=develop * fix use_op error; test=develop * fix bug; test=develop * fix bug; test=develop * add conv3d register; test=develop * fix star gan and conv_nn_grad test failed; test=develop * add header; test=develop * manul to recover to develop; * resolve confilct; test=develop * remove useless code * fix bug; * remove conv2d_cudnn; test=develop * fix bugs; test=develop * fix cpu rocm compile bugs; test=develop * fix blas error; test=develop * fix compile bug; test=develop * fix windows compile error; test=develop * fix windows error; test=develop * resolve confilct; test=develop

Move conv to pten (#39354)
* move conv to pten * move conv to pten; test=develop * fix bug; * add conv cudnn impl; test=develop * update * update operator; test=develop * fix bug; test=develop * move operator and prepared_operator to develop; test=develop * resolve conflict; test=develop * remove useless code;test=develop * add depency ; test=develop * fix bug; * add sig.cc ; test=develop * fix use_op error; test=develop * fix bug; test=develop * fix bug; test=develop * add conv3d register; test=develop * fix star gan and conv_nn_grad test failed; test=develop * add header; test=develop * manul to recover to develop; * resolve confilct; test=develop * remove useless code * fix bug; * remove conv2d_cudnn; test=develop * fix bugs; test=develop * fix cpu rocm compile bugs; test=develop * fix blas error; test=develop * fix compile bug; test=develop * fix windows compile error; test=develop * fix windows error; test=develop * resolve confilct; test=develop
d50fb43e · hong · GitHub · eaacf8bf · d50fb43e · d50fb43e
53 changed file
--- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
 USE_OP(conv2d_transpose);
 namespace paddle {

--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 namespace paddle {
 namespace operators {
@@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
 }
 template <typename DeviceContext, typename T, size_t D>
-static void RemovePaddingSlice(const framework::ExecutionContext& context,
+static void RemovePaddingSlice(const phi::GPUContext& context,
                               const Tensor* input, Tensor* out,
                               const std::vector<int>& starts,
                               const std::vector<int>& axes) {
-  auto& place =
+  auto& place = *context.eigen_device();
-      *context.template device_context<DeviceContext>().eigen_device();
  auto in_dims = input->dims();
  auto new_out_dims = out->dims();
  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
@@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results,
 using framework::ConvSearchCache;
-static void SetConvMathType(const framework::ExecutionContext& ctx,
+static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
-                            cudnnDataType_t dtype,
                            const platform::ConvolutionDescriptor& cdesc) {
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto& dev_ctx = ctx;
  if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
@@ -231,8 +230,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, const phi::GPUContext& ctx) {
-                     const framework::ExecutionContext& ctx) {
    auto dtype = platform::CudnnDataType<T>::type;
    bool has_got_workspace_size = true;
    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
@@ -284,8 +282,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
    } else if (deterministic) {
      algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
    } else {
-      auto& dev_ctx =
+      auto& dev_ctx = ctx;
-          ctx.template device_context<platform::CUDADeviceContext>();
      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
      AlgorithmsCache<algo_t>& algo_cache =
@@ -346,8 +343,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, const phi::GPUContext& ctx) {
-                     const framework::ExecutionContext& ctx) {
    auto dtype = platform::CudnnDataType<T>::type;
    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
    size_t workspace_size = 0;
@@ -413,8 +409,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
    } else if (deterministic) {
      return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    } else {
-      auto& dev_ctx =
+      auto& dev_ctx = ctx;
-          ctx.template device_context<platform::CUDADeviceContext>();
      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
      AlgorithmsCache<algo_t>& algo_cache =
@@ -478,8 +473,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, const phi::GPUContext& ctx) {
-                     const framework::ExecutionContext& ctx) {
    platform::CUDAGraphCaptureModeGuard guard;
    auto dtype = platform::CudnnDataType<T>::type;
    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
@@ -534,8 +528,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
    } else if (deterministic) {
      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
    } else {
-      auto& dev_ctx =
+      auto& dev_ctx = ctx;
-          ctx.template device_context<platform::CUDADeviceContext>();
      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
      AlgorithmsCache<algo_t>& algo_cache =
          *(framework::ConvSearchCache::Instance().GetBackwardFilter());

--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the spopecific language governing permissions and
-limitations under the License. */
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/memory.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/operators/conv_miopen_helper.h"
-#else
-#include "paddle/fluid/operators/conv_cudnn_helper.h"
-#endif
-#include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-DECLARE_bool(cudnn_deterministic);
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
-using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
-using DataLayout = platform::DataLayout;
-static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) {
-  return dev_ctx.GetComputeCapability() >= 70;
-}
-template <typename T>
-class CUDNNConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    auto* filter = ctx.Input<Tensor>("Filter");
-    auto* output = ctx.Output<Tensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    auto dtype = platform::CudnnDataType<T>::type;
-#ifdef PADDLE_WITH_HIP
-    // HIP MIOPEN ONLY SUPPORT NCHW format
-    auto compute_format = DataLayout::kNCHW;
-#else
-    // Tensor Core introduced from Volta GPUs supports more faster conv op
-    // with FP16 in NHWC data format.
-    const bool compute_in_nhwc =
-        dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
-    // We will only do data format conversion from NHWC to NCHW.
-    // cudnn will convert NCHW to NHWC automatically on Tensor Core.
-    auto compute_format =
-        compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
-#endif
-    VLOG(3) << "Compute ConvOp with cuDNN:"
-            << " data_format=" << data_format << " compute_format="
-            << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
-    // ------------ transformed tensor -----------
-    Tensor transformed_input_channel(input->type());
-    Tensor transformed_output(output->type());
-    Tensor transformed_filter_channel(filter->type());
-    T* output_data = nullptr;
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, output,
-                                                           &transformed_output);
-    } else {
-      transformed_input_channel.ShareDataWith(*input);
-      transformed_output.ShareDataWith(*output);
-    }
-    if (compute_format == DataLayout::kNHWC) {
-      VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
-      ResizeToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      TransToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-    } else {
-      transformed_filter_channel.ShareDataWith(*filter);
-    }
-    output_data = transformed_output.data<T>();
-    // update padding and dilation
-    auto in_dims = transformed_input_channel.dims();
-    auto filter_dims = transformed_filter_channel.dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (compute_format == DataLayout::kNCHW) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-      filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-      filter_data_dims =
-          phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-    }
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_input;
-    std::vector<int> padding_common(data_dim, 0);
-    if (!is_sys_pad) {
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_input_channel.dims()[0];
-      if (compute_format == DataLayout::kNCHW) {
-        new_input_shape_vec[1] = transformed_input_channel.dims()[1];
-      } else {
-        new_input_shape_vec[data_dim + 1] =
-            transformed_input_channel.dims()[data_dim + 1];
-      }
-      std::vector<int> input_pad(transformed_input_channel.dims().size() * 2,
-                                 0);
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        if (compute_format == DataLayout::kNCHW) {
-          new_input_shape_vec[i + 2] =
-              transformed_input_channel.dims()[i + 2] + padding_diff[i];
-        } else {
-          new_input_shape_vec[i + 1] =
-              transformed_input_channel.dims()[i + 1] + padding_diff[i];
-        }
-        if (compute_format == DataLayout::kNCHW) {
-          input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-        } else {
-          input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
-        }
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      const int rank = transformed_input_channel.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-    } else {
-      transformed_input.ShareDataWith(transformed_input_channel);
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-    const T* input_data = transformed_input.data<T>();
-    const T* filter_data = transformed_filter_channel.data<T>();
-    // ------------------- cudnn descriptors ---------------------
-    ConvArgs args{&transformed_input,
-                  &transformed_filter_channel,
-                  &transformed_output,
-                  strides,
-                  padding_common,
-                  dilations,
-                  dtype};
-    auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                            : DataLayout::kNCHW;
-    if (transformed_input.dims().size() == 5) {
-      layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
-                                                   : DataLayout::kNCDHW;
-    }
-    auto layout_format = GetCudnnTensorFormat(layout);
-    args.handle = handle;
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN need to set groups in cdesc in miopen_desc.h
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn(), groups);
-#else
-    args.cdesc.set(dtype, padding_common, strides, dilations,
-                   platform::AllowTF32Cudnn());
-#endif
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
-    // cudnn 7 can support groups, no need to do it manually
-    // FIXME(typhoonzero): find a better way to disable groups
-    // rather than setting it to 1.
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
-        args.cdesc.desc(), groups));
-    groups = 1;
-#endif
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN do not set groups in wdesc after set groups in cdesc
-    groups = 1;
-#endif
-    args.idesc.set(transformed_input, layout_format);
-    args.wdesc.set(transformed_filter_channel, layout_format, groups);
-    args.odesc.set(transformed_output, layout_format);
-    int i_n, i_c, i_d, i_h, i_w;
-    int o_n, o_c, o_d, o_h, o_w;
-    if (compute_format == DataLayout::kNHWC) {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d,
-               &o_h, &o_w);
-    } else {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
-               &o_h, &o_w);
-    }
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = transformed_filter_channel.numel() / groups;
-    // ------------------- cudnn conv workspace ---------------------
-    size_t workspace_size = 0;  // final workspace to allocate.
-// ------------------- cudnn conv algorithm ---------------------
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t algo{};
-    using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-    workspace_size = search::GetWorkspaceSize(args);
-    algo = search::Find<T>(args, exhaustive_search, deterministic,
-                           workspace_size, ctx);
-#else
-    cudnnConvolutionFwdAlgo_t algo{};
-    using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
-    workspace_size = search::GetWorkspaceSize(args, algo);
-#endif
-#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
-    // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
-    // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
-    // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
-    // FWD_ALGO_IMPLICIT_GEMM manually.
-    if (ctx.Attr<int>("groups") > 1) {
-      algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    }
-#endif
-    // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
-// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
-#ifdef PADDLE_WITH_HIP
-    workspace_handle.RunFunc(
-        [&](void* workspace_ptr) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::miopenConvolutionForward(
-                  handle, &alpha, args.idesc.desc(), input_data,
-                  args.wdesc.desc(), filter_data, args.cdesc.desc(), algo,
-                  &beta, args.odesc.desc(), output_data, workspace_ptr,
-                  workspace_size));
-        },
-        workspace_size);
-#else
-    for (int i = 0; i < groups; i++) {
-      workspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::cudnnConvolutionForward(
-                    handle, &alpha, args.idesc.desc(),
-                    input_data + i * group_offset_in, args.wdesc.desc(),
-                    filter_data + i * group_offset_filter, args.cdesc.desc(),
-                    algo, workspace_ptr, workspace_size, &beta,
-                    args.odesc.desc(), output_data + i * group_offset_out));
-          },
-          workspace_size);
-    }
-#endif
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_output, output);
-    }
-  }
-};
-template <typename T>
-class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto input = ctx.Input<Tensor>("Input");
-    auto filter = ctx.Input<Tensor>("Filter");
-    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
-    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    int groups = ctx.Attr<int>("groups");
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    auto dtype = platform::CudnnDataType<T>::type;
-#ifdef PADDLE_WITH_HIP
-    // HIP MIOPEN ONLY SUPPORT NCHW format
-    auto compute_format = DataLayout::kNCHW;
-#else
-    const bool compute_in_nhwc =
-        dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx);
-    auto compute_format =
-        compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW;
-#endif
-    VLOG(3) << "Compute ConvGradOp with cuDNN:"
-            << " data_format=" << data_format << " compute_format="
-            << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW");
-    // transform Tensor
-    Tensor transformed_input_channel(input->type());
-    Tensor transformed_output_grad_channel(output_grad->type());
-    Tensor transformed_input_grad_channel(input->type());
-    Tensor transformed_filter_channel(filter->type());
-    Tensor transformed_filter_grad_channel(filter->type());
-    if (channel_last && compute_format == DataLayout::kNCHW) {
-      VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
-                 "NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, input, &transformed_input_channel);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, output_grad, &transformed_output_grad_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, output_grad, &transformed_output_grad_channel);
-      if (input_grad) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, input_grad, &transformed_input_grad_channel);
-        // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
-        // the data of input_grad to transformed_input_grad_channel.
-        if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
-          TransToChannelFirst<platform::CUDADeviceContext, T>(
-              ctx, input_grad, &transformed_input_grad_channel);
-        }
-      }
-    } else {
-      transformed_input_channel.ShareDataWith(*input);
-      transformed_output_grad_channel.ShareDataWith(*output_grad);
-      if (input_grad) {
-        transformed_input_grad_channel.ShareDataWith(*input_grad);
-      }
-    }
-    if (compute_format == DataLayout::kNHWC) {
-      VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
-      ResizeToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      TransToChannelLast<platform::CUDADeviceContext, T>(
-          ctx, filter, &transformed_filter_channel);
-      if (filter_grad) {
-        ResizeToChannelLast<platform::CUDADeviceContext, T>(
-            ctx, filter_grad, &transformed_filter_grad_channel);
-      }
-    } else {
-      transformed_filter_channel.ShareDataWith(*filter);
-      if (filter_grad) {
-        transformed_filter_grad_channel.ShareDataWith(*filter_grad);
-      }
-    }
-    //  update paddings
-    auto in_dims = transformed_input_channel.dims();
-    auto filter_dims = transformed_filter_channel.dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-    if (compute_format == DataLayout::kNCHW) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-      filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-      filter_data_dims =
-          phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1);
-    }
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    // cuDNN only supports padding the same amount on every dimension.
-    // So we create a new padded input tensor.
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_input(input->type());
-    Tensor transformed_input_grad(input->type());
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_input_channel.dims()[0];
-      if (compute_format == DataLayout::kNCHW) {
-        new_input_shape_vec[1] = transformed_input_channel.dims()[1];
-      } else {
-        new_input_shape_vec[data_dim + 1] =
-            transformed_input_channel.dims()[data_dim + 1];
-      }
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        if (compute_format == DataLayout::kNCHW) {
-          new_input_shape_vec[i + 2] =
-              transformed_input_channel.dims()[i + 2] + padding_diff[i];
-        } else {
-          new_input_shape_vec[i + 1] =
-              transformed_input_channel.dims()[i + 1] + padding_diff[i];
-        }
-        if (compute_format == DataLayout::kNCHW) {
-          input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-        } else {
-          input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
-          input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
-        }
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_input.Resize(new_input_shape);
-      transformed_input_grad.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
-      transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (input_grad) {
-        transformed_input_grad =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      // pad for input
-      const int rank = transformed_input_channel.dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, transformed_input_channel, pad_value,
-              &transformed_input);
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-    } else {
-      transformed_input.ShareDataWith(transformed_input_channel);
-      if (input_grad) {
-        transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
-      }
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-    const T* input_data = transformed_input.data<T>();
-    const T* output_grad_data = transformed_output_grad_channel.data<T>();
-    const T* filter_data = transformed_filter_channel.data<T>();
-    T* filter_grad_data = nullptr;
-    T* input_grad_data = nullptr;
-    T* transformed_input_grad_data = nullptr;
-    ConvArgs args1{&transformed_input_grad,
-                   &transformed_filter_channel,
-                   &transformed_output_grad_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{&transformed_input,
-                   &transformed_filter_grad_channel,
-                   &transformed_output_grad_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    auto handle = dev_ctx.cudnn_handle();
-    DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                            : DataLayout::kNCHW;
-    if (transformed_input.dims().size() == 5) {
-      layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC
-                                                   : DataLayout::kNCDHW;
-    }
-    auto layout_tensor = GetCudnnTensorFormat(layout);
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    int i_n, i_c, i_d, i_h, i_w;
-    int o_n, o_c, o_d, o_h, o_w;
-    if (compute_format == DataLayout::kNHWC) {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n,
-               &o_c, &o_d, &o_h, &o_w);
-    } else {
-      GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d,
-               &i_h, &i_w);
-      GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n,
-               &o_c, &o_d, &o_h, &o_w);
-    }
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = transformed_filter_channel.numel() / groups;
-// ------------------- cudnn backward algorithm ---------------------
-#ifdef PADDLE_WITH_HIP
-    miopenConvBwdDataAlgorithm_t data_algo =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-    // input data workspace_size
-    size_t workspace_size_d = 0;
-    // weight workspace_size
-    size_t workspace_size_w = 0;
-    int iwo_groups = groups;
-    int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_groups = 1;
-    c_groups = groups;
-    groups = 1;
-#endif
-    if (input_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      input_grad_data = input_grad->data<T>();
-      transformed_input_grad_data = transformed_input_grad.data<T>();
-      args1.handle = handle;
-      args1.idesc.set(transformed_input_grad, layout_tensor);
-      args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
-      args1.odesc.set(transformed_output_grad_channel, layout_tensor);
-      args1.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size_d =
-          std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
-      data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
-                                   workspace_size_d, ctx);
-#else
-      using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-      workspace_size_d = std::max(workspace_size_d,
-                                  search1::GetWorkspaceSize(args1, data_algo));
-#endif
-    }
-    if (filter_grad) {
-      // ------------------- cudnn descriptors ---------------------
-      filter_grad_data = transformed_filter_grad_channel.data<T>();
-      args2.handle = handle;
-      args2.idesc.set(transformed_input, layout_tensor);
-      args2.wdesc.set(transformed_filter_grad_channel, layout_tensor,
-                      iwo_groups);
-      args2.odesc.set(transformed_output_grad_channel, layout_tensor);
-      args2.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_groups);
-#ifdef PADDLE_WITH_HIP
-      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size_w =
-          std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
-      filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
-                                     workspace_size_w, ctx);
-#else
-      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
-      workspace_size_w = std::max(
-          workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo));
-#endif
-    }
-    // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f;
-#ifdef PADDLE_WITH_HIP
-    // MIOPEN ONLY support beta to be 0.0f
-    ScalingParamType<T> beta = 0.0f;
-#else
-    ScalingParamType<T> beta =
-        (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) ? 1.0f : 0.0f;
-#endif
-    VLOG(4) << "Conv_grad: use_addto = "
-            << (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto"));
-    if (input_grad) {
-// When beta is 0, it is unnecessary to reset input_grad.
-// When beta is 1, the output cannot be reset since addt strategy used.
-#ifdef PADDLE_WITH_HIP
-      if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
-        Tensor temp_tensor(transformed_input_grad.type());
-        temp_tensor.Resize(transformed_input_grad.dims());
-        T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
-                      cudnn_workspace_ptr, workspace_size_d));
-            },
-            workspace_size_d);
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
-            handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
-            transformed_input_grad_data, &alpha, args1.idesc.desc(),
-            temp_tensor_data, &beta, args1.idesc.desc(),
-            transformed_input_grad_data));
-      } else {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      data_algo, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data, cudnn_workspace_ptr,
-                      workspace_size_d));
-            },
-            workspace_size_d);
-      }
-#else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardData(
-                      handle, &alpha, args1.wdesc.desc(),
-                      filter_data + i * group_offset_filter, args1.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args1.cdesc.desc(), data_algo, cudnn_workspace_ptr,
-                      workspace_size_d, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data + i * group_offset_in));
-            },
-            workspace_size_d);
-      }
-#endif
-      if (!is_sys_pad) {
-        std::vector<int> starts(transformed_input_channel.dims().size(), 0);
-        std::vector<int> axes(transformed_input_channel.dims().size(), 0);
-        for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
-          starts[i] = input_pad[2 * i];
-          axes[i] = i;
-        }
-        transformed_input_grad_channel.mutable_data(ctx.GetPlace());
-        if (transformed_input_channel.dims().size() == 4) {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_input_grad, &transformed_input_grad_channel,
-              starts, axes);
-        } else {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_input_grad, &transformed_input_grad_channel,
-              starts, axes);
-        }
-      }
-      if (channel_last && compute_format == DataLayout::kNCHW) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_input_grad_channel, input_grad);
-      }
-    }
-    // filter_grad do not use inplace addto.
-    ScalingParamType<T> beta_filter = 0.0f;
-    // ------------------- cudnn conv backward filter ---------------------
-    if (filter_grad) {
-// Because beta is zero, it is unnecessary to reset filter_grad.
-#ifdef PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardWeights(
-                    handle, &alpha, args2.odesc.desc(), output_grad_data,
-                    args2.idesc.desc(), input_data, args2.cdesc.desc(),
-                    filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
-                    cudnn_workspace_ptr, workspace_size_w));
-          },
-          workspace_size_w);
-#else
-      for (int i = 0; i < groups; i++) {
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args2.idesc.desc(),
-                      input_data + i * group_offset_in, args2.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size_w, &beta_filter, args2.wdesc.desc(),
-                      filter_grad_data + i * group_offset_filter));
-            },
-            workspace_size_w);
-      }
-#endif
-      if (compute_format == DataLayout::kNHWC) {
-        TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_filter_grad_channel, filter_grad);
-      }
-    }
-  }
-};
-/*
- * Inputs:  I, W, dO, ddI, ddW
- * Outputs: ddO, dW, dI
- * ddo = conv(ddI, W) + conv(I, ddW)
- * dW = conv_bp_filter(ddI, dO)
- * dI = conv_bp_data(ddW, dO)
- */
-template <typename T>
-class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    auto X = ctx.Input<Tensor>("Input");
-    auto W = ctx.Input<Tensor>("Filter");
-    auto dO = ctx.Input<Tensor>("DOutput");
-    auto ddX = ctx.Input<Tensor>("DDInput");
-    auto ddW = ctx.Input<Tensor>("DDFilter");
-    auto ddO = ctx.Output<Tensor>("DDOutput");
-    auto dW = ctx.Output<Tensor>("DFilter");
-    auto dX = ctx.Output<Tensor>("DInput");
-    if (ddO) {
-      ddO->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-      set_zero(dev_ctx, ddO, static_cast<T>(0));
-    }
-    if (dW) {
-      dW->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-    }
-    // const T* x = X->data<T>();
-    const T* dy = dO->data<T>();
-    const T* w = W->data<T>();
-    const T* ddx = nullptr;
-    const T* ddw = nullptr;
-    T *dw, *dx, *ddy;
-    dw = dx = ddy = nullptr;
-    T* transformed_dx = nullptr;
-    const std::vector<int>& strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
-                                          ctx.Attr<bool>("exhaustive_search"));
-    bool deterministic = FLAGS_cudnn_deterministic;
-    auto exhaustive_deterministic = exhaustive_search && deterministic;
-    PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
-                      platform::errors::InvalidArgument(
-                          "Cann't set exhaustive_search True and "
-                          "FLAGS_cudnn_deterministic True at same time."));
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    // transform Tensors to channel first-----------
-    Tensor transformed_X_channel(X->type());
-    Tensor transformed_dO_channel(dO->type());
-    Tensor transformed_ddX_channel(X->type());
-    Tensor transformed_ddO_channel(dO->type());
-    Tensor transformed_dX_channel(X->type());
-    if (channel_last) {
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, X, &transformed_X_channel);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(
-          ctx, dO, &transformed_dO_channel);
-      if (ddX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-        TransToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddX, &transformed_ddX_channel);
-      }
-      if (ddO) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, ddO, &transformed_ddO_channel);
-      }
-      if (dX) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(
-            ctx, dX, &transformed_dX_channel);
-        transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
-      }
-    } else {
-      transformed_X_channel = *X;
-      transformed_dO_channel = *dO;
-      if (ddX) {
-        transformed_ddX_channel = *ddX;
-      }
-      if (ddO) {
-        transformed_ddO_channel.ShareDataWith(*ddO);
-      }
-      if (dX) {
-        transformed_dX_channel.ShareDataWith(*dX);
-      }
-    }
-    auto in_dims = transformed_X_channel.dims();
-    auto filter_dims = W->dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    int data_dim = strides.size();  // 2d or 3d
-    bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
-    Tensor transformed_X(X->type());
-    Tensor transformed_ddX(X->type());
-    Tensor transformed_dX(X->type());
-    std::vector<int> padding_common(data_dim, 0);
-    std::vector<int> input_pad(X->dims().size() * 2, 0);
-    if (!is_sys_pad) {
-      // get pad
-      std::vector<int> padding_diff(data_dim);
-      std::vector<int> new_input_shape_vec(data_dim + 2);
-      new_input_shape_vec[0] = transformed_X_channel.dims()[0];
-      new_input_shape_vec[1] = transformed_X_channel.dims()[1];
-      for (size_t i = 0; i < data_dim; ++i) {
-        padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
-        padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
-        new_input_shape_vec[i + 2] =
-            transformed_X_channel.dims()[i + 2] + padding_diff[i];
-        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
-        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
-      }
-      framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
-      transformed_X.Resize(new_input_shape);
-      transformed_ddX.Resize(new_input_shape);
-      transformed_dX.Resize(new_input_shape);
-      transformed_X =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
-      if (ddX) {
-        transformed_ddX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      if (dX) {
-        transformed_dX =
-            ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-                new_input_shape, dev_ctx);
-      }
-      // pad for input
-      const int rank = X->dims().size();
-      T pad_value(0.0);
-      switch (rank) {
-        case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-              dev_ctx, input_pad, transformed_X_channel, pad_value,
-              &transformed_X);
-          if (ddX) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
-                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-              dev_ctx, input_pad, transformed_X_channel, pad_value,
-              &transformed_X);
-          if (ddX) {
-            phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
-                dev_ctx, input_pad, transformed_ddX_channel, pad_value,
-                &transformed_ddX);
-          }
-        } break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "ConvOp only support tensors with 4 or 5 dimensions."));
-      }
-    } else {
-      transformed_X.ShareDataWith(transformed_X_channel);
-      if (ddX) {
-        transformed_ddX.ShareDataWith(transformed_ddX_channel);
-      }
-      if (dX) {
-        transformed_dX.ShareDataWith(transformed_dX_channel);
-      }
-      if (paddings.size() == data_dim) {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[i];
-        }
-      } else {
-        for (size_t i = 0; i < data_dim; ++i) {
-          padding_common[i] = paddings[2 * i];
-        }
-      }
-    }
-    const T* x = transformed_X.data<T>();
-    int iwo_group = groups;
-    int c_group = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-    iwo_group = 1;
-    c_group = groups;
-    groups = 1;
-#endif
-    auto dtype = platform::CudnnDataType<T>::type;
-    auto handle = dev_ctx.cudnn_handle();
-    ConvArgs args1{&transformed_ddX,
-                   W,
-                   &transformed_ddO_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args2{
-        &transformed_X, ddW,  &transformed_ddO_channel, strides, padding_common,
-        dilations,      dtype};
-    ConvArgs args3{&transformed_ddX,
-                   dW,
-                   &transformed_dO_channel,
-                   strides,
-                   padding_common,
-                   dilations,
-                   dtype};
-    ConvArgs args4{
-        &transformed_dX, ddW,  &transformed_dO_channel, strides, padding_common,
-        dilations,       dtype};
-#ifdef PADDLE_WITH_HIP
-    miopenConvFwdAlgorithm_t fwd_algo1 =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvFwdAlgorithm_t fwd_algo2 =
-        static_cast<miopenConvFwdAlgorithm_t>(0);
-    miopenConvBwdDataAlgorithm_t data_algo =
-        static_cast<miopenConvBwdDataAlgorithm_t>(0);
-    miopenConvBwdWeightsAlgorithm_t filter_algo =
-        static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
-#else
-    cudnnConvolutionFwdAlgo_t fwd_algo1 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionFwdAlgo_t fwd_algo2 =
-        static_cast<cudnnConvolutionFwdAlgo_t>(0);
-    cudnnConvolutionBwdDataAlgo_t data_algo =
-        static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
-    cudnnConvolutionBwdFilterAlgo_t filter_algo =
-        static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
-#endif
-    auto layout = GetCudnnTensorFormat(DataLayout::kNCHW);
-    // ddo = conv(ddI, W) + conv(I, ddW)
-    size_t workspace_size = 0;
-    T* transformed_ddy_channel = nullptr;
-    if (ddO) {
-      ddy = ddO->data<T>();
-      transformed_ddy_channel = transformed_ddO_channel.data<T>();
-      if (ddX) {
-        args1.handle = handle;
-        args1.idesc.set(transformed_ddX, iwo_group);
-        args1.wdesc.set(*W, layout, iwo_group);
-        args1.odesc.set(transformed_ddO_channel, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-        using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-        workspace_size = search1::GetWorkspaceSize(args1);
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false,
-                                     workspace_size, ctx);
-#else
-        using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
-        workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
-#endif
-      }
-      if (ddW) {
-        ddw = ddW->data<T>();
-        args2.handle = handle;
-        args2.idesc.set(transformed_X, iwo_group);
-        args2.wdesc.set(*ddW, layout, iwo_group);
-        args2.odesc.set(transformed_ddO_channel, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations,
-                        platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-        using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
-        workspace_size =
-            std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false,
-                                     workspace_size, ctx);
-#else
-        using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
-        workspace_size = std::max(workspace_size,
-                                  search2::GetWorkspaceSize(args2, fwd_algo2));
-#endif
-      }
-    }
-    if (dW && ddX) {
-      dw = dW->data<T>();
-      args3.handle = handle;
-      args3.idesc.set(transformed_ddX, iwo_group);
-      args3.wdesc.set(*dW, layout, iwo_group);
-      args3.odesc.set(transformed_dO_channel, iwo_group);
-      args3.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo = search3::Find<T>(args3, exhaustive_search, deterministic,
-                                     workspace_size, ctx);
-#else
-      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo =
-          search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
-      workspace_size = std::max(workspace_size,
-                                search3::GetWorkspaceSize(args3, filter_algo));
-#endif
-    }
-    if (ddW && dX) {
-      transformed_dx = transformed_dX.data<T>();
-      args4.handle = handle;
-      args4.idesc.set(transformed_dX, iwo_group);
-      args4.wdesc.set(*ddW, layout, iwo_group);
-      args4.odesc.set(transformed_dO_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations,
-                      platform::AllowTF32Cudnn(), c_group);
-#ifdef PADDLE_WITH_HIP
-      using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo = search4::Find<T>(args4, exhaustive_search, deterministic,
-                                   workspace_size, ctx);
-#else
-      using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-      data_algo =
-          search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
-      workspace_size =
-          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
-#endif
-    }
-    int i_n, i_c, i_d, i_h, i_w;
-    GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h,
-             &i_w);
-    int o_n, o_c, o_d, o_h, o_w;
-    GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d,
-             &o_h, &o_w);
-    int group_offset_in = i_c / groups * i_h * i_w * i_d;
-    int group_offset_out = o_c / groups * o_h * o_w * o_d;
-    int group_offset_filter = W->numel() / groups;
-    ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = 0.0f;
-    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
-    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
-    // 0.0f;
-    // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
-    auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
-    if (ddO) {
-      if (ddX) {
-        ddx = transformed_ddX.data<T>();
-#ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args1.idesc.desc(), ddx,
-                      args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
-                      &beta, args1.odesc.desc(), transformed_ddy_channel,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionForward(
-                        handle, &alpha, args1.idesc.desc(),
-                        ddx + i * group_offset_in, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.cdesc.desc(),
-                        fwd_algo1, workspace_ptr, workspace_size, &beta,
-                        args1.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-        }
-#endif
-      }
-      if (ddW) {
-#ifdef PADDLE_WITH_HIP
-        // MIOPEN ONLY support beta to be 0.0f
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::miopenConvolutionForward(
-                      handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
-                      ddw, args2.cdesc.desc(), fwd_algo2, &beta,
-                      args2.odesc.desc(), transformed_ddy_channel,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
-#else
-        for (int i = 0; i < groups; i++) {
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    platform::dynload::cudnnConvolutionForward(
-                        handle, &alpha, args2.idesc.desc(),
-                        x + i * group_offset_in, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        fwd_algo2, workspace_ptr, workspace_size, &alpha,
-                        args2.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out));
-              },
-              workspace_size);
-        }
-#endif
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_ddO_channel, ddO);
-      }
-    }
-    T* transformed_dy_channel = transformed_dO_channel.data<T>();
-    if (dW && ddX) {
-      ddx = transformed_ddX.data<T>();
-#ifdef PADDLE_WITH_HIP
-      wkspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardWeights(
-                    handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
-                    args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
-                    &beta, args3.wdesc.desc(), dw, workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
-#else
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardFilter(
-                      handle, &alpha, args3.idesc.desc(),
-                      ddx + i * group_offset_in, args3.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.cdesc.desc(), filter_algo, workspace_ptr,
-                      workspace_size, &beta, args3.wdesc.desc(),
-                      dw + i * group_offset_filter));
-            },
-            workspace_size);
-      }
-#endif
-    }
-    if (dX && ddW) {
-      ddw = ddW->data<T>();
-#ifdef PADDLE_WITH_HIP
-      wkspace_handle.RunFunc(
-          [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardData(
-                    handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
-                    args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
-                    &beta, args4.idesc.desc(), transformed_dx, workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
-#else
-      for (int i = 0; i < groups; i++) {
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_GPU_SUCCESS(
-                  platform::dynload::cudnnConvolutionBackwardData(
-                      handle, &alpha, args4.wdesc.desc(),
-                      ddw + i * group_offset_filter, args4.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.cdesc.desc(), data_algo, workspace_ptr,
-                      workspace_size, &beta, args4.idesc.desc(),
-                      transformed_dx + i * group_offset_in));
-            },
-            workspace_size);
-      }
-#endif
-      if (!is_sys_pad) {
-        // reverse padded input
-        std::vector<int> starts(X->dims().size(), 0);
-        std::vector<int> axes(X->dims().size(), 0);
-        for (size_t i = 0; i < X->dims().size(); ++i) {
-          starts[i] = input_pad[2 * i];
-          axes[i] = i;
-        }
-        if (X->dims().size() == 4) {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 4>(
-              ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
-        } else {
-          RemovePaddingSlice<paddle::platform::CUDADeviceContext, T, 5>(
-              ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-            ctx, &transformed_dX_channel, dX);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
-// Use depthwise_conv2d in MIOPEN to resolve this issue
-REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#else
-#if CUDNN_VERSION_MIN(8, 1, 0)
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>,
-                   paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
-#else
-REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(
-    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad_grad,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#endif
-REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
-REGISTER_OP_KERNEL(
-    conv3d_grad_grad, CUDNN, plat::CUDAPlace,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-#endif
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 namespace paddle {
 namespace operators {
@@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims,
 }
 template <typename DeviceContext, typename T, size_t D>
-static void RemovePaddingSlice(const framework::ExecutionContext& context,
+static void RemovePaddingSlice(const phi::GPUContext& context,
                               const Tensor* input, Tensor* out,
                               const std::vector<int>& starts,
                               const std::vector<int>& axes) {
-  auto& place =
+  auto& place = *context.eigen_device();
-      *context.template device_context<DeviceContext>().eigen_device();
  auto in_dims = input->dims();
  auto new_out_dims = out->dims();
  auto offsets = Eigen::array<int, D>();
@@ -128,11 +128,10 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                     bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
    algo_t algo;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
    int find_count;
    miopenConvAlgoPerf_t find_result;
@@ -170,11 +169,10 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                     bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
    algo_t algo;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
    int find_count;
    miopenConvAlgoPerf_t find_result;
@@ -212,11 +210,10 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                     bool deterministic, size_t workspace_size,
-                     const framework::ExecutionContext& ctx) {
+                     const phi::GPUContext& ctx) {
    algo_t algo;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
    int find_count;
    miopenConvAlgoPerf_t find_result;

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
            paddle::framework::DataTypeToString(input_data_type),
            paddle::framework::DataTypeToString(filter_data_type)));
  }
-#ifndef PADDLE_WITH_ASCEND_CL
+// #ifndef PADDLE_WITH_ASCEND_CL
-  if (input_data_type == framework::proto::VarType::FP16) {
+//   if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(
+//     PADDLE_ENFORCE_EQ(
-        library, framework::LibraryType::kCUDNN,
+//         library, framework::LibraryType::kCUDNN,
-        platform::errors::InvalidArgument(
+//         platform::errors::InvalidArgument(
-            "float16 can only be used when CUDNN or NPU is used"));
+//             "float16 can only be used when CUDNN or NPU is used"));
-  }
+//   }
-#endif
+// #endif
 #if PADDLE_WITH_CUDA
  if (input_data_type == framework::proto::VarType::BF16 &&
      library == framework::LibraryType::kCUDNN) {
@@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad,
                  ops::Conv3DDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad);
-// depthwise conv kernel
-// TODO(xingzhaolong): neon kernel for mobile
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    depthwise_conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv2d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    conv3d_grad_grad,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GemmConvDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_VERSION(conv2d)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/conv_op.cu.cc
+++ b/paddle/fluid/operators/conv_op.cu.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/conv_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv2d_grad,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_grad,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override;
 };
-template <typename DeviceContext, typename T>
-class GemmConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    // The filter will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-    const int groups = context.Attr<int>("groups");
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    Tensor transformed_input(input->dtype());
-    Tensor transformed_output(output->dtype());
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-      ResizeToChannelFirst<DeviceContext, T>(context, output,
-                                             &transformed_output);
-    } else {
-      transformed_input = *input;
-      transformed_output = *output;
-    }
-    // update padding and dilation
-    auto trans_in_dims = transformed_input.dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims =
-        phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-    // filter_shape_vec:
-    // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    // output_shape_vec:
-    // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_output.dims()));
-    // use col_shape in the im2col calculation
-    // col_shape_vec:
-    // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
-    // o_d,o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = trans_in_dims[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-    // use col_matrix_shape in the gemm calculation
-    // size:
-    // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
-    // o_w)
-    framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim);
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-    framework::DDim in_matrix_shape = phi::slice_ddim(
-        transformed_input.dims(), 1, transformed_input.dims().size());
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        transformed_output.dims()[1],
-        transformed_output.numel() /
-            (transformed_output.dims()[0] * transformed_output.dims()[1])};
-    // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
-    math::Vol2ColFunctor<DeviceContext, T> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch =
-          transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
-      Tensor out_batch =
-          transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          im2col(dev_ctx, in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                  paddings[3]},
-                 &col);
-        } else if (data_dim == 3U) {
-          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-        }
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
-                    T(0.0));
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<DeviceContext, T>(context, &transformed_output,
-                                           output);
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class GemmConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    // The filter and filter_grad will be reshaped in the calculations,
-    // so here use an assignment operation,
-    // that avoids modifying the variable in the Scope.
-    Tensor filter = *context.Input<Tensor>("Filter");
-    if (!input_grad && !filter_grad) return;
-    int groups = context.Attr<int>("groups");
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    Tensor transformed_input(input->dtype());
-    Tensor transformed_output_grad(output_grad->dtype());
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-      ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
-                                             &transformed_output_grad);
-      TransToChannelFirst<DeviceContext, T>(context, output_grad,
-                                            &transformed_output_grad);
-    } else {
-      transformed_input = *input;
-      transformed_output_grad = *output_grad;
-    }
-    // update padding and dilation
-    auto in_dims = transformed_input.dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
-    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_output_grad.dims()));
-    // use col_shape in the im2col calculation
-    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-    // o_h, o_w}
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = transformed_input.dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-    // use col_matrix_shape in the gemm calculation
-    // size: (i_c/g * k_h * k_w, o_h * o_w)
-    // or
-    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-    framework::DDim col_matrix_shape =
-        phi::flatten_to_2d(col_shape, data_dim + 1);
-    framework::DDim input_shape = phi::slice_ddim(
-        transformed_input.dims(), 1, transformed_input.dims().size());
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        transformed_output_grad.dims()[1],
-        transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
-                                           transformed_output_grad.dims()[1])};
-    // convolution backward input operator:  gemm + col2im(or col2vol)
-    // convolution backward weight operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    // col_matrix shares the same piece of data with col,
-    // but will be reshaped into a two-dimensional matrix shape
-    // to call the matrix multiplication interface.
-    Tensor col_matrix;
-    if (is_expand) {
-      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      Tensor transformed_input_grad(input_grad->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
-                                               &transformed_input_grad);
-      } else {
-        transformed_input_grad = *input_grad;
-      }
-      // if is_expand is false, the operation of set_zero is unnecessary,
-      // because math::matmul will reset input_grad.
-      if (is_expand) {
-        set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch =
-            transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_grad_slice =
-              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col_matrix.ShareDataWith(in_grad_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
-                      &col_matrix, T(0.0));
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &in_grad_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
-                                             input_grad);
-      }
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      Tensor filter_grad_ = *filter_grad;
-      filter_grad_.Resize(filter_matrix_shape);
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; i++) {
-        Tensor out_grad_batch =
-            transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // im2col
-          Tensor out_grad_slice =
-              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(in_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, in_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-          }
-          // gemm
-          Tensor filter_grad_slice =
-              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
-                      &filter_grad_slice, T(1.0));
-        }
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()), true,
-        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
-    const Tensor* X = ctx.Input<Tensor>("Input");
-    const Tensor* dY = ctx.Input<Tensor>("DOutput");
-    const Tensor* ddX = ctx.Input<Tensor>("DDInput");
-    const Tensor* ddW_in = ctx.Input<Tensor>("DDFilter");
-    Tensor* ddY = ctx.Output<Tensor>("DDOutput");
-    Tensor* dW = ctx.Output<Tensor>("DFilter");
-    Tensor* dX = ctx.Output<Tensor>("DInput");
-    Tensor W = GET_DATA_SAFELY(ctx.Input<Tensor>("Filter"), "Input", "Filter",
-                               "GemmConvDoubleGrad");
-    if (!ddY && !dW && !dX) return;
-    const int groups = ctx.Attr<int>("groups");
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    // transform Tensor
-    Tensor transformed_X(X->dtype());
-    Tensor transformed_dY(dY->dtype());
-    Tensor transformed_ddX(X->dtype());
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
-      TransToChannelFirst<DeviceContext, T>(ctx, X, &transformed_X);
-      ResizeToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
-      TransToChannelFirst<DeviceContext, T>(ctx, dY, &transformed_dY);
-      if (ddX) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
-        TransToChannelFirst<DeviceContext, T>(ctx, ddX, &transformed_ddX);
-      }
-    } else {
-      transformed_X = *X;
-      transformed_dY = *dY;
-      if (ddX) {
-        transformed_ddX = *ddX;
-      }
-    }
-    // update padding and dilation
-    auto in_dims = transformed_X.dims();
-    auto filter_dims = W.dims();
-    framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    const int batch_size = static_cast<int>(transformed_X.dims()[0]);
-    std::vector<int64_t> filter_shape_vec(phi::vectorize(W.dims()));
-    std::vector<int64_t> output_shape_vec(
-        phi::vectorize(transformed_dY.dims()));
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    // col_shape [in_channel/group, kh, kw, oh, ow]
-    col_shape_vec[0] = transformed_X.dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(phi::make_ddim(col_shape_vec));
-    // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
-    framework::DDim col_matrix_shape =
-        phi::flatten_to_2d(col_shape, data_dim + 1);
-    // input_shape [Cin, H, W]
-    framework::DDim input_shape =
-        phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
-    // filter_matrix_shape [Cout, Cin * kh * kw]
-    framework::DDim filter_matrix_shape = {W.dims()[0],
-                                           W.numel() / W.dims()[0]};
-    W.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        transformed_dY.dims()[1],
-        transformed_dY.numel() /
-            (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
-    int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
-    int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
-    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    Tensor col_matrix;
-    if (is_expand) {
-      col = ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    // dx convolution double grad:  gemm + col2im(col2vol)
-    // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
-    // oH, oW)
-    if (dX && ddW_in) {
-      Tensor ddW;
-      ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-      dX->mutable_data<T>(ctx.GetPlace());
-      Tensor transformed_dX(dX->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, dX, &transformed_dX);
-      } else {
-        transformed_dX = *dX;
-      }
-      // if is_expand is false, the operation of set_zero is unnecessary
-      // because math::matmul will reset dx
-      if (is_expand) {
-        set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
-      }
-      math::Col2VolFunctor<DeviceContext, T> col2vol;
-      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
-      for (int i = 0; i < batch_size; i++) {
-        Tensor dy_batch =
-            transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; g++) {
-          // gemm
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-          Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col_matrix.ShareDataWith(dx_slice);
-            col_matrix.Resize(col_matrix_shape);
-          }
-          blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix,
-                      T(0.0));
-          if (is_expand && data_dim == 2U) {
-            col2im(dev_ctx, col, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &dx_slice);
-          } else if (is_expand && data_dim == 3U) {
-            col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(ctx, &transformed_dX, dX);
-      }
-    }
-    // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
-    // oH, oW)
-    // dw convolution double grad:  im2col(vol2col) + gemm
-    if (dW && ddX) {
-      dW->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dW, static_cast<T>(0));
-      Tensor dW_arr = *dW;
-      dW_arr.Resize(filter_matrix_shape);
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor dy_batch =
-            transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-        Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-        for (int g = 0; g < groups; ++g) {
-          // im2col
-          Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-          Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(ddx_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx, ddx_slice, dilations, strides,
-                   std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                    paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-          }
-          Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice,
-                      T(1.0));
-        }
-      }
-    }
-    // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
-    // w/ddw(Cout, Cin, kh, kw)
-    // ddy convolution double grad: im2col(vol2col) + gemm
-    if (ddY) {
-      ddY->mutable_data<T>(ctx.GetPlace());
-      Tensor transformed_ddY(ddY->dtype());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(ctx, ddY, &transformed_ddY);
-      } else {
-        transformed_ddY = *ddY;
-      }
-      set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
-      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      for (int i = 0; i < batch_size; ++i) {
-        Tensor ddy_batch =
-            transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
-        for (int g = 0; g < groups; ++g) {
-          // gemm
-          Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step);
-          if (ddX) {
-            Tensor ddx_batch =
-                transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-            Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-            if (!is_expand) {
-              col.ShareDataWith(ddx_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              im2col(dev_ctx, ddx_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                      paddings[3]},
-                     &col);
-            } else if (data_dim == 3U) {
-              vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-            }
-            Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(0.0));
-          }
-          if (ddW_in) {
-            Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape);
-            Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
-            Tensor ddW;
-            ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-            if (!is_expand) {
-              col.ShareDataWith(x_slice);
-              col_matrix.ShareDataWith(col);
-              col_matrix.Resize(col_matrix_shape);
-            } else if (data_dim == 2U) {
-              im2col(dev_ctx, x_slice, dilations, strides,
-                     std::vector<int>{paddings[0], paddings[2], paddings[1],
-                                      paddings[3]},
-                     &col);
-            } else if (data_dim == 3U) {
-              vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
-            }
-            // gemm
-            Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-            blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice,
-                        T(1.0));
-          }
-        }
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(ctx, &transformed_ddY, ddY);
-      }
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class DepthwiseConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-    const std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-    if (channel_last) {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[output->dims().size() - 1] %
-              input->dims()[input->dims().size() - 1],
-          0, platform::errors::InvalidArgument(
-                 "ShapeError: The output channels must be a multiple of the "
-                 "input channels. But receivced output channel number is %d "
-                 "and input channel number is %d",
-                 output->dims()[output->dims().size() - 1],
-                 input->dims()[input->dims().size() - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[1] % input->dims()[1], 0,
-          platform::errors::InvalidArgument(
-              "ShapeError: The output channels must be a multiple of the "
-              "input channels. But receivced output channel number is %d "
-              "and input channel number is %d",
-              output->dims()[1], input->dims()[1]));
-    }
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims;
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_format);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
-    if (!is_sys_pad) {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (fuse_relu) {
-      math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output, data_layout);
-    } else {
-      math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
-      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                    output, data_layout);
-    }
-  }
-};
-template <typename DeviceContext, typename T>
-class DepthwiseConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-    if (!input_grad && !filter_grad) return;
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
-    const std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    const std::string data_format = context.Attr<std::string>("data_format");
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims;
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_format);
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
-    if (!is_sys_pad) {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        paddings.erase(paddings.begin() + i + 1);
-      }
-    }
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (input_grad) {
-      input_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, input_grad, static_cast<T>(0));
-      if (fuse_relu) {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad, data_layout);
-      } else {
-        math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
-            depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                               paddings, dilations, input_grad, data_layout);
-      }
-    }
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      if (fuse_relu) {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad, data_layout);
-      } else {
-        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
-            depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
-                                paddings, dilations, filter_grad, data_layout);
-      }
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -244,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
    using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
    workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-    algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
+    algo = search::Find<T>(
+        args, false, deterministic, workspace_size,
+        ctx.template device_context<platform::CUDADeviceContext>());
 #else
    using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    algo = search::Find<T>(args, false, deterministic, ctx);
+    algo = search::Find<T>(
+        args, false, deterministic,
+        ctx.template device_context<platform::CUDADeviceContext>());
    workspace_size =
        std::max(workspace_size, search::GetWorkspaceSize(args, algo));
 #endif
@@ -501,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
      using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
      workspace_size =
          std::max(workspace_size, search1::GetWorkspaceSize(args1));
-      data_algo =
+      data_algo = search1::Find<T>(
-          search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+          args1, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
      using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search1::Find<T>(args1, false, deterministic, ctx);
+      data_algo = search1::Find<T>(
+          args1, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
      workspace_size =
          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
 #endif
@@ -523,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
      workspace_size =
          std::max(workspace_size, search2::GetWorkspaceSize(args2));
-      filter_algo =
+      filter_algo = search2::Find<T>(
-          search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+          args2, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
+      filter_algo = search2::Find<T>(
+          args2, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
      workspace_size = std::max(workspace_size,
                                search2::GetWorkspaceSize(args2, filter_algo));
 #endif
@@ -944,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
        using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
        workspace_size = search1::GetWorkspaceSize(args1);
-        bwd_algo1 =
+        bwd_algo1 = search1::Find<T>(
-            search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
+            args1, false, deterministic, workspace_size,
+            ctx.template device_context<platform::CUDADeviceContext>());
 #else
        using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
+        bwd_algo1 = search1::Find<T>(
+            args1, false, deterministic,
+            ctx.template device_context<platform::CUDADeviceContext>());
        workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
 #endif
      }
@@ -965,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
        using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
        workspace_size =
            std::max(workspace_size, search2::GetWorkspaceSize(args2));
-        bwd_algo2 =
+        bwd_algo2 = search2::Find<T>(
-            search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
+            args2, false, deterministic, workspace_size,
+            ctx.template device_context<platform::CUDADeviceContext>());
 #else
        using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-        bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
+        bwd_algo2 = search2::Find<T>(
+            args2, false, deterministic,
+            ctx.template device_context<platform::CUDADeviceContext>());
        workspace_size = std::max(workspace_size,
                                  search2::GetWorkspaceSize(args2, bwd_algo2));
 #endif
@@ -990,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
      workspace_size =
          std::max(workspace_size, search3::GetWorkspaceSize(args3));
-      filter_algo =
+      filter_algo = search3::Find<T>(
-          search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
+          args3, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-      filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
+      filter_algo = search3::Find<T>(
+          args3, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
      workspace_size = std::max(workspace_size,
                                search3::GetWorkspaceSize(args3, filter_algo));
 #endif
@@ -1013,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
      using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
      workspace_size =
          std::max(workspace_size, search4::GetWorkspaceSize(args4));
-      data_algo =
+      data_algo = search4::Find<T>(
-          search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
+          args4, false, deterministic, workspace_size,
+          ctx.template device_context<platform::CUDADeviceContext>());
 #else
      using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      data_algo = search4::Find<T>(args4, false, deterministic, ctx);
+      data_algo = search4::Find<T>(
+          args4, false, deterministic,
+          ctx.template device_context<platform::CUDADeviceContext>());
      workspace_size =
          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
 #endif

--- a/paddle/fluid/operators/conv_transpose_op.cu
+++ b/paddle/fluid/operators/conv_transpose_op.cu
@@ -13,10 +13,150 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const std::string data_layout_str =
+        context.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(
+        groups, filter.dims()[0],
+        platform::errors::InvalidArgument(
+            "groups should be error to the 1st dimension of filter. But "
+            "received groups is %d and filter dimension[0] is %d",
+            groups, filter.dims()[0]));
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
+                                  "dilations should be 1 in depthwise conv. "
+                                  "But received dilations is %d",
+                                  v));
+    }
+    auto in_dims = input->dims();
+    auto filter_dims = filter.dims();
+    framework::DDim in_data_dims;
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    phi::funcs::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+    math::DepthwiseConvInputGradFunctor<phi::GPUContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *output, filter, *input, strides,
+        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+        dilations, output, data_layout);
+  }
+};
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const std::string data_layout_str =
+        context.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+    if (!input_grad && !filter_grad) return;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
+    auto in_dims = input->dims();
+    auto filter_dims = filter.dims();
+    framework::DDim in_data_dims;
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+    if (input_grad) {
+      math::DepthwiseConvFunctor<phi::GPUContext, T> depthwiseConv;
+      depthwiseConv(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *output_grad, filter, strides,
+          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+          dilations, input_grad, data_layout);
+    }
+    if (filter_grad) {
+      phi::funcs::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      math::DepthwiseConvFilterGradFunctor<phi::GPUContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *output_grad, *input, strides,
+          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
+          dilations, filter_grad, data_layout);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 // conv2d
 REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
                        ops::GemmConvTransposeKernel<CUDA, float>,

--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
  }
 };
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    Tensor filter = *context.Input<Tensor>("Filter");
-    Tensor* output = context.Output<Tensor>("Output");
-    output->mutable_data<T>(context.GetPlace());
-    int groups = context.Attr<int>("groups");
-    PADDLE_ENFORCE_EQ(
-        groups, filter.dims()[0],
-        platform::errors::InvalidArgument(
-            "groups should be error to the 1st dimension of filter. But "
-            "received groups is %d and filter dimension[0] is %d",
-            groups, filter.dims()[0]));
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    for (auto v : dilations) {
-      PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument(
-                                  "dilations should be 1 in depthwise conv. "
-                                  "But received dilations is %d",
-                                  v));
-    }
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, output, static_cast<T>(0));
-    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
-        depthwiseConvInputGrad;
-    depthwiseConvInputGrad(
-        dev_ctx, *output, filter, *input, strides,
-        std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-        dilations, output, data_layout);
-  }
-};
-template <typename DeviceContext, typename T>
-class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const std::string data_layout_str =
-        context.Attr<std::string>("data_format");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const Tensor* input = context.Input<Tensor>("Input");
-    const Tensor* output_grad =
-        context.Input<Tensor>(framework::GradVarName("Output"));
-    Tensor* input_grad =
-        context.Output<Tensor>(framework::GradVarName("Input"));
-    Tensor* filter_grad =
-        context.Output<Tensor>(framework::GradVarName("Filter"));
-    Tensor filter = *context.Input<Tensor>("Filter");
-    if (!input_grad && !filter_grad) return;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    std::string padding_algorithm =
-        context.Attr<std::string>("padding_algorithm");
-    auto in_dims = input->dims();
-    auto filter_dims = filter.dims();
-    framework::DDim in_data_dims;
-    if (data_layout != framework::DataLayout::kNHWC) {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             in_data_dims, strides, ksize);
-    if (input_grad) {
-      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(
-          dev_ctx, *output_grad, filter, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, input_grad, data_layout);
-    }
-    if (filter_grad) {
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      filter_grad->mutable_data<T>(context.GetPlace());
-      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
-          depthwiseConvFilterGrad;
-      depthwiseConvFilterGrad(
-          dev_ctx, *output_grad, *input, strides,
-          std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          dilations, filter_grad, data_layout);
-    }
-  }
-};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace framework = paddle::framework;
@@ -29,10 +30,10 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
-USE_OP(conv2d_grad);
+USE_OP_ITSELF(conv2d_grad);
-USE_OP_DEVICE_KERNEL(conv2d, CUDNN);
+PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT);
-USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN);
+PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT);
 template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,

--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/hostdevice.h"
-namespace paddle {
-namespace operators {
-namespace math {
-using DataLayout = framework::DataLayout;
-/*
- * \brief Compute the depthwise convolution which include
- * forward process and backpropagation process
- */
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvInputGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* input_grad,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-template <typename DeviceContext, typename T,
-          bool fuse_relu_before_conv = false>
-class DepthwiseConvFilterGradFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& output_grad,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  framework::Tensor* filter_grad,
-                  const DataLayout data_layout = DataLayout::kNCHW);
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 namespace paddle {
 namespace platform {
 class CPUDeviceContext;
@@ -141,6 +143,116 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
  }
 };
+template <class T>
+class Vol2ColFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context, const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* col,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+    // changed
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx;
+            if (data_layout != DataLayout::kNHWC) {
+              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                            input_width +
+                        w_pad;
+            } else {
+              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                            input_channels +
+                        c_in;
+            }
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
 /*
 * vol = [input_channels,input_depth, input_height, input_width]
 * col =
@@ -258,10 +370,125 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
  }
 };
+template <class T>
+class Col2VolFunctor<phi::CPUContext, T> {
+ public:
+  void operator()(const phi::CPUContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* vol,
+                  const DataLayout data_layout) const {
+    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
+    int input_channels =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
+    int input_depth =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
+    int input_height =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
+    int input_width =
+        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+    bool paddings_size_is_6 = (paddings.size() == 6);
+    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
+    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
+    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
+    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
+    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d)  and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
+    T* vol_data = vol->data<T>();
+    const T* col_data = col.data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx;
+              if (data_layout != DataLayout::kNHWC) {
+                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                              input_width +
+                          w_pad;
+              } else {
+                vol_idx =
+                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
+                        input_channels +
+                    cIm;
+              }
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
 template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
 template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Vol2ColFunctor<phi::CPUContext, float>;
+template class Vol2ColFunctor<phi::CPUContext, double>;
 template class Col2VolFunctor<platform::CPUDeviceContext, float>;
 template class Col2VolFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<phi::CPUContext, float>;
+template class Col2VolFunctor<phi::CPUContext, double>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -33,7 +33,7 @@ USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
-USE_OP(conv2d);
+USE_OP_ITSELF(conv2d);
 USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);
 namespace paddle {
@@ -55,7 +55,7 @@ class CacheTester {
    onednn_dev_ctx_->ResetBlobMap(nullptr);
  }
-  bool Analyze(unsigned short int num_entries) {
+  bool Analyze(uint16_t num_entries) {
    //  Number of created objects in cache should be as expected (num_entries)
    return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries;
  }

--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string>
 #include <tuple>
+#include "paddle/phi/common/place.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -10,7 +10,7 @@ add_subdirectory(funcs)
 set_property(GLOBAL PROPERTY PHI_KERNELS "")
 set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)

--- a/paddle/phi/kernels/conv_grad_grad_kernel.h
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        paddle::optional<const DenseTensor&> input_grad_grad,
+                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const DenseTensor& out_grad,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::string& paddding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations,
+                        const std::string& data_format,
+                        bool use_addto,
+                        int workspace_size_MB,
+                        bool exhaustive_search,
+                        DenseTensor* out_grad_grad,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad);
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& dev_ctx,
+                          paddle::optional<const DenseTensor&> input_grad_grad,
+                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings,
+                          const std::string& paddding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search,
+                          DenseTensor* out_grad_grad,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/conv_grad_kernel.h
+++ b/paddle/phi/kernels/conv_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvGradKernel(const Context& dev_ctx,
+                    const DenseTensor& out_grad,
+                    const DenseTensor& input,
+                    const DenseTensor& filter,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings,
+                    const std::string& paddding_algorithm,
+                    int groups,
+                    const std::vector<int>& dilations,
+                    const std::string& data_format,
+                    bool use_addto,
+                    int workspace_size_MB,
+                    bool exhaustive_search,
+                    DenseTensor* input_grad,
+                    DenseTensor* filter_grad);
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad);
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             const std::string& paddding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad);
+}  // namespace phi
--- a/paddle/phi/kernels/conv_kernel.h
+++ b/paddle/phi/kernels/conv_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings,
+                const std::string& paddding_algorithm,
+                int groups,
+                const std::vector<int>& dilations,
+                const std::string& data_format,
+                bool use_addto,
+                int workspace_size_MB,
+                bool exhaustive_search,
+                DenseTensor* out);
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out);
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         const std::string& paddding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void Conv3DGradGradKernel(const Context& ctx,
+                          paddle::optional<const DenseTensor&> input_grad_grad,
+                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& input,
+                          const DenseTensor& filter,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& paddings_t,
+                          const std::string& padding_algorithm,
+                          int groups,
+                          const std::vector<int>& dilations_t,
+                          const std::string& data_format,
+                          bool use_addto,
+                          int workspace_size_MB,
+                          bool exhaustive_search_t,
+                          DenseTensor* out_grad_grad,
+                          DenseTensor* input_grad,
+                          DenseTensor* filter_grad) {
+  ConvGradGradKernel<T>(ctx,
+                        input_grad_grad,
+                        filter_grad_grad,
+                        out_grad,
+                        input,
+                        filter,
+                        strides,
+                        paddings_t,
+                        padding_algorithm,
+                        groups,
+                        dilations_t,
+                        data_format,
+                        use_addto,
+                        workspace_size_MB,
+                        exhaustive_search_t,
+                        out_grad_grad,
+                        input_grad,
+                        filter_grad);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, CPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::Conv3DGradGradKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides,
+                             const std::vector<int>& paddings,
+                             const std::string& paddding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    conv2d_grad, CPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(
+    conv3d_grad, CPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
--- a/paddle/phi/kernels/cpu/conv_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides,
+                         const std::vector<int>& paddings,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(conv2d, CPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(conv3d, CPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
--- a/paddle/phi/kernels/cpu/conv_util.h
+++ b/paddle/phi/kernels/cpu/conv_util.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/ddim.h"
+namespace phi {
+template <typename T = int>
+inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
+                                     std::vector<T>* dilation,
+                                     const std::string padding_algorithm,
+                                     const DDim data_dims,
+                                     const std::vector<T>& strides,
+                                     const std::vector<T>& ksize) {
+  // set padding size == data_dims.size() * 2
+  auto data_shape = vectorize<T>(data_dims);
+  if (static_cast<int>(paddings->size()) == data_dims.size()) {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(
+        data_dims.size() * 2,
+        paddings->size(),
+        phi::errors::InvalidArgument(
+            "Attribute padding's size should be the same or twice as the "
+            "input's dimension. "
+            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "dimension is %d, input's shape is [%s].",
+            paddings->size(),
+            make_ddim(*paddings),
+            data_dims.size(),
+            data_dims));
+  }
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < data_dims.size(); ++i) {
+      T out_size = (data_dims[i] + strides[i] - 1) / strides[i];
+      T pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i],
+                   static_cast<T>(0));
+      T pad_0 = pad_sum / 2;
+      T pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+      // dilation
+      *(dilation->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  if (paddings.size() != strides.size()) {
+    for (size_t j = 0; j < paddings.size(); ++j) {
+      padding_0 = padding_0 && (paddings[j] == 0);
+    }
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/depthwise_conv_grad_kernel.h
+++ b/paddle/phi/kernels/depthwise_conv_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {}  // namespace phi
--- a/paddle/phi/kernels/depthwise_conv_kernel.h
+++ b/paddle/phi/kernels/depthwise_conv_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {}  // namespace phi
--- a/paddle/phi/kernels/funcs/batch_norm_utils.h
+++ b/paddle/phi/kernels/funcs/batch_norm_utils.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+using Tensor = DenseTensor;
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+    auto in_dims_vec = vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    phi::funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    phi::funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    phi::funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    phi::funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    phi::funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    phi::funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+}  // namespace phi
--- a/paddle/phi/kernels/funcs/padding.h
+++ b/paddle/phi/kernels/funcs/padding.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <utility>
 #include <vector>
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 namespace phi {
 namespace funcs {

--- a/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_grad_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+PD_REGISTER_KERNEL(
+    conv2d_grad_grad, GPU, ALL_LAYOUT, phi::ConvGradGradKernel, float, double) {
+}
--- a/paddle/phi/kernels/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/impl/conv_grad_kernel_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void Conv3DGradKernel(const Context& dev_ctx,
+                      const DenseTensor& out_grad,
+                      const DenseTensor& input,
+                      const DenseTensor& filter,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      DenseTensor* input_grad,
+                      DenseTensor* filter_grad) {
+  ConvGradKernel<T>(dev_ctx,
+                    out_grad,
+                    input,
+                    filter,
+                    strides,
+                    paddings,
+                    paddding_algorithm,
+                    groups,
+                    dilations,
+                    data_format,
+                    use_addto,
+                    workspace_size_MB,
+                    exhaustive_search,
+                    input_grad,
+                    filter_grad);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(
+    conv2d_grad, GPU, ALL_LAYOUT, phi::ConvGradKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    conv3d_grad, GPU, ALL_LAYOUT, phi::Conv3DGradKernel, float, double) {}
--- a/paddle/phi/kernels/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void Conv3DKernel(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  const DenseTensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::string& padding_algorithm,
+                  int groups,
+                  const std::vector<int>& dilations,
+                  const std::string& data_format,
+                  bool use_addto,
+                  int workspace_size_MB,
+                  bool exhaustive_search,
+                  DenseTensor* out) {
+  ConvKernel<T>(dev_ctx,
+                input,
+                filter,
+                strides,
+                paddings,
+                padding_algorithm,
+                groups,
+                dilations,
+                data_format,
+                use_addto,
+                workspace_size_MB,
+                exhaustive_search,
+                out);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(conv2d, GPU, ALL_LAYOUT, phi::ConvKernel, float, double) {}
+PD_REGISTER_KERNEL(conv3d, GPU, ALL_LAYOUT, phi::Conv3DKernel, float, double) {}
--- a/paddle/phi/kernels/gpu/conv_test_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_test_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <algorithm>
+#pragma once
 #include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/hostdevice.h"
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
@@ -21,7 +25,7 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -30,6 +34,58 @@ namespace paddle {
 namespace operators {
 namespace math {
+using DataLayout = framework::DataLayout;
+/*
+ * \brief Compute the depthwise convolution which include
+ * forward process and backpropagation process
+ */
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvInputGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* input_grad,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
+template <typename DeviceContext,
+          typename T,
+          bool fuse_relu_before_conv = false>
+class DepthwiseConvFilterGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* filter_grad,
+                  const DataLayout data_layout = DataLayout::kNCHW);
+};
 template <typename T>
 static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) {
  typedef cub::WarpReduce<T> WarpReduce;
@@ -293,8 +349,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
  }
 }
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+template <typename T,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
  int final_filter_multiplier = filter_multiplier;
  int h_stride = stride_height;
@@ -306,34 +366,88 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
  }
  if (c_filter == -1) {
    if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(
+      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(input_data,
-          input_data, filter_data, batch_size, output_channels, output_height,
+                                                        filter_data,
-          output_width, input_channels, input_height, input_width,
+                                                        batch_size,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
+                                                        output_channels,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+                                                        output_height,
-          output_data);
+                                                        output_width,
+                                                        input_channels,
+                                                        input_height,
+                                                        input_width,
+                                                        final_filter_multiplier,
+                                                        filter_height,
+                                                        filter_width,
+                                                        h_stride,
+                                                        w_stride,
+                                                        padding_height,
+                                                        padding_width,
+                                                        dilate_height,
+                                                        dilate_width,
+                                                        output_data);
    } else {
-      KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(
+      KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(input_data,
-          input_data, filter_data, batch_size, output_channels, output_height,
+                                                        filter_data,
-          output_width, input_channels, input_height, input_width,
+                                                        batch_size,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
+                                                        output_channels,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+                                                        output_height,
-          output_data);
+                                                        output_width,
+                                                        input_channels,
+                                                        input_height,
+                                                        input_width,
+                                                        final_filter_multiplier,
+                                                        filter_height,
+                                                        filter_width,
+                                                        h_stride,
+                                                        w_stride,
+                                                        padding_height,
+                                                        padding_width,
+                                                        dilate_height,
+                                                        dilate_width,
+                                                        output_data);
    }
  } else {
    if (data_layout != DataLayout::kNHWC) {
      KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
+          input_data,
-          output_width, input_channels, input_height, input_width,
+          filter_data,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
+          batch_size,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
          output_data);
    } else {
      KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>(
-          input_data, filter_data, batch_size, output_channels, output_height,
+          input_data,
-          output_width, input_channels, input_height, input_width,
+          filter_data,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
+          batch_size,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
          output_data);
    }
  }
@@ -464,7 +578,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
  }
 }
-template <typename T, int c_filter, int c_filter_multiplier,
+template <typename T,
+          int c_filter,
+          int c_filter_multiplier,
          bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
@@ -525,7 +641,9 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
  }
 }
-template <typename T, int c_filter, int c_filter_multiplier,
+template <typename T,
+          int c_filter,
+          int c_filter_multiplier,
          bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
@@ -595,8 +713,12 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
  }
 }
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+template <typename T,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+          int c_filter_multiplier,
+          int c_stride,
+          int c_filter,
+          DataLayout data_layout,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
  int final_filter_multiplier = filter_multiplier;
@@ -611,36 +733,100 @@ __global__ void KernelDepthwiseConvInputGradSp(
  if (c_filter_multiplier == 0 || c_filter == -1) {
    if (data_layout != DataLayout::kNHWC) {
      KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
+          input_data,
-          output_channels, output_height, output_width, input_channels,
+          output_grad_data,
-          input_height, input_width, final_filter_multiplier, filter_height,
+          filter_data,
-          filter_width, h_stride, w_stride, padding_height, padding_width,
+          batch_size,
-          dilate_height, dilate_width, input_grad_data);
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
    } else {
      KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
+          input_data,
-          output_channels, output_height, output_width, input_channels,
+          output_grad_data,
-          input_height, input_width, final_filter_multiplier, filter_height,
+          filter_data,
-          filter_width, h_stride, w_stride, padding_height, padding_width,
+          batch_size,
-          dilate_height, dilate_width, input_grad_data);
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
    }
  } else {
    if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvInputGradCFilterNCHW<T, c_filter, c_filter_multiplier,
+      KernelDepthwiseConvInputGradCFilterNCHW<T,
+                                              c_filter,
+                                              c_filter_multiplier,
                                              fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
+          input_data,
-          output_channels, output_height, output_width, input_channels,
+          output_grad_data,
-          input_height, input_width, c_filter_multiplier, filter_height,
+          filter_data,
-          filter_width, c_stride, c_stride, padding_height, padding_width,
+          batch_size,
-          dilate_height, dilate_width, input_grad_data);
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          c_filter_multiplier,
+          filter_height,
+          filter_width,
+          c_stride,
+          c_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
    } else {
-      KernelDepthwiseConvInputGradCFilterNHWC<T, c_filter, c_filter_multiplier,
+      KernelDepthwiseConvInputGradCFilterNHWC<T,
+                                              c_filter,
+                                              c_filter_multiplier,
                                              fuse_relu_before_conv>(
-          input_data, output_grad_data, filter_data, batch_size,
+          input_data,
-          output_channels, output_height, output_width, input_channels,
+          output_grad_data,
-          input_height, input_width, c_filter_multiplier, filter_height,
+          filter_data,
-          filter_width, c_stride, c_stride, padding_height, padding_width,
+          batch_size,
-          dilate_height, dilate_width, input_grad_data);
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          c_filter_multiplier,
+          filter_height,
+          filter_width,
+          c_stride,
+          c_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          input_grad_data);
    }
  }
 }
@@ -648,13 +834,25 @@ __global__ void KernelDepthwiseConvInputGradSp(
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
-    const T* output_grad_data, const T* input_data, const int num,
+    const T* output_grad_data,
-    const int output_channels, const int output_height, const int output_width,
+    const T* input_data,
-    const int input_channels, const int input_height, const int input_width,
+    const int num,
-    const int filter_multiplier, const int filter_height,
+    const int output_channels,
-    const int filter_width, const int stride_height, const int stride_width,
+    const int output_height,
-    const int padding_height, const int padding_width, const int dilate_height,
+    const int output_width,
-    const int dilate_width, T* filter_grad_data) {
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
  T s = 0;
  int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
@@ -697,13 +895,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
 template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
-    const T* output_grad_data, const T* input_data, const int num,
+    const T* output_grad_data,
-    const int output_channels, const int output_height, const int output_width,
+    const T* input_data,
-    const int input_channels, const int input_height, const int input_width,
+    const int num,
-    const int filter_multiplier, const int filter_height,
+    const int output_channels,
-    const int filter_width, const int stride_height, const int stride_width,
+    const int output_height,
-    const int padding_height, const int padding_width, const int dilate_height,
+    const int output_width,
-    const int dilate_width, T* filter_grad_data) {
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
  int bid = blockIdx.z;
  int image_h = blockIdx.y;
  int kernel_iw = blockIdx.x % filter_width;
@@ -743,13 +953,25 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
 template <typename T, int c_filter, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
-    const T* output_grad_data, const T* input_data, const int num,
+    const T* output_grad_data,
-    const int output_channels, const int output_height, const int output_width,
+    const T* input_data,
-    const int input_channels, const int input_height, const int input_width,
+    const int num,
-    const int filter_multiplier, const int filter_height,
+    const int output_channels,
-    const int filter_width, const int stride_height, const int stride_width,
+    const int output_height,
-    const int padding_height, const int padding_width, const int dilate_height,
+    const int output_width,
-    const int dilate_width, T* filter_grad_data) {
+    const int input_channels,
+    const int input_height,
+    const int input_width,
+    const int filter_multiplier,
+    const int filter_height,
+    const int filter_width,
+    const int stride_height,
+    const int stride_width,
+    const int padding_height,
+    const int padding_width,
+    const int dilate_height,
+    const int dilate_width,
+    T* filter_grad_data) {
  const int bid = blockIdx.z;
  int image_h = blockIdx.x * dilate_height + blockIdx.y;
  if (image_h >= output_height) {
@@ -804,16 +1026,31 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
  }
 }
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+template <typename T,
-          DataLayout data_layout, bool fuse_relu_before_conv>
+          int c_filter_multiplier,
-__global__ void KernelDepthwiseConvFilterGradSp(
+          int c_stride,
-    const T* output_grad_data, const T* input_data, const int num,
+          int c_filter,
-    const int output_channels, const int output_height, const int output_width,
+          DataLayout data_layout,
-    const int input_channels, const int input_height, const int input_width,
+          bool fuse_relu_before_conv>
-    const int filter_multiplier, const int filter_height,
+__global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data,
-    const int filter_width, const int stride_height, const int stride_width,
+                                                const T* input_data,
-    const int padding_height, const int padding_width, const int dilate_height,
+                                                const int num,
-    const int dilate_width, T* filter_grad_data) {
+                                                const int output_channels,
+                                                const int output_height,
+                                                const int output_width,
+                                                const int input_channels,
+                                                const int input_height,
+                                                const int input_width,
+                                                const int filter_multiplier,
+                                                const int filter_height,
+                                                const int filter_width,
+                                                const int stride_height,
+                                                const int stride_width,
+                                                const int padding_height,
+                                                const int padding_width,
+                                                const int dilate_height,
+                                                const int dilate_width,
+                                                T* filter_grad_data) {
  int final_filter_multiplier = filter_multiplier;
  int h_stride = stride_height;
  int w_stride = stride_width;
@@ -825,34 +1062,91 @@ __global__ void KernelDepthwiseConvFilterGradSp(
  if (c_filter_multiplier == 0 || c_filter == -1) {
    if (data_layout != DataLayout::kNHWC) {
      KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
+          output_grad_data,
-          output_width, input_channels, input_height, input_width,
+          input_data,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
+          num,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
          filter_grad_data);
    } else {
      KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
+          output_grad_data,
-          output_width, input_channels, input_height, input_width,
+          input_data,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
+          num,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
          filter_grad_data);
    }
  } else {
    if (data_layout != DataLayout::kNHWC) {
      KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
+          output_grad_data,
-          output_width, input_channels, input_height, input_width,
+          input_data,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
+          num,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
          filter_grad_data);
    } else {
-      KernelDepthwiseConvFilterGradCFilterNHWC<T, c_filter,
+      KernelDepthwiseConvFilterGradCFilterNHWC<T,
+                                               c_filter,
                                               fuse_relu_before_conv>(
-          output_grad_data, input_data, num, output_channels, output_height,
+          output_grad_data,
-          output_width, input_channels, input_height, input_width,
+          input_data,
-          final_filter_multiplier, filter_height, filter_width, h_stride,
+          num,
-          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
          filter_grad_data);
    }
  }
@@ -864,15 +1158,15 @@ __global__ void KernelDepthwiseConvFilterGradSp(
 * height and width, respectively.
 */
 template <class T, bool fuse_relu_before_conv>
-class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
+class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
-                           fuse_relu_before_conv> {
 public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                  const framework::Tensor& input,
                  const framework::Tensor& filter,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations, framework::Tensor* output,
+                  const std::vector<int>& dilations,
+                  framework::Tensor* output,
                  const DataLayout data_layout = DataLayout::kNCHW) {
    const int batch_size = input.dims()[0];
    const int input_channels =
@@ -905,12 +1199,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
    framework::Tensor filter_hwc;
    if (data_layout == DataLayout::kNHWC) {
-      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
+      framework::DDim filter_hwc_dims({filter.dims()[2],
-                                       filter.dims()[0], filter.dims()[1]});
+                                       filter.dims()[3],
+                                       filter.dims()[0],
+                                       filter.dims()[1]});
      filter_hwc.Resize(filter_hwc_dims);
      filter_hwc.mutable_data<T>(context.GetPlace());
      std::vector<int> perm_axis({2, 3, 0, 1});
-      phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
      trans(context, filter, &filter_hwc, perm_axis);
      filter_data = filter_hwc.data<T>();
    }
@@ -940,7 +1236,8 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
      threads = dim3(std::min(output_channels, thread), blocks, 1);
      grid = dim3((output_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
    }
    int filter_multiplier = output_channels / input_channels;
    int nums_output =
@@ -952,37 +1249,73 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 #endif
    int grid_size = (nums_output + block_size - 1) / block_size;
-#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
+#define check_case(c_filter_multiplier, c_stride, c_filter)               \
-  if (c_filter_multiplier == 0 ||                                              \
+  if (c_filter_multiplier == 0 ||                                         \
-      filter_multiplier == c_filter_multiplier &&                              \
+      filter_multiplier == c_filter_multiplier &&                         \
-          stride_height == stride_width && stride_height == c_stride &&        \
+          stride_height == stride_width && stride_height == c_stride &&   \
-          (ksize_height == ksize_width && ksize_height == c_filter ||          \
+          (ksize_height == ksize_width && ksize_height == c_filter ||     \
-           c_filter == -1)) {                                                  \
+           c_filter == -1)) {                                             \
-    if (c_filter == -1) {                                                      \
+    if (c_filter == -1) {                                                 \
-      threads.x = block_size;                                                  \
+      threads.x = block_size;                                             \
-      grid.x = grid_size;                                                      \
+      grid.x = grid_size;                                                 \
-      threads.y = threads.z = grid.y = grid.z = 1;                             \
+      threads.y = threads.z = grid.y = grid.z = 1;                        \
-    }                                                                          \
+    }                                                                     \
-    if (data_layout != DataLayout::kNHWC) {                                    \
+    if (data_layout != DataLayout::kNHWC) {                               \
-      KernelDepthwiseConvSp<                                                   \
+      KernelDepthwiseConvSp<                                              \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          T,                                                              \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          c_filter_multiplier,                                            \
-          input_data, filter_data, batch_size, output_channels, output_height, \
+          c_stride,                                                       \
-          output_width, input_channels, input_height, input_width,             \
+          c_filter,                                                       \
-          filter_multiplier, ksize_height, ksize_width, stride_height,         \
+          DataLayout::kNCHW,                                              \
-          stride_width, padding_height, padding_width, dilate_height,          \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          dilate_width, output_data);                                          \
+          input_data,                                                     \
-    } else {                                                                   \
+          filter_data,                                                    \
-      KernelDepthwiseConvSp<                                                   \
+          batch_size,                                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          output_channels,                                                \
-          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          output_height,                                                  \
-          input_data, filter_data, batch_size, output_channels, output_height, \
+          output_width,                                                   \
-          output_width, input_channels, input_height, input_width,             \
+          input_channels,                                                 \
-          filter_multiplier, ksize_height, ksize_width, stride_height,         \
+          input_height,                                                   \
-          stride_width, padding_height, padding_width, dilate_height,          \
+          input_width,                                                    \
-          dilate_width, output_data);                                          \
+          filter_multiplier,                                              \
-    }                                                                          \
+          ksize_height,                                                   \
-    return;                                                                    \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          output_data);                                                   \
+    } else {                                                              \
+      KernelDepthwiseConvSp<                                              \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNHWC,                                              \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data,                                                     \
+          filter_data,                                                    \
+          batch_size,                                                     \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          output_data);                                                   \
+    }                                                                     \
+    return;                                                               \
  }
    check_case(1, 1, 3);
    check_case(1, 1, 5);
@@ -1004,10 +1337,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
 };
 template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
+class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
-                                    fuse_relu_before_conv> {
 public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                  const framework::Tensor& input,
                  const framework::Tensor& filter,
                  const framework::Tensor& output_grad,
@@ -1048,12 +1380,14 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
    framework::Tensor filter_hwc;
    if (data_layout == DataLayout::kNHWC) {
-      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
+      framework::DDim filter_hwc_dims({filter.dims()[2],
-                                       filter.dims()[0], filter.dims()[1]});
+                                       filter.dims()[3],
+                                       filter.dims()[0],
+                                       filter.dims()[1]});
      filter_hwc.Resize(filter_hwc_dims);
      filter_hwc.mutable_data<T>(context.GetPlace());
      std::vector<int> perm_axis({2, 3, 0, 1});
-      phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
      trans(context, filter, &filter_hwc, perm_axis);
      filter_data = filter_hwc.data<T>();
    }
@@ -1078,7 +1412,8 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
          ((input_width + dilate_width - 1) / dilate_width) * dilate_width);
      threads = dim3(std::min(input_channels, thread), blocks, 1);
      grid = dim3((input_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
    }
    int filter_multiplier = output_channels / input_channels;
@@ -1090,22 +1425,60 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
           c_filter == -1)) {                                             \
    if (data_layout != DataLayout::kNHWC) {                               \
      KernelDepthwiseConvInputGradSp<                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,  \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNCHW,                                              \
          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data, output_grad_data, filter_data, batch_size,          \
+          input_data,                                                     \
-          output_channels, output_height, output_width, input_channels,   \
+          output_grad_data,                                               \
-          input_height, input_width, filter_multiplier, ksize_height,     \
+          filter_data,                                                    \
-          ksize_width, stride_height, stride_width, padding_height,       \
+          batch_size,                                                     \
-          padding_width, dilate_height, dilate_width, input_grad_data);   \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          input_grad_data);                                               \
    } else {                                                              \
      KernelDepthwiseConvInputGradSp<                                     \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,  \
+          T,                                                              \
+          c_filter_multiplier,                                            \
+          c_stride,                                                       \
+          c_filter,                                                       \
+          DataLayout::kNHWC,                                              \
          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-          input_data, output_grad_data, filter_data, batch_size,          \
+          input_data,                                                     \
-          output_channels, output_height, output_width, input_channels,   \
+          output_grad_data,                                               \
-          input_height, input_width, filter_multiplier, ksize_height,     \
+          filter_data,                                                    \
-          ksize_width, stride_height, stride_width, padding_height,       \
+          batch_size,                                                     \
-          padding_width, dilate_height, dilate_width, input_grad_data);   \
+          output_channels,                                                \
+          output_height,                                                  \
+          output_width,                                                   \
+          input_channels,                                                 \
+          input_height,                                                   \
+          input_width,                                                    \
+          filter_multiplier,                                              \
+          ksize_height,                                                   \
+          ksize_width,                                                    \
+          stride_height,                                                  \
+          stride_width,                                                   \
+          padding_height,                                                 \
+          padding_width,                                                  \
+          dilate_height,                                                  \
+          dilate_width,                                                   \
+          input_grad_data);                                               \
    }                                                                     \
    return;                                                               \
  }
@@ -1129,10 +1502,11 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
 };
 template <typename T, bool fuse_relu_before_conv>
-class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
+class DepthwiseConvFilterGradFunctor<phi::GPUContext,
+                                     T,
                                     fuse_relu_before_conv> {
 public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                  const framework::Tensor& input,
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
@@ -1187,7 +1561,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
          std::max(block_size / output_channels, 1),
          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
      grid = dim3((output_height + dilate_height - 1) / dilate_height,
-                  dilate_height, batch_size);
+                  dilate_height,
+                  batch_size);
      threads = dim3(std::min(output_channels, block_size), blocks, 1);
    }
    int filter_multiplier = output_channels / input_channels;
@@ -1200,22 +1575,41 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
           c_filter == -1)) {                                                  \
    if (data_layout != DataLayout::kNHWC) {                                    \
      KernelDepthwiseConvFilterGradSp<                                         \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          T,                                                                   \
+          c_filter_multiplier,                                                 \
+          c_stride,                                                            \
+          c_filter,                                                            \
+          DataLayout::kNCHW,                                                   \
          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data, input_data, batch_size, output_channels,           \
+          output_grad_data,                                                    \
-          output_height, output_width, input_channels, input_height,           \
+          input_data,                                                          \
-          input_width, filter_multiplier, ksize_height, ksize_width,           \
+          batch_size,                                                          \
-          stride_height, stride_width, padding_height, padding_width,          \
+          output_channels,                                                     \
-          dilate_height, dilate_width, filter_grad_data);                      \
+          output_height,                                                       \
+          output_width,                                                        \
+          input_channels,                                                      \
+          input_height,                                                        \
+          input_width,                                                         \
+          filter_multiplier,                                                   \
+          ksize_height,                                                        \
+          ksize_width,                                                         \
+          stride_height,                                                       \
+          stride_width,                                                        \
+          padding_height,                                                      \
+          padding_width,                                                       \
+          dilate_height,                                                       \
+          dilate_width,                                                        \
+          filter_grad_data);                                                   \
    } else {                                                                   \
      framework::Tensor filter_grad_hwc;                                       \
      if (c_filter != -1) {                                                    \
-        framework::DDim filter_grad_hwc_dims(                                  \
+        framework::DDim filter_grad_hwc_dims({filter_grad->dims()[2],          \
-            {filter_grad->dims()[2], filter_grad->dims()[3],                   \
+                                              filter_grad->dims()[3],          \
-             filter_grad->dims()[0], filter_grad->dims()[1]});                 \
+                                              filter_grad->dims()[0],          \
+                                              filter_grad->dims()[1]});        \
        filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
        filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
-        phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;      \
+        phi::funcs::SetConstant<phi::GPUContext, T> set_zero;                  \
        set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
        filter_grad_data = filter_grad_hwc.data<T>();                          \
      } else {                                                                 \
@@ -1231,16 +1625,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
        threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
      }                                                                        \
      KernelDepthwiseConvFilterGradSp<                                         \
-          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          T,                                                                   \
+          c_filter_multiplier,                                                 \
+          c_stride,                                                            \
+          c_filter,                                                            \
+          DataLayout::kNHWC,                                                   \
          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-          output_grad_data, input_data, batch_size, output_channels,           \
+          output_grad_data,                                                    \
-          output_height, output_width, input_channels, input_height,           \
+          input_data,                                                          \
-          input_width, filter_multiplier, ksize_height, ksize_width,           \
+          batch_size,                                                          \
-          stride_height, stride_width, padding_height, padding_width,          \
+          output_channels,                                                     \
-          dilate_height, dilate_width, filter_grad_data);                      \
+          output_height,                                                       \
+          output_width,                                                        \
+          input_channels,                                                      \
+          input_height,                                                        \
+          input_width,                                                         \
+          filter_multiplier,                                                   \
+          ksize_height,                                                        \
+          ksize_width,                                                         \
+          stride_height,                                                       \
+          stride_width,                                                        \
+          padding_height,                                                      \
+          padding_width,                                                       \
+          dilate_height,                                                       \
+          dilate_width,                                                        \
+          filter_grad_data);                                                   \
      if (c_filter != -1) {                                                    \
        std::vector<int> perm_axis({2, 3, 0, 1});                              \
-        phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;     \
+        phi::funcs::TransposeNormal<phi::GPUContext, T> trans;                 \
        trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
      }                                                                        \
    }                                                                          \
@@ -1263,31 +1675,23 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
  }
 };
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, false>;
+template class DepthwiseConvFunctor<phi::GPUContext, float, false>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, false>;
+template class DepthwiseConvFunctor<phi::GPUContext, double, false>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>;
-                                             false>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, false>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>;
-                                              float, false>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, false>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, float, true>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, double, true>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>;
-                                             true>;
+template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>;
-template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double, true>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>;
-                                              float, true>;
+template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>;
-template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double, true>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+namespace phi {
+template <typename T, typename Context>
+void DepthwiseConvGradKernel(const Context& dev_ctx,
+                             const DenseTensor& out_grad,
+                             const DenseTensor& input,
+                             const DenseTensor& filter,
+                             const std::vector<int>& strides_t,
+                             const std::vector<int>& paddings_t,
+                             const std::string& padding_algorithm,
+                             int groups,
+                             const std::vector<int>& dilations_t,
+                             const std::string& data_format,
+                             bool use_addto,
+                             int workspace_size_MB,
+                             bool exhaustive_search,
+                             bool fuse_relu,
+                             DenseTensor* input_grad,
+                             DenseTensor* filter_grad) {
+  const DenseTensor* output_grad = &out_grad;
+  if (!input_grad && !filter_grad) return;
+  std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  // update padding and dilation
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+  DDim in_data_dims;
+  const paddle::framework::DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  if (data_layout != paddle::framework::DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+  bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
+  if (!is_sys_pad) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+  phi::funcs::SetConstant<Context, T> set_zero;
+  if (input_grad) {
+    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, input_grad, static_cast<T>(0));
+    if (fuse_relu) {
+      paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, true>
+          depthwiseConvInputGrad;
+      depthwiseConvInputGrad(dev_ctx,
+                             input,
+                             filter,
+                             *output_grad,
+                             strides,
+                             paddings,
+                             dilations,
+                             input_grad,
+                             data_layout);
+    } else {
+      paddle::operators::math::DepthwiseConvInputGradFunctor<Context, T, false>
+          depthwiseConvInputGrad;
+      depthwiseConvInputGrad(dev_ctx,
+                             input,
+                             filter,
+                             *output_grad,
+                             strides,
+                             paddings,
+                             dilations,
+                             input_grad,
+                             data_layout);
+    }
+  }
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    if (fuse_relu) {
+      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, true>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx,
+                              input,
+                              *output_grad,
+                              strides,
+                              paddings,
+                              dilations,
+                              filter_grad,
+                              data_layout);
+    } else {
+      paddle::operators::math::DepthwiseConvFilterGradFunctor<Context, T, false>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx,
+                              input,
+                              *output_grad,
+                              strides,
+                              paddings,
+                              dilations,
+                              filter_grad,
+                              data_layout);
+    }
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(depthwise_conv2d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvGradKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/phi/kernels/gpu/depthwise_conv.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+namespace phi {
+template <typename T, typename Context>
+void DepthwiseConvKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations_t,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search,
+                         bool fuse_relu,
+                         DenseTensor* out) {
+  DenseTensor* output = out;
+  output->mutable_data<T>(dev_ctx.GetPlace());
+  const std::vector<int> strides = strides_t;
+  std::vector<int> dilations = dilations_t;
+  std::vector<int> paddings = paddings_t;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  if (channel_last) {
+    PADDLE_ENFORCE_EQ(
+        output->dims()[output->dims().size() - 1] %
+            input.dims()[input.dims().size() - 1],
+        0,
+        phi::errors::InvalidArgument(
+            "ShapeError: The output channels must be a multiple of the "
+            "input channels. But receivced output channel number is %d "
+            "and input channel number is %d",
+            output->dims()[output->dims().size() - 1],
+            input.dims()[input.dims().size() - 1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input.dims()[1],
+        0,
+        phi::errors::InvalidArgument(
+            "ShapeError: The output channels must be a multiple of the "
+            "input channels. But receivced output channel number is %d "
+            "and input channel number is %d",
+            output->dims()[1],
+            input.dims()[1]));
+  }
+  // update padding and dilation
+  auto in_dims = input.dims();
+  auto filter_dims = filter.dims();
+  DDim in_data_dims;
+  const paddle::framework::DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_format);
+  if (data_layout != paddle::framework::DataLayout::kNHWC) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  }
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+  bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true;
+  if (!is_sys_pad) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+  if (fuse_relu) {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T, true>
+        depthwiseConv;
+    depthwiseConv(dev_ctx,
+                  input,
+                  filter,
+                  strides,
+                  paddings,
+                  dilations,
+                  output,
+                  data_layout);
+  } else {
+    paddle::operators::math::DepthwiseConvFunctor<Context, T, false>
+        depthwiseConv;
+    depthwiseConv(dev_ctx,
+                  input,
+                  filter,
+                  strides,
+                  paddings,
+                  dilations,
+                  output,
+                  data_layout);
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(depthwise_conv2d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvKernel,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/conv_grad_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  auto X = &input;
+  auto W = &filter;
+  auto dO = &out_grad;
+  auto ddX = input_grad_grad.get_ptr();
+  auto ddW = filter_grad_grad.get_ptr();
+  auto ddO = out_grad_grad;
+  auto dW = filter_grad;
+  auto dX = input_grad;
+  if (ddO) {
+    ddO->mutable_data<T>(ctx.GetPlace());
+    phi::funcs::SetConstant<Context, T> set_zero;
+    set_zero(ctx, ddO, static_cast<T>(0));
+  }
+  if (dW) {
+    dW->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dX) {
+    dX->mutable_data<T>(ctx.GetPlace());
+  }
+  // const T* x = X->data<T>();
+  const T* dy = dO->data<T>();
+  const T* w = W->data<T>();
+  const T* ddx = nullptr;
+  const T* ddw = nullptr;
+  T *dw, *dx, *ddy;
+  dw = dx = ddy = nullptr;
+  T* transformed_dx = nullptr;
+  std::vector<int> dilations = dilations_t;
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+  std::vector<int> paddings = paddings_t;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  // transform Tensors to channel first-----------
+  DenseTensor transformed_X_channel(X->type());
+  DenseTensor transformed_dO_channel(dO->type());
+  DenseTensor transformed_ddX_channel(X->type());
+  DenseTensor transformed_ddO_channel(dO->type());
+  DenseTensor transformed_dX_channel(X->type());
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
+    TransToChannelFirst<Context, T>(ctx, X, &transformed_X_channel);
+    ResizeToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
+    TransToChannelFirst<Context, T>(ctx, dO, &transformed_dO_channel);
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
+      TransToChannelFirst<Context, T>(ctx, ddX, &transformed_ddX_channel);
+    }
+    if (ddO) {
+      ResizeToChannelFirst<Context, T>(ctx, ddO, &transformed_ddO_channel);
+    }
+    if (dX) {
+      ResizeToChannelFirst<Context, T>(ctx, dX, &transformed_dX_channel);
+      transformed_dX_channel.mutable_data<T>(ctx.GetPlace());
+    }
+  } else {
+    transformed_X_channel = *X;
+    transformed_dO_channel = *dO;
+    if (ddX) {
+      transformed_ddX_channel = *ddX;
+    }
+    if (ddO) {
+      transformed_ddO_channel.ShareDataWith(*ddO);
+    }
+    if (dX) {
+      transformed_dX_channel.ShareDataWith(*dX);
+    }
+  }
+  auto in_dims = transformed_X_channel.dims();
+  auto filter_dims = W->dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_ddX(X->type());
+  DenseTensor transformed_dX(X->type());
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(X->dims().size() * 2, 0);
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_X_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_X_channel.dims()[1];
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_X_channel.dims()[i + 2] + padding_diff[i];
+      input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_X.Resize(new_input_shape);
+    transformed_ddX.Resize(new_input_shape);
+    transformed_dX.Resize(new_input_shape);
+    transformed_X.mutable_data<T>(ctx.GetPlace());
+    if (ddX) {
+      transformed_ddX.mutable_data<T>(ctx.GetPlace());
+    }
+    if (dX) {
+      transformed_dX.mutable_data<T>(ctx.GetPlace());
+    }
+    // pad for input
+    const int rank = X->dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+        if (ddX) {
+          funcs::PadFunction<Context, T, 4>(ctx,
+                                            input_pad,
+                                            transformed_ddX_channel,
+                                            pad_value,
+                                            &transformed_ddX);
+        }
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            ctx, input_pad, transformed_X_channel, pad_value, &transformed_X);
+        if (ddX) {
+          funcs::PadFunction<Context, T, 5>(ctx,
+                                            input_pad,
+                                            transformed_ddX_channel,
+                                            pad_value,
+                                            &transformed_ddX);
+        }
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_X.ShareDataWith(transformed_X_channel);
+    if (ddX) {
+      transformed_ddX.ShareDataWith(transformed_ddX_channel);
+    }
+    if (dX) {
+      transformed_dX.ShareDataWith(transformed_dX_channel);
+    }
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+  const T* x = transformed_X.data<T>();
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+  auto handle = ctx.cudnn_handle();
+  paddle::operators::ConvArgs args1{&transformed_ddX,
+                                    W,
+                                    &transformed_ddO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_X,
+                                    ddW,
+                                    &transformed_ddO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args3{&transformed_ddX,
+                                    dW,
+                                    &transformed_dO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args4{&transformed_dX,
+                                    ddW,
+                                    &transformed_dO_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t fwd_algo1 = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvFwdAlgorithm_t fwd_algo2 = static_cast<miopenConvFwdAlgorithm_t>(0);
+  miopenConvBwdDataAlgorithm_t data_algo =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionFwdAlgo_t fwd_algo1 =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionFwdAlgo_t fwd_algo2 =
+      static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  cudnnConvolutionBwdDataAlgo_t data_algo =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+  auto layout = paddle::platform::GetCudnnTensorFormat(
+      paddle::platform::DataLayout::kNCHW);
+  // ddo = conv(ddI, W) + conv(I, ddW)
+  size_t workspace_size = 0;
+  T* transformed_ddy_channel = nullptr;
+  if (ddO) {
+    ddy = ddO->data<T>();
+    transformed_ddy_channel = transformed_ddO_channel.data<T>();
+    if (ddX) {
+      args1.handle = handle;
+      args1.idesc.set(transformed_ddX, iwo_group);
+      args1.wdesc.set(*W, layout, iwo_group);
+      args1.odesc.set(transformed_ddO_channel, iwo_group);
+      args1.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      paddle::platform::AllowTF32Cudnn(),
+                      c_group);
+#ifdef PADDLE_WITH_HIP
+      using search1 =
+          paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size = search1::GetWorkspaceSize(args1);
+      fwd_algo1 = search1::Find<T>(
+          args1, exhaustive_search, false, workspace_size, ctx);
+#else
+      using search1 =
+          paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
+      workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
+#endif
+    }
+    if (ddW) {
+      ddw = ddW->data<T>();
+      args2.handle = handle;
+      args2.idesc.set(transformed_X, iwo_group);
+      args2.wdesc.set(*ddW, layout, iwo_group);
+      args2.odesc.set(transformed_ddO_channel, iwo_group);
+      args2.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      paddle::platform::AllowTF32Cudnn(),
+                      c_group);
+#ifdef PADDLE_WITH_HIP
+      using search2 =
+          paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      fwd_algo2 = search2::Find<T>(
+          args2, exhaustive_search, false, workspace_size, ctx);
+#else
+      using search2 =
+          paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+      fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2));
+#endif
+    }
+  }
+  if (dW && ddX) {
+    dw = dW->data<T>();
+    args3.handle = handle;
+    args3.idesc.set(transformed_ddX, iwo_group);
+    args3.wdesc.set(*dW, layout, iwo_group);
+    args3.odesc.set(transformed_dO_channel, iwo_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search3 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_algo = search3::Find<T>(
+        args3, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+    using search3 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo =
+        search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo));
+#endif
+  }
+  if (ddW && dX) {
+    transformed_dx = transformed_dX.data<T>();
+    args4.handle = handle;
+    args4.idesc.set(transformed_dX, iwo_group);
+    args4.wdesc.set(*ddW, layout, iwo_group);
+    args4.odesc.set(transformed_dO_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search4 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    data_algo = search4::Find<T>(
+        args4, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+    using search4 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    data_algo = search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
+    workspace_size =
+        std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
+  }
+  int i_n, i_c, i_d, i_h, i_w;
+  GetNCDHW(
+      transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w);
+  int o_n, o_c, o_d, o_h, o_w;
+  GetNCDHW(transformed_dO_channel.dims(),
+           DataLayout::kNCHW,
+           &o_n,
+           &o_c,
+           &o_d,
+           &o_h,
+           &o_w);
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = W->numel() / groups;
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+  // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+  // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
+  // 0.0f;
+  // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+  auto wkspace_handle = ctx.cudnn_workspace_handle();
+  if (ddO) {
+    if (ddX) {
+      ddx = transformed_ddX.data<T>();
+#ifdef PADDLE_WITH_HIP
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args1.idesc.desc(),
+                    ddx,
+                    args1.wdesc.desc(),
+                    w,
+                    args1.cdesc.desc(),
+                    fwd_algo1,
+                    &beta,
+                    args1.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else
+      for (int i = 0; i < groups; i++) {
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_GPU_SUCCESS(
+                  paddle::platform::dynload::cudnnConvolutionForward(
+                      handle,
+                      &alpha,
+                      args1.idesc.desc(),
+                      ddx + i * group_offset_in,
+                      args1.wdesc.desc(),
+                      w + i * group_offset_filter,
+                      args1.cdesc.desc(),
+                      fwd_algo1,
+                      workspace_ptr,
+                      workspace_size,
+                      &beta,
+                      args1.odesc.desc(),
+                      transformed_ddy_channel + i * group_offset_out));
+            },
+            workspace_size);
+      }
+#endif
+    }
+    if (ddW) {
+#ifdef PADDLE_WITH_HIP
+      // MIOPEN ONLY support beta to be 0.0f
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionForward(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    x,
+                    args2.wdesc.desc(),
+                    ddw,
+                    args2.cdesc.desc(),
+                    fwd_algo2,
+                    &beta,
+                    args2.odesc.desc(),
+                    transformed_ddy_channel,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+#else
+      for (int i = 0; i < groups; i++) {
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_GPU_SUCCESS(
+                  paddle::platform::dynload::cudnnConvolutionForward(
+                      handle,
+                      &alpha,
+                      args2.idesc.desc(),
+                      x + i * group_offset_in,
+                      args2.wdesc.desc(),
+                      ddw + i * group_offset_filter,
+                      args2.cdesc.desc(),
+                      fwd_algo2,
+                      workspace_ptr,
+                      workspace_size,
+                      &alpha,
+                      args2.odesc.desc(),
+                      transformed_ddy_channel + i * group_offset_out));
+            },
+            workspace_size);
+      }
+#endif
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_ddO_channel, ddO);
+    }
+  }
+  T* transformed_dy_channel = transformed_dO_channel.data<T>();
+  if (dW && ddX) {
+    ddx = transformed_ddX.data<T>();
+#ifdef PADDLE_WITH_HIP
+    wkspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+                  handle,
+                  &alpha,
+                  args3.odesc.desc(),
+                  transformed_dy_channel,
+                  args3.idesc.desc(),
+                  ddx,
+                  args3.cdesc.desc(),
+                  filter_algo,
+                  &beta,
+                  args3.wdesc.desc(),
+                  dw,
+                  workspace_ptr,
+                  workspace_size));
+        },
+        workspace_size);
+#else
+    for (int i = 0; i < groups; i++) {
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
+                    handle,
+                    &alpha,
+                    args3.idesc.desc(),
+                    ddx + i * group_offset_in,
+                    args3.odesc.desc(),
+                    transformed_dy_channel + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dw + i * group_offset_filter));
+          },
+          workspace_size);
+    }
+#endif
+  }
+  if (dX && ddW) {
+    ddw = ddW->data<T>();
+#ifdef PADDLE_WITH_HIP
+    wkspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardData(
+                  handle,
+                  &alpha,
+                  args4.odesc.desc(),
+                  transformed_dy_channel,
+                  args4.wdesc.desc(),
+                  ddw,
+                  args4.cdesc.desc(),
+                  data_algo,
+                  &beta,
+                  args4.idesc.desc(),
+                  transformed_dx,
+                  workspace_ptr,
+                  workspace_size));
+        },
+        workspace_size);
+#else
+    for (int i = 0; i < groups; i++) {
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args4.wdesc.desc(),
+                    ddw + i * group_offset_filter,
+                    args4.odesc.desc(),
+                    transformed_dy_channel + i * group_offset_out,
+                    args4.cdesc.desc(),
+                    data_algo,
+                    workspace_ptr,
+                    workspace_size,
+                    &beta,
+                    args4.idesc.desc(),
+                    transformed_dx + i * group_offset_in));
+          },
+          workspace_size);
+    }
+#endif
+    if (!is_sys_pad) {
+      // reverse padded input
+      std::vector<int> starts(X->dims().size(), 0);
+      std::vector<int> axes(X->dims().size(), 0);
+      for (size_t i = 0; i < X->dims().size(); ++i) {
+        starts[i] = input_pad[2 * i];
+        axes[i] = i;
+      }
+      if (X->dims().size() == 4) {
+        paddle::operators::RemovePaddingSlice<Context, T, 4>(
+            ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
+      } else {
+        paddle::operators::RemovePaddingSlice<Context, T, 5>(
+            ctx, &transformed_dX, &transformed_dX_channel, starts, axes);
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(ctx, &transformed_dX_channel, dX);
+    }
+  }
+}
+template <typename T, typename Context>
+void DepthwiseConvCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    bool fuse_relu,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  ConvCudnnGradGradKernel<T>(ctx,
+                             input_grad_grad,
+                             filter_grad_grad,
+                             out_grad,
+                             input,
+                             filter,
+                             strides,
+                             paddings_t,
+                             padding_algorithm,
+                             groups,
+                             dilations_t,
+                             data_format,
+                             use_addto,
+                             workspace_size_MB,
+                             exhaustive_search_t,
+                             out_grad_grad,
+                             input_grad,
+                             filter_grad);
+}
+template <typename T, typename Context>
+void Conv3DCudnnGradGradKernel(
+    const Context& ctx,
+    paddle::optional<const DenseTensor&> input_grad_grad,
+    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const DenseTensor& out_grad,
+    const DenseTensor& input,
+    const DenseTensor& filter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings_t,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations_t,
+    const std::string& data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search_t,
+    DenseTensor* out_grad_grad,
+    DenseTensor* input_grad,
+    DenseTensor* filter_grad) {
+  ConvCudnnGradGradKernel<T>(ctx,
+                             input_grad_grad,
+                             filter_grad_grad,
+                             out_grad,
+                             input,
+                             filter,
+                             strides,
+                             paddings_t,
+                             padding_algorithm,
+                             groups,
+                             dilations_t,
+                             data_format,
+                             use_addto,
+                             workspace_size_MB,
+                             exhaustive_search_t,
+                             out_grad_grad,
+                             input_grad,
+                             filter_grad);
+}
+}  // namespace phi
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(conv3d_grad_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DepthwiseConvCudnnGradGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+#endif
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvCudnnGradKernel(const Context& ctx,
+                         const DenseTensor& output_grad,
+                         const DenseTensor& input,
+                         const DenseTensor& filter,
+                         const std::vector<int>& strides_t,
+                         const std::vector<int>& paddings_t,
+                         const std::string& padding_algorithm,
+                         int groups,
+                         const std::vector<int>& dilations_t,
+                         const std::string& data_format,
+                         bool use_addto,
+                         int workspace_size_MB,
+                         bool exhaustive_search_t,
+                         DenseTensor* input_grad,
+                         DenseTensor* filter_grad) {
+  if (input_grad) {
+    input_grad->mutable_data<T>(ctx.GetPlace());
+  }
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(ctx.GetPlace());
+  }
+  std::vector<int> dilations = dilations_t;
+  std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+#ifdef PADDLE_WITH_HIP
+  // HIP MIOPEN ONLY SUPPORT NCHW format
+  auto compute_format = paddle::platform::DataLayout::kNCHW;
+#else
+  const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
+  auto compute_format = compute_in_nhwc && channel_last
+                            ? paddle::platform::DataLayout::kNHWC
+                            : paddle::platform::DataLayout::kNCHW;
+#endif
+  VLOG(3) << "Compute ConvGradOp with cuDNN:"
+          << " data_format=" << data_format << " compute_format="
+          << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
+                                                                    : "NCHW");
+  // transform Tensor
+  DenseTensor transformed_input_channel(input.type());
+  DenseTensor transformed_output_grad_channel(output_grad.type());
+  DenseTensor transformed_input_grad_channel(input.type());
+  DenseTensor transformed_filter_channel(filter.type());
+  DenseTensor transformed_filter_grad_channel(filter.type());
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    VLOG(3) << "Transform input, output_grad, input_grad and tensor from "
+               "NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    ResizeToChannelFirst<Context, T>(
+        ctx, &output_grad, &transformed_output_grad_channel);
+    TransToChannelFirst<Context, T>(
+        ctx, &output_grad, &transformed_output_grad_channel);
+    if (input_grad) {
+      ResizeToChannelFirst<Context, T>(
+          ctx, input_grad, &transformed_input_grad_channel);
+      // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
+      // the data of input_grad to transformed_input_grad_channel.
+      if (use_addto) {
+        TransToChannelFirst<Context, T>(
+            ctx, input_grad, &transformed_input_grad_channel);
+      }
+    }
+  } else {
+    transformed_input_channel.ShareDataWith(input);
+    transformed_output_grad_channel.ShareDataWith(output_grad);
+    if (input_grad) {
+      transformed_input_grad_channel.ShareDataWith(*input_grad);
+    }
+  }
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC.";
+    ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    if (filter_grad) {
+      ResizeToChannelLast<Context, T>(
+          ctx, filter_grad, &transformed_filter_grad_channel);
+    }
+  } else {
+    transformed_filter_channel.ShareDataWith(filter);
+    if (filter_grad) {
+      transformed_filter_grad_channel.ShareDataWith(*filter_grad);
+    }
+  }
+  //  update paddings
+  auto in_dims = transformed_input_channel.dims();
+  auto filter_dims = transformed_filter_channel.dims();
+  DDim in_data_dims;
+  DDim filter_data_dims;
+  if (compute_format == paddle::platform::DataLayout::kNCHW) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+    filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+    filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  }
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+  // cuDNN only supports padding the same amount on every dimension.
+  // So we create a new padded input tensor.
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  Tensor transformed_input(input.type());
+  Tensor transformed_input_grad(input.type());
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_input_channel.dims()[0];
+    if (compute_format == paddle::platform::DataLayout::kNCHW) {
+      new_input_shape_vec[1] = transformed_input_channel.dims()[1];
+    } else {
+      new_input_shape_vec[data_dim + 1] =
+          transformed_input_channel.dims()[data_dim + 1];
+    }
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        new_input_shape_vec[i + 2] =
+            transformed_input_channel.dims()[i + 2] + padding_diff[i];
+      } else {
+        new_input_shape_vec[i + 1] =
+            transformed_input_channel.dims()[i + 1] + padding_diff[i];
+      }
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+      } else {
+        input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
+      }
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_input.Resize(new_input_shape);
+    transformed_input.mutable_data<T>(ctx.GetPlace());
+    transformed_input_grad.Resize(new_input_shape);
+    if (input_grad) {
+      transformed_input_grad.mutable_data<T>(ctx.GetPlace());
+    }
+    // pad for input
+    const int rank = transformed_input_channel.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_input.ShareDataWith(transformed_input_channel);
+    if (input_grad) {
+      transformed_input_grad.ShareDataWith(transformed_input_grad_channel);
+    }
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+  const T* input_data = transformed_input.data<T>();
+  const T* output_grad_data = transformed_output_grad_channel.data<T>();
+  const T* filter_data = transformed_filter_channel.data<T>();
+  T* filter_grad_data = nullptr;
+  T* input_grad_data = nullptr;
+  T* transformed_input_grad_data = nullptr;
+  paddle::operators::ConvArgs args1{&transformed_input_grad,
+                                    &transformed_filter_channel,
+                                    &transformed_output_grad_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  paddle::operators::ConvArgs args2{&transformed_input,
+                                    &transformed_filter_grad_channel,
+                                    &transformed_output_grad_channel,
+                                    strides,
+                                    padding_common,
+                                    dilations,
+                                    dtype};
+  auto handle = ctx.cudnn_handle();
+  // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+  if (transformed_input.dims().size() == 5) {
+    layout = compute_format == paddle::platform::DataLayout::kNHWC
+                 ? paddle::platform::DataLayout::kNDHWC
+                 : paddle::platform::DataLayout::kNCDHW;
+  }
+  auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  int i_n, i_c, i_d, i_h, i_w;
+  int o_n, o_c, o_d, o_h, o_w;
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  } else {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output_grad_channel.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  }
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = transformed_filter_channel.numel() / groups;
+// ------------------- cudnn backward algorithm ---------------------
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t data_algo =
+      static_cast<miopenConvBwdDataAlgorithm_t>(0);
+  miopenConvBwdWeightsAlgorithm_t filter_algo =
+      static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
+#else
+  cudnnConvolutionBwdDataAlgo_t data_algo =
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+  cudnnConvolutionBwdFilterAlgo_t filter_algo =
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+#endif
+  // input data workspace_size
+  size_t workspace_size_d = 0;
+  // weight workspace_size
+  size_t workspace_size_w = 0;
+  int iwo_groups = groups;
+  int c_groups = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+  if (input_grad) {
+    // ------------------- cudnn descriptors ---------------------
+    input_grad_data = input_grad->data<T>();
+    transformed_input_grad_data = transformed_input_grad.data<T>();
+    args1.handle = handle;
+    args1.idesc.set(transformed_input_grad, layout_tensor);
+    args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups);
+    args1.odesc.set(transformed_output_grad_channel, layout_tensor);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search1 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size_d =
+        std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
+    data_algo = search1::Find<T>(
+        args1, exhaustive_search, deterministic, workspace_size_d, ctx);
+#else
+    using search1 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
+    data_algo = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
+    workspace_size_d =
+        std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo));
+#endif
+  }
+  if (filter_grad) {
+    // ------------------- cudnn descriptors ---------------------
+    filter_grad_data = transformed_filter_grad_channel.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_input, layout_tensor);
+    args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups);
+    args2.odesc.set(transformed_output_grad_channel, layout_tensor);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    paddle::platform::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 =
+        paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size_w =
+        std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
+    filter_algo = search2::Find<T>(
+        args2, exhaustive_search, deterministic, workspace_size_w, ctx);
+#else
+    using search2 =
+        paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
+    filter_algo =
+        search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
+    workspace_size_w = std::max(workspace_size_w,
+                                search2::GetWorkspaceSize(args2, filter_algo));
+#endif
+  }
+  // ------------------- cudnn conv backward data ---------------------
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN ONLY support beta to be 0.0f
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+#else
+  paddle::operators::ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
+#endif
+  VLOG(4) << "Conv_grad: use_addto = " << use_addto;
+  if (input_grad) {
+// When beta is 0, it is unnecessary to reset input_grad.
+// When beta is 1, the output cannot be reset since addt strategy used.
+#ifdef PADDLE_WITH_HIP
+    if (use_addto) {
+      DenseTensor temp_tensor(transformed_input_grad.type());
+      temp_tensor.Resize(transformed_input_grad.dims());
+      T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    temp_tensor_data,
+                    cudnn_workspace_ptr,
+                    workspace_size_d));
+          },
+          workspace_size_d);
+      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args1.idesc.desc(),
+          transformed_input_grad_data,
+          &alpha,
+          args1.idesc.desc(),
+          temp_tensor_data,
+          &beta,
+          args1.idesc.desc(),
+          transformed_input_grad_data));
+    } else {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::miopenConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.odesc.desc(),
+                    output_grad_data,
+                    args1.wdesc.desc(),
+                    filter_data,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    &beta,
+                    args1.idesc.desc(),
+                    transformed_input_grad_data,
+                    cudnn_workspace_ptr,
+                    workspace_size_d));
+          },
+          workspace_size_d);
+    }
+#else
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardData(
+                    handle,
+                    &alpha,
+                    args1.wdesc.desc(),
+                    filter_data + i * group_offset_filter,
+                    args1.odesc.desc(),
+                    output_grad_data + i * group_offset_out,
+                    args1.cdesc.desc(),
+                    data_algo,
+                    cudnn_workspace_ptr,
+                    workspace_size_d,
+                    &beta,
+                    args1.idesc.desc(),
+                    transformed_input_grad_data + i * group_offset_in));
+          },
+          workspace_size_d);
+    }
+#endif
+    if (!is_sys_pad) {
+      std::vector<int> starts(transformed_input_channel.dims().size(), 0);
+      std::vector<int> axes(transformed_input_channel.dims().size(), 0);
+      for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) {
+        starts[i] = input_pad[2 * i];
+        axes[i] = i;
+      }
+      transformed_input_grad_channel.mutable_data(ctx.GetPlace());
+      if (transformed_input_channel.dims().size() == 4) {
+        paddle::operators::RemovePaddingSlice<Context, T, 4>(
+            ctx,
+            &transformed_input_grad,
+            &transformed_input_grad_channel,
+            starts,
+            axes);
+      } else {
+        paddle::operators::RemovePaddingSlice<Context, T, 5>(
+            ctx,
+            &transformed_input_grad,
+            &transformed_input_grad_channel,
+            starts,
+            axes);
+      }
+    }
+    if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+      TransToChannelLast<Context, T>(
+          ctx, &transformed_input_grad_channel, input_grad);
+    }
+  }
+  // filter_grad do not use inplace addto.
+  paddle::operators::ScalingParamType<T> beta_filter = 0.0f;
+  // ------------------- cudnn conv backward filter ---------------------
+  if (filter_grad) {
+// Because beta is zero, it is unnecessary to reset filter_grad.
+#ifdef PADDLE_WITH_HIP
+    workspace_handle.RunFunc(
+        [&](void* cudnn_workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+                  handle,
+                  &alpha,
+                  args2.odesc.desc(),
+                  output_grad_data,
+                  args2.idesc.desc(),
+                  input_data,
+                  args2.cdesc.desc(),
+                  filter_algo,
+                  &beta,
+                  args2.wdesc.desc(),
+                  filter_grad_data,
+                  cudnn_workspace_ptr,
+                  workspace_size_w));
+        },
+        workspace_size_w);
+#else
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                paddle::platform::dynload::cudnnConvolutionBackwardFilter(
+                    handle,
+                    &alpha,
+                    args2.idesc.desc(),
+                    input_data + i * group_offset_in,
+                    args2.odesc.desc(),
+                    output_grad_data + i * group_offset_out,
+                    args2.cdesc.desc(),
+                    filter_algo,
+                    cudnn_workspace_ptr,
+                    workspace_size_w,
+                    &beta_filter,
+                    args2.wdesc.desc(),
+                    filter_grad_data + i * group_offset_filter));
+          },
+          workspace_size_w);
+    }
+#endif
+    if (compute_format == paddle::platform::DataLayout::kNHWC) {
+      TransToChannelFirst<Context, T>(
+          ctx, &transformed_filter_grad_channel, filter_grad);
+    }
+  }
+}
+template <typename T, typename Context>
+void Conv3DCudnnGradKernel(const Context& dev_ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& input,
+                           const DenseTensor& filter,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& paddding_algorithm,
+                           int groups,
+                           const std::vector<int>& dilations,
+                           const std::string& data_format,
+                           bool use_addto,
+                           int workspace_size_MB,
+                           bool exhaustive_search,
+                           DenseTensor* input_grad,
+                           DenseTensor* filter_grad) {
+  ConvCudnnGradKernel<T>(dev_ctx,
+                         out_grad,
+                         input,
+                         filter,
+                         strides,
+                         paddings,
+                         paddding_algorithm,
+                         groups,
+                         dilations,
+                         data_format,
+                         use_addto,
+                         workspace_size_MB,
+                         exhaustive_search,
+                         input_grad,
+                         filter_grad);
+}
+}  // namespace phi
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(conv3d_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+#endif
--- a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvCudnnKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& filter,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings_t,
+                     const std::string& padding_algorithm,
+                     int groups,
+                     const std::vector<int>& dilations_t,
+                     const std::string& data_format,
+                     bool use_addto,
+                     int workspace_size_MB,
+                     bool exhaustive_search_t,
+                     DenseTensor* output) {
+  output->mutable_data<T>(ctx.GetPlace());
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  auto exhaustive_deterministic = exhaustive_search && deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+#ifdef PADDLE_WITH_HIP
+  // HIP MIOPEN ONLY SUPPORT NCHW format
+  auto compute_format = paddle::platform::DataLayout::kNCHW;
+#else
+  // Tensor Core introduced from Volta GPUs supports more faster conv op
+  // with FP16 in NHWC data format.
+  const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx);
+  // We will only do data format conversion from NHWC to NCHW.
+  // cudnn will convert NCHW to NHWC automatically on Tensor Core.
+  auto compute_format = compute_in_nhwc && channel_last
+                            ? paddle::platform::DataLayout::kNHWC
+                            : paddle::platform::DataLayout::kNCHW;
+#endif
+  VLOG(3) << "Compute ConvOp with cuDNN:"
+          << " data_format=" << data_format << " compute_format="
+          << (compute_format == paddle::platform::DataLayout::kNHWC ? "NHWC"
+                                                                    : "NCHW");
+  // ------------ transformed tensor -----------
+  DenseTensor transformed_input_channel(input.type());
+  DenseTensor transformed_output(output->type());
+  DenseTensor transformed_filter_channel(filter.type());
+  T* output_data = nullptr;
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    TransToChannelFirst<Context, T>(ctx, &input, &transformed_input_channel);
+    ResizeToChannelFirst<Context, T>(ctx, output, &transformed_output);
+  } else {
+    transformed_input_channel.ShareDataWith(input);
+    transformed_output.ShareDataWith(*output);
+  }
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
+    ResizeToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+    TransToChannelLast<Context, T>(ctx, &filter, &transformed_filter_channel);
+  } else {
+    transformed_filter_channel.ShareDataWith(filter);
+  }
+  output_data = transformed_output.data<T>();
+  // update padding and dilation
+  auto in_dims = transformed_input_channel.dims();
+  auto filter_dims = transformed_filter_channel.dims();
+  DDim in_data_dims;
+  DDim filter_data_dims;
+  if (compute_format == paddle::platform::DataLayout::kNCHW) {
+    in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+    filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  } else {
+    in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+    filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  }
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
+  DenseTensor transformed_input;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    new_input_shape_vec[0] = transformed_input_channel.dims()[0];
+    if (compute_format == paddle::platform::DataLayout::kNCHW) {
+      new_input_shape_vec[1] = transformed_input_channel.dims()[1];
+    } else {
+      new_input_shape_vec[data_dim + 1] =
+          transformed_input_channel.dims()[data_dim + 1];
+    }
+    std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]);
+      padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]);
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        new_input_shape_vec[i + 2] =
+            transformed_input_channel.dims()[i + 2] + padding_diff[i];
+      } else {
+        new_input_shape_vec[i + 1] =
+            transformed_input_channel.dims()[i + 1] + padding_diff[i];
+      }
+      if (compute_format == paddle::platform::DataLayout::kNCHW) {
+        input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i];
+      } else {
+        input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i];
+        input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i];
+      }
+    }
+    DDim new_input_shape(make_ddim(new_input_shape_vec));
+    transformed_input.Resize(new_input_shape);
+    transformed_input.mutable_data<T>(ctx.GetPlace());
+    const int rank = transformed_input_channel.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(ctx,
+                                          input_pad,
+                                          transformed_input_channel,
+                                          pad_value,
+                                          &transformed_input);
+      } break;
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_input.ShareDataWith(transformed_input_channel);
+    if (paddings.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings[2 * i];
+      }
+    }
+  }
+  const T* input_data = transformed_input.data<T>();
+  const T* filter_data = transformed_filter_channel.data<T>();
+  // ------------------- cudnn descriptors ---------------------
+  paddle::operators::ConvArgs args{&transformed_input,
+                                   &transformed_filter_channel,
+                                   &transformed_output,
+                                   strides,
+                                   padding_common,
+                                   dilations,
+                                   dtype};
+  auto handle = ctx.cudnn_handle();
+  auto workspace_handle = ctx.cudnn_workspace_handle();
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+  if (transformed_input.dims().size() == 5) {
+    layout = compute_format == paddle::platform::DataLayout::kNHWC
+                 ? paddle::platform::DataLayout::kNDHWC
+                 : paddle::platform::DataLayout::kNCDHW;
+  }
+  auto layout_format = paddle::platform::GetCudnnTensorFormat(layout);
+  args.handle = handle;
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN need to set groups in cdesc in miopen_desc.h
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations,
+                 paddle::platform::AllowTF32Cudnn(),
+                 groups);
+#else
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations,
+                 paddle::platform::AllowTF32Cudnn());
+#endif
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
+  // cudnn 7 can support groups, no need to do it manually
+  // FIXME(typhoonzero): find a better way to disable groups
+  // rather than setting it to 1.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetConvolutionGroupCount(
+          args.cdesc.desc(), groups));
+  groups = 1;
+#endif
+#ifdef PADDLE_WITH_HIP
+  // MIOPEN do not set groups in wdesc after set groups in cdesc
+  groups = 1;
+#endif
+  args.idesc.set(transformed_input, layout_format);
+  args.wdesc.set(transformed_filter_channel, layout_format, groups);
+  args.odesc.set(transformed_output, layout_format);
+  int i_n, i_c, i_d, i_h, i_w;
+  int o_n, o_c, o_d, o_h, o_w;
+  if (compute_format == paddle::platform::DataLayout::kNHWC) {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output.dims(),
+                                paddle::platform::DataLayout::kNHWC,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  } else {
+    paddle::operators::GetNCDHW(transformed_input.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &i_n,
+                                &i_c,
+                                &i_d,
+                                &i_h,
+                                &i_w);
+    paddle::operators::GetNCDHW(transformed_output.dims(),
+                                paddle::platform::DataLayout::kNCHW,
+                                &o_n,
+                                &o_c,
+                                &o_d,
+                                &o_h,
+                                &o_w);
+  }
+  int group_offset_in = i_c / groups * i_h * i_w * i_d;
+  int group_offset_out = o_c / groups * o_h * o_w * o_d;
+  int group_offset_filter = transformed_filter_channel.numel() / groups;
+  // ------------------- cudnn conv workspace ---------------------
+  size_t workspace_size = 0;  // final workspace to allocate.
+// ------------------- cudnn conv algorithm ---------------------
+#ifdef PADDLE_WITH_HIP
+  miopenConvFwdAlgorithm_t algo{};
+  using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+  workspace_size = search::GetWorkspaceSize(args);
+  algo = search::Find<T>(
+      args, exhaustive_search, deterministic, workspace_size, ctx);
+#else
+  cudnnConvolutionFwdAlgo_t algo{};
+  using search =
+      paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
+  algo = search::Find<T>(args, exhaustive_search, deterministic, ctx);
+  workspace_size = search::GetWorkspaceSize(args, algo);
+#endif
+#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
+  // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
+    // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable
+  // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
+    // FWD_ALGO_IMPLICIT_GEMM manually.
+  if (groups > 1) {
+    algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
+  }
+#endif
+  // ------------------- cudnn conv forward ---------------------
+  paddle::operators::ScalingParamType<T> alpha = 1.0f;
+  paddle::operators::ScalingParamType<T> beta = 0.0f;
+// NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+// ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+// VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+#ifdef PADDLE_WITH_HIP
+  workspace_handle.RunFunc(
+      [&](void* workspace_ptr) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args.idesc.desc(),
+                input_data,
+                args.wdesc.desc(),
+                filter_data,
+                args.cdesc.desc(),
+                algo,
+                &beta,
+                args.odesc.desc(),
+                output_data,
+                workspace_ptr,
+                workspace_size));
+      },
+      workspace_size);
+#else
+  for (int i = 0; i < groups; i++) {
+    workspace_handle.RunFunc(
+        [&](void* workspace_ptr) {
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              paddle::platform::dynload::cudnnConvolutionForward(
+                  handle,
+                  &alpha,
+                  args.idesc.desc(),
+                  input_data + i * group_offset_in,
+                  args.wdesc.desc(),
+                  filter_data + i * group_offset_filter,
+                  args.cdesc.desc(),
+                  algo,
+                  workspace_ptr,
+                  workspace_size,
+                  &beta,
+                  args.odesc.desc(),
+                  output_data + i * group_offset_out));
+        },
+        workspace_size);
+  }
+#endif
+  if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) {
+    TransToChannelLast<Context, T>(ctx, &transformed_output, output);
+  }
+}
+template <typename T, typename Context>
+void Conv3DCudnnKernel(const Context& dev_ctx,
+                       const DenseTensor& input,
+                       const DenseTensor& filter,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const std::string& padding_algorithm,
+                       int groups,
+                       const std::vector<int>& dilations,
+                       const std::string& data_format,
+                       bool use_addto,
+                       int workspace_size_MB,
+                       bool exhaustive_search,
+                       DenseTensor* out) {
+  ConvCudnnKernel<T>(dev_ctx,
+                     input,
+                     filter,
+                     strides,
+                     paddings,
+                     padding_algorithm,
+                     groups,
+                     dilations,
+                     data_format,
+                     use_addto,
+                     workspace_size_MB,
+                     exhaustive_search,
+                     out);
+}
+}  // namespace phi
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+PD_REGISTER_KERNEL(conv2d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::ConvCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(conv3d,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::Conv3DCudnnKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
+#endif
+// todo register bfloat16
--- a/paddle/phi/kernels/impl/conv_cudnn_impl.h
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/fluid/framework/eigen.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/operators/conv_miopen_helper.h"
+#else
+#include "paddle/fluid/operators/conv_cudnn_helper.h"
+#endif
+#include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+DECLARE_bool(cudnn_deterministic);
+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
+namespace phi {
+static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) {
+  return dev_ctx.GetComputeCapability() >= 70;
+}
+// inline cudnnTensorFormat_t GetCudnnTensorFormat(
+//     const phi::DataLayout& order) {  // Not use
+//   switch (order) {
+//     case phi::DataLayout::kNHWC:
+//       return CUDNN_TENSOR_NHWC;
+//     case phi::DataLayout::kNCHW:
+//       return CUDNN_TENSOR_NCHW;
+//     case phi::DataLayout::NCDHW:
+//       return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
+//     case phi::DataLayout::NDHWC:
+//       return CUDNN_TENSOR_NHWC;  // add, liyamei
+//     default:
+//       PADDLE_THROW(phi::errors::Unimplemented(
+//           "CUDNN has no equivalent dataLayout for input order."));
+//   }
+//   return CUDNN_TENSOR_NCHW;
+// }
+static inline void GetNCDHW(const DDim& dims,
+                            const phi::DataLayout& layout,
+                            int* N,
+                            int* C,
+                            int* D,
+                            int* H,
+                            int* W) {
+  *N = dims[0];
+  *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+  int i = layout == phi::DataLayout::kNCHW ? 0 : 1;
+  if (dims.size() == 5) {
+    *D = dims[2 - i];
+    *H = dims[3 - i];
+    *W = dims[4 - i];
+  } else {
+    *D = 1;
+    *H = dims[2 - i];
+    *W = dims[3 - i];
+  }
+}
+}  // namespace phi
+// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double
+// ) {}
--- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvGradGradKernel(const Context& dev_ctx,
+                        paddle::optional<const DenseTensor&> input_grad_grad,
+                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const DenseTensor& out_grad,
+                        const DenseTensor& input,
+                        const DenseTensor& filter,
+                        const std::vector<int>& strides_t,
+                        const std::vector<int>& paddings_t,
+                        const std::string& padding_algorithm,
+                        int groups,
+                        const std::vector<int>& dilations_t,
+                        const std::string& data_format,
+                        bool use_addto,
+                        int workspace_size_MB,
+                        bool exhaustive_search,
+                        DenseTensor* out_grad_grad,
+                        DenseTensor* input_grad,
+                        DenseTensor* filter_grad) {
+  const DenseTensor* X = &input;
+  const DenseTensor* dY = &out_grad;
+  const DenseTensor* ddX = input_grad_grad.get_ptr();
+  const DenseTensor* ddW_in = filter_grad_grad.get_ptr();
+  DenseTensor* ddY = out_grad_grad;
+  DenseTensor* dW = filter_grad;
+  DenseTensor* dX = input_grad;
+  DenseTensor W = filter;
+  if (!ddY && !dW && !dX) return;
+  const std::vector<int> strides = strides_t;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  // transform Tensor
+  DenseTensor transformed_X(X->type());
+  DenseTensor transformed_dY(dY->type());
+  DenseTensor transformed_ddX(X->type());
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+    TransToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
+    ResizeToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+    TransToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
+    if (ddX) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+      TransToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
+    }
+  } else {
+    transformed_X = *X;
+    transformed_dY = *dY;
+    if (ddX) {
+      transformed_ddX = *ddX;
+    }
+  }
+  // update padding and dilation
+  auto in_dims = transformed_X.dims();
+  auto filter_dims = W.dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+  const int batch_size = static_cast<int>(transformed_X.dims()[0]);
+  std::vector<int64_t> filter_shape_vec(vectorize(W.dims()));
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_dY.dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  // col_shape [in_channel/group, kh, kw, oh, ow]
+  col_shape_vec[0] = transformed_X.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+  // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+  // input_shape [Cin, H, W]
+  DDim input_shape =
+      slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
+  // filter_matrix_shape [Cout, Cin * kh * kw]
+  DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]};
+  W.Resize(filter_matrix_shape);
+  DDim output_matrix_shape = {
+      transformed_dY.dims()[1],
+      transformed_dY.numel() /
+          (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
+  int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  DenseTensor col;
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  // dx convolution double grad:  gemm + col2im(col2vol)
+  // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
+  // oH, oW)
+  if (dX && ddW_in) {
+    Tensor ddW;
+    ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+    dX->mutable_data<T>(dev_ctx.GetPlace());
+    DenseTensor transformed_dX(dX->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, dX, &transformed_dX);
+    } else {
+      transformed_dX = *dX;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary
+    // because math::matmul will reset dx
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col_matrix.ShareDataWith(dx_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(
+            ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0));
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &dx_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_dX, dX);
+    }
+  }
+  // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
+  // oH, oW)
+  // dw convolution double grad:  im2col(vol2col) + gemm
+  if (dW && ddX) {
+    dW->mutable_data<T>(dev_ctx.GetPlace());
+    set_zero(dev_ctx, dW, static_cast<T>(0));
+    DenseTensor dW_arr = *dW;
+    dW_arr.Resize(filter_matrix_shape);
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor dy_batch =
+          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
+      Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; ++g) {
+        // im2col
+        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col.ShareDataWith(ddx_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 ddx_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+        }
+        DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(
+            dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0));
+      }
+    }
+  }
+  // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
+  // w/ddw(Cout, Cin, kh, kw)
+  // ddy convolution double grad: im2col(vol2col) + gemm
+  if (ddY) {
+    ddY->mutable_data<T>(dev_ctx.GetPlace());
+    DenseTensor transformed_ddY(ddY->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, ddY, &transformed_ddY);
+    } else {
+      transformed_ddY = *ddY;
+    }
+    set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; ++i) {
+      DenseTensor ddy_batch =
+          transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
+      for (int g = 0; g < groups; ++g) {
+        // gemm
+        DenseTensor ddy_slice =
+            ddy_batch.Slice(g * out_step, (g + 1) * out_step);
+        if (ddX) {
+          DenseTensor ddx_batch =
+              transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor ddx_slice =
+              ddx_batch.Slice(g * in_step, (g + 1) * in_step);
+          if (!is_expand) {
+            col.ShareDataWith(ddx_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   ddx_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
+          }
+          DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0));
+        }
+        if (ddW_in) {
+          DenseTensor x_batch =
+              transformed_X.Slice(i, i + 1).Resize(input_shape);
+          DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
+          DenseTensor ddW;
+          ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
+          if (!is_expand) {
+            col.ShareDataWith(x_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx,
+                   x_slice,
+                   dilations,
+                   strides,
+                   std::vector<int>{
+                       paddings[0], paddings[2], paddings[1], paddings[3]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
+          }
+          // gemm
+          DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
+          blas.MatMul(
+              ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0));
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_ddY, ddY);
+    }
+  }
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvGradKernel(const Context& dev_ctx,
+                    const DenseTensor& output_grad,
+                    const DenseTensor& input,
+                    const DenseTensor& filter_t,
+                    const std::vector<int>& strides,
+                    const std::vector<int>& paddings_t,
+                    const std::string& padding_algorithm,
+                    int groups,
+                    const std::vector<int>& dilations_t,
+                    const std::string& data_format,
+                    bool use_addto,
+                    int workspace_size_MB,
+                    bool exhaustive_search,
+                    DenseTensor* input_grad,
+                    DenseTensor* filter_grad) {
+  // The filter and filter_grad will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+  if (!input_grad && !filter_grad) return;
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  DenseTensor filter = filter_t;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_output_grad(output_grad.type());
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    ResizeToChannelFirst<Context, T>(
+        dev_ctx, &output_grad, &transformed_output_grad);
+    TransToChannelFirst<Context, T>(
+        dev_ctx, &output_grad, &transformed_output_grad);
+  } else {
+    transformed_input = input;
+    transformed_output_grad = output_grad;
+  }
+  // update padding and dilation
+  auto in_dims = transformed_input.dims();
+  auto filter_dims = filter.dims();
+  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation<int>(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+  // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+  std::vector<int64_t> output_shape_vec(
+      vectorize(transformed_output_grad.dims()));
+  // use col_shape in the im2col calculation
+  // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+  // o_h, o_w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = transformed_input.dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+  // use col_matrix_shape in the gemm calculation
+  // size: (i_c/g * k_h * k_w, o_h * o_w)
+  // or
+  // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
+  DDim input_shape =
+      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
+  DDim filter_matrix_shape = {filter.dims()[0],
+                              filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  DDim output_matrix_shape = {
+      transformed_output_grad.dims()[1],
+      transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
+                                         transformed_output_grad.dims()[1])};
+  // convolution backward input operator:  gemm + col2im(or col2vol)
+  // convolution backward weight operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  DenseTensor col;
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  if (is_expand) {
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  phi::funcs::SetConstant<Context, T> set_zero;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  if (input_grad) {
+    input_grad->mutable_data<T>(dev_ctx.GetPlace());
+    DenseTensor transformed_input_grad(input_grad->type());
+    if (channel_last) {
+      ResizeToChannelFirst<Context, T>(
+          dev_ctx, input_grad, &transformed_input_grad);
+    } else {
+      transformed_input_grad = *input_grad;
+    }
+    // if is_expand is false, the operation of set_zero is unnecessary,
+    // because math::matmul will reset input_grad.
+    if (is_expand) {
+      set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
+    }
+    paddle::operators::math::Col2VolFunctor<Context, T> col2vol;
+    paddle::operators::math::
+        Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            col2im;
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor out_grad_batch =
+          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor in_grad_batch =
+          transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // gemm
+        DenseTensor out_grad_slice =
+            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor filter_slice =
+            filter.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor in_grad_slice =
+            in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col_matrix.ShareDataWith(in_grad_slice);
+          col_matrix.Resize(col_matrix_shape);
+        }
+        blas.MatMul(filter_slice,
+                    true,
+                    out_grad_slice,
+                    false,
+                    T(1.0),
+                    &col_matrix,
+                    T(0.0));
+        if (is_expand && data_dim == 2U) {
+          col2im(dev_ctx,
+                 col,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &in_grad_slice);
+        } else if (is_expand && data_dim == 3U) {
+          col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
+        }
+      }
+    }
+    if (channel_last) {
+      TransToChannelLast<Context, T>(
+          dev_ctx, &transformed_input_grad, input_grad);
+    }
+  }
+  if (filter_grad) {
+    filter_grad->mutable_data<T>(dev_ctx.GetPlace());
+    Tensor filter_grad_ = *filter_grad;
+    filter_grad_.Resize(filter_matrix_shape);
+    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+    paddle::operators::math::
+        Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+            im2col;
+    paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+    for (int i = 0; i < batch_size; i++) {
+      DenseTensor out_grad_batch =
+          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
+      DenseTensor in_batch =
+          transformed_input.Slice(i, i + 1).Resize(input_shape);
+      for (int g = 0; g < groups; g++) {
+        // im2col
+        DenseTensor out_grad_slice =
+            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+        DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          im2col(dev_ctx,
+                 in_slice,
+                 dilations,
+                 strides,
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[1], paddings[3]},
+                 &col);
+        } else if (data_dim == 3U) {
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+        }
+        // gemm
+        DenseTensor filter_grad_slice =
+            filter_grad_.Slice(g * out_step, (g + 1) * out_step);
+        blas.MatMul(out_grad_slice,
+                    false,
+                    col_matrix,
+                    true,
+                    T(1.0),
+                    &filter_grad_slice,
+                    T(1.0));
+      }
+    }
+  }
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+template <typename T, typename Context>
+void ConvKernel(const Context& dev_ctx,
+                const DenseTensor& input,
+                const DenseTensor& filter_t,
+                const std::vector<int>& strides,
+                const std::vector<int>& paddings_t,
+                const std::string& padding_algorithm,
+                int groups,
+                const std::vector<int>& dilations_t,
+                const std::string& data_format,
+                bool use_addto,
+                int workspace_size_MB,
+                bool exhaustive_search,
+                DenseTensor* output) {
+  std::vector<int> paddings = paddings_t;
+  std::vector<int> dilations = dilations_t;
+  DenseTensor filter = filter_t;
+  // The filter will be reshaped in the calculations,
+  // so here use an assignment operation,
+  // that avoids modifying the variable in the Scope.
+  output->mutable_data<T>(dev_ctx.GetPlace());
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_output(output->type());
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
+    ResizeToChannelFirst<Context, T>(dev_ctx, output, &transformed_output);
+  } else {
+    transformed_input = input;
+    transformed_output = *output;
+  }
+  // update padding and dilation
+  auto trans_in_dims = transformed_input.dims();
+  auto filter_dims = filter.dims();
+  DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
+  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
+  // filter_shape_vec:
+  // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+  std::vector<int64_t> filter_shape_vec(vectorize(filter.dims()));
+  // output_shape_vec:
+  // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+  std::vector<int64_t> output_shape_vec(vectorize(transformed_output.dims()));
+  // use col_shape in the im2col calculation
+  // col_shape_vec:
+  // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
+  // o_d,o_h, o_w}
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = trans_in_dims[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  DDim col_shape(make_ddim(col_shape_vec));
+  // use col_matrix_shape in the gemm calculation
+  // size:
+  // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
+  // o_w)
+  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim);
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  DenseTensor col;
+  // col_matrix shares the same piece of data with col,
+  // but will be reshaped into a two-dimensional matrix shape
+  // to call the matrix multiplication interface.
+  DenseTensor col_matrix;
+  if (is_expand) {
+    // col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
+    col.Resize(col_shape);
+    col.mutable_data<T>(dev_ctx.GetPlace());
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  DDim in_matrix_shape =
+      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
+  DDim filter_matrix_shape = {filter.dims()[0],
+                              filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  DDim output_matrix_shape = {
+      transformed_output.dims()[1],
+      transformed_output.numel() /
+          (transformed_output.dims()[0] * transformed_output.dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
+  int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
+  paddle::operators::math::Vol2ColFunctor<Context, T> vol2col;
+  paddle::operators::math::
+      Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
+          im2col;
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  for (int i = 0; i < batch_size; i++) {
+    DenseTensor in_batch =
+        transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
+    DenseTensor out_batch =
+        transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        im2col(dev_ctx,
+               in_slice,
+               dilations,
+               strides,
+               std::vector<int>{
+                   paddings[0], paddings[2], paddings[1], paddings[3]},
+               &col);
+      } else if (data_dim == 3U) {
+        vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      blas.MatMul(
+          filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0));
+    }
+  }
+  if (channel_last) {
+    TransToChannelLast<Context, T>(dev_ctx, &transformed_output, output);
+  }
+}
+}  // namespace phi
--- a/paddle/phi/ops/compat/conv2d_sig.cc
+++ b/paddle/phi/ops/compat/conv2d_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature Conv2dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"Output"});
+}
+KernelSignature Conv2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+KernelSignature Conv2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(conv2d, phi::Conv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_grad, phi::Conv2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv2d_grad_grad,
+                           phi::Conv2dDoubleGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/conv3d_sig.cc
+++ b/paddle/phi/ops/compat/conv3d_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"Output"});
+}
+KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+KernelSignature Conv3dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("conv3d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(conv3d, phi::Conv3dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_grad, phi::Conv3dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(conv3d_grad_grad,
+                           phi::Conv3dDoubleGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
+++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature DepthwiseConv2dOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d",
+                         {"Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {"Output"});
+}
+KernelSignature DepthwiseConv2dGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_grad",
+                         {GradVarName("Output"), "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {GradVarName("Input"), GradVarName("Filter")});
+}
+KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("depthwise_conv2d_grad_grad",
+                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"strides",
+                          "paddings",
+                          "padding_algorithm",
+                          "groups",
+                          "dilations",
+                          "data_format",
+                          "use_addto",
+                          "workspace_size_MB",
+                          "exhaustive_search",
+                          "fuse_relu_before_depthwise_conv"},
+                         {"DDOutput", "DInput", "DFilter"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d,
+                           phi::DepthwiseConv2dOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad,
+                           phi::DepthwiseConv2dGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(depthwise_conv2d_grad_grad,
+                           phi::DepthwiseConv2dDoubleGradOpArgumentMapping);
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -230,4 +230,5 @@ def load_tests(loader, standard_tests, pattern):
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -18,6 +18,7 @@ import paddle.fluid.dygraph as dg
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
 import unittest
+import paddle
 def _reverse_repeat_list(t, n):
@@ -284,4 +285,5 @@ def load_tests(loader, standard_tests, pattern):
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -604,7 +604,7 @@ class TestWithInput1x1Filter1x1(TestConv2DOp):
        self.groups = 3
-#----------------Conv2DCUDNN----------------
+# #----------------Conv2DCUDNN----------------
 create_test_cudnn_class(TestConv2DOp)
 create_test_cudnn_class(TestWithPad)

--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle
 def conv3d_forward_naive(input,
@@ -1001,4 +1002,5 @@ class TestConv3DAPI_Error(unittest.TestCase):
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
@@ -44,7 +45,6 @@ class TestConvDoubleGradCheck(unittest.TestCase):
    def test_grad(self):
        places = [fluid.CPUPlace()]
-        places = []
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
@@ -120,7 +120,8 @@ class TestConv3DDoubleGradCheck(unittest.TestCase):
            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
    def test_grad(self):
-        places = [fluid.CPUPlace()]
+        #places = [fluid.CPUPlace()]
+        places = []
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))
        for p in places:
@@ -503,4 +504,5 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d.py
@@ -534,4 +534,5 @@ class TestFunctionalConv2DErrorCase13(TestFunctionalConv2DErrorCase12):
 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py
@@ -509,4 +509,5 @@ class TestFunctionalConv3DErrorCase12(TestFunctionalConv3DErrorCase11):
 if __name__ == "__main__":
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -117,4 +117,5 @@ class TestMNIST(TestParallelExecutorBase):
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -617,4 +617,5 @@ class TestStarGANWithGradientPenalty(unittest.TestCase):
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()