From d7dddf9486bf7e39456be44b5a747db16644e818 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 9 Feb 2022 13:43:33 +0800
Subject: [PATCH] Move trace op to pten (#39227)

* add trace op

* bug fix

* bug fix; test=develop

* thrust bug fix; test=develop

* remove useless register; test=develop

* fix bug; test=develop

* update trace kernel; test=develop

* move kernel args to trace_sig; test=develop
---
 paddle/fluid/operators/trace_op.cc            |  20 +-
 paddle/fluid/operators/trace_op.cu            |  77 -------
 paddle/pten/kernels/cpu/trace_grad_kernel.cc  |  31 +++
 paddle/pten/kernels/cpu/trace_kernel.cc       |  58 ++++++
 paddle/pten/kernels/gpu/trace_grad_kernel.cu  |  31 +++
 paddle/pten/kernels/gpu/trace_kernel.cu       |  57 ++++++
 .../kernels/impl/trace_kernel_impl.h}         | 191 ++++++++----------
 paddle/pten/kernels/trace_grad_kernel.h       |  30 +++
 paddle/pten/kernels/trace_kernel.h            |  29 +++
 paddle/pten/ops/compat/trace_sig.cc           |  34 ++++
 .../fluid/tests/unittests/test_trace_op.py    |   2 +
 11 files changed, 360 insertions(+), 200 deletions(-)
 delete mode 100644 paddle/fluid/operators/trace_op.cu
 create mode 100644 paddle/pten/kernels/cpu/trace_grad_kernel.cc
 create mode 100644 paddle/pten/kernels/cpu/trace_kernel.cc
 create mode 100644 paddle/pten/kernels/gpu/trace_grad_kernel.cu
 create mode 100644 paddle/pten/kernels/gpu/trace_kernel.cu
 rename paddle/{fluid/operators/trace_op.h => pten/kernels/impl/trace_kernel_impl.h} (50%)
 create mode 100644 paddle/pten/kernels/trace_grad_kernel.h
 create mode 100644 paddle/pten/kernels/trace_kernel.h
 create mode 100644 paddle/pten/ops/compat/trace_sig.cc
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index de71a089b69..aabad64c894 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/trace_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -161,24 +161,6 @@ REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker,
 
 REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad,
                   ops::TraceGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    trace, ops::TraceKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TraceKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TraceKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TraceKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TraceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<float>>,
-    ops::TraceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    trace_grad, ops::TraceGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::TraceGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TraceGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TraceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TraceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TraceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(trace)
diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu
deleted file mode 100644
index 3d8a60dd65f..00000000000
--- a/paddle/fluid/operators/trace_op.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/trace_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TraceCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("Input");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const int64_t offset = context.Attr<int>("offset");
-    const int64_t dim1 = context.Attr<int>("axis1");
-    const int64_t dim2 = context.Attr<int>("axis2");
-
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const framework::Tensor diag =
-        Diagonal<DeviceContext, T>(context, input, offset, dim1, dim2);
-    if (diag.numel() > 0) {
-      auto stream = context.cuda_device_context().stream();
-      std::vector<int> reduce_dims;
-      reduce_dims.push_back(out->dims().size());
-      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
-          reduce_dims, stream);
-    } else {
-      math::SetConstant<DeviceContext, T> functor;
-      functor(context.device_context<DeviceContext>(), out, static_cast<T>(0));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace platform = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    trace, ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext,
-                         platform::float16>,
-    ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    trace_grad, ops::TraceGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TraceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TraceGradKernel<paddle::platform::CUDADeviceContext,
-                         platform::float16>,
-    ops::TraceGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TraceGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TraceGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TraceGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<double>>);
diff --git a/paddle/pten/kernels/cpu/trace_grad_kernel.cc b/paddle/pten/kernels/cpu/trace_grad_kernel.cc
new file mode 100644
index 00000000000..136b941ea8b
--- /dev/null
+++ b/paddle/pten/kernels/cpu/trace_grad_kernel.cc
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/trace_grad_kernel.h"
+#include "paddle/pten/kernels/impl/trace_kernel_impl.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_REGISTER_KERNEL(trace_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::TraceGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/trace_kernel.cc b/paddle/pten/kernels/cpu/trace_kernel.cc
new file mode 100644
index 00000000000..4064b752ef4
--- /dev/null
+++ b/paddle/pten/kernels/cpu/trace_kernel.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/trace_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/impl/trace_kernel_impl.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void TraceKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 int offset,
+                 int axis1,
+                 int axis2,
+                 DenseTensor* out) {
+  auto output_dims = out->dims();
+
+  T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+  const DenseTensor diag = Diagonal<T, Context>(ctx, &x, offset, axis1, axis2);
+  if (diag.numel() > 0) {
+    auto x = paddle::framework::EigenMatrix<T>::Reshape(diag,
+                                                        diag.dims().size() - 1);
+    auto output = paddle::framework::EigenVector<T>::Flatten(*out);
+    auto reduce_dim = Eigen::array<int, 1>({1});
+    output.device(*ctx.eigen_device()) = x.sum(reduce_dim);
+    out->Resize(output_dims);
+  } else {
+    std::fill(out_data, out_data + out->numel(), static_cast<T>(0));
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(trace,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::TraceKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/trace_grad_kernel.cu b/paddle/pten/kernels/gpu/trace_grad_kernel.cu
new file mode 100644
index 00000000000..b1b22e5ce54
--- /dev/null
+++ b/paddle/pten/kernels/gpu/trace_grad_kernel.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/impl/trace_kernel_impl.h"
+#include "paddle/pten/kernels/trace_grad_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+PT_REGISTER_KERNEL(trace_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::TraceGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/trace_kernel.cu b/paddle/pten/kernels/gpu/trace_kernel.cu
new file mode 100644
index 00000000000..b0bfb3a8a51
--- /dev/null
+++ b/paddle/pten/kernels/gpu/trace_kernel.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
+#include "paddle/pten/kernels/impl/trace_kernel_impl.h"
+#include "paddle/pten/kernels/trace_kernel.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void TraceKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 int offset,
+                 int axis1,
+                 int axis2,
+                 DenseTensor* out) {
+  T* out_data = out->mutable_data<T>(ctx.GetPlace());
+  auto diag = Diagonal<T, Context>(ctx, &x, offset, axis1, axis2);
+  if (diag.numel() > 0) {
+    auto stream = ctx.stream();
+    std::vector<int> reduce_dims;
+    reduce_dims.push_back(out->dims().size());
+    kernels::
+        TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+            ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
+  } else {
+    paddle::operators::math::SetConstant<Context, T> functor;
+    functor(ctx, out, static_cast<T>(0));
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(trace,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::TraceKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/fluid/operators/trace_op.h b/paddle/pten/kernels/impl/trace_kernel_impl.h
similarity index 50%
rename from paddle/fluid/operators/trace_op.h
rename to paddle/pten/kernels/impl/trace_kernel_impl.h
index ca9439cbed9..4dbba9bc69e 100644
--- a/paddle/fluid/operators/trace_op.h
+++ b/paddle/pten/kernels/impl/trace_kernel_impl.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,20 +13,26 @@
 // limitations under the License.
 
 #pragma once
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#endif
+
 #include <algorithm>
-#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
 
-namespace paddle {
-namespace operators {
-
+namespace pten {
 template <typename T>
 struct DiagonalFunctor {
-  DiagonalFunctor(const T* input, const int64_t* diag_stride,
-                  const int64_t* ret_strides, int64_t pos, int64_t dim_size,
+  DiagonalFunctor(const T* input,
+                  const int64_t* diag_stride,
+                  const int64_t* ret_strides,
+                  int64_t pos,
+                  int64_t dim_size,
                   T* diag)
       : input_(input),
         diag_stride_(diag_stride),
@@ -55,9 +61,15 @@ struct DiagonalFunctor {
 
 template <typename T>
 struct TraceGradFunctor {
-  TraceGradFunctor(const T* d_out, const int64_t* out_stride,
-                   const int64_t* x_strides, int64_t pos, int64_t dim_size,
-                   int64_t dim1, int64_t dim2, int64_t diag_size, T* d_x)
+  TraceGradFunctor(const T* d_out,
+                   const int64_t* out_stride,
+                   const int64_t* x_strides,
+                   int64_t pos,
+                   int64_t dim_size,
+                   int64_t dim1,
+                   int64_t dim2,
+                   int64_t diag_size,
+                   T* d_x)
       : d_out_(d_out),
         out_stride_(out_stride),
         x_strides_(x_strides),
@@ -101,10 +113,12 @@ struct TraceGradFunctor {
   T* d_x_;
 };
 
-template <typename DeviceContext, typename T>
-framework::Tensor Diagonal(const framework::ExecutionContext& context,
-                           const framework::Tensor* input, int64_t offset,
-                           int64_t dim1, int64_t dim2) {
+template <typename T, typename DeviceContext>
+DenseTensor Diagonal(const DeviceContext& context,
+                     const DenseTensor* input,
+                     int64_t offset,
+                     int64_t dim1,
+                     int64_t dim2) {
   auto* input_data = input->data<T>();
   auto input_dims = input->dims();
   auto input_stride = framework::stride(input_dims);
@@ -138,7 +152,7 @@ framework::Tensor Diagonal(const framework::ExecutionContext& context,
     }
     ret_strides.push_back(stride1 + stride2);
     ret_dims.push_back(diag_size);
-    framework::Tensor diag;
+    DenseTensor diag;
     framework::DDim diag_dims = framework::make_ddim(ret_dims);
     auto dig_stride = framework::stride(diag_dims);
     auto diag_data = diag.mutable_data<T>(diag_dims, context.GetPlace());
@@ -155,10 +169,10 @@ framework::Tensor Diagonal(const framework::ExecutionContext& context,
     const auto* ret_arr = ret_strides.data();
 #endif
 
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    platform::ForRange<DeviceContext> for_range(dev_ctx, diag.numel());
-    DiagonalFunctor<T> functor(input_data, diag_arr, ret_arr, pos, dim_size,
-                               diag_data);
+    // auto& dev_ctx = context.template device_context<DeviceContext>();
+    paddle::platform::ForRange<DeviceContext> for_range(context, diag.numel());
+    DiagonalFunctor<T> functor(
+        input_data, diag_arr, ret_arr, pos, dim_size, diag_data);
     for_range(functor);
     return diag;
   } else {
@@ -166,99 +180,68 @@ framework::Tensor Diagonal(const framework::ExecutionContext& context,
   }
 }
 
-template <typename DeviceContext, typename T>
-class TraceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("Input");
-    auto* out = context.Output<framework::Tensor>("Out");
+template <typename T, typename Context>
+void TraceGradKernel(const Context& ctx,
+                     const DenseTensor& out_grad,
+                     const DenseTensor& x,
+                     int offset,
+                     int axis1,
+                     int axis2,
+                     DenseTensor* in_grad) {
+  auto input_dims = in_grad->dims();
+  auto input_stride = framework::stride(input_dims);
+  auto output_dims = out_grad.dims();
+  auto output_stride = framework::stride(output_dims);
 
-    const int64_t offset = context.Attr<int>("offset");
-    const int64_t dim1 = context.Attr<int>("axis1");
-    const int64_t dim2 = context.Attr<int>("axis2");
+  auto* out_data = out_grad.data<T>();
+  T* x_data = in_grad->mutable_data<T>(ctx.GetPlace());
 
-    auto output_dims = out->dims();
+  paddle::operators::math::SetConstant<Context, T> set_zero;
 
-    T* out_data = out->mutable_data<T>(context.GetPlace());
+  set_zero(ctx, in_grad, static_cast<T>(0.0));
+  auto dim1 = axis1;
+  auto dim2 = axis2;
+  auto dim1_ = dim1 < 0 ? input_dims.size() + dim1 : dim1;
+  auto dim2_ = dim2 < 0 ? input_dims.size() + dim2 : dim2;
+  auto len1 = input_dims[std::min(dim1_, dim2_)];
+  auto len2 = input_dims[std::max(dim1_, dim2_)];
+  auto stride1 = input_stride[std::min(dim1_, dim2_)];
+  auto stride2 = input_stride[std::max(dim1_, dim2_)];
 
-    const framework::Tensor diag =
-        Diagonal<DeviceContext, T>(context, input, offset, dim1, dim2);
-    if (diag.numel() > 0) {
-      auto x = framework::EigenMatrix<T>::Reshape(diag, diag.dims().size() - 1);
-      auto output = framework::EigenVector<T>::Flatten(*out);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({1});
-      output.device(place) = x.sum(reduce_dim);
-      out->Resize(output_dims);
-    } else {
-      std::fill(out_data, out_data + out->numel(), static_cast<T>(0));
-    }
+  int offset_stride = 0;
+  if (offset >= 0) {
+    offset_stride = stride2;
+    len2 -= offset;
+  } else {
+    offset_stride = stride1;
+    len1 += offset;
   }
-};
-
-template <typename DeviceContext, typename T>
-class TraceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-
-    int64_t offset = context.Attr<int>("offset");
-    int64_t dim1 = context.Attr<int>("axis1");
-    int64_t dim2 = context.Attr<int>("axis2");
-
-    auto input_dims = d_x->dims();
-    auto input_stride = framework::stride(input_dims);
-    auto output_dims = d_out->dims();
-    auto output_stride = framework::stride(output_dims);
-
-    auto* out_data = d_out->data<T>();
-    T* x_data = d_x->mutable_data<T>(context.GetPlace());
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    set_zero(dev_ctx, d_x, static_cast<T>(0.0));
-
-    auto dim1_ = dim1 < 0 ? input_dims.size() + dim1 : dim1;
-    auto dim2_ = dim2 < 0 ? input_dims.size() + dim2 : dim2;
-    auto len1 = input_dims[std::min(dim1_, dim2_)];
-    auto len2 = input_dims[std::max(dim1_, dim2_)];
-    auto stride1 = input_stride[std::min(dim1_, dim2_)];
-    auto stride2 = input_stride[std::max(dim1_, dim2_)];
-
-    int offset_stride = 0;
-    if (offset >= 0) {
-      offset_stride = stride2;
-      len2 -= offset;
-    } else {
-      offset_stride = stride1;
-      len1 += offset;
-    }
-    int64_t diag_size = len2 < len1 ? len2 : len1;
-    int64_t pos = std::abs(offset) * offset_stride;
-    if (diag_size > 0) {
+  int64_t diag_size = len2 < len1 ? len2 : len1;
+  int64_t pos = std::abs(offset) * offset_stride;
+  if (diag_size > 0) {
 #if defined(__NVCC__) || defined(__HIPCC__)
-      thrust::device_vector<int64_t> output_vec(vectorize(output_stride));
-      const int64_t* output_arr = thrust::raw_pointer_cast(output_vec.data());
-      thrust::device_vector<int64_t> input_vec(vectorize(input_stride));
-      const int64_t* input_arr = thrust::raw_pointer_cast(input_vec.data());
+    thrust::device_vector<int64_t> output_vec(vectorize(output_stride));
+    const int64_t* output_arr = thrust::raw_pointer_cast(output_vec.data());
+    thrust::device_vector<int64_t> input_vec(vectorize(input_stride));
+    const int64_t* input_arr = thrust::raw_pointer_cast(input_vec.data());
 
 #else
-      const auto* output_arr = output_stride.Get();
-      const auto* input_arr = input_stride.Get();
+    const auto* output_arr = output_stride.Get();
+    const auto* input_arr = input_stride.Get();
 #endif
 
-      platform::ForRange<DeviceContext> for_range(dev_ctx, d_x->numel());
-      TraceGradFunctor<T> functor(out_data, output_arr, input_arr, pos,
-                                  input_dims.size(), dim1_, dim2_, diag_size,
-                                  x_data);
-      for_range(functor);
-    }
+    paddle::platform::ForRange<Context> for_range(ctx, in_grad->numel());
+    TraceGradFunctor<T> functor(out_data,
+                                output_arr,
+                                input_arr,
+                                pos,
+                                input_dims.size(),
+                                dim1_,
+                                dim2_,
+                                diag_size,
+                                x_data);
+    for_range(functor);
   }
-};
+}
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/pten/kernels/trace_grad_kernel.h b/paddle/pten/kernels/trace_grad_kernel.h
new file mode 100644
index 00000000000..9dad4e6ac90
--- /dev/null
+++ b/paddle/pten/kernels/trace_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void TraceGradKernel(const Context& ctx,
+                     const DenseTensor& out_grad,
+                     const DenseTensor& x,
+                     int offset,
+                     int axis1,
+                     int axis2,
+                     DenseTensor* in_grad);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/trace_kernel.h b/paddle/pten/kernels/trace_kernel.h
new file mode 100644
index 00000000000..7a68e32508a
--- /dev/null
+++ b/paddle/pten/kernels/trace_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void TraceKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 int offset,
+                 int axis1,
+                 int axis2,
+                 DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/ops/compat/trace_sig.cc b/paddle/pten/ops/compat/trace_sig.cc
new file mode 100644
index 00000000000..a36beaaee7a
--- /dev/null
+++ b/paddle/pten/ops/compat/trace_sig.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/core/compat/op_utils.h"
+
+namespace pten {
+
+KernelSignature TraceOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "trace", {"Input"}, {"offset", "axis1", "axis2"}, {"Out"});
+}
+
+KernelSignature TraceGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("trace_grad",
+                         {GradVarName("Out"), "Input"},
+                         {"offset", "axis1", "axis2"},
+                         {GradVarName("Input")});
+}
+
+}  // namespace pten
+
+PT_REGISTER_ARG_MAPPING_FN(trace, pten::TraceOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(trace_grad, pten::TraceGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/test_trace_op.py b/python/paddle/fluid/tests/unittests/test_trace_op.py
index 7441ff24329..3320b240e56 100644
--- a/python/paddle/fluid/tests/unittests/test_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trace_op.py
@@ -21,6 +21,7 @@ import paddle.nn.functional as F
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.tensor as tensor
+import paddle
 
 
 class TestTraceOp(OpTest):
@@ -86,4 +87,5 @@ class TestTraceAPICase(unittest.TestCase):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
-- 
GitLab