[pten] trans diagonal kernel into pten (#39575)

* trans diagonal kernel into pten * fix by code review

[pten] trans diagonal kernel into pten (#39575)
* trans diagonal kernel into pten * fix by code review
5c66338f · xiongkun · GitHub · 7d6d3848 · 5c66338f · 7d6d3848
11 changed file
--- a/paddle/fluid/operators/diagonal_op.cc
+++ b/paddle/fluid/operators/diagonal_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/diagonal_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
@@ -169,18 +169,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer,
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker,
                  ops::DiagonalGradOpMaker<paddle::framework::OpDesc>,
                  ops::DiagonalGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(diagonal_grad, ops::DiagonalGradOp,
                  ops::DiagonalGradNoNeedBufferVarsInferer)
-REGISTER_OP_CPU_KERNEL(diagonal, ops::DiagonalKernel<int>,
-                       ops::DiagonalKernel<int64_t>, ops::DiagonalKernel<float>,
-                       ops::DiagonalKernel<double>, ops::DiagonalKernel<bool>);
-REGISTER_OP_CPU_KERNEL(diagonal_grad, ops::DiagonalGradKernel<int>,
-                       ops::DiagonalGradKernel<int64_t>,
-                       ops::DiagonalGradKernel<float>,
-                       ops::DiagonalGradKernel<double>);
--- a/paddle/fluid/operators/diagonal_op.cu
+++ b/paddle/fluid/operators/diagonal_op.cu
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/diagonal_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
-namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
-template <typename T, int X_DIM_SIZE, int OUT_DIM_SIZE>
-__global__ void Diagonal(const T* data1, T* data2, const int64_t offset_,
-                         int64_t axis1_, int64_t axis2_, int64_t* x_stride,
-                         int64_t* out_stride, int64_t numel, bool is_grad) {
-  CUDA_KERNEL_LOOP(idx, numel) {
-    int64_t idx_dim[X_DIM_SIZE] = {0};
-    int64_t temp = 0;
-    for (size_t i = 0; i < X_DIM_SIZE - 1; i++) {
-      idx_dim[i] = (idx - temp) / x_stride[i];
-      temp = temp + idx_dim[i] * x_stride[i];
-    }
-    idx_dim[X_DIM_SIZE - 1] = idx - temp;
-    int64_t axis1_dim = idx_dim[axis1_];
-    int64_t axis2_dim = idx_dim[axis2_];
-    int64_t out_dim[OUT_DIM_SIZE] = {0};
-    int temp_pos = 0;
-    for (int i = 0; i < X_DIM_SIZE; i++) {
-      if (i != axis1_ && i != axis2_) {
-        out_dim[temp_pos] = idx_dim[i];
-        temp_pos++;
-      }
-    }
-    bool flag = false;
-    if (offset_ == 0 && axis1_dim == axis2_dim) {
-      out_dim[temp_pos] = axis1_dim;
-      flag = true;
-    } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
-      out_dim[temp_pos] = axis1_dim;
-      flag = true;
-    } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
-      out_dim[temp_pos] = axis2_dim;
-      flag = true;
-    }
-    if (!is_grad) {
-      if (flag) {
-        int64_t idx_output = 0;
-        for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
-          idx_output = idx_output + out_dim[i] * out_stride[i];
-        }
-        idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1];
-        data2[idx_output] = data1[idx];
-      }
-    } else {
-      if (flag) {
-        int64_t idx_output = 0;
-        for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
-          idx_output = idx_output + out_dim[i] * out_stride[i];
-        }
-        idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1];
-        data2[idx] = data1[idx_output];
-      } else {
-        data2[idx] = static_cast<T>(0);
-      }
-    }
-  }
-}
-template <typename T>
-class DiagonalCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("Input");
-    const auto* input_data = input->data<T>();
-    auto input_dim = input->dims().Get();
-    auto input_dim_size = input->dims().size();
-    std::vector<int64_t> res_in = vectorize(framework::stride(input->dims()));
-    paddle::framework::Tensor input_stride_tensor;
-    framework::TensorFromVector<int64_t>(res_in, context.device_context(),
-                                         &input_stride_tensor);
-    int64_t* input_stride = input_stride_tensor.data<int64_t>();
-    auto* output = context.Output<framework::Tensor>("Out");
-    auto* output_data = output->mutable_data<T>(context.GetPlace());
-    auto output_dim = output->dims().Get();
-    auto output_dim_size = output->dims().size();
-    std::vector<int64_t> res_out = vectorize(framework::stride(output->dims()));
-    paddle::framework::Tensor output_stride_tensor;
-    framework::TensorFromVector<int64_t>(res_out, context.device_context(),
-                                         &output_stride_tensor);
-    int64_t* output_stride = output_stride_tensor.data<int64_t>();
-    const int64_t offset_ = context.Attr<int>("offset");
-    const int64_t axis1 = context.Attr<int>("axis1");
-    int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
-    const int64_t axis2 = context.Attr<int>("axis2");
-    int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2;
-    int64_t numel = input->numel();
-    int threads = PADDLE_CUDA_NUM_THREADS;
-    int blocks = (numel + threads - 1) / threads;
-    switch (input_dim_size) {
-      case 2:
-        Diagonal<T, 2, 1><<<blocks, threads>>>(input_data, output_data, offset_,
-                                               axis1_, axis2_, input_stride,
-                                               output_stride, numel, false);
-        break;
-      case 3:
-        Diagonal<T, 3, 2><<<blocks, threads>>>(input_data, output_data, offset_,
-                                               axis1_, axis2_, input_stride,
-                                               output_stride, numel, false);
-        break;
-      case 4:
-        Diagonal<T, 4, 3><<<blocks, threads>>>(input_data, output_data, offset_,
-                                               axis1_, axis2_, input_stride,
-                                               output_stride, numel, false);
-        break;
-      case 5:
-        Diagonal<T, 5, 4><<<blocks, threads>>>(input_data, output_data, offset_,
-                                               axis1_, axis2_, input_stride,
-                                               output_stride, numel, false);
-        break;
-      case 6:
-        Diagonal<T, 6, 5><<<blocks, threads>>>(input_data, output_data, offset_,
-                                               axis1_, axis2_, input_stride,
-                                               output_stride, numel, false);
-        break;
-      case 7:
-        Diagonal<T, 7, 6><<<blocks, threads>>>(input_data, output_data, offset_,
-                                               axis1_, axis2_, input_stride,
-                                               output_stride, numel, false);
-        break;
-      case 8:
-        Diagonal<T, 8, 7><<<blocks, threads>>>(input_data, output_data, offset_,
-                                               axis1_, axis2_, input_stride,
-                                               output_stride, numel, false);
-        break;
-      case 9:
-        Diagonal<T, 9, 8><<<blocks, threads>>>(input_data, output_data, offset_,
-                                               axis1_, axis2_, input_stride,
-                                               output_stride, numel, false);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input should be less than 10, but received %d.",
-            input_dim_size));
-    }
-  }
-};
-template <typename T>
-class DiagonalGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const auto* dout_data = dout->data<T>();
-    auto dout_dim = dout->dims().Get();
-    auto dout_dim_size = dout->dims().size();
-    std::vector<int64_t> res_dout = vectorize(framework::stride(dout->dims()));
-    paddle::framework::Tensor dout_stride_tensor;
-    framework::TensorFromVector<int64_t>(res_dout, context.device_context(),
-                                         &dout_stride_tensor);
-    int64_t* dout_stride = dout_stride_tensor.data<int64_t>();
-    auto* dx =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    auto* dx_data = dx->mutable_data<T>(context.GetPlace());
-    auto dx_dim = dx->dims().Get();
-    auto dx_dim_size = dx->dims().size();
-    std::vector<int64_t> res_dx = vectorize(framework::stride(dx->dims()));
-    paddle::framework::Tensor dx_stride_tensor;
-    framework::TensorFromVector<int64_t>(res_dx, context.device_context(),
-                                         &dx_stride_tensor);
-    int64_t* dx_stride = dx_stride_tensor.data<int64_t>();
-    const int64_t offset_ = context.Attr<int>("offset");
-    const int64_t axis1 = context.Attr<int>("axis1");
-    int64_t axis1_ = axis1 < 0 ? dx_dim_size + axis1 : axis1;
-    const int64_t axis2 = context.Attr<int>("axis2");
-    int64_t axis2_ = axis2 < 0 ? dx_dim_size + axis2 : axis2;
-    int64_t numel = dx->numel();
-    int threads = PADDLE_CUDA_NUM_THREADS;
-    int blocks = (numel + threads - 1) / threads;
-    switch (dx_dim_size) {
-      case 2:
-        Diagonal<T, 2, 1><<<blocks, threads>>>(dout_data, dx_data, offset_,
-                                               axis1_, axis2_, dx_stride,
-                                               dout_stride, numel, true);
-        break;
-      case 3:
-        Diagonal<T, 3, 2><<<blocks, threads>>>(dout_data, dx_data, offset_,
-                                               axis1_, axis2_, dx_stride,
-                                               dout_stride, numel, true);
-        break;
-      case 4:
-        Diagonal<T, 4, 3><<<blocks, threads>>>(dout_data, dx_data, offset_,
-                                               axis1_, axis2_, dx_stride,
-                                               dout_stride, numel, true);
-        break;
-      case 5:
-        Diagonal<T, 5, 4><<<blocks, threads>>>(dout_data, dx_data, offset_,
-                                               axis1_, axis2_, dx_stride,
-                                               dout_stride, numel, true);
-        break;
-      case 6:
-        Diagonal<T, 6, 5><<<blocks, threads>>>(dout_data, dx_data, offset_,
-                                               axis1_, axis2_, dx_stride,
-                                               dout_stride, numel, true);
-        break;
-      case 7:
-        Diagonal<T, 7, 6><<<blocks, threads>>>(dout_data, dx_data, offset_,
-                                               axis1_, axis2_, dx_stride,
-                                               dout_stride, numel, true);
-        break;
-      case 8:
-        Diagonal<T, 8, 7><<<blocks, threads>>>(dout_data, dx_data, offset_,
-                                               axis1_, axis2_, dx_stride,
-                                               dout_stride, numel, true);
-        break;
-      case 9:
-        Diagonal<T, 9, 8><<<blocks, threads>>>(dout_data, dx_data, offset_,
-                                               axis1_, axis2_, dx_stride,
-                                               dout_stride, numel, true);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of output(input@Grad) should be less than 10, but "
-            "received %d.",
-            dx_dim_size));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(diagonal, ops::DiagonalCUDAKernel<int>,
-                        ops::DiagonalCUDAKernel<int64_t>,
-                        ops::DiagonalCUDAKernel<float>,
-                        ops::DiagonalCUDAKernel<double>,
-                        ops::DiagonalCUDAKernel<plat::float16>,
-                        ops::DiagonalCUDAKernel<bool>);
-REGISTER_OP_CUDA_KERNEL(diagonal_grad, ops::DiagonalGradCUDAKernel<int>,
-                        ops::DiagonalGradCUDAKernel<int64_t>,
-                        ops::DiagonalGradCUDAKernel<float>,
-                        ops::DiagonalGradCUDAKernel<double>,
-                        ops::DiagonalGradCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/diagonal_op.h
+++ b/paddle/fluid/operators/diagonal_op.h
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-std::vector<T> ComputeDimStride(const std::vector<T> dim) {
-  size_t dim_size = dim.size();
-  std::vector<T> dim_strides;
-  dim_strides.resize(dim_size);
-  for (size_t i = 0; i < dim_size - 1; i++) {
-    size_t temp_stride = 1;
-    for (size_t j = i + 1; j < dim_size; j++) {
-      temp_stride = temp_stride * dim[j];
-    }
-    dim_strides[i] = temp_stride;
-  }
-  dim_strides[dim_size - 1] = 1;
-  return dim_strides;
-}
-template <typename T>
-class DiagonalKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<framework::Tensor>("Input");
-    const T* input_data = input->data<T>();
-    auto input_dim = vectorize(input->dims());
-    auto input_dim_size = input_dim.size();
-    auto* output = context.Output<framework::Tensor>("Out");
-    T* output_data = output->mutable_data<T>(context.GetPlace());
-    auto output_dim = vectorize(output->dims());
-    const int64_t offset_ = context.Attr<int>("offset");
-    const int64_t axis1 = context.Attr<int>("axis1");
-    int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
-    const int64_t axis2 = context.Attr<int>("axis2");
-    int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2;
-    std::vector<int64_t> input_stride = ComputeDimStride(input_dim);
-    std::vector<int64_t> output_stride = ComputeDimStride(output_dim);
-    int64_t numel = input->numel();
-    for (int64_t idx = 0; idx < numel; idx++) {
-      std::vector<int64_t> idx_dim(input_dim_size);
-      int64_t temp = 0;
-      for (size_t i = 0; i < input_dim_size; i++) {
-        idx_dim[i] = (idx - temp) / input_stride[i];
-        temp = temp + idx_dim[i] * input_stride[i];
-      }
-      int64_t axis1_dim = idx_dim[axis1_];
-      int64_t axis2_dim = idx_dim[axis2_];
-      idx_dim.erase(idx_dim.begin() + std::max(axis1_, axis2_));
-      idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_));
-      bool flag = false;
-      if (offset_ == 0 && axis1_dim == axis2_dim) {
-        idx_dim.push_back(axis1_dim);
-        flag = true;
-      } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
-        idx_dim.push_back(axis1_dim);
-        flag = true;
-      } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
-        idx_dim.push_back(axis2_dim);
-        flag = true;
-      }
-      if (flag) {
-        int64_t idx_output = 0;
-        for (size_t i = 0; i < idx_dim.size(); i++) {
-          idx_output = idx_output + idx_dim[i] * output_stride[i];
-        }
-        output_data[idx_output] = input_data[idx];
-      }
-    }
-  }
-};
-template <typename T>
-class DiagonalGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* dout =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    const T* dout_data = dout->data<T>();
-    auto dout_dim = vectorize(dout->dims());
-    auto* dx =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    T* dx_data = dx->mutable_data<T>(context.GetPlace());
-    auto dx_dim = vectorize(dx->dims());
-    auto dx_dim_size = dx_dim.size();
-    const int64_t offset_ = context.Attr<int>("offset");
-    const int64_t axis1 = context.Attr<int>("axis1");
-    int64_t axis1_ = axis1 < 0 ? dx_dim_size + axis1 : axis1;
-    const int64_t axis2 = context.Attr<int>("axis2");
-    int64_t axis2_ = axis2 < 0 ? dx_dim_size + axis2 : axis2;
-    std::vector<int64_t> dout_stride = ComputeDimStride(dout_dim);
-    std::vector<int64_t> dx_stride = ComputeDimStride(dx_dim);
-    int64_t numel = dx->numel();
-    for (int64_t idx = 0; idx < numel; idx++) {
-      std::vector<int64_t> idx_dim(dx_dim_size);
-      int64_t temp = 0;
-      for (size_t i = 0; i < dx_dim_size; i++) {
-        idx_dim[i] = (idx - temp) / dx_stride[i];
-        temp = temp + idx_dim[i] * dx_stride[i];
-      }
-      int64_t axis1_dim = idx_dim[axis1_];
-      int64_t axis2_dim = idx_dim[axis2_];
-      idx_dim.erase(idx_dim.begin() + std::max(axis1_, axis2_));
-      idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_));
-      bool flag = false;
-      if (offset_ == 0 && axis1_dim == axis2_dim) {
-        idx_dim.push_back(axis1_dim);
-        flag = true;
-      } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
-        idx_dim.push_back(axis1_dim);
-        flag = true;
-      } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
-        idx_dim.push_back(axis2_dim);
-        flag = true;
-      }
-      if (flag) {
-        int64_t idx_output = 0;
-        for (size_t i = 0; i < idx_dim.size(); i++) {
-          idx_output = idx_output + idx_dim[i] * dout_stride[i];
-        }
-        dx_data[idx] = dout_data[idx_output];
-      } else {
-        dx_data[idx] = static_cast<T>(0);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/pten/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/diagonal_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pten/kernels/diagonal_grad_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/diagonal.h"
+namespace pten {
+template <typename T, typename Context>
+void DiagonalGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        int offset,
+                        int axis1,
+                        int axis2,
+                        DenseTensor* in_grad) {
+  const auto* dout = &out_grad;
+  const T* dout_data = dout->data<T>();
+  auto dout_dim = vectorize(dout->dims());
+  auto* dx = in_grad;
+  T* dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dx_dim = vectorize(dx->dims());
+  auto dx_dim_size = dx_dim.size();
+  const int64_t offset_ = offset;
+  int64_t axis1_ = axis1 < 0 ? dx_dim_size + axis1 : axis1;
+  int64_t axis2_ = axis2 < 0 ? dx_dim_size + axis2 : axis2;
+  std::vector<int64_t> dout_stride = funcs::ComputeDimStride(dout_dim);
+  std::vector<int64_t> dx_stride = funcs::ComputeDimStride(dx_dim);
+  int64_t numel = dx->numel();
+  for (int64_t idx = 0; idx < numel; idx++) {
+    std::vector<int64_t> idx_dim(dx_dim_size);
+    int64_t temp = 0;
+    for (size_t i = 0; i < dx_dim_size; i++) {
+      idx_dim[i] = (idx - temp) / dx_stride[i];
+      temp = temp + idx_dim[i] * dx_stride[i];
+    }
+    int64_t axis1_dim = idx_dim[axis1_];
+    int64_t axis2_dim = idx_dim[axis2_];
+    idx_dim.erase(idx_dim.begin() + std::max(axis1_, axis2_));
+    idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_));
+    bool flag = false;
+    if (offset_ == 0 && axis1_dim == axis2_dim) {
+      idx_dim.push_back(axis1_dim);
+      flag = true;
+    } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
+      idx_dim.push_back(axis1_dim);
+      flag = true;
+    } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
+      idx_dim.push_back(axis2_dim);
+      flag = true;
+    }
+    if (flag) {
+      int64_t idx_output = 0;
+      for (size_t i = 0; i < idx_dim.size(); i++) {
+        idx_output = idx_output + idx_dim[i] * dout_stride[i];
+      }
+      dx_data[idx] = dout_data[idx_output];
+    } else {
+      dx_data[idx] = static_cast<T>(0);
+    }
+  }
+}
+}  // namespace pten
+PT_REGISTER_KERNEL(diagonal_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DiagonalGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/pten/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/pten/kernels/cpu/diagonal_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pten/kernels/diagonal_kernel.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/diagonal.h"
+namespace pten {
+template <typename T, typename Context>
+void DiagonalKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int offset,
+                    int axis1,
+                    int axis2,
+                    DenseTensor* out) {
+  auto* input = &x;
+  const T* input_data = input->data<T>();
+  auto input_dim = vectorize(input->dims());
+  auto input_dim_size = input_dim.size();
+  auto* output = out;
+  T* output_data = dev_ctx.template Alloc<T>(output);
+  auto output_dim = vectorize(output->dims());
+  const int64_t offset_ = offset;
+  int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
+  int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2;
+  std::vector<int64_t> input_stride = funcs::ComputeDimStride(input_dim);
+  std::vector<int64_t> output_stride = funcs::ComputeDimStride(output_dim);
+  int64_t numel = input->numel();
+  for (int64_t idx = 0; idx < numel; idx++) {
+    std::vector<int64_t> idx_dim(input_dim_size);
+    int64_t temp = 0;
+    for (size_t i = 0; i < input_dim_size; i++) {
+      idx_dim[i] = (idx - temp) / input_stride[i];
+      temp = temp + idx_dim[i] * input_stride[i];
+    }
+    int64_t axis1_dim = idx_dim[axis1_];
+    int64_t axis2_dim = idx_dim[axis2_];
+    idx_dim.erase(idx_dim.begin() + std::max(axis1_, axis2_));
+    idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_));
+    bool flag = false;
+    if (offset_ == 0 && axis1_dim == axis2_dim) {
+      idx_dim.push_back(axis1_dim);
+      flag = true;
+    } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
+      idx_dim.push_back(axis1_dim);
+      flag = true;
+    } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
+      idx_dim.push_back(axis2_dim);
+      flag = true;
+    }
+    if (flag) {
+      int64_t idx_output = 0;
+      for (size_t i = 0; i < idx_dim.size(); i++) {
+        idx_output = idx_output + idx_dim[i] * output_stride[i];
+      }
+      output_data[idx_output] = input_data[idx];
+    }
+  }
+}
+}  // namespace pten
+PT_REGISTER_KERNEL(diagonal,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DiagonalKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
--- a/paddle/pten/kernels/diagonal_grad_kernel.h
+++ b/paddle/pten/kernels/diagonal_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/pten/core/dense_tensor.h"
+namespace pten {
+template <typename T, typename Context>
+void DiagonalGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        int offset,
+                        int axis1,
+                        int axis2,
+                        DenseTensor* in_grad);
+}  // namespace pten
--- a/paddle/pten/kernels/diagonal_kernel.h
+++ b/paddle/pten/kernels/diagonal_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/pten/core/dense_tensor.h"
+namespace pten {
+template <typename T, typename Context>
+void DiagonalKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int offset,
+                    int axis1,
+                    int axis2,
+                    DenseTensor* out);
+}  // pten
--- a/paddle/pten/kernels/funcs/diagonal.h
+++ b/paddle/pten/kernels/funcs/diagonal.h
@@ -17,11 +17,13 @@
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include "paddle/pten/kernels/primitive/kernel_primitives.h"
 #endif
 #include <algorithm>
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/core/dense_tensor.h"
 namespace pten {
 namespace funcs {
@@ -126,5 +128,88 @@ DenseTensor Diagonal(const DeviceContext& context,
  }
 }
+template <typename T>
+std::vector<T> ComputeDimStride(const std::vector<T> dim) {
+  size_t dim_size = dim.size();
+  std::vector<T> dim_strides;
+  dim_strides.resize(dim_size);
+  for (size_t i = 0; i < dim_size - 1; i++) {
+    size_t temp_stride = 1;
+    for (size_t j = i + 1; j < dim_size; j++) {
+      temp_stride = temp_stride * dim[j];
+    }
+    dim_strides[i] = temp_stride;
+  }
+  dim_strides[dim_size - 1] = 1;
+  return dim_strides;
+}
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T, int X_DIM_SIZE, int OUT_DIM_SIZE>
+__global__ void DiagonalCuda(const T* data1,
+                             T* data2,
+                             const int64_t offset_,
+                             int64_t axis1_,
+                             int64_t axis2_,
+                             int64_t* x_stride,
+                             int64_t* out_stride,
+                             int64_t numel,
+                             bool is_grad) {
+  CUDA_KERNEL_LOOP(idx, numel) {
+    int64_t idx_dim[X_DIM_SIZE] = {0};
+    int64_t temp = 0;
+    for (size_t i = 0; i < X_DIM_SIZE - 1; i++) {
+      idx_dim[i] = (idx - temp) / x_stride[i];
+      temp = temp + idx_dim[i] * x_stride[i];
+    }
+    idx_dim[X_DIM_SIZE - 1] = idx - temp;
+    int64_t axis1_dim = idx_dim[axis1_];
+    int64_t axis2_dim = idx_dim[axis2_];
+    int64_t out_dim[OUT_DIM_SIZE] = {0};
+    int temp_pos = 0;
+    for (int i = 0; i < X_DIM_SIZE; i++) {
+      if (i != axis1_ && i != axis2_) {
+        out_dim[temp_pos] = idx_dim[i];
+        temp_pos++;
+      }
+    }
+    bool flag = false;
+    if (offset_ == 0 && axis1_dim == axis2_dim) {
+      out_dim[temp_pos] = axis1_dim;
+      flag = true;
+    } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
+      out_dim[temp_pos] = axis1_dim;
+      flag = true;
+    } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
+      out_dim[temp_pos] = axis2_dim;
+      flag = true;
+    }
+    if (!is_grad) {
+      if (flag) {
+        int64_t idx_output = 0;
+        for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
+          idx_output = idx_output + out_dim[i] * out_stride[i];
+        }
+        idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1];
+        data2[idx_output] = data1[idx];
+      }
+    } else {
+      if (flag) {
+        int64_t idx_output = 0;
+        for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) {
+          idx_output = idx_output + out_dim[i] * out_stride[i];
+        }
+        idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1];
+        data2[idx] = data1[idx_output];
+      } else {
+        data2[idx] = static_cast<T>(0);
+      }
+    }
+  }
+}
+#endif
 }  // namespace funcs
 }  // namespace pten
--- a/paddle/pten/kernels/gpu/diagonal_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/diagonal_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/diagonal_grad_kernel.h"
+#include "paddle/pten/kernels/funcs/diagonal.h"
+namespace pten {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+template <typename T, typename Context>
+void DiagonalGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& out_grad,
+                        int offset,
+                        int axis1,
+                        int axis2,
+                        DenseTensor* in_grad) {
+  const auto* dout = &out_grad;
+  const auto* dout_data = dout->data<T>();
+  auto dout_dim = dout->dims().Get();
+  auto dout_dim_size = dout->dims().size();
+  std::vector<int64_t> res_dout = vectorize(framework::stride(dout->dims()));
+  DenseTensor dout_stride_tensor;
+  paddle::framework::TensorFromVector<int64_t>(
+      res_dout, dev_ctx, &dout_stride_tensor);
+  int64_t* dout_stride = dout_stride_tensor.data<int64_t>();
+  auto* dx = in_grad;
+  auto* dx_data = dev_ctx.template Alloc<T>(dx);
+  auto dx_dim = dx->dims().Get();
+  auto dx_dim_size = dx->dims().size();
+  std::vector<int64_t> res_dx = vectorize(framework::stride(dx->dims()));
+  DenseTensor dx_stride_tensor;
+  paddle::framework::TensorFromVector<int64_t>(
+      res_dx, dev_ctx, &dx_stride_tensor);
+  int64_t* dx_stride = dx_stride_tensor.data<int64_t>();
+  const int64_t offset_ = offset;
+  int64_t axis1_ = axis1 < 0 ? dx_dim_size + axis1 : axis1;
+  int64_t axis2_ = axis2 < 0 ? dx_dim_size + axis2 : axis2;
+  int64_t numel = dx->numel();
+  int threads = PADDLE_CUDA_NUM_THREADS;
+  int blocks = (numel + threads - 1) / threads;
+  switch (dx_dim_size) {
+    case 2:
+      funcs::DiagonalCuda<T, 2, 1><<<blocks, threads>>>(dout_data,
+                                                        dx_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        dx_stride,
+                                                        dout_stride,
+                                                        numel,
+                                                        true);
+      break;
+    case 3:
+      funcs::DiagonalCuda<T, 3, 2><<<blocks, threads>>>(dout_data,
+                                                        dx_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        dx_stride,
+                                                        dout_stride,
+                                                        numel,
+                                                        true);
+      break;
+    case 4:
+      funcs::DiagonalCuda<T, 4, 3><<<blocks, threads>>>(dout_data,
+                                                        dx_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        dx_stride,
+                                                        dout_stride,
+                                                        numel,
+                                                        true);
+      break;
+    case 5:
+      funcs::DiagonalCuda<T, 5, 4><<<blocks, threads>>>(dout_data,
+                                                        dx_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        dx_stride,
+                                                        dout_stride,
+                                                        numel,
+                                                        true);
+      break;
+    case 6:
+      funcs::DiagonalCuda<T, 6, 5><<<blocks, threads>>>(dout_data,
+                                                        dx_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        dx_stride,
+                                                        dout_stride,
+                                                        numel,
+                                                        true);
+      break;
+    case 7:
+      funcs::DiagonalCuda<T, 7, 6><<<blocks, threads>>>(dout_data,
+                                                        dx_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        dx_stride,
+                                                        dout_stride,
+                                                        numel,
+                                                        true);
+      break;
+    case 8:
+      funcs::DiagonalCuda<T, 8, 7><<<blocks, threads>>>(dout_data,
+                                                        dx_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        dx_stride,
+                                                        dout_stride,
+                                                        numel,
+                                                        true);
+      break;
+    case 9:
+      funcs::DiagonalCuda<T, 9, 8><<<blocks, threads>>>(dout_data,
+                                                        dx_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        dx_stride,
+                                                        dout_stride,
+                                                        numel,
+                                                        true);
+      break;
+    default:
+      PADDLE_THROW(errors::InvalidArgument(
+          "The rank of output(input@Grad) should be less than 10, but "
+          "received %d.",
+          dx_dim_size));
+  }
+}
+}  // namespace pten
+PT_REGISTER_KERNEL(diagonal_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DiagonalGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
--- a/paddle/pten/kernels/gpu/diagonal_kernel.cu
+++ b/paddle/pten/kernels/gpu/diagonal_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/diagonal_kernel.h"
+#include "paddle/pten/kernels/funcs/diagonal.h"
+namespace pten {
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+template <typename T, typename Context>
+void DiagonalKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int offset,
+                    int axis1,
+                    int axis2,
+                    DenseTensor* out) {
+  auto* input = &x;
+  const auto* input_data = input->data<T>();
+  auto input_dim = input->dims().Get();
+  auto input_dim_size = input->dims().size();
+  std::vector<int64_t> res_in = vectorize(framework::stride(input->dims()));
+  DenseTensor input_stride_tensor;
+  paddle::framework::TensorFromVector<int64_t>(
+      res_in, dev_ctx, &input_stride_tensor);
+  int64_t* input_stride = input_stride_tensor.data<int64_t>();
+  auto* output = out;
+  auto* output_data = dev_ctx.template Alloc<T>(out);
+  auto output_dim = output->dims().Get();
+  auto output_dim_size = output->dims().size();
+  std::vector<int64_t> res_out = vectorize(framework::stride(output->dims()));
+  DenseTensor output_stride_tensor;
+  paddle::framework::TensorFromVector<int64_t>(
+      res_out, dev_ctx, &output_stride_tensor);
+  int64_t* output_stride = output_stride_tensor.data<int64_t>();
+  const int64_t offset_ = offset;
+  int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
+  int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2;
+  int64_t numel = input->numel();
+  int threads = PADDLE_CUDA_NUM_THREADS;
+  int blocks = (numel + threads - 1) / threads;
+  switch (input_dim_size) {
+    case 2:
+      funcs::DiagonalCuda<T, 2, 1><<<blocks, threads>>>(input_data,
+                                                        output_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        input_stride,
+                                                        output_stride,
+                                                        numel,
+                                                        false);
+      break;
+    case 3:
+      funcs::DiagonalCuda<T, 3, 2><<<blocks, threads>>>(input_data,
+                                                        output_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        input_stride,
+                                                        output_stride,
+                                                        numel,
+                                                        false);
+      break;
+    case 4:
+      funcs::DiagonalCuda<T, 4, 3><<<blocks, threads>>>(input_data,
+                                                        output_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        input_stride,
+                                                        output_stride,
+                                                        numel,
+                                                        false);
+      break;
+    case 5:
+      funcs::DiagonalCuda<T, 5, 4><<<blocks, threads>>>(input_data,
+                                                        output_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        input_stride,
+                                                        output_stride,
+                                                        numel,
+                                                        false);
+      break;
+    case 6:
+      funcs::DiagonalCuda<T, 6, 5><<<blocks, threads>>>(input_data,
+                                                        output_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        input_stride,
+                                                        output_stride,
+                                                        numel,
+                                                        false);
+      break;
+    case 7:
+      funcs::DiagonalCuda<T, 7, 6><<<blocks, threads>>>(input_data,
+                                                        output_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        input_stride,
+                                                        output_stride,
+                                                        numel,
+                                                        false);
+      break;
+    case 8:
+      funcs::DiagonalCuda<T, 8, 7><<<blocks, threads>>>(input_data,
+                                                        output_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        input_stride,
+                                                        output_stride,
+                                                        numel,
+                                                        false);
+      break;
+    case 9:
+      funcs::DiagonalCuda<T, 9, 8><<<blocks, threads>>>(input_data,
+                                                        output_data,
+                                                        offset_,
+                                                        axis1_,
+                                                        axis2_,
+                                                        input_stride,
+                                                        output_stride,
+                                                        numel,
+                                                        false);
+      break;
+    default:
+      PADDLE_THROW(errors::InvalidArgument(
+          "The rank of input should be less than 10, but received %d.",
+          input_dim_size));
+  }
+}
+}  // namespace pten
+PT_REGISTER_KERNEL(diagonal,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DiagonalKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {}
--- a/paddle/pten/ops/compat/diagonal_sig.cc
+++ b/paddle/pten/ops/compat/diagonal_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pten/core/compat/op_utils.h"
+namespace pten {
+KernelSignature DiagonalGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("diagonal_grad",
+                         {"Input", GradVarName("Out")},
+                         {"offset", "axis1", "axis2"},
+                         {GradVarName("Input")});
+}
+}  // namespace pten
+PT_REGISTER_ARG_MAPPING_FN(diagonal_grad, pten::DiagonalGradOpArgumentMapping);