[Phi] Move roi_align grad kernel and infershape from fuild to phi (#40556)

* move roi_align_grad kernel * move roi_align grad kernel and infershape to phi * remove roi_align infershape

[Phi] Move roi_align grad kernel and infershape from fuild to phi (#40556)
* move roi_align_grad kernel * move roi_align grad kernel and infershape to phi * remove roi_align infershape
3898080e · zyfncg · GitHub · 44d46d03 · 3898080e · 3898080e
17 changed file
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -341,7 +341,6 @@ void BuildDygraphPhiKernelContext(
  }
  for (size_t i = 0; i < attr_names.size(); ++i) {
-    VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i];
    if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
      if (attrs.find(attr_names[i]) !=
          attrs.end()) {  // shape is in the attribute

--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/ternary.h"
 namespace paddle {
 namespace operators {
@@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::NotFound("Input(X) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      platform::errors::NotFound("Input(ROIs) of ROIAlignOp "
-                                                 "is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::NotFound("Output(Out) of ROIAlignOp "
-                                                 "is not found."));
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-    if (ctx->HasInput("RoisNum")) {
-      auto rois_num_dims = ctx->GetInputDim("RoisNum");
-      PADDLE_ENFORCE_EQ(
-          rois_num_dims.size(), 1,
-          platform::errors::InvalidArgument("The size of RoisNum should be 1"
-                                            ", but received size = %d",
-                                            rois_num_dims.size()));
-    }
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 4,
-        platform::errors::InvalidArgument(
-            "The format of Input(X) in"
-            "RoIAlignOp is NCHW. And the rank of input must be 4. "
-            "But received rank = %d",
-            input_dims.size()));
-    PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument(
-                                               "The rank of Input(ROIs) "
-                                               "in RoIAlignOp should be 2. "
-                                               "But the rank of RoIs is %d",
-                                               rois_dims.size()));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(rois_dims[1], 4,
-                        platform::errors::InvalidArgument(
-                            "The second dimension "
-                            "of Input(ROIs) should be 4. But received the "
-                            "dimension = %d",
-                            rois_dims[1]));
-    }
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_height' attribute in RoIAlignOp is "
-                          "invalid. The height must be greater than 0. But "
-                          "received 'pooled_height' = %d",
-                          pooled_height));
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      platform::errors::InvalidArgument(
-                          "The 'pooled_width' attribute in RoIAlignOp is "
-                          "invalid. The width must be greater than 0. But "
-                          "received 'pooled_width' = %d",
-                          pooled_width));
-    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "The 'spatial_scale' attribute in RoIAlignOp is "
-                          "invalid. The scale must be greater than 0. But "
-                          "received 'spatial_scale' = %f",
-                          spatial_scale));
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    ctx->SetOutputDim("Out", out_dims);
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
@@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor,
+                            PD_INFER_META(phi::RoiAlignInferMeta));
 REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker,
                  ops::ROIAlignGradMaker<paddle::framework::OpDesc>,
-                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>);
+                  ops::ROIAlignGradMaker<paddle::imperative::OpBase>,
+                  RoiAlignInferShapeFunctor);
 REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
                  ops::RoiAlignGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    roi_align_grad,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_VERSION(roi_align)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-template <class T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   const T out_grad_this_bin, const T count,
-                                   T* batch_grad_data) {
-  int x_low, y_low, x_high, y_high;
-  T w1, w2, w3, w4;
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    w1 = w2 = w3 = w4 = 0;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-  y = y <= 0 ? 0 : y;
-  x = x <= 0 ? 0 : x;
-  y_low = static_cast<int>(y);
-  x_low = static_cast<int>(x);
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = static_cast<T>(y_low);
-  } else {
-    y_high = y_low + 1;
-  }
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = static_cast<T>(x_low);
-  } else {
-    x_high = x_low + 1;
-  }
-  T ly = y - y_low, lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-  T diff1 = out_grad_this_bin * w1 / count;
-  T diff2 = out_grad_this_bin * w2 / count;
-  T diff3 = out_grad_this_bin * w3 / count;
-  T diff4 = out_grad_this_bin * w4 / count;
-  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-    *(batch_grad_data + y_low * width + x_low) += diff1;
-    *(batch_grad_data + y_low * width + x_high) += diff2;
-    *(batch_grad_data + y_high * width + x_low) += diff3;
-    *(batch_grad_data + y_high * width + x_high) += diff4;
-  }
-}
-template <typename DeviceContext, typename T>
-class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-    auto aligned = ctx.Attr<bool>("aligned");
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    int rois_batch_size;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      auto* rois_num_data = rois_num_t->data<int>();
-      int start = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int i = start; i < start + rois_num_data[n]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_data[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-    }
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-    int output_grad_size = out_grad->numel();
-    if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) {
-      return;
-    }
-    const T* rois_data = rois->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-    auto in_stride = phi::stride(in->dims());
-    auto roi_stride = phi::stride(rois->dims());
-    auto out_stride = phi::stride(out_grad->dims());
-    T roi_offset = aligned ? T(0.5) : 0;
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_idx = roi_batch_id_data[n];
-      T roi_xmin = rois_data[0] * spatial_scale - roi_offset;
-      T roi_ymin = rois_data[1] * spatial_scale - roi_offset;
-      T roi_xmax = rois_data[2] * spatial_scale - roi_offset;
-      T roi_ymax = rois_data[3] * spatial_scale - roi_offset;
-      T roi_width = roi_xmax - roi_xmin;
-      T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
-      roi_height = std::max(roi_height, static_cast<T>(1.));
-      if (!aligned) {
-        roi_width = std::max(roi_width, static_cast<T>(1.));
-        roi_height = std::max(roi_height, static_cast<T>(1.));
-      }
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-      for (int c = 0; c < channels; ++c) {
-        T* batch_grad_data =
-            in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1];
-        const T* batch_out_grad_data =
-            out_grad_data + n * out_stride[0] + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            int pool_index = ph * pooled_width + pw;
-            T out_grad_this_bin = batch_out_grad_data[pool_index];
-            int roi_bin_grid_h = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_height / pooled_height);
-            int roi_bin_grid_w = (sampling_ratio > 0)
-                                     ? sampling_ratio
-                                     : ceil(roi_width / pooled_width);
-            T count = roi_bin_grid_h * roi_bin_grid_w;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              const T y = roi_ymin + ph * bin_size_h +
-                          static_cast<T>(iy + .5f) * bin_size_h /
-                              static_cast<T>(roi_bin_grid_h);
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                const T x = roi_xmin + pw * bin_size_w +
-                            static_cast<T>(ix + .5f) * bin_size_w /
-                                static_cast<T>(roi_bin_grid_w);
-                bilinear_interpolate_gradient(height, width, y, x,
-                                              out_grad_this_bin, count,
-                                              batch_grad_data);
-              }
-            }
-          }
-        }
-      }
-      rois_data += roi_stride[0];
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -21,6 +21,10 @@ limitations under the License. */
 namespace phi {
+// Common InferMeta Functions for backward operators.
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
 void BilinearTensorProductGradInferMeta(const MetaTensor& x,
                                        const MetaTensor& y,
                                        const MetaTensor& weight,

--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -29,6 +29,8 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void AllValueCompareInferMeta(const MetaTensor& x,
                              const MetaTensor& y,

--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,23 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
+// Common InferMeta Functions for multiary operators, The format like:
+//
+//   1. The number of input MetaTensor is more than 3:
+//      void [FunctionDesc|OpName]InferMeta(const MetaTensor& x,
+//                                          const MetaTensor& y,
+//                                          const MetaTensor& z,
+//                                          const MetaTensor& w,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+//   2. There are `const vector<MetaTensor*>&` in params:
+//      void [FunctionDesc|OpName]InferMeta(const vector<MetaTensor*>& x,
+//                                          ...,
+//                                          MetaTensor* out) {}
+//
+// NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
 std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
 void AdadeltaInferMeta(const MetaTensor& param,

--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -27,6 +27,8 @@ namespace phi {
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out);

--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -322,6 +322,83 @@ void NllLossRawInferMeta(const MetaTensor& input,
  total_weight->set_dtype(input.dtype());
 }
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config) {
+  auto input_dims = x.dims();
+  auto boxes_dims = boxes.dims();
+  if (boxes_num) {
+    auto boxes_num_dims = boxes_num->dims();
+    PADDLE_ENFORCE_EQ(
+        boxes_num_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The size of RoisNum should be 1"
+                                     ", but received size = %d",
+                                     boxes_num_dims.size()));
+  }
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    4,
+                    phi::errors::InvalidArgument(
+                        "The format of Input(X) in"
+                        "RoIAlignOp is NCHW. And the rank of input must be 4. "
+                        "But received rank = %d",
+                        input_dims.size()));
+  PADDLE_ENFORCE_EQ(boxes_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument("The rank of Input(ROIs) "
+                                                 "in RoIAlignOp should be 2. "
+                                                 "But the rank of RoIs is %d",
+                                                 boxes_dims.size()));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(boxes_dims[1],
+                      4,
+                      phi::errors::InvalidArgument(
+                          "The second dimension "
+                          "of Input(ROIs) should be 4. But received the "
+                          "dimension = %d",
+                          boxes_dims[1]));
+  }
+  PADDLE_ENFORCE_GT(pooled_height,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_height' attribute in RoIAlignOp is "
+                        "invalid. The height must be greater than 0. But "
+                        "received 'pooled_height' = %d",
+                        pooled_height));
+  PADDLE_ENFORCE_GT(pooled_width,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The 'pooled_width' attribute in RoIAlignOp is "
+                        "invalid. The width must be greater than 0. But "
+                        "received 'pooled_width' = %d",
+                        pooled_width));
+  PADDLE_ENFORCE_GT(spatial_scale,
+                    0.0f,
+                    phi::errors::InvalidArgument(
+                        "The 'spatial_scale' attribute in RoIAlignOp is "
+                        "invalid. The scale must be greater than 0. But "
+                        "received 'spatial_scale' = %f",
+                        spatial_scale));
+  auto out_dims = input_dims;
+  out_dims[0] = boxes_dims[0];
+  out_dims[1] = input_dims[1];
+  out_dims[2] = pooled_height;
+  out_dims[3] = pooled_width;
+  out->set_dims(out_dims);
+  out->set_dtype(x.dtype());
+}
 void ScatterInferMeta(const MetaTensor& x,
                      const MetaTensor& index,
                      const MetaTensor& updates,

--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -30,6 +30,8 @@ namespace phi {
 //   Because functions in this file not only can infer shape, but also need
 //   infer lod or other useful data.
 //
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void AccuracyInferMeta(const MetaTensor& out,
                       const MetaTensor& indice,
                       const MetaTensor& label,
@@ -71,6 +73,17 @@ void NllLossRawInferMeta(const MetaTensor& input,
                         MetaTensor* total_weight,
                         MetaConfig config = MetaConfig());
+void RoiAlignInferMeta(const MetaTensor& x,
+                       const MetaTensor& boxes,
+                       paddle::optional<const MetaTensor&> boxes_num,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       int sampling_ratio,
+                       bool aligned,
+                       MetaTensor* out,
+                       MetaConfig config = MetaConfig());
 void ScatterInferMeta(const MetaTensor& x,
                      const MetaTensor& index,
                      const MetaTensor& updates,

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -31,6 +31,8 @@ class MetaConfig;
 // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 // Because functions in this file not only can infer shape, but also need
 // infer lod or other useful data.
+//
+// The InferMeta Functions in this file are arranged in alphabetic order.
 void ArgMinMaxInferMeta(const MetaTensor& x,
                        int64_t axis,

--- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+template <class T>
+void bilinear_interpolate_gradient(const int height,
+                                   const int width,
+                                   T y,
+                                   T x,
+                                   const T out_grad_this_bin,
+                                   const T count,
+                                   T* batch_grad_data) {
+  int x_low, y_low, x_high, y_high;
+  T w1, w2, w3, w4;
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+  y_low = static_cast<int>(y);
+  x_low = static_cast<int>(x);
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T diff1 = out_grad_this_bin * w1 / count;
+  T diff2 = out_grad_this_bin * w2 / count;
+  T diff3 = out_grad_this_bin * w3 / count;
+  T diff4 = out_grad_this_bin * w4 / count;
+  if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+    *(batch_grad_data + y_low * width + x_low) += diff1;
+    *(batch_grad_data + y_low * width + x_high) += diff2;
+    *(batch_grad_data + y_high * width + x_low) += diff3;
+    *(batch_grad_data + y_high * width + x_high) += diff4;
+  }
+}
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  auto in_dims = x.dims();
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int rois_num = boxes.dims()[0];
+  if (!dx) {
+    return;
+  }
+  DenseTensor roi_batch_id_list = Empty<int>(dev_ctx, {rois_num});
+  int* box_batch_id_data = roi_batch_id_list.data<int>();
+  int boxes_batch_size;
+  if (boxes_num) {
+    boxes_batch_size = boxes_num->numel();
+    auto* boxes_num_data = boxes_num->data<int>();
+    int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (int i = start; i < start + boxes_num_data[n]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+      start += boxes_num_data[n];
+    }
+  } else {
+    auto boxes_lod = boxes.lod().back();
+    boxes_batch_size = boxes_lod.size() - 1;
+    for (int n = 0; n < boxes_batch_size; ++n) {
+      for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
+        box_batch_id_data[i] = n;
+      }
+    }
+  }
+  dev_ctx.template Alloc<T>(dx);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+  int output_grad_size = out_grad.numel();
+  if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) {
+    return;
+  }
+  const T* boxes_data = boxes.data<T>();
+  const T* out_grad_data = out_grad.data<T>();
+  T* dx_data = dev_ctx.template Alloc<T>(dx);
+  auto in_stride = phi::stride(x.dims());
+  auto roi_stride = phi::stride(boxes.dims());
+  auto out_stride = phi::stride(out_grad.dims());
+  T roi_offset = aligned ? T(0.5) : 0;
+  for (int n = 0; n < rois_num; ++n) {
+    int box_batch_idx = box_batch_id_data[n];
+    T roi_xmin = boxes_data[0] * spatial_scale - roi_offset;
+    T roi_ymin = boxes_data[1] * spatial_scale - roi_offset;
+    T roi_xmax = boxes_data[2] * spatial_scale - roi_offset;
+    T roi_ymax = boxes_data[3] * spatial_scale - roi_offset;
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    roi_width = std::max(roi_width, static_cast<T>(1.));
+    roi_height = std::max(roi_height, static_cast<T>(1.));
+    if (!aligned) {
+      roi_width = std::max(roi_width, static_cast<T>(1.));
+      roi_height = std::max(roi_height, static_cast<T>(1.));
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    for (int c = 0; c < channels; ++c) {
+      T* batch_grad_data =
+          dx_data + box_batch_idx * in_stride[0] + c * in_stride[1];
+      const T* batch_out_grad_data =
+          out_grad_data + n * out_stride[0] + c * out_stride[1];
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int pool_index = ph * pooled_width + pw;
+          T out_grad_this_bin = batch_out_grad_data[pool_index];
+          int roi_bin_grid_h = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_height / pooled_height);
+          int roi_bin_grid_w = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_width / pooled_width);
+          T count = roi_bin_grid_h * roi_bin_grid_w;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_ymin + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_xmin + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              bilinear_interpolate_gradient(height,
+                                            width,
+                                            y,
+                                            x,
+                                            out_grad_this_bin,
+                                            count,
+                                            batch_grad_data);
+            }
+          }
+        }
+      }
+    }
+    boxes_data += roi_stride[0];
+  }
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(roi_align_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RoiAlignGradKernel,
+                   float,
+                   double,
+                   int) {}
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -179,7 +179,7 @@ void AvgPool(const std::vector<T>& interpolated_values,
 }
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
                    paddle::optional<const DenseTensor&> boxes_num,
@@ -315,4 +315,4 @@ void ROIAlignKernel(const Context& dev_ctx,
 }  // namespace phi
 PD_REGISTER_KERNEL(
-    roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {}
+    roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {}
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
-Licensed under the Apache License, Version 2.0 (the "License");
+#include "paddle/phi/kernels/roi_align_grad_kernel.h"
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-namespace paddle {
+namespace phi {
-namespace operators {
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
@@ -34,10 +36,18 @@ static inline int NumBlocks(const int N) {
 }
 template <class T>
-__device__ void BilinearInterpolateGradient(const int height, const int width,
+__device__ void BilinearInterpolateGradient(const int height,
-                                            T y, T x, T* w1, T* w2, T* w3,
+                                            const int width,
-                                            T* w4, int* x_low, int* x_high,
+                                            T y,
-                                            int* y_low, int* y_high) {
+                                            T x,
+                                            T* w1,
+                                            T* w2,
+                                            T* w3,
+                                            T* w4,
+                                            int* x_low,
+                                            int* x_high,
+                                            int* y_low,
+                                            int* y_high) {
  if (y < -1.0 || y > height || x < -1.0 || x > width) {
    return;
  }
@@ -66,12 +76,20 @@ __device__ void BilinearInterpolateGradient(const int height, const int width,
 }
 template <typename T>
-__global__ void GPUROIAlignBackward(
+__global__ void GPURoiAlignBackward(const int nthreads,
-    const int nthreads, const T* input_rois, const T* out_grad,
+                                    const T* input_rois,
-    const int num_rois, const float spatial_scale, const int channels,
+                                    const T* out_grad,
-    const int height, const int width, const int pooled_height,
+                                    const int num_rois,
-    const int pooled_width, const int sampling_ratio, int* roi_batch_id_data,
+                                    const float spatial_scale,
-    T* input_grad, const bool continuous_coordinate) {
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int pooled_height,
+                                    const int pooled_width,
+                                    const int sampling_ratio,
+                                    int* roi_batch_id_data,
+                                    T* input_grad,
+                                    const bool continuous_coordinate) {
  CUDA_KERNEL_LOOP(i, nthreads) {
    int pw = i % pooled_width;
    int ph = (i / pooled_width) % pooled_height;
@@ -119,109 +137,124 @@ __global__ void GPUROIAlignBackward(
                        static_cast<T>(roi_bin_grid_w);
        T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
        int x_low = -1, x_high = -1, y_low = -1, y_high = -1;
-        BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4,
+        BilinearInterpolateGradient(height,
-                                    &x_low, &x_high, &y_low, &y_high);
+                                    width,
+                                    y,
+                                    x,
+                                    &w1,
+                                    &w2,
+                                    &w3,
+                                    &w4,
+                                    &x_low,
+                                    &x_high,
+                                    &y_low,
+                                    &y_high);
        T diff1 = out_grad_this_bin * w1 / count;
        T diff2 = out_grad_this_bin * w2 / count;
        T diff3 = out_grad_this_bin * w3 / count;
        T diff4 = out_grad_this_bin * w4 / count;
        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low,
+          paddle::platform::CudaAtomicAdd(
-                                  diff1);
+              offset_input_grad + y_low * width + x_low, diff1);
-          platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high,
+          paddle::platform::CudaAtomicAdd(
-                                  diff2);
+              offset_input_grad + y_low * width + x_high, diff2);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low,
+          paddle::platform::CudaAtomicAdd(
-                                  diff3);
+              offset_input_grad + y_high * width + x_low, diff3);
-          platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high,
+          paddle::platform::CudaAtomicAdd(
-                                  diff4);
+              offset_input_grad + y_high * width + x_high, diff4);
        }
      }
    }
  }
 }
-template <typename Place, typename T>
+template <typename T, typename Context>
-class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
+void RoiAlignGradKernel(const Context& dev_ctx,
- public:
+                        const DenseTensor& x,
-  void Compute(const framework::ExecutionContext& ctx) const override {
+                        const DenseTensor& boxes,
-    auto* in = ctx.Input<Tensor>("X");
+                        paddle::optional<const DenseTensor&> boxes_num,
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
+                        const DenseTensor& out_grad,
+                        int pooled_height,
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+                        int pooled_width,
-    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+                        float spatial_scale,
+                        int sampling_ratio,
-    auto pooled_height = ctx.Attr<int>("pooled_height");
+                        bool aligned,
-    auto pooled_width = ctx.Attr<int>("pooled_width");
+                        DenseTensor* dx) {
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+  int rois_num = boxes.dims()[0];
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
+  int channels = x.dims()[1];
-    auto aligned = ctx.Attr<bool>("aligned");
+  int height = x.dims()[2];
+  int width = x.dims()[3];
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
+  if (!dx) {
-    int height = in->dims()[2];
+    return;
-    int width = in->dims()[3];
+  }
-    if (!in_grad) {
+  DenseTensor box_batch_id_list;
-      return;
+  box_batch_id_list.Resize({rois_num});
-    }
+  int* box_batch_size = dev_ctx.template HostAlloc<int>(&box_batch_id_list);
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
+  auto cplace = phi::CPUPlace();
-    auto cplace = platform::CPUPlace();
+  auto gplace = dev_ctx.GetPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
+  if (boxes_num) {
+    int boxes_batch_size = boxes_num->numel();
-    auto& dev_ctx = ctx.cuda_device_context();
+    std::vector<int> boxes_num_list(boxes_batch_size);
-    auto gplace = ctx.GetPlace();
+    paddle::memory::Copy(cplace,
-    if (ctx.HasInput("RoisNum")) {
+                         boxes_num_list.data(),
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+                         gplace,
-      int rois_batch_size = rois_num_t->numel();
+                         boxes_num->data<int>(),
-      std::vector<int> rois_num_list(rois_batch_size);
+                         sizeof(int) * boxes_batch_size,
-      memory::Copy(cplace, rois_num_list.data(), gplace,
+                         0);
-                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+    int start = 0;
-      int start = 0;
+    for (int n = 0; n < boxes_batch_size; ++n) {
-      for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = start; i < start + boxes_num_list[n]; ++i) {
-        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
+        box_batch_size[i] = n;
-          roi_batch_id_data[i] = n;
-        }
-        start += rois_num_list[n];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
      }
+      start += boxes_num_list[n];
    }
-    auto roi_ptr =
+  } else {
-        memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int));
+    auto boxes_lod = boxes.lod().back();
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    int boxes_batch_size = boxes_lod.size() - 1;
-    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    for (int n = 0; n < boxes_batch_size; ++n) {
-    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+      for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) {
-                 dev_ctx.stream());
+        box_batch_size[i] = n;
-    in_grad->mutable_data<T>(ctx.GetPlace());
+      }
-    phi::funcs::SetConstant<Place, T> set_zero;
-    set_zero(dev_ctx, in_grad, static_cast<T>(0));
-    int output_grad_size = out_grad->numel();
-    int blocks = NumBlocks(output_grad_size);
-    int threads = kNumCUDAThreads;
-    if (output_grad_size > 0) {
-      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-          output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
-          spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_id_data, in_grad->mutable_data<T>(ctx.GetPlace()),
-          aligned);
    }
  }
-};
+  auto roi_ptr =
+      paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int));
+  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+  int bytes = box_batch_id_list.numel() * sizeof(int);
+  paddle::memory::Copy(
+      gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
+  dev_ctx.template Alloc<T>(dx);
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, dx, static_cast<T>(0));
+  int output_grad_size = out_grad.numel();
+  int blocks = NumBlocks(output_grad_size);
+  int threads = kNumCUDAThreads;
+  if (output_grad_size > 0) {
+    GPURoiAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        output_grad_size,
+        boxes.data<T>(),
+        out_grad.data<T>(),
+        rois_num,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        roi_id_data,
+        dx->data<T>(),
+        aligned);
+  }
+}
-}  // namespace operators
+}  // namespace phi
-}  // namespace paddle
-namespace ops = paddle::operators;
+PD_REGISTER_KERNEL(
-REGISTER_OP_CUDA_KERNEL(
+    roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {}
-    roi_align_grad,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUROIAlignGradOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -71,7 +71,7 @@ __device__ T BilinearInterpolate(
 }
 template <class T>
-__global__ void GPUROIAlignForward(const int nthreads,
+__global__ void GPURoiAlignForward(const int nthreads,
                                   const T* input_data,
                                   const T* input_rois,
                                   const float spatial_scale,
@@ -137,7 +137,7 @@ __global__ void GPUROIAlignForward(const int nthreads,
 }
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
                    paddle::optional<const DenseTensor&> boxes_num,
@@ -233,7 +233,7 @@ void ROIAlignKernel(const Context& dev_ctx,
  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
  paddle::memory::Copy(
      gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
-  GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+  GPURoiAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
      output_size,
      x.data<T>(),
      boxes.data<T>(),
@@ -252,4 +252,4 @@ void ROIAlignKernel(const Context& dev_ctx,
 }  // namespace phi
 PD_REGISTER_KERNEL(
-    roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {}
+    roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {}
--- a/paddle/phi/kernels/roi_align_grad_kernel.h
+++ b/paddle/phi/kernels/roi_align_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+namespace phi {
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        paddle::optional<const DenseTensor&> boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx);
+}  // namespace phi
--- a/paddle/phi/kernels/roi_align_kernel.h
+++ b/paddle/phi/kernels/roi_align_kernel.h
@@ -20,7 +20,7 @@
 namespace phi {
 template <typename T, typename Context>
-void ROIAlignKernel(const Context& dev_ctx,
+void RoiAlignKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
                    paddle::optional<const DenseTensor&> boxes_num,

--- a/paddle/phi/ops/compat/roi_align_sig.cc
+++ b/paddle/phi/ops/compat/roi_align_sig.cc
@@ -16,7 +16,7 @@
 namespace phi {
-KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
+KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
  return KernelSignature("roi_align",
                         {"X", "ROIs", "RoisNum"},
                         {"pooled_height",
@@ -27,6 +27,19 @@ KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) {
                         {"Out"});
 }
+KernelSignature RoiAlignGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("roi_align_grad",
+                         {"X", "ROIs", "RoisNum", GradVarName("Out")},
+                         {"pooled_height",
+                          "pooled_width",
+                          "spatial_scale",
+                          "sampling_ratio",
+                          "aligned"},
+                         {GradVarName("X")});
+}
 }  // namespace phi
-PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping);