modify some op for dyg rcnn (#23648)

* test=develop modify some op for dyg rcnn

modify some op for dyg rcnn (#23648)
* test=develop modify some op for dyg rcnn
0a878be8 · FDInSky · GitHub · ab05cdc4 · 0a878be8 · 0a878be8
15 changed file
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -341,6 +341,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    lod0.push_back(0);
    anchors.Resize({anchors.numel() / 4, 4});
    variances.Resize({variances.numel() / 4, 4});
+    std::vector<int64_t> tmp_lod;
    int64_t num_proposals = 0;
    for (int64_t i = 0; i < num; ++i) {
@@ -362,6 +363,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
      AppendProposals(rpn_roi_probs, num_proposals, scores);
      num_proposals += proposals.dims()[0];
      lod0.push_back(num_proposals);
+      tmp_lod.push_back(num_proposals);
+    }
+    if (context.HasOutput("RpnRoisLod")) {
+      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
+      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
+      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
+      for (int i = 0; i < num; i++) {
+        lod_data[i] = tmp_lod[i];
+      }
+      rpn_rois_lod->Resize({num, 1});
    }
    rpn_rois->set_lod(lod);
    rpn_roi_probs->set_lod(lod);
@@ -464,6 +475,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor), Output proposals with shape (rois_num, 4).");
    AddOutput("RpnRoiProbs",
              "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
+    AddOutput("RpnRoisLod", "(Tensor), rpn rois's lod info").AsDispensable();
    AddAttr<int>("pre_nms_topN",
                 "Number of top scoring RPN proposals to keep before "
                 "applying NMS.");

--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -416,9 +416,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+    auto cpu_place = platform::CPUPlace();
    int64_t num_proposals = 0;
    std::vector<size_t> offset(1, 0);
+    std::vector<int64_t> tmp_lod;
    for (int64_t i = 0; i < num; ++i) {
      Tensor im_info_slice = im_info->Slice(i, i + 1);
      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
@@ -444,6 +447,15 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
      dev_ctx.Wait();
      num_proposals += proposals.dims()[0];
      offset.emplace_back(num_proposals);
+      tmp_lod.push_back(num_proposals);
+    }
+    if (context.HasOutput("RpnRoisLod")) {
+      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
+      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
+      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
+      memory::Copy(place, lod_data, cpu_place, &tmp_lod[0],
+                   sizeof(int64_t) * num, dev_ctx.stream());
+      rpn_rois_lod->Resize({num});
    }
    framework::LoD lod;
    lod.emplace_back(offset);

--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -35,6 +35,14 @@ class ROIAlignOp : public framework::OperatorWithKernel {
    auto input_dims = ctx->GetInputDim("X");
    auto rois_dims = ctx->GetInputDim("ROIs");
+    if (ctx->HasInput("RoisLod")) {
+      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
+      PADDLE_ENFORCE_EQ(
+          rois_lod_dims.size(), 1,
+          platform::errors::InvalidArgument("The RoisLod dimension should be 1"
+                                            ", but got dim = %d",
+                                            rois_lod_dims.size()));
+    }
    PADDLE_ENFORCE_EQ(
        input_dims.size(), 4,
        platform::errors::InvalidArgument(
@@ -136,6 +144,10 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
             "given as [[x1, y1, x2, y2], ...]. "
             "(x1, y1) is the top left coordinates, and "
             "(x2, y2) is the bottom right coordinates.");
+    AddInput("RoisLod",
+             "(Tensor), "
+             "The lod info of rois.")
+        .AsDispensable();
    AddOutput("Out",
              "(Tensor), "
              "The output of ROIAlignOp is a 4-D tensor with shape "
@@ -190,6 +202,7 @@ class ROIAlignGradMaker : public framework::SingleGradOpMaker<T> {
    op->SetType("roi_align_grad");
    op->SetInput("X", this->Input("X"));
    op->SetInput("ROIs", this->Input("ROIs"));
+    op->SetInput("RoisLod", this->Input("RoisLod"));
    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
    op->SetAttrMap(this->Attrs());
@@ -210,8 +223,10 @@ REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp,
 REGISTER_OP_CPU_KERNEL(
    roi_align,
    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::CPUROIAlignOpKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_CPU_KERNEL(
    roi_align_grad,
    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <vector>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/roi_align_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -258,6 +259,28 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
    roi_batch_id_list.Resize({rois_num});
    auto cplace = platform::CPUPlace();
    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    if (ctx.HasInput("RoisLod")) {
+      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
+      int rois_batch_size = rois_lod->numel();
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size - 1, batch_size,
+          platform::errors::InvalidArgument(
+              "The rois_batch_size and imgs "
+              "batch_size must be the same. But received rois_batch_size = %d, "
+              "batch_size = %d",
+              rois_batch_size, batch_size));
+      std::vector<int64_t> rois_lod_(rois_batch_size);
+      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
+                   sizeof(int64_t) * rois_batch_size, 0);
+      for (int n = 0; n < rois_batch_size - 1; ++n) {
+        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+          roi_batch_id_data[i] = n;
+        }
+      }
+    } else {
      auto lod = rois->lod();
      PADDLE_ENFORCE_EQ(
          lod.empty(), false,
@@ -279,11 +302,10 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
          roi_batch_id_data[i] = n;
        }
      }
-    auto& dev_ctx = ctx.cuda_device_context();
+    }
    int bytes = roi_batch_id_list.numel() * sizeof(int);
    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
                 dev_ctx.stream());
    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
@@ -320,6 +342,21 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
    roi_batch_id_list.Resize({rois_num});
    auto cplace = platform::CPUPlace();
    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    if (ctx.HasInput("RoisLod")) {
+      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
+      int rois_batch_size = rois_lod->numel();
+      std::vector<int64_t> rois_lod_(rois_batch_size);
+      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
+                   sizeof(int64_t) * rois_batch_size, 0);
+      for (int n = 0; n < rois_batch_size - 1; ++n) {
+        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+          roi_batch_id_data[i] = n;
+        }
+      }
+    } else {
      auto rois_lod = rois->lod().back();
      int rois_batch_size = rois_lod.size() - 1;
      for (int n = 0; n < rois_batch_size; ++n) {
@@ -327,12 +364,11 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
          roi_batch_id_data[i] = n;
        }
      }
-    auto& dev_ctx = ctx.cuda_device_context();
+    }
    auto roi_ptr =
        memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int));
    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
    int bytes = roi_batch_id_list.numel() * sizeof(int);
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
                 dev_ctx.stream());
    in_grad->mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <limits>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -163,7 +164,24 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
    roi_batch_id_list.Resize({rois_num});
    int* roi_batch_id_data =
        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+    int rois_batch_size;
+    if (ctx.HasInput("RoisLod")) {
+      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
+      rois_batch_size = rois_lod_t->numel();
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size - 1, batch_size,
+          platform::errors::InvalidArgument(
+              "The rois_batch_size and imgs "
+              "batch_size must be the same. But received rois_batch_size = %d, "
+              "batch_size = %d",
+              rois_batch_size, batch_size));
+      auto* rois_lod = rois_lod_t->data<int64_t>();
+      for (int n = 0; n < rois_batch_size - 1; ++n) {
+        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          roi_batch_id_data[i] = n;
+        }
+      }
+    } else {
      auto lod = rois->lod();
      PADDLE_ENFORCE_EQ(
          lod.empty(), false,
@@ -185,6 +203,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
          roi_batch_id_data[i] = n;
        }
      }
+    }
    T* output_data = out->mutable_data<T>(ctx.GetPlace());
    const T* rois_data = rois->data<T>();
    for (int n = 0; n < rois_num; ++n) {
@@ -276,13 +295,25 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
    int* roi_batch_id_data =
        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+    int rois_batch_size;
+    if (ctx.HasInput("RoisLod")) {
+      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
+      rois_batch_size = rois_lod_t->numel();
+      auto* rois_lod = rois_lod_t->data<int64_t>();
+      for (int n = 0; n < rois_batch_size - 1; ++n) {
+        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          roi_batch_id_data[i] = n;
+        }
+      }
+    } else {
      auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
+      rois_batch_size = rois_lod.size() - 1;
      for (int n = 0; n < rois_batch_size; ++n) {
        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
          roi_batch_id_data[i] = n;
        }
      }
+    }
    in_grad->mutable_data<T>(ctx.GetPlace());
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::SetConstant<DeviceContext, T> set_zero;

--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -36,7 +36,10 @@ class ROIPoolOp : public framework::OperatorWithKernel {
                   "Output(Argmax) of ROIPoolOp should not be null.");
    auto input_dims = ctx->GetInputDim("X");
    auto rois_dims = ctx->GetInputDim("ROIs");
+    if (ctx->HasInput("RoisLod")) {
+      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
+      PADDLE_ENFORCE(rois_lod_dims.size() == 1, "");
+    }
    PADDLE_ENFORCE(input_dims.size() == 4,
                   "The format of input tensor is NCHW.");
    PADDLE_ENFORCE(rois_dims.size() == 2,
@@ -115,6 +118,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
             "Where batch_id is the id of the data, "
             "(x1, y1) is the top left coordinates, and "
             "(x2, y2) is the bottom right coordinates.");
+    AddInput("RoisLod", "(Tensor), The lod info of rois.").AsDispensable();
    AddOutput("Out",
              "(Tensor), "
              "The output of ROIPoolOp is a 4-D tensor with shape "
@@ -171,6 +175,7 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
    op->SetType("roi_pool_grad");
    op->SetInput("X", this->Input("X"));
    op->SetInput("ROIs", this->Input("ROIs"));
+    op->SetInput("RoisLod", this->Input("RoisLod"));
    op->SetInput("Argmax", this->Output("Argmax"));
    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
@@ -189,8 +194,10 @@ REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
    roi_pool,
    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_CPU_KERNEL(
    roi_pool_grad,
    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <vector>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -155,6 +155,23 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
    roi_batch_id_list.Resize({rois_num});
    auto cplace = platform::CPUPlace();
    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    if (ctx.HasInput("RoisLod")) {
+      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
+      int rois_batch_size = rois_lod->numel();
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size - 1, batch_size,
+          "The rois_batch_size and imgs batch_size must be the same.");
+      std::vector<int64_t> rois_lod_(rois_batch_size);
+      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
+                   sizeof(int64_t) * rois_batch_size, 0);
+      for (int n = 0; n < rois_batch_size - 1; ++n) {
+        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+          roi_batch_id_data[i] = n;
+        }
+      }
+    } else {
      auto rois_lod = rois->lod().back();
      int rois_batch_size = rois_lod.size() - 1;
      PADDLE_ENFORCE_EQ(
@@ -168,12 +185,10 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
          roi_batch_id_data[i] = n;
        }
      }
+    }
-    auto& dev_ctx = ctx.cuda_device_context();
    int bytes = roi_batch_id_list.numel() * sizeof(int);
    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
                 dev_ctx.stream());
@@ -191,6 +206,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<Tensor>("X");
    auto* rois = ctx.Input<LoDTensor>("ROIs");
+    auto* rois_lod = ctx.Input<Tensor>("RoisLod");
    auto* argmax = ctx.Input<Tensor>("Argmax");
    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -210,6 +226,22 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
      roi_batch_id_list.Resize({rois_num});
      auto cplace = platform::CPUPlace();
      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
+      auto& dev_ctx = ctx.cuda_device_context();
+      auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+      if (ctx.HasInput("RoisLod")) {
+        auto* rois_lod = ctx.Input<Tensor>("RoisLod");
+        int rois_batch_size = rois_lod->numel();
+        std::vector<int64_t> rois_lod_(rois_batch_size);
+        memory::Copy(cplace, rois_lod_.data(), gplace,
+                     rois_lod->data<int64_t>(),
+                     sizeof(int64_t) * rois_batch_size, 0);
+        for (int n = 0; n < rois_batch_size - 1; ++n) {
+          for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+            roi_batch_id_data[i] = n;
+          }
+        }
+      } else {
        auto rois_lod = rois->lod().back();
        int rois_batch_size = rois_lod.size() - 1;
        for (int n = 0; n < rois_batch_size; ++n) {
@@ -217,12 +249,10 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
            roi_batch_id_data[i] = n;
          }
        }
+      }
-      auto& dev_ctx = ctx.cuda_device_context();
      int bytes = roi_batch_id_list.numel() * sizeof(int);
      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
                   dev_ctx.stream());

--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <limits>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
@@ -55,8 +57,22 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
    int* roi_batch_id_data =
        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+    int rois_batch_size;
+    if (ctx.HasInput("RoisLod")) {
+      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
+      rois_batch_size = rois_lod_t->numel();
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size - 1, batch_size,
+          "The rois_batch_size and imgs batch_size must be the same.");
+      auto* rois_lod = rois_lod_t->data<int64_t>();
+      for (int n = 0; n < rois_batch_size - 1; ++n) {
+        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          roi_batch_id_data[i] = n;
+        }
+      }
+    } else {
      auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
+      rois_batch_size = rois_lod.size() - 1;
      PADDLE_ENFORCE_EQ(
          rois_batch_size, batch_size,
          "The rois_batch_size and imgs batch_size must be the same.");
@@ -68,6 +84,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
          roi_batch_id_data[i] = n;
        }
      }
+    }
    T* output_data = out->mutable_data<T>(ctx.GetPlace());
    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
@@ -163,13 +180,25 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
      int* roi_batch_id_data =
          roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+      int rois_batch_size;
+      if (ctx.HasInput("RoisLod")) {
+        auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
+        rois_batch_size = rois_lod_t->numel();
+        auto* rois_lod = rois_lod_t->data<int64_t>();
+        for (int n = 0; n < rois_batch_size - 1; ++n) {
+          for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+            roi_batch_id_data[i] = n;
+          }
+        }
+      } else {
        auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
+        rois_batch_size = rois_lod.size() - 1;
        for (int n = 0; n < rois_batch_size; ++n) {
          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
            roi_batch_id_data[i] = n;
          }
        }
+      }
      const T* rois_data = rois->data<T>();
      const T* out_grad_data = out_grad->data<T>();

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -2779,6 +2779,8 @@ def generate_proposals(scores,
        dtype=bbox_deltas.dtype)
    rpn_roi_probs = helper.create_variable_for_type_inference(
        dtype=scores.dtype)
+    rpn_rois_lod = helper.create_variable_for_type_inference(dtype='int32')
    helper.append_op(
        type="generate_proposals",
        inputs={
@@ -2795,12 +2797,16 @@ def generate_proposals(scores,
            'min_size': min_size,
            'eta': eta
        },
-        outputs={'RpnRois': rpn_rois,
+        outputs={
-                 'RpnRoiProbs': rpn_roi_probs})
+            'RpnRois': rpn_rois,
+            'RpnRoiProbs': rpn_roi_probs,
+            'RpnRoisLod': rpn_rois_lod
+        })
    rpn_rois.stop_gradient = True
    rpn_roi_probs.stop_gradient = True
+    rpn_rois_lod.stop_gradient = True
-    return rpn_rois, rpn_roi_probs
+    return rpn_rois, rpn_roi_probs, rpn_rois_lod
 def box_clip(input, im_info, name=None):

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6606,7 +6606,12 @@ def label_smooth(label,
 @templatedoc()
-def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+def roi_pool(input,
+             rois,
+             pooled_height=1,
+             pooled_width=1,
+             spatial_scale=1.0,
+             rois_lod=None):
    """
    This operator implements the roi_pooling layer. 
    Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7).
@@ -6622,6 +6627,7 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
    Args:
        input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W], where N is the batch size, C is the input channel, H is Height, W is weight. The data type is float32 or float64.
        rois (Variable): ROIs (Regions of Interest) to pool over. 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates.
+        rois_lod (Variable): The lod info of rois. Default: None
        pooled_height (int, optional): The pooled output height, data type is int32. Default: 1
        pooled_width (int, optional): The pooled output height, data type is int32. Default: 1
        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0
@@ -6644,19 +6650,22 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
        input_data = np.array([i for i in range(1,17)]).reshape(1,1,4,4).astype(DATATYPE)
        roi_data =fluid.create_lod_tensor(np.array([[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(DATATYPE),[[2]], place)
+        rois_lod_data = np.array([0, 2])
        x = fluid.data(name='input', shape=[None,1,4,4], dtype=DATATYPE)
        rois = fluid.data(name='roi', shape=[None,4], dtype=DATATYPE)
+        rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64') 
        pool_out = fluid.layers.roi_pool(
                input=x,
                rois=rois,
                pooled_height=1,
                pooled_width=1,
-                spatial_scale=1.0)
+                spatial_scale=1.0,
+                rois_lod=rois_lod)
        exe = fluid.Executor(place)
-        out, = exe.run(feed={'input':input_data ,'roi':roi_data}, fetch_list=[pool_out.name])
+        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_lod': rois_lod_data}, fetch_list=[pool_out.name])
        print(out)   #array([[[[11.]]], [[[16.]]]], dtype=float32)
        print(np.array(out).shape)  # (2, 1, 1, 1)
    """
@@ -6667,7 +6676,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
    helper.append_op(
        type="roi_pool",
        inputs={"X": input,
-                "ROIs": rois},
+                "ROIs": rois,
+                "RoisLod": rois_lod},
        outputs={"Out": pool_out,
                 "Argmax": argmaxes},
        attrs={
@@ -6685,7 +6695,8 @@ def roi_align(input,
              pooled_width=1,
              spatial_scale=1.0,
              sampling_ratio=-1,
-              name=None):
+              name=None,
+              rois_lod=None):
    """
    ${comment}
@@ -6696,6 +6707,7 @@ def roi_align(input,
            data type is float32 or float64. Given as [[x1, y1, x2, y2], ...], 
            (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
            right coordinates.
+        rois_lod (Variable): The lod info of rois. Default: None
        pooled_height (int32, optional): ${pooled_height_comment} Default: 1
        pooled_width (int32, optional): ${pooled_width_comment} Default: 1
        spatial_scale (float32, optional): ${spatial_scale_comment} Default: 1.0
@@ -6718,12 +6730,14 @@ def roi_align(input,
                name='data', shape=[None, 256, 32, 32], dtype='float32')
            rois = fluid.data(
                name='rois', shape=[None, 4], dtype='float32')
+            rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
            align_out = fluid.layers.roi_align(input=x,
                                               rois=rois,
                                               pooled_height=7,
                                               pooled_width=7,
                                               spatial_scale=0.5,
-                                               sampling_ratio=-1)
+                                               sampling_ratio=-1,
+                                               rois_lod=rois_lod)
    """
    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                             'roi_align')
@@ -6734,7 +6748,8 @@ def roi_align(input,
    helper.append_op(
        type="roi_align",
        inputs={"X": input,
-                "ROIs": rois},
+                "ROIs": rois,
+                "RoisLod": rois_lod},
        outputs={"Out": align_out},
        attrs={
            "pooled_height": pooled_height,

--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -480,7 +480,7 @@ class TestGenerateProposals(unittest.TestCase):
                name='bbox_deltas',
                shape=[num_anchors * 4, 8, 8],
                dtype='float32')
-            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
+            rpn_rois, rpn_roi_probs, _ = fluid.layers.generate_proposals(
                name='generate_proposals',
                scores=scores,
                bbox_deltas=bbox_deltas,

--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -281,7 +281,9 @@ class TestGenerateProposalsOp(OpTest):
        self.outputs = {
            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod])
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
+            'RpnRoisLod': (np.asarray(
+                self.lod, dtype=np.int32))
        }
    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3217,7 +3217,9 @@ class TestBook(LayerTest):
            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
            rois = layers.data(
                name="rois", shape=[4], dtype="float32", lod_level=1)
-            output = layers.roi_pool(x, rois, 7, 7, 0.6)
+            rois_lod = layers.data(
+                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
+            output = layers.roi_pool(x, rois, 7, 7, 0.6, rois_lod)
            return (output)
    def test_sequence_enumerate(self):
@@ -3232,7 +3234,10 @@ class TestBook(LayerTest):
            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
            rois = layers.data(
                name="rois", shape=[4], dtype="float32", lod_level=1)
-            output = layers.roi_align(x, rois, 14, 14, 0.5, 2)
+            rois_lod = layers.data(
+                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
+            output = layers.roi_align(x, rois, 14, 14, 0.5, 2, 'roi_align',
+                                      rois_lod)
            return (output)
    def test_roi_perspective_transform(self):

--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -26,7 +26,11 @@ class TestROIAlignOp(OpTest):
        self.init_test_case()
        self.make_rois()
        self.calc_roi_align()
-        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
+        self.inputs = {
+            'X': self.x,
+            'ROIs': (self.rois[:, 1:5], self.rois_lod),
+        }
        self.attrs = {
            'spatial_scale': self.spatial_scale,
            'pooled_height': self.pooled_height,
@@ -170,5 +174,34 @@ class TestROIAlignOp(OpTest):
        self.check_grad(['X'], 'Out')
+class TestROIAlignInLodOp(TestROIAlignOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_align()
+        seq_len = self.rois_lod[0]
+        cur_len = 0
+        lod = [cur_len]
+        for l in seq_len:
+            cur_len += l
+            lod.append(cur_len)
+        self.inputs = {
+            'X': self.x,
+            'ROIs': (self.rois[:, 1:5], self.rois_lod),
+            'RoisLod': np.asarray(lod).astype('int64')
+        }
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width,
+            'sampling_ratio': self.sampling_ratio
+        }
+        self.outputs = {'Out': self.out_data}
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -28,7 +28,10 @@ class TestROIPoolOp(OpTest):
        self.make_rois()
        self.calc_roi_pool()
-        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
+        self.inputs = {
+            'X': self.x,
+            'ROIs': (self.rois[:, 1:5], self.rois_lod),
+        }
        self.attrs = {
            'spatial_scale': self.spatial_scale,
@@ -138,5 +141,33 @@ class TestROIPoolOp(OpTest):
        self.check_grad(['X'], 'Out')
+class TestROIPoolInLodOp(TestROIPoolOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_pool()
+        seq_len = self.rois_lod[0]
+        cur_len = 0
+        lod = [cur_len]
+        for l in seq_len:
+            cur_len += l
+            lod.append(cur_len)
+        self.inputs = {
+            'X': self.x,
+            'ROIs': (self.rois[:, 1:5], self.rois_lod),
+            'RoisLod': np.asarray(lod).astype('int64')
+        }
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
 if __name__ == '__main__':
    unittest.main()