diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index 85e15d0ed1be83992254b32e4aa178da7fa712a9..b301b3a926f89e2be01f752656303d5e4893fc68 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -39,6 +39,11 @@ class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "where (x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates. "
              "The roi batch index can be calculated from LoD.");
+    AddInput("BatchRoINums",
+             "(Tensor), "
+             "1-D tensor with shape [N], the number of"
+             " rois for each image in batch, where N is the batch size")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
               "the output of PRROIPoolOp is a 4-D Tensor with shape "
@@ -75,39 +80,57 @@ class PRROIPoolOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of op(PRROIPool) should not be null.");
+                      platform::errors::NotFound(
+                          "Input(X) of op(PRROIPool) should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true,
-                      "Input(ROIs) of op(PRROIPool) should not be null.");
+                      platform::errors::NotFound(
+                          "Input(ROIs) of op(PRROIPool) should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of op(PRROIPool) should not be null.");
+                      platform::errors::NotFound(
+                          "Output(Out) of op(PRROIPool) should not be null."));
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
     PADDLE_ENFORCE_EQ(input_dims.size(), 4,
-                      "The format of input tensor is NCHW");
-    PADDLE_ENFORCE_EQ(rois_dims.size(), 2,
-                      "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-                      "given as [(x1, y1, x2, y2), ...]");
-    PADDLE_ENFORCE_EQ(rois_dims[1], 4,
-                      "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
-                      "given as [(x1, y1, x2, y2), ...]");
-
+                      platform::errors::InvalidArgument(
+                          "The format of input tensor is NCHW"));
+    PADDLE_ENFORCE_EQ(
+        rois_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+            "given as [(x1, y1, x2, y2), ...]"));
+    PADDLE_ENFORCE_EQ(
+        rois_dims[1], 4,
+        platform::errors::InvalidArgument(
+            "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+            "given as [(x1, y1, x2, y2), ...]"));
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
     float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
 
     PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled output height must be greater than 0");
+                      platform::errors::InvalidArgument(
+                          "The pooled output height must be greater than 0"));
     PADDLE_ENFORCE_GT(pooled_width, 0,
-                      "The pooled output width must be greater than 0");
+                      platform::errors::InvalidArgument(
+                          "The pooled output width must be greater than 0"));
     PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
-                      "The spatial scale must greater than 0.");
+                      platform::errors::InvalidArgument(
+                          "The spatial scale must greater than 0."));
 
     auto out_dims = input_dims;
     out_dims[0] = rois_dims[0];
     out_dims[1] = input_dims[1];
     out_dims[2] = pooled_height;
     out_dims[3] = pooled_width;
+
+    if (ctx->HasInput("BatchRoINums")) {
+      auto rois_batch_index = ctx->GetInputDim("BatchRoINums");
+      PADDLE_ENFORCE_EQ(rois_batch_index[0], input_dims[0],
+                        platform::errors::InvalidArgument(
+                            "The length of BatchRoINums should equal to  "
+                            "first dim of inputs(X)"));
+    }
     ctx->SetOutputDim("Out", out_dims);
   }
 
@@ -154,6 +177,7 @@ class PRROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("X", this->Input("X"));
     op->SetInput("Out", this->Output("Out"));
     op->SetInput("ROIs", this->Input("ROIs"));
+    op->SetInput("BatchRoINums", this->Input("BatchRoINums"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     op->SetOutput(framework::GradVarName("ROIs"), this->InputGrad("ROIs"));
@@ -172,8 +196,12 @@ REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     prroi_pool,
     ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     prroi_pool_grad,
     ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index 35180dc91ae2f50d21863c3bc8fdbd02b0add62e..caf6892a9879e0e9fe10610a57cbadef108f0f63 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -185,8 +185,8 @@ __global__ void GPUPRROIPoolBackward(
     PrRoIPoolingCoorBackward(
         s_w, e_w, s_h, e_h, width, height, win_start_w, win_start_h, win_end_w,
         win_end_h, pw, ph, pooled_width, pooled_height, win_size, spatial_scale,
-        offset_in_data, offset_out_data, offset_input_grad_data,
-        offset_input_roi_grad_data, GPUAccumulateRois<T>,
+        offset_in_data, offset_out_data, offset_input_roi_grad_data,
+        offset_output_grad_data, GPUAccumulateRois<T>,
         [](const T x, const T y) { return max(x, y); },
         [](const T x, const T y) { return min(x, y); });
   }
@@ -214,41 +214,66 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     int rois_num = rois->dims()[0];
     if (rois_num == 0) return;
 
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and input(X) batch_size must be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
-
     // set rois batch id
     framework::Tensor rois_batch_id_list;
     rois_batch_id_list.Resize({rois_num});
     int* rois_batch_id_data =
         rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        rois_batch_id_data[i] = n;
+
+    if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
+      auto* batchroinum = ctx.Input<Tensor>("BatchRoINums");
+      framework::Tensor batch_index_cpu;
+      framework::TensorCopySync(*batchroinum, platform::CPUPlace(),
+                                &batch_index_cpu);
+
+      int rois_batch_size = batchroinum->dims()[0];
+      auto* batch_index = batch_index_cpu.data<int64_t>();
+      size_t c = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int64_t k = 0; k < batch_index[n]; ++k) {
+          rois_batch_id_data[c] = n;
+          c = c + 1;
+        }
       }
-    }
 
-    framework::Tensor rois_batch_id_list_gpu;
-    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &rois_batch_id_list_gpu);
+    } else {
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size, batch_size,
+          platform::errors::InvalidArgument(
+              "The rois_batch_size and input(X) batch_size must be the same."));
+      int rois_num_with_lod = rois_lod[rois_batch_size];
+      PADDLE_ENFORCE_EQ(
+          rois_num, rois_num_with_lod,
+          platform::errors::InvalidArgument(
+              "The rois_num from input and lod must be the same."));
+
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+    }
 
     int output_size = out->numel();
     int blocks = NumBlocks(output_size);
     int threads = kNumCUDAThreads;
 
+    auto cplace = platform::CPUPlace();
+    auto& dev_ctx = ctx.cuda_device_context();
+    int bytes = rois_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = memory::Alloc(dev_ctx, bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, rois_batch_id_data, bytes,
+                 dev_ctx.stream());
+
     // call cuda kernel function
-    GPUPRROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    GPUPRROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale,
         input_channels, height, width, output_channels, pooled_height,
-        pooled_width, rois_batch_id_list_gpu.data<int>(),
-        out->mutable_data<T>(ctx.GetPlace()));
+        pooled_width, roi_id_data, out->mutable_data<T>(ctx.GetPlace()));
   }
 };
 
@@ -275,23 +300,50 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
     int height = in->dims()[2];
     int width = in->dims()[3];
 
-    if (input_grad) {
+    if (input_grad || input_roi_grad) {
       // set roi batch id
       framework::Tensor rois_batch_id_list;
       rois_batch_id_list.Resize({rois_num});
       int* rois_batch_id_data =
           rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
+
+      if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
+        auto* batchroinum = ctx.Input<Tensor>("BatchRoINums");
+        framework::Tensor batch_index_cpu;
+        framework::TensorCopySync(*batchroinum, platform::CPUPlace(),
+                                  &batch_index_cpu);
+
+        int rois_batch_size = batchroinum->dims()[0];
+        auto* batch_index = batch_index_cpu.data<int64_t>();
+        size_t c = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int64_t k = 0; k < batch_index[n]; ++k) {
+            rois_batch_id_data[c] = n;
+            c = c + 1;
+          }
+        }
+      } else {
+        PADDLE_ENFORCE_EQ(rois->lod().empty(), false,
+                          platform::errors::InvalidArgument(
+                              "the lod of Input ROIs shoule not be empty when "
+                              "BatchRoINums is None!"));
+        auto rois_lod = rois->lod().back();
+        int rois_batch_size = rois_lod.size() - 1;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+            rois_batch_id_data[i] = n;
+          }
         }
       }
 
-      framework::Tensor rois_batch_id_list_gpu;
-      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &rois_batch_id_list_gpu);
+      auto cplace = platform::CPUPlace();
+      auto& dev_ctx = ctx.cuda_device_context();
+      int bytes = rois_batch_id_list.numel() * sizeof(int);
+      auto roi_ptr = memory::Alloc(dev_ctx, bytes);
+      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+      const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+      memory::Copy(gplace, roi_id_data, cplace, rois_batch_id_data, bytes,
+                   dev_ctx.stream());
 
       input_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<DeviceContext, T> set_zero;
@@ -304,12 +356,10 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       int threads = kNumCUDAThreads;
 
       if (output_grad_size > 0) {
-        GPUPRROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        GPUPRROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
             output_grad_size, in->data<T>(), rois->data<T>(),
             output_grad->data<T>(), spatial_scale, input_channels, height,
-            width, output_channels, pooled_height, pooled_width,
-            rois_batch_id_list_gpu.data<int>(),
+            width, output_channels, pooled_height, pooled_width, roi_id_data,
             input_grad->mutable_data<T>(ctx.GetPlace()), out->data<T>(),
             input_roi_grad->mutable_data<T>(ctx.GetPlace()));
       }
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 641309c730f97c879679440d3e57d13b8009e2a3..25f45d0b2c978df8629b590974a606cd492802b2 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -81,7 +81,7 @@ inline HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff,
                                                   const T coeff) {
   bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
   if (!overflow) {
-    *(diff + h * width + w) = top_diff * coeff;
+    *(diff + h * width + w) += top_diff * coeff;
   }
 }
 
@@ -179,7 +179,7 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward(
     T win_start_h, T win_end_w, T win_end_h, int pw, int ph,
     const int pooled_width, const int pooled_height, T win_size,
     const float spatial_scale, const T* this_bottom_data,
-    const T* this_top_data, T* this_data_grad, T* this_out_grad,
+    const T* this_top_data, T* this_data_grad, const T* this_out_grad,
     Functor functor, MaxFunctor maxFunctor, MinFunctor minFunctor) {
   T g_x1_y = 0.f;
   T g_x2_y = 0.f;
@@ -232,20 +232,19 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward(
   partial_y1 = partial_y1 / win_size * spatial_scale;
   partial_y2 = partial_y2 / win_size * spatial_scale;
 
-  this_data_grad[0] = 0;
-  functor(this_data_grad + 1,
+  functor(this_data_grad + 0,
           (partial_x1 * (1.0 - static_cast<T>(pw) / pooled_width) +
            partial_x2 * (1.0 - static_cast<T>(pw + 1) / pooled_width)) *
               (*this_out_grad));
-  functor(this_data_grad + 2,
+  functor(this_data_grad + 1,
           (partial_y1 * (1.0 - static_cast<T>(ph) / pooled_height) +
            partial_y2 * (1.0 - static_cast<T>(ph + 1) / pooled_height)) *
               (*this_out_grad));
-  functor(this_data_grad + 3,
+  functor(this_data_grad + 2,
           (partial_x2 * static_cast<T>(pw + 1) / pooled_width +
            partial_x1 * static_cast<T>(pw) / pooled_width) *
               (*this_out_grad));
-  functor(this_data_grad + 4,
+  functor(this_data_grad + 3,
           (partial_y2 * static_cast<T>(ph + 1) / pooled_height +
            partial_y1 * static_cast<T>(ph) / pooled_height) *
               (*this_out_grad));
@@ -262,7 +261,6 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
     auto in_dims = in->dims();
     int batch_size = in_dims[0];
     int input_channels = in_dims[1];
@@ -270,6 +268,7 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     int height = in_dims[2];
     int width = in_dims[3];
     int rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
 
     auto in_stride = framework::stride(in_dims);
     auto out_stride = framework::stride(out->dims());
@@ -280,26 +279,44 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
     rois_batch_id_list.Resize({rois_num});
     int* rois_batch_id_data =
         rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
+    if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
+      auto* batchroinum = ctx.Input<framework::Tensor>("BatchRoINums");
+      auto* batch_index = batchroinum->data<int64_t>();
+      int rois_batch_size = batchroinum->dims()[0];
+      size_t c = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int64_t k = 0; k < batch_index[n]; ++k) {
+          rois_batch_id_data[c] = n;
+          c = c + 1;
+        }
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(rois->lod().empty(), false,
+                        platform::errors::InvalidArgument(
+                            "the lod of Input ROIs shoule not be empty when "
+                            "BatchRoINums is None!"));
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size, batch_size,
+          platform::errors::InvalidArgument("the rois_batch_size and input(X) "
+                                            "batch_size should be the same."));
+      int rois_num_with_lod = rois_lod[rois_batch_size];
+      PADDLE_ENFORCE_EQ(
+          rois_num_with_lod, rois_num,
+          platform::errors::InvalidArgument(
+              "the rois_num from input and lod must be the same"));
 
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "the rois_batch_size and input(X) batch_size should be the same.");
-    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num,
-                      "the rois_num from input and lod must be the same");
-
-    // calculate batch id index for each roi according to LoD
-    for (int n = 0; n < rois_batch_size; ++n) {
-      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-        rois_batch_id_data[i] = n;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
       }
     }
 
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     const T* input_rois = rois->data<T>();
-
     // calculate prroipooling, parallel processing can be implemented per ROI
     for (int n = 0; n < rois_num; ++n) {
       // set roi batch id
@@ -390,7 +407,7 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
 
-    if (input_grad && input_roi_grad) {
+    if (input_grad || input_roi_grad) {
       auto in_dims = in->dims();
       auto* in_data = in->data<T>();
       auto* out_data = out->data<T>();
@@ -406,24 +423,42 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       rois_batch_id_list.Resize({rois_num});
       int* rois_batch_id_data =
           rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
+      if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
+        auto* batchroinum = ctx.Input<framework::Tensor>("BatchRoINums");
+        auto* batch_index = batchroinum->data<int64_t>();
+        int rois_batch_size = batchroinum->dims()[0];
+        size_t c = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int64_t k = 0; k < batch_index[n]; ++k) {
+            rois_batch_id_data[c] = n;
+            c = c + 1;
+          }
+        }
+      } else {
+        auto rois_lod = rois->lod().back();
+        int rois_batch_size = rois_lod.size() - 1;
+        // calculate batch id index for each roi according to LoD
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+            rois_batch_id_data[i] = n;
+          }
         }
       }
 
       const T* input_rois = rois->data<T>();
       const T* output_grad_data = output_grad->data<T>();
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      T* input_roi_grad_data = input_roi_grad->mutable_data<T>(ctx.GetPlace());
 
+      input_grad->mutable_data<T>(ctx.GetPlace());
+      input_roi_grad->mutable_data<T>(ctx.GetPlace());
       // set gradient of X to be 0. before backpropagate.
       math::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), input_grad,
                static_cast<T>(0));
+      set_zero(ctx.template device_context<DeviceContext>(), input_roi_grad,
+               static_cast<T>(0));
+
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      T* input_roi_grad_data = input_roi_grad->mutable_data<T>(ctx.GetPlace());
 
       // backpropagate gradient per output pixel
       int output_grad_size = output_grad->numel();
@@ -493,7 +528,7 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
             s_w, e_w, s_h, e_h, width, height, win_start_w, win_start_h,
             win_end_w, win_end_h, pw, ph, pooled_width, pooled_height, win_size,
             spatial_scale, offset_in_data, offset_out_data,
-            offset_input_grad_data, offset_input_roi_grad_data,
+            offset_input_roi_grad_data, offset_output_grad_data,
             CPUAccumulateRois<T>,
             [](const T x, const T y) { return std::max(x, y); },
             [](const T x, const T y) { return std::min(x, y); });
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c1fdf12df73e13230cc1d8cb5f4733f241460416..f61a4d04163d8e3241214599567c480ddf0da84f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12655,35 +12655,53 @@ def prroi_pool(input,
                spatial_scale=1.0,
                pooled_height=1,
                pooled_width=1,
+               batch_roi_nums=None,
                name=None):
     """
-    The precise roi pooling implementation for paddle?https://arxiv.org/pdf/1807.11590.pdf
+    The precise roi pooling implementation for paddle. Reference: https://arxiv.org/pdf/1807.11590.pdf
 
     Args:
-        input (Variable):The input of Deformable PSROIPooling.The shape of input tensor is
+        input (Variable):The input of precise roi pooliing.The shape of input tensor is
                         [N,C,H,W]. Where N is batch size,C is number of input channels,H
                         is height of the feature, and W is the width of the feature.
         rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                        a 2-D LoDTensor of shape (num_rois, 4), the lod level
-                        is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
+                        a 2-D LoDTensor or Tensor of shape (num_rois, 4), the lod level
+                        is 1 when it is LoDTensor. The LoD include the rois's batch index
+                        information. If rois is Tensor, its batch index information should
+                        be provided by batch_index.
+                        Given as [[x1, y1, x2, y2], ...], (x1, y1) is
                         the top left coordinates, and (x2, y2) is the bottom
                         right coordinates.
         spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width).
                              Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
         pooled_height (integer): The pooled output height. Default: 1.
         pooled_width (integer): The pooled output width. Default: 1.
+        batch_roi_nums (Variable): The number of roi for each image in batch. It 
+                         shoule be 1-D Tensor, with shape [N] and dtype int64, 
+                         where N is the batch size. Default: None. Be note: The lod of input should be
+                         empty when batch_roi_nums has values;
         name (str, default None): The name of this operation.
 
     Returns:
-        Variable(Tensor): The shape of the returned Tensor is (num_rois, output_channels, pooled_h, pooled_w), with value type float32,float16..
+        Variable(Tensor):The shape of the returned Tensor is (N, C, pooled_height, pooled_width), with value type float32,float16. N, C denote batch_size and channels of input respectively.
 
     Examples:
         .. code-block:: python
 
+            ## prroi_pool without batch_roi_num
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[490, 28, 28], dtype='float32')
-            rois = fluid.layers.data(name='rois', shape=[4], lod_level=1, dtype='float32')
+            x = fluid.data(name='x', shape=[None, 490, 28, 28], dtype='float32')
+            rois = fluid.data(name='rois', shape=[None, 4], lod_level=1, dtype='float32')
             pool_out = fluid.layers.prroi_pool(x, rois, 1.0, 7, 7)
+            
+            ## prroi_pool with batch_roi_num
+            batchsize=4
+            x2 = fluid.data(name='x2', shape=[batchsize, 490, 28, 28], dtype='float32')
+            rois2 = fluid.data(name='rois2', shape=[batchsize, 4], dtype='float32')
+            batch_rois_num = fluid.data(name='rois_nums', shape=[batchsize], dtype='int64')
+            pool_out2 = fluid.layers.prroi_pool(x2, rois2, 1.0, 7, 7, batch_roi_nums=batch_rois_num)
+
+
     """
     helper = LayerHelper('prroi_pool', **locals())
     # check attrs
@@ -12695,10 +12713,12 @@ def prroi_pool(input,
         raise TypeError("pooled_width must be int type")
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
+    inputs_op = {'X': input, 'ROIs': rois}
+    if batch_roi_nums is not None:
+        inputs_op['BatchRoINums'] = batch_roi_nums
     helper.append_op(
         type='prroi_pool',
-        inputs={'X': input,
-                'ROIs': rois},
+        inputs=inputs_op,
         outputs={'Out': out},
         attrs={
             'spatial_scale': spatial_scale,
diff --git a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
index e3bfa062a3e1b78cb974f4916f214ae20ff7bb4d..cf9d69247f124112ba784a083c8569481b521bb4 100644
--- a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
@@ -19,6 +19,7 @@ import unittest
 from py_precise_roi_pool import PyPrRoIPool
 from op_test import OpTest
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import compiler, Program, program_guard
 
 
@@ -29,7 +30,7 @@ class TestPRROIPoolOp(OpTest):
         self.prRoIPool = PyPrRoIPool()
         self.outs = self.prRoIPool.compute(
             self.x, self.rois, self.output_channels, self.spatial_scale,
-            self.pooled_height, self.pooled_width).astype('float32')
+            self.pooled_height, self.pooled_width).astype('float64')
         self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
         self.attrs = {
             'output_channels': self.output_channels,
@@ -42,17 +43,17 @@ class TestPRROIPoolOp(OpTest):
     def init_test_case(self):
         self.batch_size = 3
         self.channels = 3 * 2 * 2
-        self.height = 6
-        self.width = 4
+        self.height = 12
+        self.width = 16
 
         self.x_dim = [self.batch_size, self.channels, self.height, self.width]
 
-        self.spatial_scale = 1.0 / 4.0
+        self.spatial_scale = 1.0 / 2.0
         self.output_channels = self.channels
-        self.pooled_height = 2
-        self.pooled_width = 2
+        self.pooled_height = 4
+        self.pooled_width = 4
 
-        self.x = np.random.random(self.x_dim).astype('float32')
+        self.x = np.random.random(self.x_dim).astype('float64')
 
     def make_rois(self):
         rois = []
@@ -72,7 +73,7 @@ class TestPRROIPoolOp(OpTest):
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
         self.rois_num = len(rois)
-        self.rois = np.array(rois).astype('float32')
+        self.rois = np.array(rois).astype('float64')
 
     def setUp(self):
         self.op_type = 'prroi_pool'
@@ -82,17 +83,20 @@ class TestPRROIPoolOp(OpTest):
         self.check_output()
 
     def test_backward(self):
-        for place in self._get_places():
-            self._get_gradient(['X'], place, ["Out"], None)
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.check_grad_with_place(place, ['X'], 'Out')
 
     def run_net(self, place):
         with program_guard(Program(), Program()):
             x = fluid.layers.data(
                 name="X",
                 shape=[self.channels, self.height, self.width],
-                dtype="float32")
+                dtype="float64")
             rois = fluid.layers.data(
-                name="ROIs", shape=[4], dtype="float32", lod_level=1)
+                name="ROIs", shape=[4], dtype="float64", lod_level=1)
             output = fluid.layers.prroi_pool(x, rois, 0.25, 2, 2)
             loss = fluid.layers.mean(output)
             optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
@@ -116,9 +120,127 @@ class TestPRROIPoolOp(OpTest):
     def test_errors(self):
         with program_guard(Program(), Program()):
             x = fluid.layers.data(
-                name="x", shape=[245, 30, 30], dtype="float32")
+                name="x", shape=[245, 30, 30], dtype="float64")
+            rois = fluid.layers.data(
+                name="rois", shape=[4], dtype="float64", lod_level=1)
+            # spatial_scale must be float type
+            self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 2, 7,
+                              7)
+            # pooled_height must be int type
+            self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 0.25,
+                              0.7, 7)
+            # pooled_width must be int type
+            self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 0.25,
+                              7, 0.7)
+
+
+class TestPRROIPoolOpTensorRoIs(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.prRoIPool = PyPrRoIPool()
+        self.outs = self.prRoIPool.compute(
+            self.x, self.rois, self.output_channels, self.spatial_scale,
+            self.pooled_height, self.pooled_width).astype('float64')
+
+        self.rois_index = np.array(self.rois_lod).reshape([-1]).astype(np.int64)
+        self.inputs = {
+            'X': self.x,
+            'ROIs': self.rois[:, 1:5],
+            'BatchRoINums': self.rois_index
+        }
+        self.attrs = {
+            'output_channels': self.output_channels,
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+        self.outputs = {'Out': self.outs}
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3 * 2 * 2
+        self.height = 12
+        self.width = 16
+
+        self.x_dim = [self.batch_size, self.channels, self.height, self.width]
+
+        self.spatial_scale = 1.0 / 2.0
+        self.output_channels = self.channels
+        self.pooled_height = 4
+        self.pooled_width = 4
+
+        self.x = np.random.random(self.x_dim).astype('float64')
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = []
+        for bno in range(self.batch_size):
+            self.rois_lod.append(bno + 1)
+            for i in range(bno + 1):
+                x1 = np.random.uniform(
+                    0, self.width // self.spatial_scale - self.pooled_width)
+                y1 = np.random.uniform(
+                    0, self.height // self.spatial_scale - self.pooled_height)
+
+                x2 = np.random.uniform(x1 + self.pooled_width,
+                                       self.width // self.spatial_scale)
+                y2 = np.random.uniform(y1 + self.pooled_height,
+                                       self.height // self.spatial_scale)
+                roi = [bno, x1, y1, x2, y2]
+                rois.append(roi)
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype('float64')
+
+    def setUp(self):
+        self.op_type = 'prroi_pool'
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_backward(self):
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+    def run_net(self, place):
+        with program_guard(Program(), Program()):
+            x = fluid.layers.data(
+                name="X",
+                shape=[self.channels, self.height, self.width],
+                dtype="float64")
+            rois = fluid.layers.data(name="ROIs", shape=[4], dtype="float64")
+            rois_index = fluid.layers.data(
+                name='rois_idx', shape=[], dtype="int64")
+            output = fluid.layers.prroi_pool(
+                x, rois, 0.25, 2, 2, batch_roi_nums=rois_index)
+            loss = fluid.layers.mean(output)
+            optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
+            optimizer.minimize(loss)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            exe.run(fluid.default_main_program(), {
+                'X': self.x,
+                "ROIs": self.rois[:, 1:5],
+                "rois_idx": self.rois_index
+            })
+
+    def test_net(self):
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.run_net(place)
+
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            x = fluid.layers.data(
+                name="x", shape=[245, 30, 30], dtype="float64")
             rois = fluid.layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
+                name="rois", shape=[4], dtype="float64", lod_level=1)
             # spatial_scale must be float type
             self.assertRaises(TypeError, fluid.layers.prroi_pool, x, rois, 2, 7,
                               7)