fix some code issues

ef905598 · wanghaox · wanghaox · 36dd770a · 6ab78aee · ef905598
5 changed file
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"

 #define CONFIG_BIN "./trainer_config.bin"
@@ -27,20 +28,19 @@ int main() {
  CHECK(paddle_arguments_resize(in_args, 1));

  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
                                           /* size */ 784,
                                           /* useGPU */ false);
  srand(time(0));

-  std::vector<paddle_real> input;
-  input.resize(784 * 10);
+  paddle_real* array;
+
+  // Get First row.
+  CHECK(paddle_matrix_get_row(mat, 0, &array));

-  for (int i = 0; i < input.size(); ++i) {
-    input[i] = rand() / ((float)RAND_MAX);
+  for (int i = 0; i < 784; ++i) {
+    array[i] = rand() / ((float)RAND_MAX);
  }
-  
-  // Set value for the input matrix
-  CHECK(paddle_matrix_set_value(mat, input.data()));

  CHECK(paddle_arguments_set_value(in_args, 0, mat));

@@ -53,17 +53,18 @@ int main() {

  CHECK(paddle_arguments_get_value(out_args, 0, prob));

-  std::std::vector<paddle_real> result;
-  int height;
-  int width;
+  uint64_t height;
+  uint64_t width;

-  CHECK(paddle_matrix_get_shape(prob, &height, &width);
-  result.resize(height * width);
-  CHECK(paddle_matrix_get_value(prob, result.data()));
+  CHECK(paddle_matrix_get_shape(prob, &height, &width));
+  CHECK(paddle_matrix_get_row(prob, 0, &array));

-  printf("Prob: ");
+  printf("Prob: \n");
  for (int i = 0; i < height * width; ++i) {
-    printf("%.2f ", result[i]);
+    printf("%.4f ", array[i]);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
  }
  printf("\n");


--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -17,24 +17,47 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-class RoiPoolOp : public framework::OperatorWithKernel {
+class ROIPoolOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of RoiPoolOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Rois"),
-                   "Input(Rois) of RoiPoolOp should not be null.");
+                   "Input(X) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of ROIPoolOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of RoiPoolOp should not be null.");
+                   "Output(Out) of ROIPoolOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
-                   "Output(Argmax) of RoiPoolOp should not be null.");
+                   "Output(Argmax) of ROIPoolOp should not be null.");
    auto input_dims = ctx->GetInputDim("X");
-
-    // Initialize the output's dims to maximum,
-    // and re-set to real dims by the value of Rois at kernel
-    ctx->SetOutputDim("Out", input_dims);
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = input_dims[1];
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputDim("Argmax", out_dims);
    }

 protected:
@@ -46,7 +69,7 @@ class RoiPoolOp : public framework::OperatorWithKernel {
  }
 };

-class RoiPoolGradOp : public framework::OperatorWithKernel {
+class ROIPoolGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -67,44 +90,51 @@ class RoiPoolGradOp : public framework::OperatorWithKernel {
  }
 };

-class RoiPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RoiPoolOpMaker(framework::OpProto* proto,
+  ROIPoolOpMaker(framework::OpProto* proto,
                       framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor), "
-             "the input of RoiPoolOp.");
-    AddInput("Rois",
+             "the input of ROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
             "(Tensor), "
-             "RoIs (Regions of Interest) to pool over. "
-             "Should be a 2-D tensor of shape (num_rois, 5)"
-             "given as [[batch_id, x1, y1, x2, y2], …].");
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D tensor of shape (num_rois, 5)"
+             "given as [[batch_id, x1, y1, x2, y2], …]. "
+             "Where batch_id is the id of the data, "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
    AddOutput("Out",
              "(Tensor), "
-             "RoI pooled output 4-D tensor of shape "
-             "(num_rois, channels, pooled_h, pooled_w).");
+              "The output of ROIPoolOp is a 4-D tensor with shape "
+              "(num_rois, channels, pooled_h, pooled_w).");
    AddOutput("Argmax",
              "(Tensor), "
              "Argmaxes corresponding to indices in X used "
              "for gradient computation. Only output "
              "if arg “is_test” is false.").AsIntermediate();
    AddAttr<float>("spatial_scale",
-                      "(float, default 1.0), "
-                      "Multiplicative spatial scale factor "
-                      "to translate ROI coords from their input scale "
-                      "to the scale used when pooling.")
-                      .SetDefault(1.0);
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+                   .SetDefault(1.0);
    AddAttr<int>("pooled_height",
-                      "(int, default 1), "
-                      "The pooled output height.")
-                    .SetDefault(1);
+                 "(int, default 1), "
+                 "The pooled output height.")
+                 .SetDefault(1);
    AddAttr<int>("pooled_width",
-                      "(int, default 1), "
-                      "The pooled output width.")
-                    .SetDefault(1);
+                 "(int, default 1), "
+                 "The pooled output width.")
+                 .SetDefault(1);
    AddComment(R"DOC(
-RoiPool operator
+ROIPool operator

 ROI Pooling for Faster-RCNN. The link below is a further introduction: 
 https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
@@ -116,11 +146,11 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::RoiPoolOp, ops::RoiPoolOpMaker,
-            roi_pool_grad, ops::RoiPoolGradOp);
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
+            roi_pool_grad, ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
    roi_pool,
-    ops::CPURoiPoolOpKernel<paddle::platform::CPUPlace, float>);
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
    roi_pool_grad,
-    ops::CPURoiPoolGradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
@@ -12,91 +12,80 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/platform/cuda_helper.h"
 #include "paddle/operators/roi_pool_op.h"
+#include "paddle/platform/cuda_helper.h"

 namespace paddle {
 namespace operators {

-#define FLT_MAX __FLT_MAX__
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 5;

-constexpr int PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS = 512;
-constexpr int PADDLE_OPERATORS_ROIPOOL_MAXIMUM_NUM_BLOCKS = 4096;
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+  }

-inline int PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(const int N) {
-  return std::min((N + PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS - 1)
-                  / PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS,
-                  PADDLE_OPERATORS_ROIPOOL_MAXIMUM_NUM_BLOCKS);
-}
+  template <typename T>
+  __global__ void GPUROIPoolForward(
+      const int nthreads, const T* input_data, const int64_t* input_rois,
+      const float spatial_scale, const int channels, const int height,
+      const int width, const int pooled_height, const int pooled_width,
+      T* output_data, int64_t* argmax_data) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int offset = blockDim.x * gridDim.x;
+    for (size_t i = index; i < nthreads; i += offset) {
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int c = (index / pooled_width / pooled_height) % channels;
+      int n = index / pooled_width / pooled_height / channels;

-template <typename T>
-__global__ void GPURoiPoolForward(
-    const int nthreads,
-    const T* input_data,
-    const int64_t* input_rois,
-    const float spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    T* output_data,
-    int64_t* argmax_data) {
-      int index = blockIdx.x * blockDim.x + threadIdx.x;
-      int offset = blockDim.x * gridDim.x;
-      for (size_t i = index; i < nthreads; i += offset) {
-        int pw = index % pooled_width;
-        int ph = (index / pooled_width) % pooled_height;
-        int c = (index / pooled_width / pooled_height) % channels;
-        int n = index / pooled_width / pooled_height / channels;
-
-        const int64_t* offset_input_rois = input_rois + n * 5;
-        int roi_batch_ind = offset_input_rois[0];
-        int roi_start_w = round(offset_input_rois[1] * spatial_scale);
-        int roi_start_h = round(offset_input_rois[2] * spatial_scale);
-        int roi_end_w = round(offset_input_rois[3] * spatial_scale);
-        int roi_end_h = round(offset_input_rois[4] * spatial_scale);
-
-        int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-        int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-        T bin_size_h = static_cast<T>(roi_height)
-                      / static_cast<T>(pooled_height);
-        T bin_size_w = static_cast<T>(roi_width)
-                      / static_cast<T>(pooled_width);
-
-        int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
-        int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
-        int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
-        int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
-
-        hstart = min(max(hstart + roi_start_h, 0), height);
-        hend = min(max(hend + roi_start_h, 0), height);
-        wstart = min(max(wstart + roi_start_w, 0), width);
-        wend = min(max(wend + roi_start_w, 0), width);
-        bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-        T maxval = is_empty ? 0 : -FLT_MAX;
-        int maxidx = -1;
-        const T* offset_input_data =
-            input_data + (roi_batch_ind * channels + c) * height * width;
-        for (int h = hstart; h < hend; ++h) {
-          for (int w = wstart; w < wend; ++w) {
-            int input_data_index = h * width + w;
-            if (offset_input_data[input_data_index] > maxval) {
-              maxval = offset_input_data[input_data_index];
-              maxidx = input_data_index;
-            }
+      const int64_t* offset_input_rois = input_rois + n * kROISize;
+      int roi_batch_ind = offset_input_rois[0];
+      int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+      int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+      int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+      int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+      int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+      int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+      int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+      int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+      hstart = min(max(hstart + roi_start_h, 0), height);
+      hend = min(max(hend + roi_start_h, 0), height);
+      wstart = min(max(wstart + roi_start_w, 0), width);
+      wend = min(max(wend + roi_start_w, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      T maxval = is_empty ? 0 : -std::numeric_limits<float>::max();
+      int maxidx = -1;
+      const T* offset_input_data =
+          input_data + (roi_batch_ind * channels + c) * height * width;
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          int input_data_index = h * width + w;
+          if (offset_input_data[input_data_index] > maxval) {
+            maxval = offset_input_data[input_data_index];
+            maxidx = input_data_index;
          }
        }
-        output_data[index] = maxval;
-        if (argmax_data) {
-          argmax_data[index] = maxidx;
-        }
+      }
+      output_data[index] = maxval;
+      if (argmax_data) {
+        argmax_data[index] = maxidx;
+      }
    }
  }

 template <typename T>
-__global__ void GPURoiPoolBackward(
+__global__ void GPUROIPoolBackward(
    const int nthreads,
    const int64_t* input_rois,
    const T* output_grad,
@@ -117,7 +106,7 @@ __global__ void GPURoiPoolBackward(
      int c = (index / pooled_width / pooled_height) % channels;
      int n = index / pooled_width / pooled_height / channels;

-      const int64_t* offset_input_rois = input_rois + n * 5;
+      const int64_t* offset_input_rois = input_rois + n * kROISize;
      int roi_batch_ind = offset_input_rois[0];
      int input_offset = (roi_batch_ind * channels + c) * height * width;
      int output_offset = (n * channels + c) * pooled_height * pooled_width;
@@ -135,11 +124,11 @@ __global__ void GPURoiPoolBackward(


 template <typename Place, typename T>
-class GPURoiPoolOpKernel : public framework::OpKernel<T> {
+class GPUROIPoolOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("Rois");
+    auto* rois = ctx.Input<Tensor>("ROIs");
    auto* out = ctx.Output<Tensor>("Out");
    auto* argmax = ctx.Output<Tensor>("Argmax");

@@ -147,31 +136,17 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> {
    auto pooled_width = ctx.Attr<int>("pooled_width");
    auto spatial_scale = ctx.Attr<float>("spatial_scale");

-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled output height must greater than 0");
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      "The pooled output width must greater than 0");
-    PADDLE_ENFORCE_GT(spatial_scale, 0,
-                      "The spatial scale must greater than 0");
-
    auto in_dims = in->dims();
    auto in_stride = framework::stride(in_dims);
    int channels = in_dims[1];
    int height = in_dims[2];
    int width = in_dims[3];

-    int rois_num = rois->dims()[0];
-    auto out_dims = in_dims;
-    out_dims[0] = rois_num;
-    out_dims[1] = in_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
+    size_t rois_num = rois->dims()[0];

-    out->Resize(out_dims);
    out->mutable_data<T>(ctx.GetPlace());
    math::SetConstant<Place, T> set_zero;
    set_zero(ctx.device_context(), out, static_cast<T>(0));
-    argmax->Resize(out->dims());
    argmax->mutable_data<int64_t>(ctx.GetPlace());
    math::SetConstant<Place, int64_t> set_init;
    set_init(ctx.device_context(), argmax, static_cast<int64_t>(-1));
@@ -179,10 +154,10 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> {
    if (rois_num== 0) return;

    int output_size = out->numel();
-    int blocks = PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(output_size);
-    int threads = PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS;
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;

-    GPURoiPoolForward<T>
+    GPUROIPoolForward<T>
      <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
      output_size,
      in->data<T>(),
@@ -195,17 +170,15 @@ class GPURoiPoolOpKernel : public framework::OpKernel<T> {
      pooled_width,
      out->mutable_data<T>(ctx.GetPlace()),
      argmax->mutable_data<int64_t>(ctx.GetPlace()));
-
-      return;
  }
 };

 template <typename Place, typename T>
-class GPURoiPoolGradOpKernel : public framework::OpKernel<T> {
+class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("Rois");
+    auto* rois = ctx.Input<Tensor>("ROIs");
    auto* argmax = ctx.Input<Tensor>("Argmax");

    auto* out_grad =
@@ -217,23 +190,22 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> {
    auto pooled_width = ctx.Attr<int>("pooled_width");
    auto spatial_scale = ctx.Attr<float>("spatial_scale");

-    int rois_num = rois->dims()[0];
+    size_t rois_num = rois->dims()[0];
    int channels = in->dims()[1];
    int height = in->dims()[2];
    int width = in->dims()[3];

    if (x_grad) {
-      x_grad->Resize(in->dims());
      x_grad->mutable_data<T>(ctx.GetPlace());
      math::SetConstant<Place, T> set_zero;
      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));

      int output_grad_size = out_grad->numel();
-      int blocks = PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(output_grad_size);
-      int threads = PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS;
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;

      if (output_grad_size > 0) {
-        GPURoiPoolBackward<T>
+        GPUROIPoolBackward<T>
          <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
          output_grad_size,
          rois->data<int64_t>(),
@@ -248,7 +220,6 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> {
          pooled_width,
          x_grad->mutable_data<T>(ctx.GetPlace()));
        }
-      return;
    }
  }
 };
@@ -259,7 +230,7 @@ class GPURoiPoolGradOpKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    roi_pool,
-    ops::GPURoiPoolOpKernel<paddle::platform::GPUPlace, float>);
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
    roi_pool_grad,
-    ops::GPURoiPoolGradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
@@ -25,11 +25,11 @@ using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;

 template <typename Place, typename T>
-class CPURoiPoolOpKernel : public framework::OpKernel<T> {
+class CPUROIPoolOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("Rois");
+    auto* rois = ctx.Input<Tensor>("ROIs");
    auto* out = ctx.Output<Tensor>("Out");
    auto* argmax = ctx.Output<Tensor>("Argmax");

@@ -37,13 +37,6 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> {
    auto pooled_width = ctx.Attr<int>("pooled_width");
    auto spatial_scale = ctx.Attr<float>("spatial_scale");

-    PADDLE_ENFORCE_GT(pooled_height, 0,
-                      "The pooled output height must greater than 0");
-    PADDLE_ENFORCE_GT(pooled_width, 0,
-                      "The pooled output width must greater than 0");
-    PADDLE_ENFORCE_GT(spatial_scale, 0,
-                      "The spatial scale must greater than 0");
-
    auto in_dims = in->dims();
    int batch_size = in_dims[0];
    int channels = in_dims[1];
@@ -51,18 +44,10 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> {
    int width = in_dims[3];
    int rois_num = rois->dims()[0];

-    auto out_dims = in_dims;
-    out_dims[0] = rois_num;
-    out_dims[1] = channels;
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-    out->Resize(out_dims);
-    argmax->Resize(out->dims());
-
    auto in_stride = framework::stride(in_dims);
    auto argmax_stride = framework::stride(argmax->dims());
    auto roi_stride = framework::stride(rois->dims());
-    auto out_stride = framework::stride(out_dims);
+    auto out_stride = framework::stride(out->dims());

    const T* input_data = in->data<T>();
    const int64_t* rois_data = rois->data<int64_t>();
@@ -124,7 +109,8 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> {

            // Define an empty pooling region to be zero
            bool is_empty = (hend <= hstart) || (wend <= wstart);
-            output_data[pool_index] = is_empty ? 0 : -__FLT_MAX__;
+            output_data[pool_index] =
+                is_empty ? 0 : -std::numeric_limits<float>::max();

            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
@@ -150,11 +136,11 @@ class CPURoiPoolOpKernel : public framework::OpKernel<T> {
 };

 template <typename Place, typename T>
-class CPURoiPoolGradOpKernel : public framework::OpKernel<T> {
+class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("Rois");
+    auto* rois = ctx.Input<Tensor>("ROIs");
    auto* argmax = ctx.Input<Tensor>("Argmax");

    auto* out_grad =
@@ -188,9 +174,9 @@ class CPURoiPoolGradOpKernel : public framework::OpKernel<T> {
      for (size_t n = 0; n < rois_num; ++n) {
        size_t roi_batch_idx = rois_data[0];
        T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx;
-        for (size_t c = 0; c < channels; ++c) {
-          for (size_t ph = 0; ph < pooled_height; ++ph) {
-            for (size_t pw = 0; pw < pooled_width; ++pw) {
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            for (int pw = 0; pw < pooled_width; ++pw) {
              size_t pool_index = ph * pooled_width + pw;

              if (argmax_data[pool_index] >= 0) {

--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -4,8 +4,7 @@ import math
 import sys
 from op_test import OpTest

-
-class TestSequenceSliceOp(OpTest):
+class TestROIPoolOp(OpTest):
    def set_data(self):
        self.init_test_case()
        self.make_rois()
@@ -13,7 +12,7 @@ class TestSequenceSliceOp(OpTest):

        self.inputs = {
            'X': self.x, 
-            'Rois': self.rois}
+            'ROIs': self.rois}
        
        self.attrs = {
            'spatial_scale': self.spatial_scale,