Merge pull request #5831 from wanghaox/roi_pool

Roi pool operator

Merge pull request #5831 from wanghaox/roi_pool
Roi pool operator
0690cca7 · wanghaox · GitHub · 65c859db · cf5b5986 · 0690cca7
4 changed file
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kROISize = 5;
+
+class ROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
+                   "Output(Argmax) of ROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+    PADDLE_ENFORCE(rois_dims[1] == kROISize,
+                "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                "given as [[batch_id, x1, y1, x2, y2], …].");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = input_dims[1];
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputDim("Argmax", out_dims);
+    }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ROIPoolOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor), "
+             "the input of ROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(Tensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D tensor of shape (num_rois, 5)"
+             "given as [[batch_id, x1, y1, x2, y2], …]. "
+             "Where batch_id is the id of the data, "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
+    AddOutput("Out",
+              "(Tensor), "
+              "The output of ROIPoolOp is a 4-D tensor with shape "
+              "(num_rois, channels, pooled_h, pooled_w).");
+    AddOutput("Argmax",
+              "(Tensor), "
+              "Argmaxes corresponding to indices in X used "
+              "for gradient computation. Only output "
+              "if arg “is_test” is false.").AsIntermediate();
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+                   .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "The pooled output height.")
+                 .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "The pooled output width.")
+                 .SetDefault(1);
+    AddComment(R"DOC(
+ROIPool operator
+
+ROI Pooling for Faster-RCNN. The link below is a further introduction: 
+https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
+            roi_pool_grad, ops::ROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool_grad,
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 5;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+  template <typename T>
+  __global__ void GPUROIPoolForward(
+      const int nthreads, const T* input_data, const int64_t* input_rois,
+      const float spatial_scale, const int channels, const int height,
+      const int width, const int pooled_height, const int pooled_width,
+      T* output_data, int64_t* argmax_data) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int offset = blockDim.x * gridDim.x;
+    for (size_t i = index; i < nthreads; i += offset) {
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int c = (index / pooled_width / pooled_height) % channels;
+      int n = index / pooled_width / pooled_height / channels;
+
+      const int64_t* offset_input_rois = input_rois + n * kROISize;
+      int roi_batch_ind = offset_input_rois[0];
+      int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+      int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+      int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+      int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+      int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+      int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+      int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+      int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+      hstart = min(max(hstart + roi_start_h, 0), height);
+      hend = min(max(hend + roi_start_h, 0), height);
+      wstart = min(max(wstart + roi_start_w, 0), width);
+      wend = min(max(wend + roi_start_w, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+      int maxidx = -1;
+      const T* offset_input_data =
+          input_data + (roi_batch_ind * channels + c) * height * width;
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          int input_data_index = h * width + w;
+          if (offset_input_data[input_data_index] > maxval) {
+            maxval = offset_input_data[input_data_index];
+            maxidx = input_data_index;
+          }
+        }
+      }
+      output_data[index] = maxval;
+      if (argmax_data) {
+        argmax_data[index] = maxidx;
+      }
+    }
+  }
+
+template <typename T>
+__global__ void GPUROIPoolBackward(
+    const int nthreads,
+    const int64_t* input_rois,
+    const T* output_grad,
+    const int64_t* argmax_data,
+    const int num_rois,
+    const float spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    T* input_grad) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int offset = blockDim.x * gridDim.x;
+    for (int i = index; i < nthreads; i += offset) {
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int c = (index / pooled_width / pooled_height) % channels;
+      int n = index / pooled_width / pooled_height / channels;
+
+      const int64_t* offset_input_rois = input_rois + n * kROISize;
+      int roi_batch_ind = offset_input_rois[0];
+      int input_offset = (roi_batch_ind * channels + c) * height * width;
+      int output_offset = (n * channels + c) * pooled_height * pooled_width;
+      const T* offset_output_grad = output_grad + output_offset;
+      T* offset_input_grad = input_grad + input_offset;
+      const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+      int argmax = offset_argmax_data[ph * pooled_width + pw];
+      if (argmax != -1) {
+        platform::CudaAtomicAdd(offset_input_grad + argmax,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+      }
+    }
+  }
+
+
+template <typename Place, typename T>
+class GPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* argmax = ctx.Output<Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    auto in_stride = framework::stride(in_dims);
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    size_t rois_num = rois->dims()[0];
+    if (rois_num== 0) return;
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    GPUROIPoolForward<T>
+      <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+      output_size,
+      in->data<T>(),
+      rois->data<int64_t>(),
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      out->mutable_data<T>(ctx.GetPlace()),
+      argmax->mutable_data<int64_t>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* argmax = ctx.Input<Tensor>("Argmax");
+
+    auto* out_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    size_t rois_num = rois->dims()[0];
+    int channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+
+      int output_grad_size = out_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUROIPoolBackward<T>
+          <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+          output_grad_size,
+          rois->data<int64_t>(),
+          out_grad->data<T>(),
+          argmax->data<int64_t>(),
+          rois_num,
+          spatial_scale,
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          x_grad->mutable_data<T>(ctx.GetPlace()));
+        }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    roi_pool,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    roi_pool_grad,
+    ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class CPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto argmax_stride = framework::stride(argmax->dims());
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+    const int64_t* rois_data = rois->data<int64_t>();
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      PADDLE_ENFORCE_GE(roi_batch_id, 0);
+      PADDLE_ENFORCE_LT(roi_batch_id, batch_size);
+      rois_data += roi_stride[0];
+    }
+
+    rois_data = rois->data<int64_t>();
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      int roi_start_w = round(rois_data[1] * spatial_scale);
+      int roi_start_h = round(rois_data[2] * spatial_scale);
+      int roi_end_w = round(rois_data[3] * spatial_scale);
+      int roi_end_h = round(rois_data[4] * spatial_scale);
+
+      // Force malformed ROIs to be 1x1
+      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
+
+      const float bin_size_h =
+          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+      const float bin_size_w =
+          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+      const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            //  Compute pooling region for this output unit:
+            //  start (included) = floor(ph * roi_height / pooled_height_)
+            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+            int hstart =
+                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+            int wstart =
+                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+            int hend =
+                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+            int wend =
+                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
+            hend = std::min(std::max(hend + roi_start_h, 0), height);
+            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
+            wend = std::min(std::max(wend + roi_start_w, 0), width);
+
+            const int pool_index = ph * pooled_width + pw;
+
+            // Define an empty pooling region to be zero
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            output_data[pool_index] =
+                is_empty ? 0 : -std::numeric_limits<T>::max();
+            argmax_data[pool_index] = -1;
+
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width + w;
+                if (batch_data[index] > output_data[pool_index]) {
+                  output_data[pool_index] = batch_data[index];
+                  argmax_data[pool_index] = index;
+                }
+              }
+            }
+          }
+        }
+
+        batch_data += in_stride[1];
+        output_data += out_stride[1];
+        argmax_data += argmax_stride[1];
+      }
+      // Increment ROI data pointer
+      rois_data += roi_stride[0];
+    }
+    return;
+  }
+};
+
+template <typename Place, typename T>
+class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
+
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+
+    if (x_grad) {
+      int channels = in->dims()[1];
+      auto in_stride = framework::stride(in->dims());
+      auto roi_stride = framework::stride(rois->dims());
+
+      const int64_t* rois_data = rois->data<int64_t>();
+      int rois_num = rois->dims()[0];
+
+      T* x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+
+      size_t roi_offset = roi_stride[0];
+      size_t batch_offset = in_stride[0];
+      size_t channel_offset = in_stride[1];
+
+      const T* out_grad_data = out_grad->data<T>();
+      size_t pool_channel_offset = pooled_height * pooled_width;
+      const int64_t* argmax_data = argmax->data<int64_t>();
+
+      for (size_t n = 0; n < rois_num; ++n) {
+        size_t roi_batch_idx = rois_data[0];
+        T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx;
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              size_t pool_index = ph * pooled_width + pw;
+
+              if (argmax_data[pool_index] >= 0) {
+                size_t index = static_cast<size_t>(argmax_data[pool_index]);
+                batch_grad_data[index] += out_grad_data[pool_index];
+              }
+            }
+          }
+          batch_grad_data += channel_offset;
+          out_grad_data += pool_channel_offset;
+          argmax_data += pool_channel_offset;
+        }
+        rois_data += roi_offset;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+import unittest
+import numpy as np
+import math
+import sys
+from op_test import OpTest
+
+class TestROIPoolOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_pool()
+
+        self.inputs = {
+            'X': self.x, 
+            'ROIs': self.rois}
+        
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width}
+
+        self.outputs = {
+            'Out': self.outs,
+            'Argmax': self.argmaxes}
+
+    def init_test_case(self):
+        self.batch_size = 5
+        self.channels = 3
+        self.height = 6
+        self.width = 4
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels,
+                      self.height, self.width)
+
+        self.spatial_scale = 1.0/4.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.rois_num = 2
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def calc_roi_pool(self):
+        out_data = np.zeros(
+            (self.rois_num, self.channels,
+            self.pooled_height, self.pooled_width))
+        argmax_data = np.zeros(
+            (self.rois_num, self.channels,
+            self.pooled_height, self.pooled_width))
+            
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = roi[0]
+            roi_start_w = int(round(roi[1] * self.spatial_scale))
+            roi_start_h = int(round(roi[2] * self.spatial_scale))
+            roi_end_w = int(round(roi[3] * self.spatial_scale))
+            roi_end_h = int(round(roi[4] * self.spatial_scale))
+
+            roi_height = int(max(roi_end_h - roi_start_h + 1, 1));
+            roi_width = int(max(roi_end_w - roi_start_w + 1, 1));
+
+            x_i = self.x[roi_batch_id]
+
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+
+            for c in range(self.channels):
+                for ph in range(self.pooled_height):
+                    for pw in range(self.pooled_width):
+                        hstart = int(math.floor(ph * bin_size_h))
+                        wstart = int(math.floor(pw * bin_size_w))
+                        hend = int(math.ceil((ph + 1) * bin_size_h))
+                        wend = int(math.ceil((pw + 1) * bin_size_w))
+
+                        hstart = min(max(hstart + roi_start_h, 0), self.height)
+                        hend = min(max(hend + roi_start_h, 0), self.height)
+                        wstart = min(max(wstart + roi_start_w, 0), self.width)
+                        wend = min(max(wend + roi_start_w, 0), self.width)
+
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        if is_empty:
+                            out_data[i, c, ph, pw] = 0
+                        else:
+                            out_data[i, c, ph, pw] = -sys.float_info.max
+
+                        argmax_data[i, c, ph, pw] = -1
+                        
+                        for h in range(hstart, hend):
+                            for w in range(wstart, wend):
+                                if x_i[c, h, w] > out_data[i, c, ph, pw]:
+                                    out_data[i, c, ph, pw] = x_i[c, h, w]
+                                    argmax_data[i, c, ph, pw] = h * \
+                                        self.width + w
+
+        self.outs = out_data.astype('float32')
+        self.argmaxes = argmax_data.astype('int64')
+
+    def make_rois(self):
+        rois = []
+        batch_ids = np.random.randint(0, self.batch_size, size=self.rois_num)
+        for i in range(self.rois_num):
+            x1 = np.random.random_integers(
+                0, self.width / self.spatial_scale - self.pooled_width)
+            y1 = np.random.random_integers(
+                0, self.height / self.spatial_scale - self.pooled_height)
+
+            x2 = np.random.random_integers(
+                x1 + self.pooled_width, self.width / self.spatial_scale)
+            y2 = np.random.random_integers(
+                y1 + self.pooled_height, self.height / self.spatial_scale)
+            
+            roi = [batch_ids[i], x1, y1, x2, y2]
+            rois.append(roi)
+        self.rois = np.array(rois).astype("int64")
+
+    def setUp(self):
+        self.op_type = "roi_pool"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+if __name__ == '__main__':
+    unittest.main()