diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
index 7f0cacc400e5c937f0d331337e27667e689743f0..156db9358689c90293311b8f08a7576b680c9472 100755
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
+static constexpr int kROISize = 5;
+
 class ROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -38,6 +42,9 @@ class ROIPoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D tensor of shape (num_rois, 5)"
                    "given as [[batch_id, x1, y1, x2, y2], …].");
+    PADDLE_ENFORCE(rois_dims[1] == kROISize,
+                "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                "given as [[batch_id, x1, y1, x2, y2], …].");
 
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -150,7 +157,9 @@ REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
             roi_pool_grad, ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>);
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     roi_pool_grad,
-    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>);
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUPlace, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
index e405d9beda7555428e077d5f0702dbe9569a4109..97df45f1b5779d5e28e36814450a9577edf85135 100755
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
@@ -18,6 +18,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 static constexpr int kROISize = 5;
@@ -25,7 +27,7 @@ static constexpr int kROISize = 5;
 static inline int NumBlocks(const int N) {
   return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
                   kNumMaxinumNumBlocks);
-  }
+}
 
   template <typename T>
   __global__ void GPUROIPoolForward(
@@ -64,7 +66,7 @@ static inline int NumBlocks(const int N) {
       wend = min(max(wend + roi_start_w, 0), width);
       bool is_empty = (hend <= hstart) || (wend <= wstart);
 
-      T maxval = is_empty ? 0 : -std::numeric_limits<float>::max();
+      T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
       int maxidx = -1;
       const T* offset_input_data =
           input_data + (roi_batch_ind * channels + c) * height * width;
@@ -143,14 +145,6 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int width = in_dims[3];
 
     size_t rois_num = rois->dims()[0];
-
-    out->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<Place, T> set_zero;
-    set_zero(ctx.device_context(), out, static_cast<T>(0));
-    argmax->mutable_data<int64_t>(ctx.GetPlace());
-    math::SetConstant<Place, int64_t> set_init;
-    set_init(ctx.device_context(), argmax, static_cast<int64_t>(-1));
-
     if (rois_num== 0) return;
 
     int output_size = out->numel();
@@ -230,7 +224,9 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
     roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>);
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     roi_pool_grad,
-    ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>);
+    ops::GPUROIPoolGradOpKernel<paddle::platform::GPUPlace, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
index 4eb81b527347276a17486500a34fa33abc4da091..bd7736d63125f1be57c8af5141208f66d0592adb 100755
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
@@ -15,23 +15,18 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/math_function.h"
-#include "paddle/operators/strided_memcpy.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using LoD = framework::LoD;
-
 template <typename Place, typename T>
 class CPUROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* argmax = ctx.Output<Tensor>("Argmax");
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
@@ -54,11 +49,6 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
 
-    math::SetConstant<Place, T> set_zero;
-    set_zero(ctx.device_context(), out, static_cast<T>(0));
-    math::SetConstant<Place, int64_t> set_init;
-    set_init(ctx.device_context(), argmax, static_cast<int64_t>(-1));
-
     for (int n = 0; n < rois_num; ++n) {
       int roi_batch_id = rois_data[0];
       PADDLE_ENFORCE_GE(roi_batch_id, 0);
@@ -83,7 +73,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
       const float bin_size_w =
           static_cast<float>(roi_width) / static_cast<float>(pooled_width);
 
-      const float* batch_data = input_data + roi_batch_id * in_stride[0];
+      const T* batch_data = input_data + roi_batch_id * in_stride[0];
 
       for (int c = 0; c < channels; ++c) {
         for (int ph = 0; ph < pooled_height; ++ph) {
@@ -110,7 +100,8 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
             // Define an empty pooling region to be zero
             bool is_empty = (hend <= hstart) || (wend <= wstart);
             output_data[pool_index] =
-                is_empty ? 0 : -std::numeric_limits<float>::max();
+                is_empty ? 0 : -std::numeric_limits<T>::max();
+            argmax_data[pool_index] = -1;
 
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -139,14 +130,14 @@ template <typename Place, typename T>
 class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("ROIs");
-    auto* argmax = ctx.Input<Tensor>("Argmax");
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
 
     auto* out_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* x_grad =
-        ctx.Output<Tensor>(framework::GradVarName("X"));
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
diff --git a/python/paddle/v2/fluid/tests/test_roi_pool_op.py b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
index af35bcced830bc1f37b5549cf54d701d4a559c92..7cedb930ca861aed95c355931d80cb4d265c8235 100644
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -77,7 +77,12 @@ class TestROIPoolOp(OpTest):
                         wstart = min(max(wstart + roi_start_w, 0), self.width)
                         wend = min(max(wend + roi_start_w, 0), self.width)
 
-                        out_data[i, c, ph, pw] = 0
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        if is_empty:
+                            out_data[i, c, ph, pw] = 0
+                        else:
+                            out_data[i, c, ph, pw] = -sys.float_info.max
+
                         argmax_data[i, c, ph, pw] = -1
                         
                         for h in range(hstart, hend):