diff --git a/paddle/operators/spp_op.cc b/paddle/operators/spp_op.cc
index 62fc2112a8bf267a57ddbca780b70b93144b2077..ff607c57699349871a86e6b8f7c17eba3cccdcd1 100644
--- a/paddle/operators/spp_op.cc
+++ b/paddle/operators/spp_op.cc
@@ -29,28 +29,22 @@ class SppOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor) The output tensor of spp operator."
               "N * M."
               "M = C * H * W");
-    AddAttr<int>("pyramid_height", ">= 1");
+    AddAttr<int>("pyramid_height", "int");
     AddComment(R"DOC(
-        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        "Does spatial pyramid pooling on the input image by taking the max,
+        etc. within regions so that the result vector of different sized
+        images are of the same size
+        Input shape: $(N, C_{in}, H_{in}, W_{in})$
         Output shape: $(H_{out}, W_{out})$
         Where
           $$
-            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
-            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+            H_{out} = N \\
+            W_{out} = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * C_{in}
           $$
         )DOC");
   }
 };
 
-int OutputSize(int pyramid_level, int input_size) {
-  int bins = std::pow(2, pyramid_level);
-  int ksize = std::ceil(input_size / static_cast<double>(bins));
-  int padding = (ksize * bins - input_size + 1) / 2;
-  int output_size = (input_size - ksize + 2 * padding) / ksize + 1;
-  // output_size = bins
-  return output_size;
-}
-
 class SppOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -64,13 +58,7 @@ class SppOp : public framework::OperatorWithKernel {
     int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
     PADDLE_ENFORCE(in_x_dims.size() == 4,
                    "Spping intput must be of 4-dimensional.");
-    int outlen = 0;
-    for (int p = 0; p < pyramid_height; ++p) {
-      int outh = OutputSize(p, in_x_dims[2]);
-      int outw = OutputSize(p, in_x_dims[3]);
-      int p_level_outlen = outh * outw * in_x_dims[1];
-      outlen += p_level_outlen;
-    }
+    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
     std::vector<int64_t> output_shape({in_x_dims[0], outlen});
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   }
diff --git a/paddle/operators/spp_op.cu.cc b/paddle/operators/spp_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7057907cea2eed21753efeb399874f01533cd53
--- /dev/null
+++ b/paddle/operators/spp_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(spp, ops::SppKernel<paddle::platform::GPUPlace, float>,
+                       ops::SppKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(spp_grad,
+                       ops::SppGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::SppGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/spp_op.h b/paddle/operators/spp_op.h
index 2a2824bb3108fd1470437a774b909a4233c1a7b7..7a385352a0d563467b07cd0ac1c9ae2f7e373805 100644
--- a/paddle/operators/spp_op.h
+++ b/paddle/operators/spp_op.h
@@ -42,34 +42,36 @@ class SppKernel : public framework::OpKernel<T> {
       std::vector<int> strides({ksize_h, ksize_w});
       std::vector<int> paddings({padding_h, padding_w});
       // pooling output shape
+      framework::Tensor out_level;
       std::vector<int64_t> output_shape_vec({in_x->dims()[0], in_x->dims()[1]});
       output_shape_vec.push_back((input_h - ksize_h + 2 * padding_h) / ksize_h +
                                  1);
       output_shape_vec.push_back((input_w - ksize_w + 2 * padding_w) / ksize_w +
                                  1);
       framework::DDim output_shape(framework::make_ddim(output_shape_vec));
-      // flatten pooling output shape
-      int output_flatten_w = in_x->dims()[1] * bins * bins;
-      std::vector<int64_t> output_flatten_shape_vec(
-          {in_x->dims()[0], output_flatten_w});
-      framework::DDim output_flatten_shape(
-          framework::make_ddim(output_flatten_shape_vec));
-      framework::Tensor out_level;
-      framework::Tensor out_flatten_level;
       out_level.mutable_data<T>(output_shape, context.GetPlace());
       // pooling
       math::Pool2dFunctor<Place, math::MaxPool<T>, T> pool_forward;
       math::MaxPool<T> max_process;
       pool_forward(context.device_context(), *in_x, ksize, strides, paddings,
                    max_process, &out_level);
+      // flatten pooling output shape
+      framework::Tensor out_flatten_level;
+      int output_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> output_flatten_shape_vec(
+          {in_x->dims()[0], output_flatten_w});
+      framework::DDim output_flatten_shape(
+          framework::make_ddim(output_flatten_shape_vec));
       out_flatten_level.ShareDataWith(out_level);
       out_flatten_level.Resize(output_flatten_shape);
-      auto in_stride = framework::stride(out_flatten_level.dims());
-      const T* src_data = out_flatten_level.data<T>();
-      StridedMemcpy<T>(context.device_context(), src_data, in_stride,
-                       out_flatten_level.dims(), out_stride,
-                       out->data<T>() + output_offset);
-      output_offset += out_flatten_level.dims()[1] * in_stride[1];
+      // concat
+      auto out_flatten_level_stride =
+          framework::stride(out_flatten_level.dims());
+      StridedMemcpy<T>(context.device_context(), out_flatten_level.data<T>(),
+                       out_flatten_level_stride, out_flatten_level.dims(),
+                       out_stride, out->data<T>() + output_offset);
+      output_offset +=
+          out_flatten_level.dims()[1] * out_flatten_level_stride[1];
     }
   }
 };
@@ -83,12 +85,11 @@ class SppGradKernel : public framework::OpKernel<T> {
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     framework::Tensor* in_x_grad =
         context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int pyramid_height = context.template Attr<int>("pyramid_height");
     auto& device_ctx = context.device_context();
     math::SetConstant<Place, T> zero;
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
-    int pyramid_height = context.template Attr<int>("pyramid_height");
-    auto outgrad_stride = framework::stride(out_grad->dims());
     auto out_stride = framework::stride(out->dims());
     int input_h = in_x->dims()[2];
     int input_w = in_x->dims()[3];
@@ -102,26 +103,17 @@ class SppGradKernel : public framework::OpKernel<T> {
       std::vector<int> ksize({ksize_h, ksize_w});
       std::vector<int> strides({ksize_h, ksize_w});
       std::vector<int> paddings({padding_h, padding_w});
-      // split outgrad and get flatten
-      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
-      out_shape_vec.push_back((input_h - ksize_h + 2 * padding_h) / ksize_h +
-                              1);
-      out_shape_vec.push_back((input_w - ksize_w + 2 * padding_w) / ksize_w +
-                              1);
-      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      // split out and outgrad  ...  to flatten
+      framework::Tensor out_flatten_level;
+      framework::Tensor outgrad_flatten_level;
       int out_flatten_w = in_x->dims()[1] * bins * bins;
       std::vector<int64_t> out_flatten_shape_vec(
           {in_x->dims()[0], out_flatten_w});
       framework::DDim out_flatten_shape(
           framework::make_ddim(out_flatten_shape_vec));
-      framework::Tensor out_level;
-      framework::Tensor outgrad_level;
-      framework::Tensor out_flatten_level;
-      framework::Tensor outgrad_flatten_level;
       out_flatten_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
       outgrad_flatten_level.mutable_data<T>(out_flatten_shape,
                                             context.GetPlace());
-
       auto flatten_stride = framework::stride(out_flatten_level.dims());
       // memcpy
       StridedMemcpy<T>(context.device_context(), out->data<T>() + out_offset,
@@ -129,15 +121,24 @@ class SppGradKernel : public framework::OpKernel<T> {
                        out_flatten_level.data<T>());
 
       StridedMemcpy<T>(context.device_context(),
-                       out_grad->data<T>() + out_offset, outgrad_stride,
+                       out_grad->data<T>() + out_offset, out_stride,
                        outgrad_flatten_level.dims(), flatten_stride,
                        outgrad_flatten_level.data<T>());
       out_offset += out_flatten_level.dims()[1] * out_stride[1];
-      // flatten backward
+      // flatten backward to nchw
+      framework::Tensor out_level;
+      framework::Tensor outgrad_level;
+      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
+      out_shape_vec.push_back((input_h - ksize_h + 2 * padding_h) / ksize_h +
+                              1);
+      out_shape_vec.push_back((input_w - ksize_w + 2 * padding_w) / ksize_w +
+                              1);
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
       out_level.ShareDataWith(out_flatten_level);
       out_level.Resize(out_shape);
       outgrad_level.ShareDataWith(outgrad_flatten_level);
       outgrad_level.Resize(out_shape);
+      // pooling backward
       math::MaxPool2dGradFunctor<Place, T> pool2d_backward;
       pool2d_backward(context.device_context(), *in_x, *&out_level,
                       *&outgrad_level, ksize, strides, paddings, in_x_grad);
diff --git a/python/paddle/v2/fluid/tests/test_spp_op.py b/python/paddle/v2/fluid/tests/test_spp_op.py
index 806d5e7736bbf78002f475de736793e41af676b1..89b12e885c7665b5bc74a31bd901a33dd85fee85 100644
--- a/python/paddle/v2/fluid/tests/test_spp_op.py
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
@@ -37,11 +37,11 @@ class TestSppOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', max_relative_error=0.05)
 
     def init_test_case(self):
-        self.shape = [1, 1, 2, 2]
-        self.pyramid_height = 2
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
 
 
 if __name__ == '__main__':