diff --git a/paddle/operators/spp_op.cc b/paddle/operators/spp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1807b62616b80ea8a9e48409e0760c1c7b36a38
--- /dev/null
+++ b/paddle/operators/spp_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+namespace paddle {
+namespace operators {
+
+class SppOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SppOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of spp operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of spp operator."
+              "N * M."
+              "M = C * H * W");
+    AddAttr<int>("pyramid_height", "(int), multi level pooling");
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddComment(R"DOC(
+        "With spatial pyramid pooling, the input image can
+        be of any sizes. This not only allows arbitrary aspect
+        ratios, but also allows arbitrary scales. We can resize
+        the input image to any scale (e.g., min(w, h)=180, 224,
+        ...) and apply the same deep network. When the
+        input image is at different scales, the network (with
+        the same filter sizes) will extract features at different
+        scales. The scales play important roles in traditional
+        methods.
+        Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = N \\
+            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
+          $$
+        paper https://arxiv.org/pdf/1406.4729v4.pdf
+        )DOC");
+  }
+};
+
+class SppOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SppOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SppOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Spping intput must be of 4-dimensional.");
+    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
+    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class SppOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/spp_op.cu.cc b/paddle/operators/spp_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..761e4d6c4a9639898ba548d56bed3c8817720c1b
--- /dev/null
+++ b/paddle/operators/spp_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/spp_op.h b/paddle/operators/spp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f35b305d02c73bcae6e72b8afa5ce55148ea98b8
--- /dev/null
+++ b/paddle/operators/spp_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SppKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t output_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // pooling output shape
+      framework::Tensor out_level;
+      std::vector<int64_t> output_shape_vec(
+          {in_x->dims()[0], in_x->dims()[1], bins, bins});
+      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
+      out_level.mutable_data<T>(output_shape, context.GetPlace());
+      // pooling
+      if (pooling_type == "max") {
+        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
+        math::MaxPool<T> max_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, max_process, &out_level);
+      } else if (pooling_type == "avg") {
+        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
+        math::AvgPool<T> avg_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, avg_process, &out_level);
+      }
+      // flatten pooling output shape
+      int output_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> output_flatten_shape_vec(
+          {in_x->dims()[0], output_flatten_w});
+      framework::DDim output_flatten_shape(
+          framework::make_ddim(output_flatten_shape_vec));
+      out_level.Resize(output_flatten_shape);
+      // concat
+      auto out_level_stride = framework::stride(out_level.dims());
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_level.data<T>(), out_level_stride, out_level.dims(),
+                       out_stride, out->data<T>() + output_offset);
+      output_offset += out_level.dims()[1] * out_level_stride[1];
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SppGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t out_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // split out and outgrad  ...  to flatten
+      framework::Tensor out_level;
+      framework::Tensor outgrad_level;
+      int out_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> out_flatten_shape_vec(
+          {in_x->dims()[0], out_flatten_w});
+      framework::DDim out_flatten_shape(
+          framework::make_ddim(out_flatten_shape_vec));
+      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      auto flatten_stride = framework::stride(out_level.dims());
+      // memcpy
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out->data<T>() + out_offset, out_stride,
+                       out_level.dims(), flatten_stride, out_level.data<T>());
+
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_grad->data<T>() + out_offset, out_stride,
+                       outgrad_level.dims(), flatten_stride,
+                       outgrad_level.data<T>());
+      out_offset += out_level.dims()[1] * out_stride[1];
+      // flatten backward to nchw
+
+      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
+      out_shape_vec.push_back(
+          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
+      out_shape_vec.push_back(
+          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out_level.ShareDataWith(out_level);
+      out_level.Resize(out_shape);
+      outgrad_level.ShareDataWith(outgrad_level);
+      outgrad_level.Resize(out_shape);
+      // pooling backward
+      if (pooling_type == "max") {
+        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
+                        *&out_level, *&outgrad_level, kernel_size, strides,
+                        paddings, in_x_grad);
+      } else if (pooling_type == "avg") {
+        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+            pool_backward;
+        math::AvgPoolGrad<T> avg_process;
+        pool_backward(context.template device_context<DeviceContext>(), *in_x,
+                      *&out_level, *&outgrad_level, kernel_size, strides,
+                      paddings, avg_process, in_x_grad);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/fluid/tests/test_spp_op.py b/python/paddle/v2/fluid/tests/test_spp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..007723f0e35ad194c427401337bc9b13756576de
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
@@ -0,0 +1,68 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_pool2d_op import max_pool2D_forward_naive
+from test_pool2d_op import avg_pool2D_forward_naive
+
+
+class TestSppOp(OpTest):
+    def setUp(self):
+        self.op_type = "spp"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = input.shape
+        out_level_flatten = []
+        for i in xrange(self.pyramid_height):
+            bins = np.power(2, i)
+            kernel_size = [0, 0]
+            padding = [0, 0]
+            kernel_size[0] = np.ceil(hsize /
+                                     bins.astype("double")).astype("int32")
+            padding[0] = (
+                (kernel_size[0] * bins - hsize + 1) / 2).astype("int32")
+
+            kernel_size[1] = np.ceil(wsize /
+                                     bins.astype("double")).astype("int32")
+            padding[1] = (
+                (kernel_size[1] * bins - wsize + 1) / 2).astype("int32")
+            out_level = self.pool2D_forward_naive(input, kernel_size,
+                                                  kernel_size, padding)
+            out_level_flatten.append(
+                out_level.reshape(nsize, bins * bins * csize))
+            if i == 0:
+                output = out_level_flatten[i]
+            else:
+                output = np.concatenate((output, out_level_flatten[i]), 1)
+        # output = np.concatenate(out_level_flatten.tolist(), 0);
+        self.inputs = {'X': input.astype('float32'), }
+        self.attrs = {
+            'pyramid_height': self.pyramid_height,
+            'pooling_type': self.pool_type
+        }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "avg":
+            self.check_grad(['X'], 'Out', max_relative_error=0.05)
+
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.pool_type = "max"
+
+
+class TestCase2(TestSppOp):
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.pool_type = "avg"
+
+
+if __name__ == '__main__':
+    unittest.main()