diff --git a/doc/design/python_api.md b/doc/design/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..5c68354274d577c29e054fdd1d3b17c7ea9b4cf9
--- /dev/null
+++ b/doc/design/python_api.md
@@ -0,0 +1,216 @@
+# Design Doc: Python API
+
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+
+| Python classes | Protobuf messages |
+| --- | --- |
+| Program | ProgramDesc |
+| Block | BlockDesc |
+| Operator | OpDesc |
+| Variable | VarDesc |
+
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+
+## Core Concepts
+
+### Program
+
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator needs to be able to access variables in its ancessor blocks.
+
+Whenever we create a block, we need set its parent block to the current block, so the Python class `Program` needs to maintain a data member `current_block`.
+
+```python
+class Program(objects):
+    def __init__(self):
+        self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+
+    def global_block():
+        return self.blocks[0]
+
+    def current_block():
+        return self.get_block(self.current_block)
+
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+
+### Block
+
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.proto = core.NewBlock(program.proto)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+
+    def create_var(self, ...):
+        return Variable(self, ...)
+
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+
+`create_parameter` is necessary because parameters are global variables, those defined in the global block, but can be created in some sub-blocks, e.g., an FC layer in the step block of an RNN operator.
+
+`prepand_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+
+### Operator
+
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer output shape from input shape.
+
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
+        core.infer_shape(self.proto, inputs, outputs)
+
+    def type(self):
+        return self.proto.type()
+```
+
+`Operator` creates the `OpDesc` message in C++ space, so could it call the `InferShape` function, which is in C++.
+
+### Variable
+
+Operators take Variables as its inputs and outputs.
+
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
+        self.writer = None
+```
+
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each writes to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+
+### Parameter
+
+A parameter is a global variable with an initializer (or load) operator.
+
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+
+When users create a parameter, s/he can call
+
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+
+## Layer Functions
+
+A layer is a Python function that creates some operators and variables.  Layers simplify the work of application programmers.
+
+### Data Layer
+
+```python
+def data_layer(name, type, column_name):
+    block = the_current_program.glolal_block()
+    var = block.create_global_var(
+            name=name,
+            shape=[None] + type.dims(),
+            dtype=type.dtype)
+    block.prepend_operator(block,
+                           type="Feed",
+                           inputs = None,
+                           outputs = [var],
+                           {column_name: column_name})
+    return var
+```
+
+The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md).
+
+### FC Layer
+
+```python
+def fc_layer(input, size, ...):
+    block = program.current_block()
+    w = block.create_parameter(...)
+    b = block.create_parameter(...)
+    out = block.create_var()
+    op = block.append_operator("FC", X=input, W=w, b=b, out=out)
+    out.writer = op
+    return out
+```
diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..12b04fb2713d8bb862c08ddb538ddf860bb2983c
--- /dev/null
+++ b/doc/design/register_grad_op.md
@@ -0,0 +1,67 @@
+# Design Doc: Gradient Operators Registration
+
+
+## The Problem Posed
+
+In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
+
+However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
+
+More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
+
+## Current Implementation
+
+OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
+
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+
+map<string, OpInfo> OpInfoMap;
+
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+
+## Proposed Solution
+
+The mapping relationship between an operator and its gradient operators is a function. The interface of that function is:
+
+```cpp
+// (OpDesc) --> vector<OpDesc>
+using GradOpDescMaker = std::function<std::vector<OpDesc>(const OpDesc&)>;
+```
+
+The function take a `OpDesc` of the forward operator and return one or many gradient operator descriptions.
+
+The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
+
+```cpp
+struct OpInfo {
+  GradOpDescMaker grad_op_maker_;
+  ...
+};
+```
+
+The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
+
+We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+
+The user interface should be
+
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index df79bc0e8f1444ebc0b5eafcb2379cdb555fce83..1168fc38af43d528ad03a03eae04d7a396344216 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -29,7 +29,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator proto_desc)
 cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
+cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry sum_op)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index d09892f81bea34415d454b017258fd2a0d4575db..55c5fa420e2441260999ba15cf86484f9fd6e890 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -3,7 +3,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 
-USE_OP(add);
+USE_OP(sum);
 
 namespace paddle {
 namespace framework {
@@ -41,17 +41,24 @@ namespace f = paddle::framework;
 
 TEST(GradOpBuilder, AddTwo) {
   std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
+      "sum", {{"X", {"x", "y"}}}, {{"Out", {"out"}}}, {}));
   std::shared_ptr<f::OperatorBase> grad_add_op =
       f::OpRegistry::CreateGradOp(*add_op);
-  EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
-  EXPECT_EQ(grad_add_op->Outputs().size(), 2UL);
-  EXPECT_EQ(grad_add_op->Input("X"), "x");
-  EXPECT_EQ(grad_add_op->Input("Y"), "y");
-  EXPECT_EQ(grad_add_op->Input("Out"), "out");
+
+  EXPECT_EQ(grad_add_op->Inputs().size(), 1UL);
+  EXPECT_EQ(grad_add_op->Outputs().size(), 1UL);
   EXPECT_EQ(grad_add_op->Input(f::GradVarName("Out")), f::GradVarName("out"));
-  EXPECT_EQ(grad_add_op->Output(f::GradVarName("X")), f::GradVarName("x"));
-  EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
+  auto &outputs = grad_add_op->Outputs(f::GradVarName("X"));
+  EXPECT_EQ(2UL, outputs.size());
+  auto in_output = [&outputs](const std::string &name) {
+    for (auto &output_name : outputs) {
+      if (output_name == name) return true;
+    }
+    return false;
+  };
+
+  EXPECT_TRUE(in_output(f::GradVarName("x")));
+  EXPECT_TRUE(in_output(f::GradVarName("y")));
 }
 
 REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 310d68d7c1baac231a2f1709af28bfb58ae1a436..0af527c88c753f8545184d4175114c17abfba0a6 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -334,6 +334,32 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
     return var != nullptr;
   }
 
+  bool HasInputs(const std::string& name) const {
+    auto inputs = op_.Inputs(name);
+    if (inputs.size() == 0UL) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const {
+    auto outputs = op_.Outputs(name);
+    if (outputs.size() == 0UL) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   DDim GetInputDim(const std::string& name) const {
     return GetDim(op_.Input(name));
   }
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index b07fc788124413f728c713027609d9d2d1c39538..bc8af0eb3ec7e8685eb7d2734e9b8f75372d1309 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -24,6 +24,10 @@ class InferShapeContextBase {
   virtual ~InferShapeContextBase() {}
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
+
+  virtual bool HasInputs(const std::string &name) const = 0;
+  virtual bool HasOutputs(const std::string &name) const = 0;
+
   virtual framework::DDim GetInputDim(const std::string &name) const = 0;
   std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
     const std::vector<std::string> &names = Inputs(name);
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 87efb900cd59e6adeb051e0e458f2b86c1b510c9..43eb4de2c1d4ff61f295c05af9dfd5f26c2ba365 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,6 +55,12 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    if ("${TARGET}" STREQUAL "pool_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
+    endif()
+
     # activation_op contains several operators
     if ("${TARGET}" STREQUAL "activation_op")
         set(pybind_flag 1)
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
deleted file mode 100644
index 3914d1323083ede6a7ea07e7b4ef76b9e4afd26d..0000000000000000000000000000000000000000
--- a/paddle/operators/add_op.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/add_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AddOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of AddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of AddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of AddOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                      "Two input of Add Op's dimension must be same.");
-    ctx->SetOutputDim("Out", x_dims);
-  }
-};
-
-class AddOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AddOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of add op");
-    AddInput("Y", "The second input of add op");
-    AddOutput("Out", "The output of add op");
-    AddComment(R"DOC(
-Two Element Add Operator.
-
-The equation is: Out = X + Y
-)DOC");
-  }
-};
-
-class AddOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
-
-REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
deleted file mode 100644
index d9c6d20a6c320b59e57ed25da3dd8b093833f8c7..0000000000000000000000000000000000000000
--- a/paddle/operators/add_op.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/add_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
deleted file mode 100644
index 75163032a1ff11a1f18cfd0a4ff7289ff0cb66bf..0000000000000000000000000000000000000000
--- a/paddle/operators/add_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename Place, typename T>
-class AddKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Y");
-    auto* output = context.Output<Tensor>("Out");
-
-    output->mutable_data<T>(context.GetPlace());
-
-    auto X = EigenVector<T>::Flatten(*input0);
-    auto Y = EigenVector<T>::Flatten(*input1);
-    auto Z = EigenVector<T>::Flatten(*output);
-
-    auto place = context.GetEigenDevice<Place>();
-
-    Z.device(place) = X + Y;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index b39d4f0ac27bf0a8378344f852a602c5ecf4cf6a..a0ceb029e3abee2fe591325ffa3100168c3aa8e3 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,12 +1,10 @@
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
-               im2col.cu DEPS cblas device_context operator)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu pooling.cc pooling.cu DEPS cblas device_context operator)
     nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc
-               DEPS cblas device_context operator)
+    cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator)
     cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
     cc_library(softmax SRCS softmax.cc DEPS operator)
     cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b706529d8f1ed0d673904b81047a5614bd4cf23
--- /dev/null
+++ b/paddle/operators/math/pooling.cc
@@ -0,0 +1,463 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T ele = pool_process.initial();
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_process.compute(ele, input_data[h * input_width + w]);
+              }
+            }
+            int pool_size = (hend - hstart) * (wend - wstart);
+            pool_process.finalize(ele, (static_cast<T>(pool_size)));
+            output_data[ph * output_width + pw] = ele;
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+template <typename PoolProcess, class T>
+class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            float scale = 1.0 / pool_size;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_grad_process.compute(
+                    input_data[h * input_width + w],
+                    output_data[ph * output_width + pw],
+                    output_grad_data[ph * output_width + pw],
+                    input_grad_data[h * input_width + w],
+                    static_cast<T>(scale));
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template <class T>
+class MaxPool2dGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            bool stop = false;
+            for (int h = hstart; h < hend && !stop; ++h) {
+              for (int w = wstart; w < wend && !stop; ++w) {
+                int input_idx = h * input_width + w;
+                int output_idx = ph * output_width + pw;
+                if (input_data[input_idx] == output_data[output_idx]) {
+                  input_grad_data[input_idx] += output_grad_data[output_idx];
+                  stop = true;
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
+// template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
+
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = pool_process.initial();
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    pool_process.compute(
+                        ele,
+                        input_data[(d * input_height + h) * input_width + w]);
+                  }
+                }
+              }
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              pool_process.finalize(ele, static_cast<T>(pool_size));
+              output_data[output_idx] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              float scale = 1.0 / pool_size;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+                    pool_grad_process.compute(
+                        input_data[input_idx], output_data[output_idx],
+                        output_grad_data[output_idx],
+                        input_grad_data[input_idx], static_cast<T>(scale));
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template <class T>
+class MaxPool3dGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              bool stop = false;
+              for (int d = dstart; d < dend && !stop; ++d) {
+                for (int h = hstart; h < hend && !stop; ++h) {
+                  for (int w = wstart; w < wend && !stop; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+
+                    if (input_data[input_idx] == output_data[output_idx]) {
+                      input_grad_data[input_idx] +=
+                          output_grad_data[output_idx];
+                      stop = true;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
+// template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
+
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aeccd1f8e8855c51ad85016f0cb239b4c9c8fb0
--- /dev/null
+++ b/paddle/operators/math/pooling.cu
@@ -0,0 +1,635 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2D(const int nthreads, const T* input_data,
+                             T* output_data, const int channels,
+                             const int input_height, const int input_width,
+                             const int output_height, const int output_width,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_height, const int stride_width,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T ele = pool_process.initial();
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        pool_process.compute(ele, input_data[h * input_width + w]);
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, (static_cast<T>(pool_size)));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetC = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int phend = min(offsetH / stride_height + 1, output_height);
+    int pwend = min(offsetW / stride_width + 1, output_width);
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx =
+        (batch_idx * channels + offsetC) * output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int hstart = ph * stride_height - padding_height;
+        int wstart = pw * stride_width - padding_width;
+        int hend = min(hstart + ksize_height, input_height);
+        int wend = min(wstart + ksize_width, input_width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        int output_sub_idx = ph * output_width + pw;
+        pool_process.compute(input, output_data[output_sub_idx],
+                             output_grad[output_sub_idx], gradient,
+                             static_cast<T>(1.0 / pool_size));
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    input_grad += (batch_idx * channels + c) * input_height * input_width;
+
+    T ele = output_data[index];
+    int maxIndex = -1;
+    bool stop = false;
+    for (int h = hstart; h < hend && !stop; ++h) {
+      for (int w = wstart; w < wend && !stop; ++w) {
+        if (ele == input_data[h * input_width + w]) {
+          maxIndex = h * input_width + w;
+          stop = true;
+        }
+      }
+    }
+
+    if (maxIndex != -1) {
+      // atomic add
+      atomicAdd(input_grad + maxIndex, output_grad[index]);
+    }
+  }
+}
+
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2D<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, output_data, input_channels,
+                              input_height, input_width, output_height,
+                              output_width, ksize_height, ksize_width,
+                              stride_height, stride_width, padding_height,
+                              padding_width, pool_process);
+  }
+};
+
+template <typename PoolProcess, typename T>
+class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2DGrad<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, output_height, output_width,
+        ksize_height, ksize_width, stride_height, stride_width, padding_height,
+        padding_width, pool_process);
+  }
+};
+
+template <typename T>
+class MaxPool2dGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, output_height, output_width,
+        ksize_height, ksize_width, stride_height, stride_width, padding_height,
+        padding_width);
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::GPUPlace, float>;
+// template class MaxPool2dGradFunctor<platform::GPUPlace, double>; // The
+// 64-bit floating-point version of atomicAdd() is only supported by devices of
+// compute capability 6.x and higher.
+
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3D(
+    const int nthreads, const T* input_data, T* output_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = pool_process.initial();
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          pool_process.compute(
+              ele, input_data[(d * input_height + h) * input_width + w]);
+        }
+      }
+    }
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, static_cast<T>(pool_size));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetD =
+        (index / input_width / input_height) % input_depth + padding_depth;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+
+    int pdstart = (offsetD < ksize_depth)
+                      ? 0
+                      : (offsetD - ksize_depth) / stride_depth + 1;
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int pdend = min((offsetD) / stride_depth + 1, output_depth);
+    int phend = min((offsetH) / stride_height + 1, output_height);
+    int pwend = min((offsetW) / stride_width + 1, output_width);
+
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
+                     output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_depth - padding_depth;
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int dend = min(dstart + ksize_depth, input_depth);
+          int hend = min(hstart + ksize_height, input_height);
+          int wend = min(wstart + ksize_width, input_width);
+          dstart = max(dstart, 0);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
+          pool_process.compute(input, output_data[output_sub_idx],
+                               output_grad[output_sub_idx], gradient,
+                               static_cast<T>(1.0 / pool_size));
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = output_data[index];
+    bool stop = false;
+    int maxIdx = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    input_grad +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+
+    for (int d = dstart; d < dend && !stop; ++d) {
+      for (int h = hstart; h < hend && !stop; ++h) {
+        for (int w = wstart; w < wend && !stop; ++w) {
+          if (ele == input_data[(d * input_height + h) * input_width + w]) {
+            stop = true;
+            maxIdx = (d * input_height + h) * input_width + w;
+          }
+        }
+      }
+    }
+    if (maxIdx != -1) {
+      // atomic add
+      atomicAdd(input_grad + maxIdx, output_grad[index]);
+    }
+  }
+}
+
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3D<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        pool_process);
+  }
+};
+
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3DGrad<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_depth, input_height, input_width, output_depth,
+        output_height, output_width, ksize_depth, ksize_height, ksize_width,
+        stride_depth, stride_height, stride_width, padding_depth,
+        padding_height, padding_width, pool_process);
+  }
+};
+
+template <class T>
+class MaxPool3dGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_depth, input_height, input_width, output_depth,
+        output_height, output_width, ksize_depth, ksize_height, ksize_width,
+        stride_depth, stride_height, stride_width, padding_depth,
+        padding_height, padding_width);
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::GPUPlace, float>;
+// template class MaxPool3dGradFunctor<platform::GPUPlace, double>;  // The
+// 64-bit floating-point version of atomicAdd() is only supported by devices of
+// compute capability 6.x and higher.
+
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..d214c689235ad4233d3e4e1c2aa0fdc993bf20c6
--- /dev/null
+++ b/paddle/operators/math/pooling.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+//////////////////////
+#define FLT_MAX __FLT_MAX__  //
+
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
+  DEVICE inline void finalize(T& y, const T& poo_size) {}
+};
+
+template <class T>
+class AvgPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(T& y, const T& x) { y += x; }
+  DEVICE inline void finalize(T& y, const T& poo_size) { y /= poo_size; }
+};
+template <class T>
+class MaxPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += dy * (x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += (scale * dy);
+  }
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute);
+};
+
+template <typename Place, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute);
+};
+
+template <typename Place, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c29f51f05613832c838400eb114465c81290ea58
--- /dev/null
+++ b/paddle/operators/pool_op.cc
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+class PoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Out(Output) of Pooling should not be null.");
+
+    auto in_x_dims = ctx->GetInputDim("X");
+
+    std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
+                   "pooling_type should be 'max' or 'avg'");
+    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                   "Pooling intput should be 4-D or 5-D");
+
+    if (ctx->Attrs().Get<bool>("globalPooling")) {
+      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+    }
+
+    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                   "Input size and Pooling size should be consistent.");
+    PADDLE_ENFORCE(ksize.size() == 2 || ksize.size() == 3,
+                   "Pooling size should be 2 elements. or 3 elements.");
+    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                      "strides size and pooling size should be the same.");
+    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                      "paddings size and pooling size should be the same.");
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class PoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContextBase *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input@Grad of Pooling should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool2dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "The input tensor of pooling operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "The output tensor of pooling operator."
+              "The format of output tensor is also NCHW.");
+
+    AddAttr<std::string>("poolingType",
+                         "PoolingType of pooling operator."
+                         "Str constant equal to 'max' or 'avg'.")
+        .InEnum({"max", "avg"});
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "Pooling size(depth, height, width) of pooling operator."
+        "If globalPooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Add checker)
+    AddAttr<bool>(
+        "globalPooling",
+        "Whether to use the globalPooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If globalPooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "Strides(height, width) of pooling operator."
+                              "Default {1,1}")
+        .SetDefault({1, 1});  // TODO(Add checker)
+    AddAttr<std::vector<int>>("paddings",
+                              "Paddings(height, width) of pooling operator."
+                              "Default {0,0}.")
+        .SetDefault({0, 0});  // TODO(Add checker)
+    AddComment(R"DOC(
+The pooling2d operation calculates the output based on
+the input, poolingType and ksize, strides, paddings parameters.
+)DOC");
+  }
+};
+
+class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool3dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW. Where N is batch size, C is "
+             "the "
+             "number of channels, D, H and W is the depth, height and width of "
+             "feature.");
+    AddOutput("Out",
+              "The output tensor of pooling operator."
+              "The format of output tensor is also NCDHW.");
+
+    AddAttr<std::string>("poolingType",
+                         "PoolingType of pooling operator."
+                         "str constant equal to 'max' or 'avg'.")
+        .InEnum({"max", "avg"});
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "Pooling size(depth, height, width) of pooling operator."
+        "If globalPooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Add checker)
+    AddAttr<bool>(
+        "globalPooling",
+        "Whether to use the globalPooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If globalPooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "strides",
+        "Strides(depth, height, width) of pooling operator."
+        "Default {1,1,1}.")
+        .SetDefault({1, 1, 1});  // TODO(Add checker)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "Paddings(depth, height, width) of pooling operator."
+        "Default {0,0,0}.")
+        .SetDefault({0, 0, 0});  // TODO(Add checker)
+    AddComment(R"DOC(
+The pooling3d operation calculates the output based on
+the input, poolingType and ksize, strides, paddings parameters.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool2d,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool2d_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+
+REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool3d,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool3d_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/pool_op.cu b/paddle/operators/pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0e3b80868f7b9d1697d619889160856d65ad59a3
--- /dev/null
+++ b/paddle/operators/pool_op.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(pool2d,
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool2d_grad,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(pool3d,
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool3d_grad,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2bc358def42959f2cc8f61cb00436fae1b7514b
--- /dev/null
+++ b/paddle/operators/pool_op.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class PoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+
+    switch (ksize.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              Place, paddle::operators::math::MaxPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              Place, paddle::operators::math::AvgPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool3dFunctor<
+              Place, paddle::operators::math::MaxPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool3dFunctor<
+              Place, paddle::operators::math::AvgPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        }
+      } break;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class PoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (context.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    }
+
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
+      temp.device(context.GetEigenDevice<Place>()) =
+          temp.constant(static_cast<T>(0));
+
+      switch (ksize.size()) {
+        case 2: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
+                pool2d_backward;
+            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool2dGradFunctor<
+                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool2d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings, pool_process);
+          }
+        } break;
+        case 3: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
+                pool3d_backward;
+            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool3dGradFunctor<
+                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool3d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings, pool_process);
+          }
+        } break;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 8f62a9f4db8d39edc11949df513aebf4fa257d45..5d76313aeb96c0c8204f64aee1057f753ec85d6b 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -43,8 +43,10 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SumOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
-    AddOutput("Out", "the output tensor of sum operator.");
+    AddInput("X", "the input tensors of sum operator.")
+        .AsDuplicable()
+        .NotInGradient();
+    AddOutput("Out", "the output tensor of sum operator.").NotInGradient();
     AddComment(R"DOC(
 Sum the input tensors.
 
diff --git a/paddle/platform/hostdevice.h b/paddle/platform/hostdevice.h
index e7de86b7b2f75d206e730ec409bbee5d0a08942e..eb2df291cceef553d6422e6166e1fef2c63e2a47 100644
--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
@@ -2,8 +2,10 @@
 
 #ifdef __CUDACC__
 #define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
 #define HOST __host__
 #else
 #define HOSTDEVICE
+#define DEVICE
 #define HOST
 #endif
diff --git a/python/paddle/v2/framework/tests/test_add_op.py b/python/paddle/v2/framework/tests/test_add_op.py
deleted file mode 100644
index 3ca34d9b9fc2b7b54cc25ca0e0d1a08a71e37c52..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_add_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestAddOp(OpTest):
-    def setUp(self):
-        self.op_type = "add"
-        self.inputs = {
-            'X': np.random.random((102, 105)).astype("float32"),
-            'Y': np.random.random((102, 105)).astype("float32")
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/framework/tests/test_cond_op.py
index 3698ce9c8ed5c021826af622a53ee742e9b22552..76323b5e10c59822b4de82a70ebd57b3e57c8392 100644
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -15,7 +15,7 @@ class PySimpleCond(object):
         for i in range(1, 10, 2):
             array[i] = 0
         self.cond = np.array(array)
-        self.x = np.ones(shape=(10, 1))
+        self.x = np.ones(shape=(10, 1)).astype("float32")
 
     def forward(self):
         self.index_t = np.where(self.cond == 1)
diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py
deleted file mode 100644
index 85117bf9600975ea5d61dfb5b34335792bf6d8b2..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import unittest
-import numpy as np
-import paddle.v2.framework.core as core
-from op_test import get_numeric_gradient
-from op_test import create_op
-
-
-class GetNumericGradientTest(unittest.TestCase):
-    def test_add_op(self):
-        x = np.random.random((10, 1)).astype("float32")
-        y = np.random.random((10, 1)).astype("float32")
-        z = x + y
-        scope = core.Scope()
-        add_op = create_op(scope, "add", {'X': x, 'Y': y}, {'Out': z}, dict())
-        arr = get_numeric_gradient(scope, add_op, {'X': x,
-                                                   'Y': y}, 'X', ['Out'])
-        self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-4)
-
-    def test_softmax_op(self):
-        def stable_softmax(x):
-            """Compute the softmax of vector x in a numerically stable way."""
-            shiftx = x - np.max(x)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
-        def label_softmax_grad(Y, dY):
-            dX = Y * 0.0
-            for i in range(Y.shape[0]):
-                d = np.dot(Y[i, :], dY[i, :])
-                dX[i, :] = Y[i, :] * (dY[i, :] - d)
-            return dX
-
-        X = np.random.random((2, 2)).astype("float32")
-        Y = np.apply_along_axis(stable_softmax, 1, X)
-        dY = np.ones(Y.shape)
-        dX = label_softmax_grad(Y, dY)
-
-        scope = core.Scope()
-        softmax_op = create_op(scope, "softmax", {"X": X}, {"Y": Y}, dict())
-
-        arr = get_numeric_gradient(scope, softmax_op, {"X": X}, "X", "Y")
-        np.testing.assert_almost_equal(arr, dX, decimal=1e-2)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index 50cfb855f2b01d8fd32342855d46716da7e07856..8503257feb8e1a5802f3f889f72c559a2aaa583a 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -15,7 +15,7 @@ def fc(X, W, Y):
 class TestNet(unittest.TestCase):
     def test_net_all(self):
         net = core.Net.create()
-        op1 = Operator("add", X="X", Y="Y", Out="Out")
+        op1 = Operator("sum", X=["X", "Y"], Out="Out")
         net.append_op(op1)
 
         net2 = core.Net.create()
@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
 
         expected = '''
 Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
+    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
     Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
         Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
             Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/framework/tests/test_operator.py
index 040556322d79cbb594eb9af585a5b9920d7ab625..98f6b2f5ee639120557cb85b3ada6d2931f7d0d2 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
 
 class TestOpCreations(unittest.TestCase):
     def test_all(self):
-        add_op = op.Operator("add", X="a", Y="b", Out="z")
+        add_op = op.Operator("sum", X=["a", "b"], Out="z")
         self.assertIsNotNone(add_op)
         # Invoke C++ DebugString()
-        self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
+        self.assertEqual('Op(sum), inputs:{X[a, b]}, outputs:{Out[z]}.',
                          str(add_op))
 
 
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/framework/tests/test_pool2d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2941fda81b23998072810d8c6f6597a6f3db7e30
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -0,0 +1,144 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+    return out
+
+
+def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / (
+                (r_end - r_start) * (c_end - c_start))
+    return out
+
+
+class TestPool2d_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'poolingType': self.pool_type,
+            'globalPooling': self.global_pool,
+        }
+
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase1(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase2(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+class TestCase3(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase4(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase5(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/framework/tests/test_pool3d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8792b492e3da6541f71185be82b8bfc4f52d821d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -0,0 +1,152 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+    return out
+
+
+def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / (
+                    (d_end - d_start) * (h_end - h_start) * (w_end - w_start))
+    return out
+
+
+class TestPool3d_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool3D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'poolingType': self.pool_type,
+            'globalPooling': self.global_pool,
+        }
+
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase1(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase2(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+class TestCase3(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase4(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase5(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+if __name__ == '__main__':
+    unittest.main()