Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into lstm

83627d3e · dangqingqing · 3cace737 · 5c5250e3 · 83627d3e · 83627d3e
114 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON)
    "different Python interpreter from compiling.")
 endif()

+if(MOBILE_INFERENCE)
+    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
+else()
+    set(THIRD_PARTY_BUILD_TYPE Release)
+endif()
+
 ########################################################################################

 include(external/mklml)     # download mklml package

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -8,7 +8,7 @@ ExternalProject_Add(
    extern_eigen3
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "master"
+    GIT_TAG         4e79cb69b9425f5f8c3a84be4350d4ab75b5fd9d
    PREFIX          ${EIGEN_SOURCE_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -36,6 +36,7 @@ ExternalProject_Add(
    # change this back to the official Github repo once my PR is
    # merged.
    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
+    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -45,11 +46,11 @@ ExternalProject_Add(
                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -31,6 +31,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_TAG         v0.3.5
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -43,12 +44,12 @@ ExternalProject_Add(
                    -DWITH_GFLAGS=ON
                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -56,11 +56,11 @@ IF(WITH_TESTING)
                        -DBUILD_GMOCK=ON
                        -Dgtest_disable_pthreads=ON
                        -Dgtest_force_shared_crt=ON
-                        -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                        ${EXTERNAL_OPTIONAL_ARGS}
        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=Release
+                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
    )

    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -191,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            ${OPTIONAL_ARGS}
            -Dprotobuf_BUILD_TESTS=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_INSTALL_LIBDIR=lib
        CMAKE_CACHE_ARGS
            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
            ${OPTIONAL_CACHE_ARGS}

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -35,6 +35,7 @@ ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -48,9 +49,9 @@ ExternalProject_Add(
                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                    -DBUILD_SHARED=ON
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -42,11 +42,11 @@ ExternalProject_Add(
                    -DBUILD_SHARED_LIBS=OFF
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DCMAKE_MACOSX_RPATH=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

 LIST(APPEND external_project_dependencies zlib)

--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -243,7 +243,7 @@ class SymbolTable {
  // TODO determine whether name is generated by python or C++.
  // Currently assume that a unique name will be generated by C++ if the
  // argument name is left default.
-  VarDesc* NewVar(const string& name="");
+  VarDesc* Var(const string& name="");

  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
  // recursively.

--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@@ -33,7 +33,6 @@ digraph ImageClassificationGraph {

        cost -> MSE_Grad [color=red];
        d_cost -> MSE_Grad [color=red];
-        x -> MSE_Grad [color=red];
        l -> MSE_Grad [color=red];
        y -> MSE_Grad -> d_y [color=red];


--- a/doc/design/images/graph_construction_example_all.png
+++ b/doc/design/images/graph_construction_example_all.png
--- a/doc/design/images/graph_construction_example_forward_backward.png
+++ b/doc/design/images/graph_construction_example_forward_backward.png
--- a/doc/design/images/graph_construction_example_forward_only.png
+++ b/doc/design/images/graph_construction_example_forward_only.png
--- a/doc/design/register_grad_op.md
+++ b/doc/design/register_grad_op.md
@@ -3,15 +3,17 @@

 ## The Problem Posed

-In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
+Currently, for each C++ operator class definition, there registers a *gradient operator creator* function, which takes a C++ operator instance and returns the corresponding gradient operator instance.

-However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
+However, we noticed two problems with the current deisgn:

-More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
+1. As we decided to separate the *compilation* and *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.

-## Current Implementation
+1. Some operator's gradient computation requires more than one gradient operators.  For example, the gradient of *minus* consists of two operators -- an identity operaotr and a scale operator.  So we need to make the registration mechanism to support the mapping from an operator to a set of operators for gradient computation.

-OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
+## The Current Implementation
+
+The C++ class `OpInfos` store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is

 ```cpp
 struct OpInfo {

--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,7 +37,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
 public:
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
  const Variable* FindVar(const std::string& name) const;

 private:
@@ -98,7 +98,7 @@ class Scope {
  Variable* FindVar(const std::string& name) const;

  // return if already contains same name variable.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);

 private:
  std::shared_ptr<Scope> parent_;
@@ -107,7 +107,7 @@ class Scope {
 ```
 ## Only scope can create a variable

-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.

 ## When scope destroyed, all variables inside this scope should be destroyed together

@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar

 ## Orthogonal interface

-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
@@ -161,7 +161,7 @@ class TensorArray:
        @name: str
            the name of the variable to output.
        '''
-        tensor = NewVar(name)
+        tensor = Var(name)
        tensor_array_stack(self.name, tensor)
        return tensor


--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@@ -16,16 +16,23 @@ The computation graph is constructed by Data Node and Operation Node. The concep

 ## Definition of VarDesc

-A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
+A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. 

 ```proto
 message VarDesc {
  required string name = 1;
-  optional LoDTensorDesc lod_tensor = 2;
+  enum VarType {
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LoDTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
 }
 ```

-## Definition of LodTensorDesc
+## Definition of TensorDesc

 ```proto
 enum DataType {
@@ -38,87 +45,25 @@ enum DataType {
  FP64 = 6;
 }

-message LoDTensorDesc {
+message TensorDesc {
  required DataType data_type = 1;
-  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  optional int32 lod_level = 3 [default=0];
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
 }
 ```

-## Definition of Variable in Python
-
-In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
-
-```python
-image = Variable(dims=[-1, 640, 480])
-# fc1 and fc2 are both Variable
-fc1 = layer.fc(input=image, output_size=10)
-fc2 = layer.fc(input=fc1, output_size=20)
-```
-### what should class `Variable` Have
-1. `name`.a name of string type is used to mark the value of the Variable.
-1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
-1. `operator`. Variable should record which operator produce itself. The reaon is:
-  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
-
-In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
-
-```python
-import VarDesc
-import LoDTensorDesc
-import framework
-
-def AddInitialOperator(variable, initializer):
-	# add an initialize Operator to block to init this Variable
-
-class Variable(object):
-   def __init__(self, name, dims, type, initializer):
-      self._block = get_default_block()
-      self._name = name
-      self.op = None
-
-      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
-      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
-      self._var = framework.CreateVar(_var_desc)
-      self._block.add_var(self)
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).

-      # add initial op according to initializer
-      if initializer is not None:
-          AddInitialOperator(self, initializer)
-
-   def dims(self):
-      return self._var.dims()
-
-   def data_type(self):
-       return self._var.data_type()
+## Definition of LodTensorDesc

-   def to_proto(self):
-       pass
+```proto
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
 ```

-Then we can use this Variable to create a fc layer in Python.
+A LoDTensorDesc contains a tensor and a lod_level.

-```python
-import paddle as pd
-
-def flatten_size(X, num_flatten_dims):
-  prod = 1 # of last num_flatten_dims
-  for i in xrange(num_flatten_dims):
-    prod = prod * X.dims[-i-1]
-  return prod
-
-def layer.fc(X, output_size, num_flatten_dims):
-  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
-  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
-  out = Variable(type=FP32)
-  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
-  pd.InferShape(y)
-  return out
-
-x = Variable(dims=[-1, 640, 480])
-y = layer.fc(x, output_size=100)
-z = layer.fc(y, output_size=200)
+## Definition of Variable in Python

-paddle.eval(targets=[z], ...)
-print(z)
-```
+For Variable in Python, please reference [`Python API`](./python_api.md).
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -26,7 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)

 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic")

 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
    paddle_parameter

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,10 +19,10 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)

 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
-cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)

@@ -42,12 +42,17 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)

-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward ${GLOB_OP_LIB})
-#if(WITH_GPU)
-#    nv_test(executor_test SRCS executor_test.cc DEPS executor)
-#else()
-#    cc_test(executor_test SRCS executor_test.cc DEPS executor)
-#endif()
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward)
+set(EXECUTOR_TEST_OP elementwise_add_op gaussian_random_op feed_op fetch_op
+    mul_op sum_op squared_l2_distance_op fill_constant_op sgd_op mean_op)
+if(WITH_GPU)
+    nv_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
+else()
+    cc_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
+endif()

 cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
 cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
+
+cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
+cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -28,14 +28,15 @@ namespace paddle {
 namespace framework {

 static inline std::unique_ptr<OperatorBase> CreateGradOp(
-    const OperatorBase& op) {
+    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
  OpDescBind op_desc;
  op_desc.SetInputMap(op.Inputs());
  op_desc.SetOutputMap(op.Outputs());
  op_desc.SetType(op.Type());
  op_desc.SetAttrMap(op.Attrs());
  auto& info = OpInfoMap::Instance().Get(op.Type());
-  auto grad_descs = info.GradOpMaker()(op_desc);
+  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var);
  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
  grad_ops.reserve(grad_descs.size());
  std::transform(grad_descs.begin(), grad_descs.end(),
@@ -98,7 +99,9 @@ static std::unique_ptr<OperatorBase> NOP() {
 //  See Backward.h for details
 static std::unique_ptr<OperatorBase> BackwardRecursive(
    const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
+    std::unordered_set<std::string>& no_grad_names,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    size_t& uniq_id) {
  //  If all input gradients of forwarding operator do not need to calculate,
  //  just return an NOP. Not return null ptr because NOP does not take
  //  too much time for calculation, but it is useful for simplifying logic.
@@ -136,7 +139,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
         ++it, ++local_op_id) {
      auto& fwd = *it;
-      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
      ForEachVarName(bwd->Outputs(),
                     [&dup_output_ops, local_op_id](const std::string& out) {
                       dup_output_ops[out].emplace_back(local_op_id);
@@ -187,7 +190,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
      net->InsertOp(pos.first + 1, std::move(pos.second));
    }
  } else {
-    std::unique_ptr<OperatorBase> grad_op(CreateGradOp(forwardOp));
+    std::unique_ptr<OperatorBase> grad_op(
+        CreateGradOp(forwardOp, no_grad_names, grad_to_var));

    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
                                          const std::string& grad_input) {
@@ -226,7 +230,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
          *static_cast<const OperatorBase*>(&rnnop.stepnet());
      // create stepnet's gradient op
      rnn_grad_op->set_stepnet(
-          BackwardRecursive(stepnet_op, no_grad_names, uniq_id));
+          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
    }

    if (net->ops_.empty()) {  // Current no aux op is added to network
@@ -253,7 +257,8 @@ std::unique_ptr<OperatorBase> Backward(
    no_grad_names.insert(name + kGradVarSuffix);
  }
  size_t uid = 0;
-  return BackwardRecursive(forwardOp, no_grad_names, uid);
+  std::unordered_map<std::string, std::string> grad_to_var;
+  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
 }

 // ====================================  //
@@ -268,30 +273,61 @@ static bool AllGradInSet(const std::vector<std::string>& names,
  return true;
 }

+static void CreateGradVarInBlock(
+    size_t grad_op_start_index,
+    const std::unordered_map<std::string, std::string>& param_name_map,
+    BlockDescBind* block_desc,
+    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
+  auto ops = block_desc->AllOps();
+  for (size_t op_index = grad_op_start_index; op_index < ops.size();
+       ++op_index) {
+    ForEachVarName(ops[op_index]->Outputs(),
+                   [&](const std::string& grad_var_name) {
+                     if (block_desc->HasVar(grad_var_name)) {
+                       return false;
+                     }
+                     block_desc->Var(grad_var_name);
+                     auto it = param_name_map.find(grad_var_name);
+                     if (it == param_name_map.end()) {
+                       return false;
+                     }
+                     auto param_var_name = it->second;
+                     auto& grad_record = (*grad_var_record)[param_var_name];
+                     grad_record.name_ = grad_var_name;
+                     grad_record.block_idx_ = block_desc->ID();
+                     grad_record.op_idx_ = static_cast<int>(op_index);
+                     return false; /* not break */
+                   });
+  }
+}
+
 std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
    const std::unique_ptr<OpDescBind>& op_desc,
-    std::unordered_set<std::string>& no_grad_vars) {
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
-  // All input gradients of forwarding operator do not need to calculat.
+  // All input gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
-  if (AllGradInSet(inputs, no_grad_vars)) {
+  if (AllGradInSet(inputs, *no_grad_vars)) {
    return grad_op_descs;  // empty vector
  }
  // All output gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
-  if (AllGradInSet(outputs, no_grad_vars)) {
+  if (AllGradInSet(outputs, *no_grad_vars)) {
    for (const std::string& name : inputs) {
-      no_grad_vars.insert(GradVarName(name));
+      no_grad_vars->insert(GradVarName(name));
    }
    return grad_op_descs;  // empty vector
  }

-  grad_op_descs = OpRegistry::CreateGradOpDescs(op_desc.get());
+  grad_op_descs = OpInfoMap::Instance()
+                      .Get(op_desc->Type())
+                      .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var);

  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
  for (auto& desc : grad_op_descs) {
    for (const std::string& in_name : desc->InputArgumentNames()) {
-      if (no_grad_vars.count(in_name)) {
+      if (no_grad_vars->count(in_name)) {
        std::string prefix = in_name.substr(
            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
        std::string new_name = prefix + kZeroVarSuffix;
@@ -301,11 +337,6 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
      }
    }
-    for (const std::string& out_name : desc->OutputArgumentNames()) {
-      if (no_grad_vars.count(out_name)) {
-        desc->Rename(out_name, kEmptyVarName);
-      }
-    }
  }

  for (auto& p : pending_fill_zeros_ops) {
@@ -316,23 +347,25 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(

 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    ProgramDescBind& program_desc, int block_idx,
-    std::unordered_set<std::string>& no_grad_vars) {
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
  BlockDescBind* cur_block = program_desc.Block(block_idx);
  std::deque<std::unique_ptr<OpDescBind>>& op_descs = cur_block->ops_;
  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
  size_t grad_desc_idx = 0;
  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+
  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
    std::vector<std::unique_ptr<OpDescBind>> op_grads =
-        MakeOpGrad(*it, no_grad_vars);
+        MakeOpGrad(*it, no_grad_vars, grad_to_var);

    if ((*it)->Type() == "recurrent") {
      PADDLE_ENFORCE_EQ(
-          op_grads.size(), size_t(1),
+          op_grads.size(), static_cast<size_t>(1),
          "rnn_op's gradient process should contain only one op.");
-      int step_block_idx = (*it)->GetBlockAttr("stop_block");
-      auto backward_block_op_descs =
-          MakeBlockBackward(program_desc, step_block_idx, no_grad_vars);
+      int step_block_idx = (*it)->GetBlockAttr("step_block");
+      auto backward_block_op_descs = MakeBlockBackward(
+          program_desc, step_block_idx, no_grad_vars, grad_to_var);
      BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
      for (auto& ptr : backward_block_op_descs) {
        backward_block->ops_.push_back(std::move(ptr));
@@ -376,24 +409,56 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    backward_descs.insert(backward_descs.begin() + p.first + 1,
                          std::move(p.second));
  }
+
  return backward_descs;
 }

-void AppendBackward(ProgramDescBind& program_desc,
-                    const std::unordered_set<std::string>& no_grad_vars) {
+ParamGradInfoMap AppendBackward(
+    ProgramDescBind& program_desc, const VarDescBind& target,
+    const std::unordered_set<std::string>& no_grad_vars) {
  std::unordered_set<std::string> no_grad_var_names;
  no_grad_var_names.reserve(no_grad_vars.size() + 1);
  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
  for (auto& name : no_grad_vars) {
    no_grad_var_names.insert(GradVarName(name));
  }
+
  const int root_block_idx = 0;
-  auto backward_op_descs =
-      MakeBlockBackward(program_desc, root_block_idx, no_grad_var_names);
-  auto& forw_op_descs = program_desc.Block(root_block_idx)->ops_;
+  auto root_block = program_desc.Block(root_block_idx);
+  auto& all_ops = root_block->ops_;
+
+  // insert fill one op for target
+  std::string fill_one_op_out = GradVarName(target.Name());
+  std::unique_ptr<OpDescBind> fill_one_op(
+      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                     {{"shape", std::vector<int>{1}},
+                      {"value", static_cast<float>(1.0)},
+                      {"dataType", framework::DataType::FP32}}));
+  all_ops.push_back(std::move(fill_one_op));
+  size_t forward_op_num = all_ops.size();
+  size_t forward_block_num = program_desc.Size();
+
+  // Insert backward operators
+  std::unordered_map<std::string, std::string> grad_to_var;
+  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
+                                             &no_grad_var_names, &grad_to_var);
+
+  std::unordered_map<std::string, GradVarInfo> retv;
+
+  // Create Variable
  for (auto& ptr : backward_op_descs) {
-    forw_op_descs.push_back(std::move(ptr));
+    all_ops.push_back(std::move(ptr));
+  }
+  root_block->Var(fill_one_op_out);
+
+  // create grad_var for all blocks in this program
+  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
+  for (size_t block_index = forward_block_num;
+       block_index < program_desc.Size(); ++block_index) {
+    CreateGradVarInBlock(0, grad_to_var, program_desc.Block(block_index),
+                         &retv);
  }
+  return retv;
 }

 }  // namespace framework

--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -14,7 +14,10 @@

 #pragma once

+#include <string>
+#include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"

@@ -27,10 +30,27 @@ extern std::unique_ptr<OperatorBase> Backward(
    const OperatorBase& forwardOp,
    const std::unordered_set<std::string>& no_grad_vars);

-// TODO(jiayi): Add target as parameter and generate backward op
-// according to target.
-void AppendBackward(ProgramDescBind& program_desc,
-                    const std::unordered_set<std::string>& no_grad_vars);
+struct GradVarInfo {
+  GradVarInfo() {}
+  GradVarInfo(const std::string& name, int block_idx, int op_idx)
+      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
+
+  bool operator==(const GradVarInfo& b) const {
+    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
+           op_idx_ == b.op_idx_;
+  }
+
+  std::string name_;
+  int block_idx_;
+  int op_idx_;
+};
+
+using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
+                                            GradVarInfo /*grad_var_info*/>;
+
+ParamGradInfoMap AppendBackward(
+    ProgramDescBind& program_desc, const VarDescBind& target,
+    const std::unordered_set<std::string>& no_grad_vars);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -18,6 +18,7 @@
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"

 namespace paddle {
@@ -169,6 +170,45 @@ class MultInOutOpMaker : public OpProtoAndCheckerMaker {
  }
 };

+class MinusGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+    std::vector<std::unique_ptr<OpDescBind>> retv;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *op_desc = new OpDescBind();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", x_g);
+      op_desc->SetAttr("scale", 1.0f);
+      retv.emplace_back(op_desc);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *op_desc = new OpDescBind();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", y_g);
+      op_desc->SetAttr("scale", -1.0f);
+      retv.emplace_back(op_desc);
+    }
+    return retv;
+  }
+};
+
+class MinusOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("Y", "");
+    AddOutput("Out", "");
+    AddComment("minus for unittest");
+  }
+};
 }  // namespace framework
 }  // namespace paddle

@@ -187,6 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
 REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
            f::NOP);
 REGISTER_OP(mult_in_out, f::NOP, f::MultInOutOpMaker, mult_in_out_grad, f::NOP);
+REGISTER_OPERATOR(minus, f::NOP, f::MinusOpMaker, f::MinusGradOpDescMaker);

 TEST(Backward, simple_op_not_need_grad) {
  auto fwd = f::OpRegistry::CreateOp(
@@ -395,12 +436,13 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
            2UL       /* external input number */
                + 1UL /* external output number*/
                + 1UL /* number of gradient of external output*/
-                + 2U /* internal variable number*/);
+                + 2UL /* internal variable number*/
+            );
  EXPECT_EQ(grad_fc.Outputs(all).size(),
            2UL       /* input number of mul*/
-                + 2UL /* input number of rowwise_add
-                       */
-                + 1UL /* input number of sigmod */);
+                + 2UL /* input number of rowwise_add*/
+                + 1UL /* input number of sigmod */
+                - 1UL /* out2 is not needed*/);
  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
@@ -427,10 +469,14 @@ TEST(Backward, simple_single_op) {
  op->SetInput("b", {"b"});
  op->SetOutput("Out", {"out"});

-  AppendBackward(program, {});
+  auto target = f::VarDescBind("out");
+  auto var_to_grad = AppendBackward(program, target, {});
+
+  ASSERT_EQ(block->AllOps().size(), 3UL);
+  f::OpDescBind *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");

-  ASSERT_EQ(block->AllOps().size(), 2UL);
-  f::OpDescBind *grad_op = block->AllOps()[1];
+  f::OpDescBind *grad_op = block->AllOps()[2];
  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
@@ -440,6 +486,13 @@ TEST(Backward, simple_single_op) {
            std::vector<std::string>({f::GradVarName("x")}));
  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b")}));
+
+  EXPECT_EQ(var_to_grad.size(), 2UL);
+  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
+  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
 }

 TEST(Backward, default_attribute) {
@@ -451,14 +504,19 @@ TEST(Backward, default_attribute) {
  op->SetInput("X", {"x"});
  op->SetInput("Y", {"y"});
  op->SetOutput("Out", {"out"});
+  op->CheckAttrs();

-  AppendBackward(program, {});
+  auto target = f::VarDescBind("out");
+  AppendBackward(program, target, {});

-  ASSERT_EQ(block->AllOps().size(), 2UL);
+  ASSERT_EQ(block->AllOps().size(), 3UL);
  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);

-  f::OpDescBind *grad_op = block->AllOps()[1];
+  f::OpDescBind *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op = block->AllOps()[2];
  ASSERT_EQ(grad_op->Type(), "mul_grad");
  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
@@ -486,10 +544,15 @@ TEST(Backward, simple_mult_op) {
  op3->SetInput("b", {"b3"});
  op3->SetOutput("Out", {"out3"});

-  AppendBackward(program, {});
+  auto target = f::VarDescBind("out3");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {});

-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op1 = block->AllOps()[6];
  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -500,7 +563,7 @@ TEST(Backward, simple_mult_op) {
  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b1")}));

-  f::OpDescBind *grad_op2 = block->AllOps()[4];
+  f::OpDescBind *grad_op2 = block->AllOps()[5];
  EXPECT_EQ(grad_op2->Type(), "mul_grad");
  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -514,7 +577,7 @@ TEST(Backward, simple_mult_op) {
  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
            std::vector<std::string>({f::GradVarName("y2")}));

-  f::OpDescBind *grad_op3 = block->AllOps()[3];
+  f::OpDescBind *grad_op3 = block->AllOps()[4];
  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -524,6 +587,23 @@ TEST(Backward, simple_mult_op) {
            std::vector<std::string>({f::GradVarName("out2")}));
  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b3")}));
+
+  EXPECT_EQ(var_to_grad.size(), 6UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out2"),
+            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
 }

 TEST(Backward, intermedia_var_no_grad) {
@@ -554,10 +634,15 @@ TEST(Backward, intermedia_var_no_grad) {
  op4->SetInput("Y", {"out3"});
  op4->SetOutput("Out", {"out4"});

-  AppendBackward(program, {"out3"});
+  auto target = f::VarDescBind("out4");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"out3"});

-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  ASSERT_EQ(block->AllOps().size(), 7UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op1 = block->AllOps()[6];
  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -568,7 +653,7 @@ TEST(Backward, intermedia_var_no_grad) {
  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b1")}));

-  f::OpDescBind *grad_op4 = block->AllOps()[4];
+  f::OpDescBind *grad_op4 = block->AllOps()[5];
  EXPECT_EQ(grad_op4->Type(), "mul_grad");
  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -579,8 +664,17 @@ TEST(Backward, intermedia_var_no_grad) {
            std::vector<std::string>({f::GradVarName("out4")}));
  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::kEmptyVarName}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
+
+  EXPECT_EQ(var_to_grad.size(), 3UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
 }

 TEST(Backward, var_no_grad) {
@@ -601,10 +695,15 @@ TEST(Backward, var_no_grad) {
  op2->SetOutput("Y", {"y2"});
  op2->SetOutput("Z", {"z2"});

-  AppendBackward(program, {"z1"});
+  auto target = f::VarDescBind("z2");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"z1"});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");

-  ASSERT_EQ(block->AllOps().size(), 5UL);
-  f::OpDescBind *grad_op2 = block->AllOps()[2];
+  f::OpDescBind *grad_op2 = block->AllOps()[3];
  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -618,10 +717,9 @@ TEST(Backward, var_no_grad) {
            std::vector<std::string>({f::GradVarName("z2")}));
  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")),
-            std::vector<std::string>({f::kEmptyVarName}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());

-  f::OpDescBind *fill_zero_op = block->AllOps()[3];
+  f::OpDescBind *fill_zero_op = block->AllOps()[4];
  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
@@ -629,7 +727,7 @@ TEST(Backward, var_no_grad) {
  EXPECT_EQ(fill_zero_op->Output("Y"),
            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));

-  f::OpDescBind *grad_op1 = block->AllOps()[4];
+  f::OpDescBind *grad_op1 = block->AllOps()[5];
  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -645,6 +743,15 @@ TEST(Backward, var_no_grad) {
            std::vector<std::string>({f::GradVarName("x1")}));
  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
            std::vector<std::string>({f::GradVarName("h1")}));
+
+  EXPECT_EQ(var_to_grad.size(), 3UL);
+  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
 }

 TEST(Backward, shared_var) {
@@ -669,10 +776,15 @@ TEST(Backward, shared_var) {
  op3->SetInput("b", {"b3"});
  op3->SetOutput("Out", {"out3"});

-  AppendBackward(program, {});
+  auto target = f::VarDescBind("out3");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {});

-  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDescBind *grad_op3 = block->AllOps()[3];
+  ASSERT_EQ(block->AllOps().size(), 8UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op3 = block->AllOps()[4];
  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -683,7 +795,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b3")}));

-  f::OpDescBind *grad_op4 = block->AllOps()[4];
+  f::OpDescBind *grad_op4 = block->AllOps()[5];
  ASSERT_EQ(grad_op4->Type(), "mul_grad");
  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -697,7 +809,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
            std::vector<std::string>({f::GradVarName("y2")}));

-  f::OpDescBind *sum_op = block->AllOps()[5];
+  f::OpDescBind *sum_op = block->AllOps()[6];
  ASSERT_EQ(sum_op->Type(), "sum");
  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
@@ -707,7 +819,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(sum_op->Output("Out"),
            std::vector<std::string>({f::GradVarName("out1")}));

-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDescBind *grad_op1 = block->AllOps()[7];
  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -717,4 +829,41 @@ TEST(Backward, shared_var) {
            std::vector<std::string>({f::GradVarName("x1")}));
  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b1")}));
-}
\ No newline at end of file
+
+  EXPECT_EQ(var_to_grad.size(), 5UL);
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+}
+
+TEST(Backward, half_backward) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  auto *op1 = block->AppendOp();
+  op1->SetType("minus");
+  op1->SetInput("X", {"a"});
+  op1->SetInput("Y", {"b"});
+  op1->SetOutput("Out", {"out"});
+
+  auto target = f::VarDescBind("out");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"b"});
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+  auto ops = block->AllOps();
+  ASSERT_EQ(3UL, ops.size());
+
+  EXPECT_EQ(var_to_grad.size(), 1UL);
+  EXPECT_EQ(var_to_grad.at("a"),
+            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
+}
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -18,19 +18,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-VarDescBind *BlockDescBind::NewVar(const std::string &name) {
+VarDescBind *BlockDescBind::Var(const std::string &name) {
  need_update_ = true;
  auto it = vars_.find(name);
-  PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
-  auto var = new VarDescBind(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  auto *var = new VarDescBind(name);
  vars_[name].reset(var);
  return var;
 }

-VarDescBind *BlockDescBind::Var(const std::string &name) const {
+VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
  auto it = vars_.find(name);
-  PADDLE_ENFORCE(it != vars_.end(),
-                 "Can not find variable %s in current block.", name);
+  if (it == vars_.end()) {
+    return nullptr;
+  }
  return it->second.get();
 }

@@ -66,7 +69,7 @@ std::vector<OpDescBind *> BlockDescBind::AllOps() const {
  return res;
 }

-void BlockDescBind::Sync() {
+void BlockDescBind::Flush() {
  if (need_update_) {
    auto &op_field = *this->desc_->mutable_ops();
    op_field.Clear();
@@ -91,9 +94,10 @@ BlockDescBind *BlockDescBind::ParentBlock() const {
  return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
 }

-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
-  BlockDesc *desc = block.RawPtr();
-  this->attrs_[name] = desc;
+BlockDesc *BlockDescBind::Proto() {
+  Flush();
+  return desc_;
 }
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -33,14 +33,6 @@ class ProgramDescBind;

 class BlockDescBind {
 public:
-  friend std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
-      ProgramDescBind &program_desc, int block_idx,
-      std::unordered_set<std::string> &no_grad_vars);
-
-  friend void AppendBackward(
-      ProgramDescBind &program_desc,
-      const std::unordered_set<std::string> &no_grad_vars);
-
  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
      : prog_(prog), desc_(desc), need_update_(false) {}

@@ -48,9 +40,9 @@ class BlockDescBind {

  int32_t Parent() const { return desc_->parent_idx(); }

-  VarDescBind *NewVar(const std::string &name_bytes);
+  VarDescBind *Var(const std::string &name_bytes);

-  VarDescBind *Var(const std::string &name_bytes) const;
+  VarDescBind *FindVar(const std::string &name_bytes) const;

  bool HasVar(const std::string &var_name) const;

@@ -64,11 +56,13 @@ class BlockDescBind {

  std::vector<OpDescBind *> AllOps() const;

-  void Sync();
+  void Flush();

-  BlockDesc *RawPtr() { return desc_; }
+  BlockDesc *Proto();

- private:
+  // FIXME(yuyang18): backward will access private data of BlockDesc.
+  // Mark it public temporary. We can fix it later.
+ public:
  ProgramDescBind *prog_;  // not_own
  BlockDesc *desc_;        // not_own
  bool need_update_;

--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -97,8 +97,11 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
 template <typename T>
 struct OpInfoFiller<T, kGradOpDescMaker> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->grad_op_maker_ = [](const OpDescBind& fwd_op) {
-      T maker(fwd_op);
+    info->grad_op_maker_ = [](
+        const OpDescBind& fwd_op,
+        const std::unordered_set<std::string>& no_grad_set,
+        std::unordered_map<std::string, std::string>* grad_to_var) {
+      T maker(fwd_op, no_grad_set, grad_to_var);
      return maker();
    };
  }

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -66,7 +66,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {

  // Instantiate all the vars in the global scope
  for (auto& var : block.vars()) {
-    scope->NewVar(var.name());
+    scope->Var(var.name());
  }

  Scope& local_scope = scope->NewScope();
@@ -78,7 +78,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
      for (auto& var : block.ops(i).outputs()) {
        for (auto& argu : var.arguments()) {
          if (local_scope.FindVar(argu) == nullptr) {
-            local_scope.NewVar(argu);
+            local_scope.Var(argu);
          }
        }
      }

--- a/paddle/framework/executor_test.cc
+++ b/paddle/framework/executor_test.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>

+#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/framework/attribute.h"
 #include "paddle/framework/backward.h"
@@ -25,6 +26,17 @@ limitations under the License. */
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"

+USE_OP(elementwise_add);
+USE_OP(gaussian_random);
+USE_OP(feed);
+USE_OP(fetch);
+USE_OP(mul);
+USE_OP(sum);
+USE_OP(squared_l2_distance);
+USE_OP(fill_constant);
+USE_OP(mean);
+USE_OP(sgd);
+
 using namespace paddle::platform;
 using namespace paddle::framework;

@@ -34,8 +46,16 @@ void AddOp(const std::string& type, const VariableNameMap& inputs,
  // insert output
  for (auto kv : outputs) {
    for (auto v : kv.second) {
-      auto var = block->NewVar(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      // <<<<<<< HEAD
+      //       auto var = block->Var(v);
+      //       var->SetType(VarDesc::LOD_TENSOR);
+      //       var->SetDataType(paddle::framework::DataType::FP32);
+      // =======
+      if (!block->HasVar(v)) {
+        auto var = block->Var(v);
+        var->SetDataType(paddle::framework::DataType::FP32);
+      }
+      // >>>>>>> origin/develop
    }
  }

@@ -49,6 +69,7 @@ void AddOp(const std::string& type, const VariableNameMap& inputs,
    op->SetOutput(kv.first, kv.second);
  }
  op->SetAttrMap(attrs);
+  op->CheckAttrs();
 }

 // Tensors in feed value variable will only be in CPUPlace
@@ -134,12 +155,12 @@ class ExecutorTesterRandom : public ::testing::Test {
    AddOp("squared_l2_distance", {{"X", {"a"}}, {"Y", {"a_out"}}},
          {{"Out", {"l2_distance"}}, {"sub_result", {"l2_distance_sub"}}}, {},
          root_block);
+    AddOp("mean", {{"X", {"l2_distance"}}}, {{"Out", {"mean_out"}}}, {},
+          root_block);

    // backward
-    AddOp("fill_constant", {}, {{"Out", {"l2_distance@GRAD"}}},
-          {{"shape", std::vector<int>{batch_size, 1}}, {"value", float(1.0)}},
-          root_block);
-    AppendBackward(program, {});
+    auto target = VarDescBind("mean_out");
+    AppendBackward(program, target, {});

    // update
    AddOp("fill_constant", {}, {{"Out", {"learning_rate"}}},
@@ -305,4 +326,14 @@ TEST_F(ExecutorTesterFeedAndFetch, GPU) {
    }
  }
 }
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  // Use less GPU memory for unittest.
+  FLAGS_fraction_of_gpu_memory_to_use = 0.25;
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -97,16 +97,26 @@ enum DataType {
  FP64 = 6;
 }

-message LoDTensorDesc {
+message TensorDesc {
  required DataType data_type = 1;
  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  optional int32 lod_level = 3 [ default = 0 ];
+}
+
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
 }

 message VarDesc {
+  enum VarType {
+    LOD_TENSOR = 1;
+    SELECTED_ROWS = 2;
+  }
  required string name = 1;
-  optional LoDTensorDesc lod_tensor = 2;
-  optional bool persistable = 3 [ default = false ];
+  required VarType type = 2;
+  optional LoDTensorDesc lod_tensor = 3;
+  optional TensorDesc selected_rows = 4;
+  optional bool persistable = 5 [ default = false ];
 }

 message BlockDesc {

--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -13,6 +13,8 @@
   limitations under the License. */

 #pragma once
+#include <string>
+#include <unordered_set>
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/operator.h"

@@ -21,27 +23,50 @@ namespace framework {

 class GradOpDescMakerBase {
 public:
-  explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {}
+  explicit GradOpDescMakerBase(
+      const OpDescBind& fwd_op,
+      const std::unordered_set<std::string>& no_grad_set,
+      std::unordered_map<std::string, std::string>* grad_to_var)
+      : fwd_op_(fwd_op), no_grad_set_(no_grad_set), grad_to_var_(grad_to_var) {}

  virtual ~GradOpDescMakerBase() = default;
  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;

 protected:
-  static std::vector<std::string> ToGradNames(
-      const std::vector<std::string>& var_names) {
+  std::vector<std::string> InputGrad(const std::string& name,
+                                     bool drop_empty_grad = true) const {
    std::vector<std::string> ret_val;
+    auto var_names = this->Input(name);
    ret_val.reserve(var_names.size());
    std::transform(var_names.begin(), var_names.end(),
-                   std::back_inserter(ret_val), GradVarName);
-    return ret_val;
-  }
-
-  std::vector<std::string> InputGrad(const std::string& name) const {
-    return ToGradNames(fwd_op_.Input(name));
+                   std::back_inserter(ret_val),
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     if (no_grad_set_.count(g_name)) {
+                       return kEmptyVarName;
+                     } else {
+                       (*this->grad_to_var_)[g_name] = fwd_var_name;
+                       return g_name;
+                     }
+                   });
+    if (!drop_empty_grad) {
+      return ret_val;
+    }
+    std::vector<std::string> dropped_ret_val;
+    dropped_ret_val.reserve(ret_val.size());
+    std::copy_if(ret_val.begin(), ret_val.end(),
+                 std::back_inserter(dropped_ret_val),
+                 [](const std::string& str) { return str != kEmptyVarName; });
+    return dropped_ret_val;
  }

  std::vector<std::string> OutputGrad(const std::string& name) const {
-    return ToGradNames(fwd_op_.Output(name));
+    std::vector<std::string> ret_val;
+    auto onames = this->Output(name);
+    ret_val.reserve(onames.size());
+    std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
+                   GradVarName);
+    return ret_val;
  }

  std::vector<std::string> InputNames() const {
@@ -75,6 +100,8 @@ class GradOpDescMakerBase {

 private:
  const OpDescBind& fwd_op_;
+  const std::unordered_set<std::string>& no_grad_set_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
 };

 class SingleGradOpDescMaker : public GradOpDescMakerBase {
@@ -91,6 +118,7 @@ class SingleGradOpDescMaker : public GradOpDescMakerBase {
  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
 };

+template <bool DropEmptyIG = true>
 class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 public:
  using SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -102,7 +130,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {

    for (auto& input_param : this->InputNames()) {
      grad->SetInput(input_param, this->Input(input_param));
-      grad->SetOutput(GradVarName(input_param), this->InputGrad(input_param));
+      grad->SetOutput(GradVarName(input_param),
+                      this->InputGrad(input_param, DropEmptyIG));
    }

    for (auto& output_param : this->OutputNames()) {

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -32,7 +32,7 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
 }

 OpDesc *OpDescBind::Proto() {
-  Sync();
+  Flush();
  return &op_desc_;
 }

@@ -100,6 +100,12 @@ void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
  need_update_ = true;
 }

+void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+  BlockDesc *desc = block.Proto();
+  this->attrs_[name] = desc;
+  need_update_ = true;
+}
+
 void OpDescBind::SetAttrMap(
    const std::unordered_map<std::string, Attribute> &attr_map) {
  attrs_ = attr_map;
@@ -159,7 +165,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };

-void OpDescBind::Sync() {
+void OpDescBind::Flush() {
  if (need_update_) {
    this->op_desc_.mutable_inputs()->Clear();
    for (auto &ipt : inputs_) {

--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -89,8 +89,6 @@ class OpDescBind {
    this->need_update_ = true;
  }

-  void Sync();
-
  const VariableNameMap &Inputs() const { return inputs_; }

  const VariableNameMap &Outputs() const { return outputs_; }
@@ -104,6 +102,8 @@ class OpDescBind {

  void InferShape(const BlockDescBind &block) const;

+  void Flush();
+
 private:
  template <typename MapType>
  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {

--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -59,16 +59,5 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
                  op_desc.GetAttrMap());
 }

-std::vector<std::unique_ptr<OpDescBind>> OpRegistry::CreateGradOpDescs(
-    OpDescBind* op_desc) {
-  auto& info = OpInfoMap::Instance().Get(op_desc->Type());
-
-  if (info.Checker() != nullptr) {
-    info.Checker()->Check(*op_desc->MutableAttrMap());
-  }
-
-  return info.grad_op_maker_(*op_desc);
-}
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -79,9 +79,6 @@ class OpRegistry {

  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);

-  static std::vector<std::unique_ptr<OpDescBind>> CreateGradOpDescs(
-      OpDescBind* op_desc);
-
  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
 };

@@ -160,17 +157,18 @@ class OpKernelRegistrar : public Registrar {
 /**
 * Macro to register Operator.
 */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,           \
-                    grad_op_class)                                             \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                              \
-  class _GradOpDescMaker_##grad_op_type##_                                     \
-      : public ::paddle::framework::DefaultGradOpDescMaker {                   \
-    using ::paddle::framework::DefaultGradOpDescMaker::DefaultGradOpDescMaker; \
-                                                                               \
-   protected:                                                                  \
-    virtual std::string GradOpType() const { return #grad_op_type; }           \
-  };                                                                           \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,     \
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,       \
+                    grad_op_class)                                         \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                          \
+  class _GradOpDescMaker_##grad_op_type##_                                 \
+      : public ::paddle::framework::DefaultGradOpDescMaker<true> {         \
+    using ::paddle::framework::DefaultGradOpDescMaker<                     \
+        true>::DefaultGradOpDescMaker;                                     \
+                                                                           \
+   protected:                                                              \
+    virtual std::string GradOpType() const { return #grad_op_type; }       \
+  };                                                                       \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
                    op_maker_class);

 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -289,6 +289,15 @@ class ExecutionContext {
    return device_context_;
  }

+#ifdef PADDLE_WITH_CUDA
+  const platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    auto cuda_ctx =
+        reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_);
+    return *cuda_ctx;
+  }
+#endif
+
 private:
  const OperatorBase& op_;
  const Scope& scope_;
@@ -394,11 +403,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {

 private:
  DDim GetDim(const std::string& name) const override {
-    return framework::make_ddim(block_.Var(name)->Shape());
+    return framework::make_ddim(block_.FindVar(name)->Shape());
  }

  void SetDim(const std::string& name, const DDim& dim) override {
-    block_.Var(name)->SetShape(framework::vectorize(dim));
+    block_.FindVar(name)->SetShape(framework::vectorize(dim));
  }

  const OpDescBind& op_;

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -84,7 +84,7 @@ TEST(OperatorBase, all) {
  paddle::framework::Scope scope;

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  scope.NewVar("OUT1");
+  scope.Var("OUT1");
  ASSERT_EQ(paddle::framework::op_run_num, 0);
  op->Run(scope, device_context);
  ASSERT_EQ(paddle::framework::op_run_num, 1);
@@ -237,12 +237,12 @@ TEST(OpKernel, multi_inputs) {

  paddle::platform::CPUDeviceContext cpu_device_context;
  paddle::framework::Scope scope;
-  scope.NewVar("x0")->GetMutable<Tensor>();
-  scope.NewVar("x1")->GetMutable<Tensor>();
-  scope.NewVar("x2")->GetMutable<Tensor>();
-  scope.NewVar("k0")->GetMutable<Tensor>();
-  scope.NewVar("y0")->GetMutable<Tensor>();
-  scope.NewVar("y1")->GetMutable<Tensor>();
+  scope.Var("x0")->GetMutable<Tensor>();
+  scope.Var("x1")->GetMutable<Tensor>();
+  scope.Var("x2")->GetMutable<Tensor>();
+  scope.Var("k0")->GetMutable<Tensor>();
+  scope.Var("y0")->GetMutable<Tensor>();
+  scope.Var("y1")->GetMutable<Tensor>();

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  op->Run(scope, cpu_device_context);

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -45,7 +45,7 @@ BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {

 ProgramDesc *ProgramDescBind::Proto() {
  for (auto &block : blocks_) {
-    block->Sync();
+    block->Flush();
  }
  return prog_;
 }

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -31,7 +31,7 @@ Scope& Scope::NewScope() const {
  return *kids_.back();
 }

-Variable* Scope::NewVar(const std::string& name) {
+Variable* Scope::Var(const std::string& name) {
  auto iter = vars_.find(name);
  if (iter != vars_.end()) {
    return iter->second;
@@ -42,8 +42,8 @@ Variable* Scope::NewVar(const std::string& name) {
  return v;
 }

-Variable* Scope::NewVar() {
-  return NewVar(string::Sprintf("%p.%d", this, vars_.size()));
+Variable* Scope::Var() {
+  return Var(string::Sprintf("%p.%d", this, vars_.size()));
 }

 Variable* Scope::FindVar(const std::string& name) const {
@@ -71,8 +71,8 @@ framework::Scope& GetGlobalScope() {
  static std::unique_ptr<framework::Scope> g_scope{nullptr};
  std::call_once(feed_variable_flag, [&]() {
    g_scope.reset(new framework::Scope());
-    g_scope->NewVar("feed_value");
-    g_scope->NewVar("fetch_value");
+    g_scope->Var("feed_value");
+    g_scope->Var("fetch_value");
  });
  return *(g_scope.get());
 }

--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -45,10 +45,10 @@ class Scope {
  Scope& NewScope() const;

  /// Create a variable with given name if it doesn't exist.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);

  /// Create a variable with a scope-unique name.
-  Variable* NewVar();
+  Variable* Var();

  /// Find a variable in the scope or any of its ancestors.  Returns
  /// nullptr if cannot find.

--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -23,8 +23,8 @@ TEST(Scope, VarsShadowing) {
  Scope& ss1 = s.NewScope();
  Scope& ss2 = s.NewScope();

-  Variable* v0 = s.NewVar("a");
-  Variable* v1 = ss1.NewVar("a");
+  Variable* v0 = s.Var("a");
+  Variable* v1 = ss1.Var("a");

  EXPECT_NE(v0, v1);

@@ -40,7 +40,7 @@ TEST(Scope, FindVar) {
  EXPECT_EQ(nullptr, s.FindVar("a"));
  EXPECT_EQ(nullptr, ss.FindVar("a"));

-  ss.NewVar("a");
+  ss.Var("a");

  EXPECT_EQ(nullptr, s.FindVar("a"));
  EXPECT_NE(nullptr, ss.FindVar("a"));
@@ -49,7 +49,7 @@ TEST(Scope, FindVar) {
 TEST(Scope, FindScope) {
  Scope s;
  Scope& ss = s.NewScope();
-  Variable* v = s.NewVar("a");
+  Variable* v = s.Var("a");

  EXPECT_EQ(&s, s.FindScope(v));
  EXPECT_EQ(&s, ss.FindScope(v));

--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRows {
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+  }
+
+  SelectedRows() { value_.reset(new Tensor()); }
+
+  platform::Place place() const { return value_->place(); }
+
+  const Tensor& value() const { return *value_; }
+
+  Tensor* mutable_value() { return value_.get(); }
+
+  int64_t height() const { return height_; }
+
+  void set_height(int64_t height) { height_ = height; }
+
+  const std::vector<int64_t>& rows() const { return rows_; }
+
+  void set_rows(const std::vector<int64_t>& rows) { rows_ = rows; }
+
+  DDim GetCompleteDims() const {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return make_ddim(dims);
+  }
+
+ private:
+  std::vector<int64_t> rows_;
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/selected_rows.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRowsTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    std::vector<int64_t> rows{0, 4, 7};
+    int64_t height = 10;
+    int64_t row_numel = 100;
+    selected_rows_.reset(new SelectedRows(rows, height));
+
+    Tensor* value = selected_rows_->mutable_value();
+    value->mutable_data<float>(
+        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+  }
+
+ protected:
+  platform::CPUPlace place_;
+  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
+};
+
+TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
+
+TEST_F(SelectedRowsTester, dims) {
+  ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100}));
+}
+
+TEST_F(SelectedRowsTester, complete_dims) {
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -19,9 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-// TODO(longfei): Once after both CompileTimeInferShapeContext and
-// RuntimeInferShapeContext get merged, we can rename InferShapeContext into
-// InferShapeContext so to replace the current InferShapeContext.
 class InferShapeContext {
 public:
  virtual ~InferShapeContext() {}

--- a/paddle/framework/tensor_array.cc
+++ b/paddle/framework/tensor_array.cc
@@ -76,6 +76,17 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
                           const std::vector<DySeqMeta>& meta, const LoD& lod,
                           size_t level);

+std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) {
+  // collect indice need to copy to the batch
+  std::vector<size_t> indice;
+  for (const auto& seq : meta) {
+    size_t id = seq.begin + batch_id;
+    if (id >= seq.end) break;
+    indice.push_back(id);
+  }
+  return indice;
+}
+
 }  // namespace detail

 const LoDTensor& TensorArray::Read(size_t index) const {
@@ -113,8 +124,8 @@ LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
  return detail::PackDynamicBatch(values_, meta, lod, level);
 }

-std::vector<DySeqMeta> TensorArray::Unpack(const LoDTensor& source, int level,
-                                           bool length_desend) {
+DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level,
+                                   bool length_desend) {
  detail::DynamicBatchUnpacker unpacker(source, level,
                                        length_desend /*descend*/);

@@ -129,6 +140,7 @@ std::vector<DySeqMeta> TensorArray::Unpack(const LoDTensor& source, int level,
    Write(batch_id, unpacker.GetBatch(batch_id));
  }

+  PADDLE_ENFORCE(!unpacker.meta.empty());
  return unpacker.meta;
 }

@@ -218,13 +230,7 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
  LoDTensor result;

-  // collect indice need to copy to the batch
-  std::vector<size_t> indice;
-  for (const auto& seq : meta) {
-    size_t id = seq.begin + index;
-    if (id >= seq.end) break;
-    indice.push_back(id);
-  }
+  auto indice = detail::GenDyBatchIndice(meta, index);
  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);

  // copy the indice of records in LoDTensor
@@ -237,9 +243,9 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
  for (size_t i = 0; i < indice.size(); i++) {
    auto index = indice[i];
    auto target = result.Slice<value_type>(i, i + 1);
-    auto source_ = source->Slice<value_type>(index, index + 1);
+    auto slice = source->Slice<value_type>(index, index + 1);

-    target.CopyFrom<value_type>(source_, platform::CPUPlace(),
+    target.CopyFrom<value_type>(slice, platform::CPUPlace(),
                                platform::CPUDeviceContext());
  }


--- a/paddle/framework/tensor_array.h
+++ b/paddle/framework/tensor_array.h
@@ -34,6 +34,13 @@ struct DySeqMeta {
  size_t ori_idx;
 };

+using DySeqMetaBatch = std::vector<DySeqMeta>;
+
+/*
+ * Extract the indices of instances.
+ */
+std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id);
+
 /*
 * TensorArray is a C-array-like array of tensors, it is meant to be used with
 * dynamic iteration primitives such as while_loop. It is used to segment inputs
@@ -69,7 +76,7 @@ class TensorArray {
   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
   * `indice_map`.
   */
-  LoDTensor Pack(size_t level, const std::vector<DySeqMeta> &meta,
+  LoDTensor Pack(size_t level, const DySeqMetaBatch &meta,
                 const LoD &lod) const;

  /*
@@ -77,8 +84,7 @@ class TensorArray {
   * `values`, if set `desend`, will sort by length in descending order else in
   * ascending order.
   */
-  std::vector<DySeqMeta> Unpack(const LoDTensor &source, int level,
-                                bool length_desend);
+  DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend);

  /*
   * Pack the values into a tensor with rank one higher than each tensor in

--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -36,8 +36,9 @@ using OpCreator = std::function<OperatorBase*(
    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;

-using GradOpMakerFN =
-    std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>;
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
+    const OpDescBind&, const std::unordered_set<std::string>& /*no_grad_set*/,
+    std::unordered_map<std::string, std::string>* /*grad_to_var*/)>;

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -13,32 +13,58 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/framework/var_desc.h"
+#include "paddle/platform/enforce.h"

 namespace paddle {
 namespace framework {

 void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
-  VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }

 void VarDescBind::SetDataType(DataType data_type) {
-  desc_.mutable_lod_tensor()->set_data_type(data_type);
+  mutable_tensor_desc()->set_data_type(data_type);
 }

 std::vector<int64_t> VarDescBind::Shape() const {
-  return RepeatedToVector(desc_.lod_tensor().dims());
+  return RepeatedToVector(tensor_desc().dims());
 }

-DataType VarDescBind::GetDataType() const {
-  return desc_.lod_tensor().data_type();
-}
+DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }

 void VarDescBind::SetLoDLevel(int32_t lod_level) {
+  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
  desc_.mutable_lod_tensor()->set_lod_level(lod_level);
 }

 int32_t VarDescBind::GetLodLevel() const {
+  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
  return desc_.lod_tensor().lod_level();
 }
+
+const TensorDesc &VarDescBind::tensor_desc() const {
+  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+  switch (desc_.type()) {
+    case VarDesc::SELECTED_ROWS:
+      return desc_.selected_rows();
+    case VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().tensor();
+    default:
+      PADDLE_THROW("Unexpected branch.");
+  }
+}
+
+TensorDesc *VarDescBind::mutable_tensor_desc() {
+  PADDLE_ENFORCE(desc_.has_type(),
+                 "invoke MutableTensorDesc must after set type");
+  switch (desc_.type()) {
+    case VarDesc::SELECTED_ROWS:
+      return desc_.mutable_selected_rows();
+    case VarDesc::LOD_TENSOR:
+      return desc_.mutable_lod_tensor()->mutable_tensor();
+    default:
+      PADDLE_THROW("Unexpected branch.");
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -34,6 +34,7 @@ inline std::vector<T> RepeatedToVector(
 template <typename T, typename RepeatedField>
 inline void VectorToRepeated(const std::vector<T> &vec,
                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
  repeated_field->Reserve(vec.size());
  for (const auto &elem : vec) {
    *repeated_field->Add() = elem;
@@ -44,6 +45,7 @@ inline void VectorToRepeated(const std::vector<T> &vec,
 template <typename RepeatedField>
 inline void VectorToRepeated(const std::vector<bool> &vec,
                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
  repeated_field->Reserve(vec.size());
  for (auto elem : vec) {
    *repeated_field->Add() = elem;
@@ -52,7 +54,10 @@ inline void VectorToRepeated(const std::vector<bool> &vec,

 class VarDescBind {
 public:
-  explicit VarDescBind(const std::string &name) { desc_.set_name(name); }
+  explicit VarDescBind(const std::string &name) {
+    desc_.set_name(name);
+    desc_.set_type(VarDesc::LOD_TENSOR);
+  }

  VarDesc *Proto() { return &desc_; }

@@ -70,7 +75,14 @@ class VarDescBind {

  int32_t GetLodLevel() const;

+  VarDesc::VarType GetType() const { return desc_.type(); }
+
+  void SetType(VarDesc::VarType type) { desc_.set_type(type); }
+
 private:
+  const TensorDesc &tensor_desc() const;
+  TensorDesc *mutable_tensor_desc();
+
  VarDesc desc_;
 };
 }  // namespace framework

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -462,8 +462,8 @@ void LambdaCost::calcGrad(const real* outputScore,
      real score_j = score[index_j];
      real dcgDif = 0;
      if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) /
-                 (std::log(i + 2) - std::log(j + 2));
+        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
+                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
      } else {
        dcgDif =
            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);

--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -86,6 +86,7 @@ protected:
  /// Also used in 'use_mkldnn' case.
  std::vector<Argument> outputOtherDevice_;
  /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
  std::map<std::string, Argument*> outputMap_;
  /// Used to merge grad on different devices.
  MatrixPtr tmpGrad_;
@@ -325,6 +326,11 @@ public:
    outputMap_[name] = output;
  }

+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
  /**
   * Get the output based on layer's name.
   */

--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -225,8 +225,6 @@ void MKLDNNConvLayer::resetFwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
  if (cvtInVal_) {
    pipeline.push_back(*cvtInVal_);
  }
@@ -245,7 +243,7 @@ void MKLDNNConvLayer::resetFwdPipeline(

 void MKLDNNConvLayer::resetInValue(
    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
-  const MatrixPtr& inMat = inputLayers_[0]->getOutput().value;
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
  in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());

  // create buffer and reorder if input value do not match
@@ -310,15 +308,20 @@ void MKLDNNConvLayer::resetOutValue(
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
+      out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
+      CHECK(cvtOutVal_) << "should not be empty";
    } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
      cpuOutVal_ = out;
    }
+    // when output is cpu device, change the mkldnn output value and make them
+    // share the same data. Then if next layer use inputlayer->getOuputValue()
+    // to achieve the input value, it will get the right data.
+    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
+    return;
  }
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }

 void MKLDNNConvLayer::resetBwdWgtPD(
@@ -412,8 +415,6 @@ void MKLDNNConvLayer::resetBwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
  if (cvtOutGrad_) {
    pipeline.push_back(*cvtOutGrad_);
  }
@@ -446,28 +447,27 @@ void MKLDNNConvLayer::resetBwdPipeline(

 void MKLDNNConvLayer::resetOutGrad(
    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  const MatrixPtr& outMat = output_.grad;
-  out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
-  CHECK(outVal_ != nullptr &&
-        out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
-      << "primitive desc of out grad and value should be equal";
-
-  // TODO(TJ): merge outgrad
-  // create reorder if has output grad does not match
  cpuOutGrad_ = nullptr;
  cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_ != nullptr &&
+        outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
+      << "primitive desc of out grad and value should be equal";
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    outMat->setData(cpuOut->getData());
    // same PrimitiveDesc with cpuInVal_
    CHECK(cpuOutVal_);
    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
-      out = cpuOutGrad_;
-    } else {
-      out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc());
+    // create reorder if primitive desc does not match
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_);
+    } else {
+      // share the same data of CPU output
+      output_.grad->setData(cpuOut->getData());
+      out = cpuOutGrad_;
    }
  }
 }
@@ -496,32 +496,30 @@ void MKLDNNConvLayer::resetWgtBiasGrad(
 void MKLDNNConvLayer::resetInGrad(
    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
    MKLDNNMatrixPtr& in) {
+  in = nullptr;
+  cpuInGrad_ = nullptr;
+  cvtInGrad_ = nullptr;
  if (dataPD == nullptr) {
    return;
  }

-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-  in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad,
-                            dataPD->diff_src_primitive_desc());
-  CHECK(nullptr != inVal_ &&
-        in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-      << "primitive desc of input grad and value should be equal";
-
-  // create reorder if has output grad does not match
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (!inputIsOnlyMKLDNN()) {
+  if (inputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
+    CHECK(nullptr != inVal_ &&
+          in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
+        << "primitive desc of input grad and value should be equal";
+  } else {
    const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
    // same PrimitiveDesc with cpuInVal_
    CHECK(cpuInVal_);
    cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE);
-      in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc());
+    in = cpuInGrad_;
+    // create reorder if PrimitiveDesc does not match
+    if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
+      in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
+                                dataPD->diff_src_primitive_desc());
      cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
      CHECK(cvtInGrad_);
-    } else {
-      in = cpuInGrad_;
    }
  }
 }

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -180,10 +180,10 @@ void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
 void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
  if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert
-    // just share point
+    // fc cpu output value do not need create convert, just share data
    getOutput(CPU_DEVICE).value->setData(out->getData());
  }
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }

 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@@ -214,8 +214,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
  if (bias) {
    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
  } else {
@@ -237,19 +235,14 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
 }

 void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  // TODO(TJ): merge outgrad
-  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  output_.grad->setData(getOutput(device).grad->getData());
-  // for MKLDNN device:
-  // can not directly cast outputgrad to mkldnnmatrix,
-  // since each layer can not write the inputgrad to mkldnn inputgrad.
-  // So just create from matrix with outputvalue format.
-  // for CPU device:
-  // fc do not need to convert from cpu device since output is always nc format
-  // only need create from cpu device
  CHECK(outVal_);
-  out =
-      MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc());
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    output_.grad->setData(cpuOut->getData());
+    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
+  }
 }

 void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
@@ -267,13 +260,11 @@ void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,

 void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
  in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
    return;
  }
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
  CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }

 void MKLDNNFcLayer::resetBwdWgtPD(
@@ -314,7 +305,6 @@ void MKLDNNFcLayer::resetBwdPipeline(
    MKLDNNMatrixPtr& wgt,
    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
  CHECK(inVal_);
  if (bias) {
    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -65,6 +65,17 @@ protected:
  MKLDNNMatrixPtr biasVal_;
  MKLDNNMatrixPtr biasGrad_;

+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+  // since mkldnn sum do not support different formats:
+  // can refer to https://github.com/01org/mkl-dnn/issues/134
+  // so need create reorder manually and save tmp MKLDNNMatrix
+  MKLDNNMatrixPtr tmpOutGrad_;
+  std::shared_ptr<mkldnn::primitive> tmpCvt_;
+
 public:
  explicit MKLDNNLayer(const LayerConfig& config)
      : Layer(config),
@@ -99,6 +110,7 @@ public:
    if (!Layer::init(layerMap, parameterMap)) {
      return false;
    }
+    setOutputMap();
    checkCPUOutputsNumber();

    stream_.reset(new MKLDNNStream());
@@ -118,12 +130,9 @@ public:
        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
        // reset when input total sizes changed, not only the batchsize
        inputElemenCnt_ = elemenCnt;
+        pipelineFwd_.clear();
        reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
        resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-        if (outVal_) {
-          // change original output value to mkldnn output value
-          output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-        }
        convertWeightsFromPaddle();
        needResetBwd_ = true;
      }
@@ -144,9 +153,18 @@ public:
  void backward(const UpdateCallback& callback) override {
    if (needResetBwd_) {
      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+      pipelineBwd_.clear();
+      pipelineMergeGrad_.clear();
+      mergeGrad_ = nullptr;
      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
      needResetBwd_ = false;
    }
+
+    // merge grad must before backward activation
+    if (mergeGrad_) {
+      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+      stream_->submit(pipelineMergeGrad_);
+    }
    {
      REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
      backwardActivation();
@@ -247,6 +265,76 @@ protected:
    }
  }

+  /**
+   * reset the output grad matrix from primitive desc.
+   * and reset the merge grad primitive if needed.
+   * note: when this layer has serval outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  virtual void resetOutGrad(MKLDNNMatrixPtr& out,
+                            mkldnn::memory::primitive_desc pd) {
+    CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
+    mergeGrad_ = nullptr;
+    pipelineMergeGrad_.clear();
+    out = MKLDNNMatrix::create(output_.grad, pd);
+    if (outputMap_.size() <= 1) {
+      return;
+    }
+    std::vector<double> scales(outputMap_.size(), 1.0);
+    std::vector<mkldnn::memory::primitive_desc> srcPDs;
+    std::vector<mkldnn::primitive::at> srcs;
+    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+      MKLDNNMatrixPtr src =
+          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
+      CHECK(src) << "should be MKLDNNMatrix";
+      auto srcDims = src->getDims();
+      auto dstDims = out->getDims();
+      CHECK_EQ(srcDims.size(), dstDims.size());
+      for (size_t i = 0; i < srcDims.size(); ++i) {
+        CHECK_EQ(srcDims[i], dstDims[i]);
+      }
+      srcPDs.push_back(src->getPrimitiveDesc());
+      srcs.push_back(*src);
+    }
+
+    // TODO(TJ): remove me when mkldnn sum support different formats
+    for (size_t i = 1; i < srcPDs.size(); ++i) {
+      CHECK(srcPDs[0] == srcPDs[i]);
+    }
+    tmpOutGrad_ = nullptr;
+    tmpCvt_ = nullptr;
+    if (out->getPrimitiveDesc() != srcPDs[0]) {
+      tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
+      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+      CHECK(tmpCvt_);
+      pipelineMergeGrad_.push_back(*tmpCvt_);
+    } else {
+      tmpOutGrad_ = out;
+    }
+
+    auto sumPD = mkldnn::sum::primitive_desc(
+        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
+    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+  }
+
+  /**
+   * reset input grad from primitive desc.
+   * this function is avaiable for input is only mkldnn
+   * or input do not care cpu device
+   */
+  virtual void resetInGrad(MKLDNNMatrixPtr& in,
+                           mkldnn::memory::primitive_desc pd) {
+    LayerPtr& input = inputLayers_[0];
+    const MatrixPtr& grad =
+        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
+    in = MKLDNNMatrix::create(grad, pd);
+    Argument& arg = input->getOutput(this->getName());
+    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  }
+
  /**
   * print info about sizes
   */
@@ -334,6 +422,16 @@ private:
    }
  }

+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
  /**
   * Check the cpu device number of outputOtherDevice_.
   * should have only one at most.

--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -142,14 +142,16 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
      CHECK(cvtOutVal_) << "should not be emptry";
    } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
      cpuOutVal_ = out;
    }
+    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
+    return;
  }
+  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
 }

 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
@@ -187,7 +189,6 @@ void MKLDNNPoolLayer::resetFwdPipeline(
    std::shared_ptr<pool_fwd::primitive_desc>& pd,
    MKLDNNMatrixPtr& in,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
  fwd_ = workspace_
             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
@@ -205,17 +206,17 @@ void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
  resetInGrad(in);
 }
 void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_) << "Should have output value";
-  out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
-
-  // create reorder if output value has cpu device and pd do not match
  cpuOutGrad_ = nullptr;
  cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_);
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
    cpuOutGrad_ = MKLDNNMatrix::create(
        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_) << "should not be emptry";
    } else {
@@ -228,12 +229,11 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {

 void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
  in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
    return;
  }
  CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }

 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
@@ -261,7 +261,6 @@ void MKLDNNPoolLayer::resetBwdPipeline(
    std::shared_ptr<pool_bwd::primitive_desc>& pd,
    MKLDNNMatrixPtr& in,
    MKLDNNMatrixPtr& out) {
-  pipeline.clear();
  if (cvtOutGrad_) {
    pipeline.push_back(*cvtOutGrad_);
  }

--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -124,8 +124,8 @@ void MKLDNNTester::randomTopDiffs() {
 void MKLDNNTester::checkForward() {
  VLOG(MKLDNN_ALL) << "Check Forward";
  printTopDatas();
-  double delta = compareMatrix(dnnLayer_->getOutput(CPU_DEVICE).value,
-                               refLayer_->getOutputValue());
+  double delta =
+      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
  EXPECT_LE(fabs(delta), eps_);
 }


--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -84,8 +84,9 @@ function(op_library TARGET)
    endif()

    # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
    file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "OperatorWithKernel" regex_result "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
    string(REPLACE "_op" "" TARGET "${TARGET}")
    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")

--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -338,6 +338,38 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };

+template <typename AttrType>
+class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardSigmoidOpMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardSigmoid operator");
+    AddOutput("Y", "Output of HardSigmoid operator");
+    AddComment(R"DOC(
+Hard Sigmoid activation operator.
+
+Segment-wise linear approximation of sigmoid[1].
+This is much faster than sigmoid.
+
+hard_sigmoid = max(0, min(1, slope * x + shift))
+
+The slope should be positive. The offset can be either positive or negative.
+The default slope and shift are set from [1].
+It is recommended to use the defaults for this activation.
+
+References:
+  [1] Noisy Activation Functions
+      (https://arxiv.org/abs/1603.00391)
+
+    )DOC");
+    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.2));
+    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.5));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -413,6 +445,9 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp,
            ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
            ops::ActivationOpGrad);

+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
+            hard_sigmoid_grad, ops::ActivationOpGrad);
+
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
  REGISTER_OP_CPU_KERNEL(                                                      \
      act_type,                                                                \

--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -616,30 +616,63 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };

+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    y.device(d) = temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) =
+        dy *
+        ((y > static_cast<T>(0)) * (y < static_cast<T>(1))).template cast<T>() *
+        static_cast<T>(slope);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                          \
-  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);           \
-  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(exp, ExpFunctor, ExpGradFunctor);                       \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                    \
-  __macro(tanh, TanhFunctor, TanhGradFunctor);                    \
-  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
-  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                    \
-  __macro(abs, AbsFunctor, AbsGradFunctor);                       \
-  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log, LogFunctor, LogGradFunctor);                       \
-  __macro(square, SquareFunctor, SquareGradFunctor);              \
-  __macro(brelu, BReluFunctor, BReluGradFunctor);                 \
-  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);       \
-  __macro(pow, PowFunctor, PowGradFunctor);                       \
-  __macro(stanh, STanhFunctor, STanhGradFunctor);                 \
-  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);        \
-  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);        \
-  __macro(relu6, Relu6Functor, Relu6GradFunctor);                 \
-  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
-  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
-  __macro(elu, ELUFunctor, ELUGradFunctor);                       \
-  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
+  __macro(log, LogFunctor, LogGradFunctor);                          \
+  __macro(square, SquareFunctor, SquareGradFunctor);                 \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
+  __macro(pow, PowFunctor, PowGradFunctor);                          \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
+  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
+  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
+  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
+  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
+  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+                   "Input(Moment1) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+                   "Input(Moment2) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+                   "Input(Beta2Pow) of AdamOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                   "Output(Moment1Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                   "Output(Moment2Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"),
+                   "Output(Beta1PowOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Beta2PowOut"),
+                   "Output(Beta2PowOut) of AdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment1"),
+        "Param and Moment input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment2"),
+        "Param and InfNorm input of AdamOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("Moment1Out", param_dims);
+    ctx->SetOutputDim("Moment2Out", param_dims);
+    ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
+    ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims);
+  }
+};
+
+class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+
+    AddComment(R"DOC(
+Adam Updates Operator.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper[1]. Adam is a first-order gradient-based optimization
+method based on adaptive estimates of lower-order moments.
+
+Adam updates:
+
+moment1_out = beta1 * moment1 + (1 − beta1) * grad
+moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
+beta1_pow_out = beta1_pow * beta1
+beta2_pow_out = beta2_pow * beta2
+learning_rate_t = learning_rate_t *
+                  sqrt(1 - beta2_pow_out) / (1 - beta1_pow_out)
+param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
+
+References:
+  [1] Adam: A Method for Stochastic Optimization
+      (https://arxiv.org/abs/1412.6980)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
+REGISTER_OP_CPU_KERNEL(adam,
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/adam_op.cu
+++ b/paddle/operators/adam_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adam_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(adam,
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment1_out_tensor = ctx.Output<framework::Tensor>("Moment1Out");
+    auto moment2_out_tensor = ctx.Output<framework::Tensor>("Moment2Out");
+    auto beta1_pow_out_tensor = ctx.Output<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_out_tensor = ctx.Output<framework::Tensor>("Beta2PowOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
+    beta1_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
+    beta2_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float beta1 = ctx.Attr<float>("beta1");
+    float beta2 = ctx.Attr<float>("beta2");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment1 = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment1"));
+    auto moment2 = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment2"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+    auto beta1_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta1Pow"));
+    auto beta2_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta2Pow"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
+    auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
+    auto beta1_pow_out =
+        framework::EigenVector<T>::Flatten(*beta1_pow_out_tensor);
+    auto beta2_pow_out =
+        framework::EigenVector<T>::Flatten(*beta2_pow_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
+    moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
+    beta1_pow_out.device(place) = beta1_pow * beta1;
+    beta2_pow_out.device(place) = beta2_pow * beta2;
+    // All of these are tensors of 1 element
+    auto lr_t = lr * (1 - beta2_pow_out).sqrt() / (1 - beta1_pow_out);
+    // Eigen does not support automatic broadcast
+    // Get dimensions of moment vector to broadcast lr_t
+    Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
+    param_out.device(place) =
+        param -
+        lr_t.broadcast(m_dsize) *
+            (moment1_out / (moment2_out.sqrt() + epsilon));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -134,7 +134,7 @@ void CondOp::PrepareDataForSubnet(
  for (int i = 0; i < BRANCH_NUM; ++i) {
    for (auto& output : (*sub_net_op_[i]).Outputs()) {
      for (auto& var_name : output.second) {
-        sub_scopes[i]->NewVar(var_name);
+        sub_scopes[i]->Var(var_name);
      }
    }
  }

--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -12,111 +12,91 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/gemm_conv2d_op.h"
+#include "paddle/operators/conv2d_op.h"

 namespace paddle {
 namespace operators {

-int outputSize(int input_size, int filter_size, int padding, int stride) {
-  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  return output_size;
+void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of Conv2DOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of Conv2DOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of Conv2DOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  int groups = ctx->Attrs().Get<int>("groups");
+  int input_channels = in_dims[1];
+  int output_channels = filter_dims[0];
+
+  PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
+  PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
+  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
+                    "The number of input channels should be equal to filter "
+                    "channels * groups.");
+  PADDLE_ENFORCE_EQ(
+      output_channels % groups, 0,
+      "The number of output channels should be divided by groups.");
+
+  auto output_height =
+      OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
+  auto output_width =
+      OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
+  ctx->SetOutputDim("Output",
+                    {in_dims[0], filter_dims[0], output_height, output_width});
 }

-class Conv2DOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of Conv2DOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) of Conv2DOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                   "Output(Output) of Conv2DOp should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    int groups = ctx->Attrs().Get<int>("groups");
-    int input_channels = in_dims[1];
-    int output_channels = filter_dims[0];
-
-    PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
-    PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
-    PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
-                      "The number of input channels should be equal to filter "
-                      "channels * groups.");
-    PADDLE_ENFORCE_EQ(
-        output_channels % groups, 0,
-        "The number of output channels should be divided by groups.");
-
-    auto output_height =
-        outputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
-    auto output_width =
-        outputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
-    ctx->SetOutputDim(
-        "Output", {in_dims[0], filter_dims[0], output_height, output_width});
-  }
-};
-
-class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv2DOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "Input",
-        "The input tensor of convolution operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of image.");
-    AddInput(
-        "Filter",
-        "The filter tensor of convolution operator."
-        "The format of the filter tensor is MCHW, where M is the number of "
-        "output image channels, C is the number of input image channels, "
-        "H and W is height and width of filter. "
-        "If the groups attribute is greater than 1, C equal the number of "
-        "input image channels divided by the groups.");
-    AddOutput("Output",
-              "The output tensor of convolution operator."
-              "The format of output tensor is also NCHW.");
-    AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
-        .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings", "paddings of convolution operator.")
-        .SetDefault({0, 0});
-    AddAttr<int>(
-        "groups",
-        "group size of convolution operator. "
-        "Refer to grouped convolution in Alex Krizhevsky's paper: "
-        "when group=2, the first half of the filters are only connected to the "
-        "first half of the input channels, and the second half only connected "
-        "to the second half.")
-        .SetDefault(1);
-    AddComment(R"DOC(
+Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "The input tensor of convolution operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of channels, H and W is the height and width of image.");
+  AddInput("Filter",
+           "The filter tensor of convolution operator."
+           "The format of the filter tensor is MCHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "H and W is height and width of filter. "
+           "If the groups attribute is greater than 1, C equal the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "The output tensor of convolution operator."
+            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>("paddings", "paddings of convolution operator.")
+      .SetDefault({0, 0});
+  AddAttr<int>(
+      "groups",
+      "group size of convolution operator. "
+      "Refer to grouped convolution in Alex Krizhevsky's paper: "
+      "when group=2, the first half of the filters are only connected to the "
+      "first half of the input channels, and the second half only connected "
+      "to the second half.")
+      .SetDefault(1);
+  AddComment(R"DOC(
 The convolution operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 )DOC");
-  }
-};
-
-class Conv2DOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+}

- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    if (ctx->HasOutput(framework::GradVarName("Input"))) {
-      ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-    }
+void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
  }
-};
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}

 }  // namespace operators
 }  // namespace paddle

--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/operators/conv2d_op.cu
@@ -12,7 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "paddle/operators/gemm_conv2d_op.h"
+#include "paddle/operators/conv2d_op.h"

 namespace ops = paddle::operators;


--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
@@ -24,6 +24,38 @@ namespace operators {

 using Tensor = framework::Tensor;

+// Base convolution operator definations for other conv
+// like operators to reuse the implementation.
+inline int OutputSize(int input_size, int filter_size, int padding,
+                      int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+// Define Op classes in .h file so that other conv
+// operator implementations can reuse the code.
+class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+class Conv2DOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class Conv2DOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
 template <typename Place, typename T>
 class GemmConv2DKernel : public framework::OpKernel<T> {
 public:
@@ -74,7 +106,6 @@ class GemmConv2DKernel : public framework::OpKernel<T> {

    framework::DDim output_matrix_shape = {output_channels,
                                           output_height * output_width};
-
    // convolution operator: im2col + gemm
    int in_step = input_channels / groups;
    int out_step = output_channels / groups;

--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv2d_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CudnnConvOpMaker : public Conv2DOpMaker {
+ public:
+  CudnnConvOpMaker(framework::OpProto* proto,
+                   framework::OpAttrChecker* op_checker)
+      : Conv2DOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault(std::vector<int>{1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
+            ops::Conv2DOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv_cudnn_grad,
+    ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memory.h"
+#include "paddle/operators/conv2d_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using DataLayout = platform::DataLayout;
+using CUDADeviceContext = platform::CUDADeviceContext;
+
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
+
+// NOTE: framework::vectorize converts to type int64_t
+//       which does not fit cudnn inputs.
+std::vector<int> Dims2Vector(const framework::DDim& dims) {
+  std::vector<int> ret;
+  for (int i = 0; i < dims.size(); i++) {
+    ret.push_back(dims[i]);
+  }
+  return ret;
+}
+
+template <typename T>
+class CudnnConvOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+
+    cudnnTensorDescriptor_t cudnn_input_desc =
+        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_desc =
+        output_desc.descriptor<T>(layout, Dims2Vector(output->dims()), groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc =
+        filter_desc.descriptor<T>(layout, Dims2Vector(filter->dims()), groups);
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+    int input_channels = input->dims()[1];
+    int input_height = input->dims()[2];
+    int input_width = input->dims()[3];
+    int output_channels = output->dims()[1];
+    int output_height = output->dims()[2];
+    int output_width = output->dims()[3];
+
+    int group_offset_in = input_channels / groups * input_height * input_width;
+    int group_offset_out =
+        output_channels / groups * output_height * output_width;
+    int group_offset_filter = filter->numel() / groups;
+    // ------------------- cudnn conv workspace ---------------------
+    void* cudnn_workspace = nullptr;
+    size_t workspace_size_in_bytes;  // final workspace to allocate.
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+    // ------------------- cudnn conv algorithm ---------------------
+    cudnnConvolutionFwdAlgo_t algo;
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+    // get workspace size able to allocate
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // Allocate on GPU memory
+    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv forward ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    for (int i = 0; i < groups; i++) {
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+          cudnn_filter_desc, filter_data + i * group_offset_filter,
+          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
+          &beta, cudnn_output_desc, output_data + i * group_offset_out));
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+template <typename T>
+class CudnnConvGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+
+    const T* input_data = input->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_grad_desc;
+    ScopedTensorDescriptor input_grad_desc;
+
+    ScopedFilterDescriptor filter_desc;
+    ScopedFilterDescriptor filter_grad_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+
+    cudnnTensorDescriptor_t cudnn_input_desc =
+        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_grad_desc =
+        output_grad_desc.descriptor<T>(layout, Dims2Vector(output_grad->dims()),
+                                       groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc =
+        filter_desc.descriptor<T>(layout, Dims2Vector(filter->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
+    cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+    int input_channels = input->dims()[1];
+    int input_height = input->dims()[2];
+    int input_width = input->dims()[3];
+    int output_grad_channels = filter->dims()[0];
+    int output_grad_height = output_grad->dims()[2];
+    int output_grad_width = output_grad->dims()[3];
+
+    int group_offset_in = input_channels / groups * input_height * input_width;
+    int group_offset_out =
+        output_grad_channels / groups * output_grad_height * output_grad_width;
+    int group_offset_filter = filter->numel() / groups;
+    // ------------------- cudnn backward algorithm ---------------------
+    cudnnConvolutionBwdDataAlgo_t data_algo;
+    cudnnConvolutionBwdFilterAlgo_t filter_algo;
+    size_t workspace_size_in_bytes = 0, tmp_size = 0;
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    if (input_grad) {
+      cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
+          layout, Dims2Vector(input_grad->dims()), groups);
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+              handle, cudnn_filter_desc,
+              // dyDesc: Handle to the previously initialized input differential
+              // tensor descriptor.
+              cudnn_output_grad_desc, cudnn_conv_desc,
+              // dxDesc: Handle to the previously initialized output tensor
+              // descriptor.
+              cudnn_input_grad_desc,
+              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &data_algo));
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+              handle, cudnn_filter_desc, cudnn_output_grad_desc,
+              cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+
+    if (filter_grad) {
+      cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
+          layout, Dims2Vector(filter_grad->dims()), groups);
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc,
+              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &filter_algo));
+
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc, filter_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+    // ------------------- cudnn conv workspace ---------------------
+    // Already on GPU
+    void* cudnn_workspace = nullptr;
+    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv backward data ---------------------
+    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+    T alpha = 1.0f, beta = 0.0f;
+    if (input_grad) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      auto t = framework::EigenVector<T>::Flatten(*input_grad);
+      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
+          t.constant(static_cast<T>(0));
+      for (int i = 0; i < groups; i++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+            handle, &alpha, cudnn_filter_desc,
+            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
+            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta,
+            cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
+      }
+    }
+    // ------------------- cudnn conv backward filter ---------------------
+    if (filter_grad) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
+      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
+          t.constant(static_cast<T>(0));
+      for (int i = 0; i < groups; i++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
+            cudnn_conv_desc, filter_algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
+            filter_grad_data + i * group_offset_filter));
+      }
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
+                       paddle::operators::CudnnConvGradOpKernel<float>);
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/decayed_adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DecayedAdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of DecayedAdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
+                      "Param and Grad input of DecayedAdagradOp should have "
+                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
+                      "Param and Moment input of DecayedAdagradOp should have "
+                      "the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DecayedAdagradOpMaker(framework::OpProto *proto,
+                        framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("decay",
+                   "(float, default 0.95) "
+                   "Discounting factor for coming gradient")
+        .SetDefault(0.95);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+
+Decayed Adagrad
+
+moment_out = decay * moment + (1 - decay) * grad * grad
+param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
+                             ops::DecayedAdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/decayed_adagrad_op.cu
+++ b/paddle/operators/decayed_adagrad_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/decayed_adagrad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/decayed_adagrad_op.h
+++ b/paddle/operators/decayed_adagrad_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class DecayedAdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float decay = ctx.Attr<float>("decay");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ b/paddle/operators/dynamic_recurrent_op.cc
@@ -23,13 +23,37 @@ using framework::Scope;
 using framework::TensorArray;
 using framework::LoDTensor;
 using framework::Variable;
+using framework::DySeqMetaBatch;

 namespace detail {

 inline void CreateVariables(Scope& scope,
                            const std::vector<std::string>& var_names) {
  for (const auto& name : var_names) {
-    scope.NewVar(name);
+    scope.Var(name);
+  }
+}
+
+/*
+ * The inputs with sequence should be reordered when they are split, so the
+ * boot_states should be reordered in the same order.
+ *
+ * NOTE This may require that the `pre_state` of the first time step should just
+ * copy the `boot_state` rather than reference it, for that the content should
+ * be reordered, but the RNN op should not change the `boot_state` as an input
+ * variable's content.
+ */
+template <typename T>
+inline void ReorderBootState(const DySeqMetaBatch& metas,
+                             const LoDTensor& boot_state, LoDTensor* tensor,
+                             const platform::Place& dst_place) {
+  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
+    auto slice = tensor->Slice<T>(seq_id, seq_id + 1);
+    auto boot_slice =
+        boot_state.Slice<T>(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
+    // TODO(superjom) pass in device context as an argument
+    slice.template CopyFrom<T>(boot_slice, dst_place,
+                               platform::CPUDeviceContext());
  }
 }

@@ -69,6 +93,7 @@ void DynamicRecurrentOp::Run(const Scope& scope,
  CreateScopes();
  WriteStepInputs();
  InitStates();
+  WriteStepOutputs();

  // call stepnet in all the time steps
  for (size_t step = 0; step < cache_.num_steps; step++) {
@@ -76,7 +101,6 @@ void DynamicRecurrentOp::Run(const Scope& scope,
    stepnet_->Run(step_scope, dev_ctx);
  }

-  WriteStepOutputs();
  ConcatOutputs();
 }

@@ -84,11 +108,11 @@ void DynamicRecurrentOp::SplitInputs() const {
  // TODO(superjom) make level a config
  // TODO(superjom) check all the inputs has the same LoD
  int level = 0;
-  const auto& inlinks = cache_.inlinks;
-  for (const auto& item : inlinks) {
+  for (const auto& item : cache_.inlinks) {
    const auto& var = item.second;
    const auto& tensor = var->Get<LoDTensor>();
    TensorArray& ta = step_inputs_[item.first];
+
    dy_seq_metas_[item.first] =
        ta.Unpack(tensor, level, true /*length_descend*/);

@@ -112,7 +136,7 @@ void DynamicRecurrentOp::WriteStepInputs() const {
      auto& step_scope = cache_.GetScope(step);
      Variable* var = step_scope.FindVar(item.first);
      if (var == nullptr) {
-        var = step_scope.NewVar(item.first);
+        var = step_scope.Var(item.first);
      }
      var->GetMutable<LoDTensor>()->ShareDataWith<value_type>(tensor);
    }
@@ -120,17 +144,11 @@ void DynamicRecurrentOp::WriteStepInputs() const {
 }

 void DynamicRecurrentOp::WriteStepOutputs() const {
-  for (size_t step = 0; step < cache_.scopes->size(); step++) {
-    auto& scope = cache_.GetScope(step);
-    for (auto& item : step_outputs_) {
-      auto* var = scope.FindVar(item.first);
-      if (var == nullptr) {
-        var = scope.NewVar(item.first);
-      }
-      auto* tensor = var->GetMutable<LoDTensor>();
-      item.second.WriteShared(step, *tensor);
-    }
+  // initialize step outputs
+  for (const auto& item : cache_.outlinks) {
+    step_outputs_.emplace(item.first, TensorArray());
  }
+  PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
 }

 void DynamicRecurrentOp::CreateScopes() const {
@@ -145,12 +163,18 @@ void DynamicRecurrentOp::CreateScopes() const {
  PADDLE_ENFORCE_NOT_NULL(stepnet_, "stepnet should be set first");
  std::vector<std::string> memories;
  std::vector<std::string> pre_memories;
+  std::vector<std::string> stepnet_outputs;
  std::transform(arg_.memories.begin(), arg_.memories.end(),
                 std::back_inserter(memories),
                 [](const rnn::MemoryAttr& m) { return m.var; });
  std::transform(arg_.memories.begin(), arg_.memories.end(),
                 std::back_inserter(pre_memories),
                 [](const rnn::MemoryAttr& m) { return m.pre_var; });
+  for (const auto& item : stepnet_->Outputs()) {
+    for (const auto& var : item.second) {
+      stepnet_outputs.push_back(var);
+    }
+  }

  for (size_t step = 0; step < cache_.num_steps; step++) {
    auto& scope = cache_.GetScope(step);
@@ -158,60 +182,88 @@ void DynamicRecurrentOp::CreateScopes() const {
    detail::CreateVariables(scope, arg_.outlinks);
    detail::CreateVariables(scope, memories);
    detail::CreateVariables(scope, pre_memories);
+    detail::CreateVariables(scope, stepnet_outputs);
  }
 }

 void DynamicRecurrentOp::ConcatOutputs() const {
  // TODO(superjom) transform this to a config
  int level = 0;
-  // TODO(superjom) pass in some lod
-  // just a placeholder
-  framework::LoD lod;
+  for (size_t step = 0; step < cache_.num_steps; step++) {
+    auto& scope = cache_.GetScope(step);
+    for (auto& item : step_outputs_) {
+      auto* var = scope.FindVar(item.first);
+      PADDLE_ENFORCE_NOT_NULL(var);
+      auto* tensor = var->GetMutable<LoDTensor>();
+      tensor->mutable_data<value_type>(platform::CPUPlace());
+      item.second.WriteShared(step, *tensor);
+    }
+  }
+  // the inlinks' lods should be the same, so randomly get one lod.
+  const auto& some_lod =
+      cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
  for (auto& item : step_outputs_) {
-    auto tensor = item.second.Pack(level, dy_seq_metas_[item.first], lod);
-    auto& output = cache_.outlinks[item.first]->Get<LoDTensor>();
-    const_cast<LoDTensor*>(&output)->ShareDataWith<value_type>(tensor);
+    auto tensor = item.second.Pack(level, some_meta, some_lod);
+    auto* output = cache_.outlinks[item.first]->GetMutable<LoDTensor>();
+    const_cast<LoDTensor*>(output)->ShareDataWith<value_type>(tensor);
  }
 }

 void DynamicRecurrentOp::InitStates() const {
-  // init the first state
-  // TODO(superjom) parepare the scenerio that boot state not exists
-  for (auto memory : arg_.memories) {
-    auto* boot_state_var = cache_.scope->FindVar(memory.boot_var);
-    PADDLE_ENFORCE_NOT_NULL(boot_state_var);
-    auto& boot_state = boot_state_var->Get<LoDTensor>();
-    const auto& dims = boot_state.dims();
-
-    for (size_t step = 0; step < cache_.num_steps; step++) {
-      auto& cur_scope = cache_.GetScope(step);
-      // link pre-state to boot_state
-      // init state and pre-state
-      auto* pre_state = cur_scope.FindVar(memory.pre_var);
-      PADDLE_ENFORCE_NOT_NULL(pre_state);
-      pre_state->GetMutable<LoDTensor>();
-
-      auto* state = cur_scope.FindVar(memory.var);
-      PADDLE_ENFORCE_NOT_NULL(state);
-      state->GetMutable<LoDTensor>()->Resize(dims);
-      state->GetMutable<LoDTensor>()->mutable_data<value_type>(
-          platform::CPUPlace());
-
-      if (step == 0) {
-        auto* pre_state_tensor = pre_state->GetMutable<LoDTensor>();
-        pre_state_tensor->Resize(boot_state.dims());
-        pre_state_tensor->ShareDataWith<value_type>(boot_state);
-      } else {
-        auto& pre_scope = cache_.GetScope(step - 1);
-        auto* state_pre = pre_scope.FindVar(memory.var);
-        PADDLE_ENFORCE_NOT_NULL(state_pre);
-        pre_state->GetMutable<LoDTensor>()->ShareDataWith<value_type>(
-            *state_pre->GetMutable<LoDTensor>());
-      }
+  for (size_t step = 0; step < cache_.num_steps; step++) {
+    for (const auto& memory : arg_.memories) {
+      CreateState(memory, step);
+      LinkState(memory, step);
    }
  }
 }

+void DynamicRecurrentOp::CreateState(const rnn::MemoryAttr& memory,
+                                     size_t step) const {
+  auto& scope = cache_.GetScope(step);
+  auto& state = *cache_.GetTensor(scope, memory.var);
+  auto& boot_state = *cache_.GetTensor(*cache_.scope, memory.boot_var);
+
+  size_t num_instances =
+      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+  auto dims = boot_state.dims();
+  dims[0] = num_instances;
+
+  state.Resize(dims);
+  state.mutable_data<value_type>(platform::CPUPlace());
+  states_[memory.var].WriteShared(step, state);
+}
+
+void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory,
+                                   size_t step) const {
+  auto& scope = cache_.GetScope(step);
+  auto& state_pre = *cache_.GetTensor(scope, memory.pre_var);
+
+  // all the step_inputs' metas should be the same, just randomly select one
+  // and get the dyseq meta.
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
+  size_t num_instances =
+      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+
+  LoDTensor* pre_state{nullptr};
+  if (step == 0) {
+    pre_state = cache_.GetTensor(*cache_.scope, memory.boot_var);
+    pre_state->mutable_data<float>(platform::CPUPlace());
+    // allocate memory
+    state_pre.Resize(pre_state->dims());
+    state_pre.mutable_data<value_type>(platform::CPUPlace());
+    detail::ReorderBootState<value_type>(some_meta, *pre_state, &state_pre,
+                                         pre_state->place());
+  } else {
+    pre_state = cache_.GetTensor(cache_.GetScope(step - 1), memory.var);
+  }
+
+  // shink and share from previous state
+  auto shrinked_pre_state = pre_state->Slice<value_type>(0, num_instances);
+  state_pre.ShareDataWith<value_type>(shrinked_pre_state);
+}
+
 void DynamicRecurrentOp::ArgCache::Init(
    const rnn::ArgumentName& name, const paddle::framework::OperatorBase& op,
    const paddle::framework::Scope& scope, rnn::Argument* arg) {
@@ -261,6 +313,12 @@ Variable* DynamicRecurrentOp::ArgCache::GetVariable(const Scope& scope,
  return var;
 }

+LoDTensor* DynamicRecurrentOp::ArgCache::GetTensor(
+    const framework::Scope& scope, const std::string& name) {
+  auto* var = GetVariable(scope, name);
+  return var->GetMutable<LoDTensor>();
+}
+
 const rnn::ArgumentName DynamicRecurrentOp::kArgName{
    "step_net", "step_scopes",  "inlinks",      "outlinks",
    "memories", "pre_memories", "boot_memories"};

--- a/paddle/operators/dynamic_recurrent_op.h
+++ b/paddle/operators/dynamic_recurrent_op.h
@@ -77,6 +77,17 @@ class DynamicRecurrentOp : public framework::OperatorBase {
   */
  void InitStates() const;

+  /*
+   * Create state variables for each time step.
+   */
+  void CreateState(const rnn::MemoryAttr& memory, size_t step) const;
+
+  /*
+   * Link pre-state variable in current scope to the state variable in the
+   * previous time step (scope).
+   */
+  void LinkState(const rnn::MemoryAttr& memory, size_t step) const;
+
  /*
   * Concatenate outputs in each time step and generate a LoDTensor.
   */
@@ -91,6 +102,16 @@ class DynamicRecurrentOp : public framework::OperatorBase {
  }
  const OperatorBase& GetStepNet() const { return *stepnet_; }

+  const framework::TensorArray& state(const std::string& name) const {
+    return states_[name];
+  }
+  const framework::TensorArray& step_input(const std::string& name) const {
+    return step_inputs_[name];
+  }
+  const framework::TensorArray& step_output(const std::string& name) const {
+    return step_outputs_[name];
+  }
+
 protected:
  struct ArgCache {
    framework::Scope const* scope;
@@ -108,6 +129,9 @@ class DynamicRecurrentOp : public framework::OperatorBase {
      return *scopes->at(index);
    }

+    framework::LoDTensor* GetTensor(const framework::Scope& scope,
+                                    const std::string& name);
+
   private:
    void InitArgument(const rnn::ArgumentName& name, const OperatorBase& op,
                      rnn::Argument* arg);
@@ -122,7 +146,7 @@ class DynamicRecurrentOp : public framework::OperatorBase {

 private:
  std::unique_ptr<OperatorBase> stepnet_;
-  mutable framework::TensorArray states_;
+  mutable std::map<std::string, framework::TensorArray> states_;
  mutable std::map<std::string, framework::TensorArray> step_inputs_;
  mutable std::map<std::string, framework::TensorArray> step_outputs_;
  mutable std::map<std::string, std::vector<framework::DySeqMeta>>

--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
@@ -36,7 +36,7 @@ void OpDescNewVar(const std::string& param_name,
 // create a LoD tensor in scope with specific dims
 LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
                     const platform::Place& place) {
-  auto* var = scope.NewVar(name);
+  auto* var = scope.Var(name);
  auto* tensor = var->GetMutable<LoDTensor>();
  tensor->Resize(dims);
  tensor->mutable_data<float>(place);
@@ -85,9 +85,8 @@ class DynamicRecurrentOpTestHelper : public ::testing::Test {

  void CreateGlobalVariables() {
    platform::CPUPlace place;
-    scope.NewVar("step_scopes");
+    scope.Var("step_scopes");
    CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place);
-    // auto* out0 =
    CreateVar(scope, "out0", framework::make_ddim({10, 20}), place);
    auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place);
    // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively.

--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/gru_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("Gate"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"),
+                   "Output(%s) of GRUUnitOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    auto bias = Input("Bias");
+    if (bias != framework::kEmptyVarName) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("Gate", {batch_size, frame_size * 3});
+    ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size});
+    ctx->SetOutputDim("Hidden", {batch_size, frame_size});
+  }
+};
+
+class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUUnitOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+             "input.");
+    AddInput("HiddenPrev",
+             "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+             "states of previous time step.");
+    AddInput("Weight",
+             "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
+             "The elements continuous in memory can be divided into two parts. "
+             "The first part are weights of the update gate and reset gate "
+             "with shape [frame_size, frame_size * 2], and the second part are "
+             "weights of output candidate with shape [frame_size, frame_size]");
+    AddInput("Bias",
+             "(Tensor) Bias vector with shape [1, frame_size * 3] concating "
+             "bias of the update gate, reset gate and output candidate.");
+    AddOutput("Gate",
+              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+              "output of update gate, reset gate and output candidate")
+        .AsIntermediate();
+    AddOutput("ResetHiddenPrev",
+              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+              "reseted hidden state of previous time step.")
+        .AsIntermediate();
+    AddOutput("Hidden",
+              "(Tensor) The GRU hidden state of the current time step "
+              "with shape [batch_size, frame_size].");
+    AddAttr<int>("activation",
+                 "(enum int, default tanh) "
+                 "The activation type used for output candidate {h}_t.")
+        .SetDefault(tanh)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddAttr<int>("gate_activation",
+                 "(enum int, default sigmoid) "
+                 "The activation type used in update gate and reset gate.")
+        .SetDefault(sigmoid)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddComment(R"DOC(
+GRUUnitOp implements part calculations of the GRU unit as following:
+
+\f[
+update \ gate: u_t = actGate(xu_t + W_u * hidden_prev + bias_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * hidden_prev + bias_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, hidden_prev) + bias_c) \\
+output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_prev)
+\f]
+
+The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+)DOC");
+  }
+};
+
+class GRUUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("Gate"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "Gate");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    // int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    auto bias = Input("Bias");
+    if (bias != framework::kEmptyVarName) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev");
+    if (ctx->HasOutput(hidden_prev_grad_name))
+      ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
+            ops::GRUUnitGradOp);
+REGISTER_OP_CPU_KERNEL(gru_unit,
+                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/gru_unit_op.cu
+++ b/paddle/operators/gru_unit_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gru_unit_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gru_unit,
+                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/activation_op.h"
+#include "paddle/operators/math/math_function.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
+
+template <typename Place, typename T>
+class GRUUnitKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const int act_type, const Device& d, X x, Y y) const {
+    if (act_type == identity)
+      y.device(d) = x;
+    else if (act_type == sigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == tanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == relu)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* gate = context.Output<Tensor>("Gate");
+    gate->mutable_data<T>(context.GetPlace());
+    auto* reset_hidden_prev = context.Output<Tensor>("ResetHiddenPrev");
+    reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<Tensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    auto x = EigenMatrix<T>::From(*input);
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
+    auto h = EigenMatrix<T>::From(*hidden);
+    auto place = context.GetEigenDevice<Place>();
+
+    // calculate unactivated gate outputs
+    if (bias) {
+      auto b = EigenMatrix<T>::From(*bias);
+      g.device(place) = x +
+                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+    } else {
+      g.device(place) = x;
+    }
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* gate_data = gate->data<T>();
+    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
+                         2 * frame_size, frame_size, 1, hidden_prev_data,
+                         frame_size, weight_data, frame_size * 2, 1, gate_data,
+                         frame_size * 3);
+
+    // calculate activited gate
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(u_offsets, extents), g.slice(u_offsets, extents));
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    r_h_p.device(place) = r * h_p;         // reset previous hidden state
+    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
+                         frame_size, frame_size, 1, reset_hidden_prev_data,
+                         frame_size, weight_data + frame_size * frame_size * 2,
+                         frame_size, 1, gate_data + frame_size * 2,
+                         frame_size * 3);
+
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    ActCompute(context.Attr<int>("activation"), place,
+               g.slice(c_offsets, extents), g.slice(c_offsets, extents));
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // calculate final output
+    h.device(place) = u * (h_p - c) + c;
+  }
+};
+
+template <typename Place, typename T>
+class GRUUnitGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx,
+                      DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == identity)
+      dx.device(d) = dy;
+    else if (act_type == sigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == tanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == relu)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* gate = context.Input<Tensor>("Gate");
+    auto* reset_hidden_prev = context.Input<Tensor>("ResetHiddenPrev");
+    auto* hidden_grad = context.Input<Tensor>(framework::GradVarName("Hidden"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
+    auto* hidden_prev_grad =
+        context.Output<Tensor>(framework::GradVarName("HiddenPrev"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+    input_grad->mutable_data<T>(context.GetPlace());
+    hidden_prev_grad->mutable_data<T>(context.GetPlace());
+    weight_grad->mutable_data<T>(context.GetPlace());
+    Tensor gate_grad;
+    gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
+    Tensor reset_hidden_prev_grad;
+    reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
+                                           context.GetPlace());
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* weight_grad_data = weight_grad->data<T>();
+    T* gate_grad_data = gate_grad.data<T>();
+    const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
+
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto d_h = EigenMatrix<T>::From(*hidden_grad);
+    auto d_x = EigenMatrix<T>::From(*input_grad);
+    auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+    auto d_g = EigenMatrix<T>::From(gate_grad);
+    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
+    auto place = context.GetEigenDevice<Place>();
+
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // backward for unactivated update gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
+                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+    // backward for unactivated output candidate
+    ActGradCompute(context.Attr<int>("activation"), place, c, c,
+                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+    // backward for reset_hidden_prev
+    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                         frame_size, frame_size, 1,
+                         gate_grad_data + frame_size * 2, frame_size * 3,
+                         weight_data + frame_size * frame_size * 2, frame_size,
+                         0, reset_hidden_prev_grad_data, frame_size);
+    // backward for state_weight
+    math::gemm<Place, T>(
+        context.device_context(), true, false, frame_size, frame_size,
+        batch_size, 1, reset_hidden_prev_data, frame_size,
+        gate_grad_data + frame_size * 2, frame_size * 3, 0,
+        weight_grad_data + frame_size * frame_size * 2, frame_size);
+    // backward for unactivated reset gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
+                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
+    // backward for update_gate_weight and reset_gate_weight
+    math::gemm<Place, T>(context.device_context(), true, false, frame_size,
+                         frame_size * 2, batch_size, 1, hidden_prev_data,
+                         frame_size, gate_grad_data, frame_size * 3, 0,
+                         weight_grad_data, frame_size * 2);
+    // backward for hidden_prev
+    d_h_p.device(place) = d_r_h_p * r + d_h * u;
+    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                         frame_size, frame_size * 2, 1, gate_grad_data,
+                         frame_size * 3, weight_data, frame_size * 2, 1,
+                         hidden_prev_grad_data, frame_size);
+    // backward for input
+    d_x.device(place) = d_g;
+    // backward for bias
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -78,7 +78,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    input.CopyFrom<float>(input_tmp, *place);
+    input.CopyFrom<float>(input_tmp, *place, *context);
  }
  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                              output_depth, output_height, output_width},
@@ -93,7 +93,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    out_cfo_ptr = output.data<float>();
  } else {
-    output_tmp.CopyFrom<float>(output, paddle::platform::CPUPlace());
+    output_tmp.CopyFrom<float>(output, paddle::platform::CPUPlace(), *context);
    out_cfo_ptr = output_tmp.data<float>();
  }

@@ -107,7 +107,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    input = input_tmp;
  } else {
-    input.CopyFrom<float>(input_tmp, *place);
+    input.CopyFrom<float>(input_tmp, *place, *context);
  }

  paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
@@ -118,7 +118,7 @@ void testVol2col() {
  if (paddle::platform::is_cpu_place(*place)) {
    in_ptr = input.data<float>();
  } else {
-    input_tmp.CopyFrom<float>(input, paddle::platform::CPUPlace());
+    input_tmp.CopyFrom<float>(input, paddle::platform::CPUPlace(), *context);
    in_ptr = input_tmp.data<float>();
  }


--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -115,8 +115,9 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;

-REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad,
-            ops::MultiplexGradOp);
+REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<false>);
+REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
    multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -70,14 +70,14 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,
        // the weight are located in parent scope
        for (auto& var_name : input.second) {
          if (!step_scope.FindVar(var_name)) {
-            step_scope.NewVar(var_name)->GetMutable<LoDTensor>();
+            step_scope.Var(var_name)->GetMutable<LoDTensor>();
          }
        }
      }
      // create stepnet's outputs
      for (const auto& output : (*stepnet_)->Outputs()) {
        for (auto& var_name : output.second) {
-          step_scope.NewVar(var_name);
+          step_scope.Var(var_name);
        }
      }
      step_scopes->emplace_back(&step_scope);
@@ -87,7 +87,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope,

 void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
  for (auto& attr : arg_->memories) {
-    auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>();
+    auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable<LoDTensor>();
    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                   "memory [%s]'s boot variable [%s] not exists", attr.var,
                   attr.boot_var);
@@ -167,9 +167,9 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
                   "memory variable [%s] does not exists", attr.var);
    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                   "boot variable [%s] does not exists", attr.boot_var);
-    auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>();
+    auto* mem_grad = step_scope->Var(attr.var)->GetMutable<LoDTensor>();
    auto* boot_mem_grad =
-        step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>();
+        step_scope->Var(attr.boot_var)->GetMutable<LoDTensor>();
    boot_mem_grad->Resize(mem_grad->dims());
    boot_mem_grad->ShareDataWith<float>(*mem_grad);
  }

--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -40,7 +40,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
    for (size_t j = 0; j < seq_len; j++) {
      Tensor* step_input =
-          step_scopes[j]->NewVar(inlinks[i])->GetMutable<Tensor>();
+          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
      // The input of operators of each step is Tensor here.
      // Maybe need to modify Slice function.
      *step_input = input->Slice<float>(j, j + 1);

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -34,7 +34,7 @@ class SumOp : public framework::OperatorWithKernel {
    auto in_dim = x_dims[0];
    for (size_t i = 1; i < N; i++) {
      auto dim = x_dims[i];
-      PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
+      PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape");
    }
    ctx->SetOutputDim("Out", in_dim);
    ctx->ShareLoD("X", /*->*/ "Out");

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -54,7 +54,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(
        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
        "uniform_random's min must less then max");
-    auto dims = Attr<std::vector<int>>("dims");
+    auto& dims = ctx->Attrs().Get<std::vector<int>>("dims");
    std::vector<int64_t> temp;
    temp.reserve(dims.size());
    for (auto dim : dims) {

--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -71,23 +71,32 @@ class ScopedTensorDescriptor {

  inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
                                            const cudnnDataType_t type,
-                                            const std::vector<int>& dims) {
-    // the format is not used now, but it maybe useful feature
+                                            const std::vector<int>& dims,
+                                            const int groups = 1) {
+    // the format is not used now, will add later
    std::vector<int> strides(dims.size());
    strides[dims.size() - 1] = 1;
    for (int i = dims.size() - 2; i >= 0; i--) {
      strides[i] = dims[i + 1] * strides[i + 1];
    }
+    // Update tensor descriptor dims setting if groups > 1
+    // FIXME(typhoonzero): Assume using NCHW order
+    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
    PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
-        desc_, type, dims.size(), dims.data(), strides.data()));
+        desc_, type, dims_with_group.size(), dims_with_group.data(),
+        strides.data()));
    return desc_;
  }

  template <typename T>
  inline cudnnTensorDescriptor_t descriptor(const DataLayout& order,
-                                            const std::vector<int>& dims) {
-    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
-                      dims);
+                                            const std::vector<int>& dims,
+                                            const int groups = 1) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims,
+                      groups);
  }

 private:
@@ -106,18 +115,29 @@ class ScopedFilterDescriptor {

  inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
                                            const cudnnDataType_t type,
-                                            const std::vector<int>& kernel) {
-    // filter layout: output input spatial_dim_y spatial_dim_x
+                                            const std::vector<int>& kernel,
+                                            const int groups = 1) {
+    // filter layout: MCHW, where M is the number of
+    // output image channels, C is the number of input image channels,
+    // H and W is height and width of filter.
+    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
+    if (groups > 1) {
+      // M /= groups
+      kernel_with_group[0] /= groups;
+      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
+    }
    PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
-        desc_, type, format, kernel.size(), kernel.data()));
+        desc_, type, format, kernel_with_group.size(),
+        kernel_with_group.data()));
    return desc_;
  }

  template <typename T>
  inline cudnnFilterDescriptor_t descriptor(const DataLayout& order,
-                                            const std::vector<int>& kernel) {
+                                            const std::vector<int>& kernel,
+                                            const int groups = 1) {
    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
-                      kernel);
+                      kernel, groups);
  }

 private:

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
 if(WITH_PYTHON)
  cc_library(paddle_pybind SHARED
    SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc tensor_array
+    DEPS pybind python backward proto_desc tensor_array paddle_memory executor
    ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -118,12 +118,35 @@ void BindProgramDesc(py::module &m) {
      .def("append_block", &ProgramDescBind::AppendBlock,
           py::return_value_policy::reference)
      .def("append_backward",
-           [](ProgramDescBind &program_desc,
+           [](ProgramDescBind &program_desc, const VarDescBind &target,
              const std::unordered_set<std::string> &no_grad_vars) {
-             AppendBackward(program_desc, no_grad_vars);
+             ParamGradInfoMap param_grad_map =
+                 AppendBackward(program_desc, target, no_grad_vars);
+             std::unordered_map<
+                 std::string, std::tuple<std::string /* grad_var_name */,
+                                         int /* block_idx */, int /* op_idx */>>
+                 retv;
+             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
+                  ++it) {
+               const auto &grad_info = it->second;
+               retv[it->first] = std::make_tuple(
+                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
+             }
+             return retv;
           })
      .def("block", &ProgramDescBind::Block, py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDescBind::Size);
+      .def("num_blocks", &ProgramDescBind::Size)
+      .def("serialize_to_string",
+           [](ProgramDescBind &program_desc) -> py::bytes {
+             const ProgramDesc *desc = program_desc.Proto();
+             PADDLE_ENFORCE(desc->IsInitialized(),
+                            "ProgramDesc has not been initialized.");
+             std::string res;
+             PADDLE_ENFORCE(
+                 desc->SerializeToString(&res),
+                 "Serialize ProgramDesc Error. This could be a bug of Paddle.");
+             return res;
+           });
 }

 void BindBlockDesc(py::module &m) {
@@ -134,22 +157,32 @@ void BindBlockDesc(py::module &m) {
           py::return_value_policy::reference)
      .def("prepend_op", &BlockDescBind::PrependOp,
           py::return_value_policy::reference)
-      .def("new_var",
+      .def("var",
           [](BlockDescBind &self, py::bytes byte_name) {
             std::string name = byte_name;
-             return self.NewVar(name);
+             return self.Var(name);
           },
           py::return_value_policy::reference)
-      .def("var",
+      .def("find_var",
           [](BlockDescBind &self, py::bytes byte_name) {
             std::string name = byte_name;
-             return self.Var(name);
+             return self.FindVar(name);
           },
           py::return_value_policy::reference)
      .def("all_vars", &BlockDescBind::AllVars,
           py::return_value_policy::reference)
      .def("all_ops", &BlockDescBind::AllOps,
-           py::return_value_policy::reference);
+           py::return_value_policy::reference)
+      .def("serialize_to_string", [](BlockDescBind &block_desc) -> py::bytes {
+        const BlockDesc *desc = block_desc.Proto();
+        PADDLE_ENFORCE(desc->IsInitialized(),
+                       "BlockDesc has not been initialized.");
+        std::string res;
+        PADDLE_ENFORCE(
+            desc->SerializeToString(&res),
+            "Serialize BlockDesc Error. This could be a bug of Paddle.");
+        return res;
+      });
 }

 void BindVarDsec(py::module &m) {
@@ -162,7 +195,8 @@ void BindVarDsec(py::module &m) {
      .value("FP32", DataType::FP32)
      .value("FP64", DataType::FP64);

-  py::class_<VarDescBind>(m, "VarDesc", "")
+  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
+  var_desc
      .def("name",
           [](const VarDescBind &self) {
             py::bytes name = self.Name();
@@ -174,7 +208,23 @@ void BindVarDsec(py::module &m) {
      .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
      .def("data_type", &VarDescBind::GetDataType)
      .def("lod_level", &VarDescBind::GetLodLevel)
-      .def("set_lod_level", &VarDescBind::SetLoDLevel);
+      .def("set_lod_level", &VarDescBind::SetLoDLevel)
+      .def("type", &VarDescBind::GetType)
+      .def("set_type", &VarDescBind::SetType)
+      .def("serialize_to_string", [](VarDescBind &var_desc) -> py::bytes {
+        const VarDesc *desc = var_desc.Proto();
+        PADDLE_ENFORCE(desc->IsInitialized(),
+                       "VarDesc has not been initialized.");
+        std::string res;
+        PADDLE_ENFORCE(
+            desc->SerializeToString(&res),
+            "Serialize VarDesc Error. This could be a bug of Paddle.");
+        return res;
+      });
+
+  py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS);
 }

 void BindOpDesc(py::module &m) {
@@ -204,9 +254,19 @@ void BindOpDesc(py::module &m) {
      .def("set_attr", &OpDescBind::SetAttr)
      .def("attr", &OpDescBind::GetAttr)
      .def("set_block_attr", &OpDescBind::SetBlockAttr)
-      .def("get_block_attr", &OpDescBind::GetBlockAttr)
+      .def("block_attr", &OpDescBind::GetBlockAttr)
      .def("check_attrs", &OpDescBind::CheckAttrs)
-      .def("infer_shape", &OpDescBind::InferShape);
+      .def("infer_shape", &OpDescBind::InferShape)
+      .def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes {
+        const OpDesc *desc = op_desc.Proto();
+        PADDLE_ENFORCE(desc->IsInitialized(),
+                       "OpDesc has not been initialized.");
+        std::string res;
+        PADDLE_ENFORCE(
+            desc->SerializeToString(&res),
+            "Serialize OpDesc Error. This could be a bug of Paddle.");
+        return res;
+      });
 }

 }  // namespace pybind

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,9 +15,11 @@ limitations under the License. */
 #include "paddle/pybind/protobuf.h"

 #include "paddle/framework/backward.h"
+#include "paddle/framework/executor.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
+#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
@@ -163,9 +165,9 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::reference);

  py::class_<Scope>(m, "Scope", "")
-      .def("new_var",
+      .def("var",
           [](Scope &self, const std::string &name) -> Variable * {
-             return self.NewVar(name);
+             return self.Var(name);
           },
           py::return_value_policy::reference)
      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
@@ -341,6 +343,33 @@ All parameter, weight, gradient are variables in Paddle.
        self.set_stepnet(net.Clone());
      });

+  py::class_<operators::DynamicRecurrentOp, OperatorBase>(m,
+                                                          "DynamicRecurrentOp")
+      .def_static("create",
+                  [](py::bytes protobin) -> operators::DynamicRecurrentOp * {
+                    OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    auto rnn_op = OpRegistry::CreateOp(desc);
+                    return static_cast<operators::DynamicRecurrentOp *>(
+                        rnn_op.release());
+                  })
+      .def("set_stepnet",
+           [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
+               -> void { self.SetStepNet(net.Clone()); })
+      .def("get_state",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.state(name); })
+      .def("get_step_input",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.step_input(name); })
+      .def("get_step_output",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.step_output(name); });
+
  // cond_op
  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
      .def_static("create",
@@ -363,6 +392,14 @@ All parameter, weight, gradient are variables in Paddle.
             self.set_falsenet(net.Clone());
           });

+  py::class_<framework::Executor>(m, "Executor")
+      .def(py::init<std::vector<platform::Place> &>())
+      .def("run",
+           [](Executor &self, const ProgramDesc &program_desc, int block_id) {
+             framework::Scope &global_scope = GetGlobalScope();
+             self.Run(program_desc, &global_scope, block_id);
+           });
+
  m.def("unique_integer", UniqueIntegerGenerator);

  m.def("is_compile_gpu", IsCompileGPU);

--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -39,15 +39,18 @@ add_test(NAME test_CompareTwoNets

 ################ test_CompareMKLDNNandCPU ######################
 if(WITH_MKLDNN)
-  add_unittest_without_exec(test_CompareMKLDNNandCPU
-      test_CompareTwoNets.cpp)
-  add_test(NAME test_CompareMKLDNNandCPU
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
-              --config_file_a=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_a=True
-              --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
-              --use_gpu=False
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+  macro(gen_command VAR_NAME CONFIG_FILE)
+    set(${VAR_NAME} "${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh" "-d" "${PADDLE_SOURCE_DIR}/python/"
+                    "${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU --use_gpu=False"
+                    "--config_file_a=trainer/tests/${CONFIG_FILE} --use_mkldnn_a=True"
+                    "--config_file_b=trainer/tests/${CONFIG_FILE} --use_mkldnn_b=False"
+                    "WORKING_DIRECTORY" "${PADDLE_SOURCE_DIR}/paddle/")
+  endmacro()
+  add_unittest_without_exec(test_CompareMKLDNNandCPU test_CompareTwoNets.cpp)
+  gen_command(compare_simple_net "sample_trainer_config_simple_net.conf")
+  gen_command(compare_branch_net "sample_trainer_config_branch_net.conf")
+  add_test(NAME test_CompareMKLDNNandCPU_simple_net COMMAND ${compare_simple_net})
+  add_test(NAME test_CompareMKLDNNandCPU_branch_net COMMAND ${compare_branch_net})
 endif()

 ############### test_CompareTwoOpts ###################

--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 256,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1 = img_conv_layer(input=tmp,
+            filter_size=1,
+            num_filters=32,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+a2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = concat_layer(input=[a1, a2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=64,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+b1 = img_pool_layer(input=b1,
+            pool_size=3,
+            stride=1,
+            padding=1,
+            pool_type=MaxPooling())
+
+b2 = img_conv_layer(input=tmp,
+            filter_size=5,
+            num_filters=64,
+            padding=2,
+            shared_biases=True,
+            act=ReluActivation())
+
+b2 = img_pool_layer(input=b2,
+            pool_size=5,
+            stride=1,
+            padding=2,
+            pool_type=MaxPooling())
+
+tmp = addto_layer(input=[b1, b2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = fc_layer(input=tmp, size=64,
+            bias_attr=False,
+            act=TanhActivation())
+
+output = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -5,7 +5,7 @@ Default scope function.
 thread-local stack of Scope. Top of that stack is current scope, the bottom 
 of that stack is all scopes' parent. 

-Invoking `new_var/find_var`  can `new/find` variable in current scope. 
+Invoking `var/find_var`  can `new/find` variable in current scope. 
 Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
 scope. 

@@ -19,7 +19,7 @@ import threading
 __tl_scope__ = threading.local()

 __all__ = [
-    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var',
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'var',
    'find_var', 'scoped_function'
 ]

@@ -54,11 +54,11 @@ def leave_local_scope():
    get_cur_scope().drop_kids()


-def new_var(name):
+def var(name):
    """
    create variable in current scope.
    """
-    return get_cur_scope().new_var(name)
+    return get_cur_scope().var(name)


 def find_var(name):

--- a/python/paddle/v2/framework/graph.py
+++ b/python/paddle/v2/framework/graph.py
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -219,6 +219,27 @@ class __RecurrentOp__(object):
        return core.RecurrentOp.create(proto.SerializeToString())


+class __DynamicRecurrentOp__(object):
+    __proto__ = None
+    type = "dynamic_recurrent"
+
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and "type" not in kwargs:
+            kwargs["type"] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create rnnop
+        return core.DynamicRecurrentOp.create(proto.SerializeToString())
+
+
 class __CondOp__(object):
    __proto__ = None
    type = "cond"
@@ -242,4 +263,5 @@ class __CondOp__(object):

 Operator = OperatorFactory()  # The default global factory
 RecurrentOp = __RecurrentOp__()
+DynamicRecurrentOp = __DynamicRecurrentOp__()
 CondOp = __CondOp__()
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -14,7 +14,7 @@ def create_op(scope, op_type, inputs, outputs, attrs):
    kwargs = dict()

    def __create_var__(name, var_name):
-        scope.new_var(var_name)
+        scope.var(var_name)
        kwargs[name].append(var_name)

    for in_name, in_dup in Operator.get_op_inputs(op_type):
@@ -71,7 +71,7 @@ def set_input(scope, op, inputs, place):
 def set_output_grad(scope, op, outputs, place):
    def __set_tensor__(name):
        out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
        out_dtype = out_tensor.dtype()
        if out_dtype == core.DataType.FP64:
            data = np.ones(out_tensor.shape(), dtype=np.float64)
@@ -169,10 +169,10 @@ def get_numeric_gradient(scope,
 def get_backward_op(scope, op, no_grad_set):
    backward_op = core.Operator.backward(op, no_grad_set)
    for input in backward_op.input_vars():
-        var = scope.new_var(input)
+        var = scope.var(input)
        var.get_tensor()
    for output in backward_op.output_vars():
-        var = scope.new_var(output)
+        var = scope.var(output)
        var.get_tensor()
    return backward_op


--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -384,5 +384,33 @@ class TestThresholdedRelu(OpTest):
        self.check_grad(['X'], 'Y', max_relative_error=self.relative_error)


+class TestHardSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "hard_sigmoid"
+        self.relative_error = 0.002
+
+        X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
+        slope = 0.2
+        offset = 0.5
+        lower_threshold = -offset / slope
+        upper_threshold = (1 - offset) / slope
+
+        self.inputs = {'X': X}
+        # Same reason as TestAbs
+        X[np.abs(X - lower_threshold) < self.relative_error] = \
+            lower_threshold + 0.2
+        X[np.abs(X - upper_threshold) < self.relative_error] = \
+            upper_threshold - 0.2
+
+        temp = X * slope + offset
+        self.outputs = {'Y': np.maximum(0.0, np.minimum(1.0, temp))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.002)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_adam_op.py
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -39,7 +39,7 @@ class PySimpleCondTest(unittest.TestCase):


 def create_tensor(scope, name, shape, np_data):
-    tensor = scope.new_var(name).get_tensor()
+    tensor = scope.var(name).get_tensor()
    tensor.set_dims(shape)
    tensor.set(np_data, core.CPUPlace())
    return tensor
@@ -74,9 +74,9 @@ class TestCondOp(unittest.TestCase):
        create_tensor(self.scope, "X", [10, 1], x_np_data)
        cond_np_data = self.py_cond.cond.astype("int32")
        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.new_var("SubScopes")
-        self.scope.new_var("IndexTensors")
-        self.scope.new_var("Out")
+        self.scope.var("SubScopes")
+        self.scope.var("IndexTensors")
+        self.scope.var("Out")

    def create_cond_op(self):
        self.condop = CondOp(

--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestDecayedAdagradOp1(OpTest):
+    ''' Test DecayedAdagrad operator with explicit attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "decayed_adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        decay = 0.80
+        epsilon = 1e-8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'decay': decay, 'epsilon': epsilon}
+
+        moment_out = decay * moment + (1 - decay) * grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDecayedAdagradOp2(OpTest):
+    ''' Test DecayedAdagrad operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "decayed_adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        decay = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'decay': decay, 'epsilon': epsilon}
+
+        moment_out = decay * moment + (1 - decay) * grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -10,7 +10,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
        self.assertIsNone(find_var("test"))

    def test_create_var_get_var(self):
-        var_a = new_var("var_a")
+        var_a = var("var_a")
        self.assertIsNotNone(var_a)
        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
        enter_local_scope()
@@ -19,7 +19,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):

    def test_var_get_int(self):
        def __new_scope__():
-            i = new_var("var_i")
+            i = var("var_i")
            self.assertFalse(i.is_int())
            i.set_int(10)
            self.assertTrue(i.is_int())

--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -14,7 +14,7 @@ class TestGaussianRandomOp(unittest.TestCase):

    def gaussian_random_test(self, place):
        scope = core.Scope()
-        scope.new_var('Out').get_tensor()
+        scope.var('Out').get_tensor()

        op = Operator(
            "gaussian_random",

--- a/python/paddle/v2/framework/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py