diff --git a/CMakeLists.txt b/CMakeLists.txt
index b174831109372cb014741d63032fa6a470e74042..c7d743e193e7d32dbc0b56f3bcb05b6c61f85f1d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 534be0abe246ac70950d85ad05441825c8ca768a..41b9b5928958ae31799c396a8d77fd7cff557905 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
     endif()
     
     # cpplint code style
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+    foreach(source_file ${cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
 
   else(cc_library_SRCS)
     if (cc_library_DEPS)
@@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
         target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
       endif()
+      # cpplint code style
+      foreach(source_file ${nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
     else(nv_library_SRCS)
       if (nv_library_DEPS)
         merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 4b06966fba2bc9f92756be0cb8110bbcd5272423..f8a88cf317aee6c5dd25e4cc25d588c6c50fcbce 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,7 +15,6 @@ if(Boost_FOUND)
   add_subdirectory(platform)
   add_subdirectory(framework)
   add_subdirectory(operators)
-  add_subdirectory(pybind)
 endif()
 
 if(WITH_C_API)
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 12a3a00bba35d476fca9c9fb47ac20b87e6f53f2..9c39430835d37d5dfbe4031f29e5a6216ed8b67f 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -31,8 +31,14 @@ py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 
-cc_library(net SRCS net.cc DEPS op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net)
-
-cc_library(backward SRCS backward.cc DEPS net)
+cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python backward
+	fc_op
+	sgd_op
+	add_op
+	mean_op
+	cross_entropy_op
+	recurrent_op)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index e784bb2b7d7ddd087a75371e508840e15c952473..c034e265fe4837ca22ab969b0e6952677904e05c 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/framework/backward.h"
 #include <list>
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace framework {
@@ -32,7 +32,7 @@ static bool AllInSet(const std::vector<std::string>& names,
 }
 
 static std::shared_ptr<OperatorBase> NOP() {
-  auto net_op = std::make_shared<NetOp>();
+  auto net_op = std::make_shared<operators::NetOp>();
   net_op->type_ = "@NOP@";
   net_op->CompleteAddOp();
   return net_op;
@@ -42,9 +42,9 @@ static std::shared_ptr<OperatorBase> NOP() {
 //
 //  no_grad_names the gradient variable names without gradient calculating.
 //
-//  uniq_id is a unique index used inside recursively calling BackwardRecursive.
-//  use `uid = uniq_id++;` to get the unique index, and pass `uniq_id` through
-//  recursive calling.
+//  uniq_id is a unique index used inside recursively calling
+//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
+//  pass `uniq_id` through recursive calling.
 //
 //  returns The backward operator. For simple situation, it is a simple
 //  operator. For complex situation, it is a NetOp.
@@ -64,8 +64,8 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
     return NOP();
   }
 
-  //  All output gradients of forwarding operator do not need to calculate. Then
-  //  all input gradients cannot be computed at all, and we put them into
+  //  All output gradients of forwarding operator do not need to calculate.
+  //  Then all input gradients cannot be computed at all, and we put them into
   //  `no_grad_names` set. Return an NOP.
   if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(),
                no_grad_names)) {
@@ -77,14 +77,14 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
   }
 
   // Returned gradient network
-  auto net = std::make_shared<NetOp>();
+  auto net = std::make_shared<operators::NetOp>();
 
   if (forwardOp.IsNetOp()) {
     // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const NetOp&>(forwardOp);
+    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
 
-    // Map from output gradient variable name to operator's indices in backward
-    // net. That operator generates that variable.
+    // Map from output gradient variable name to operator's indices in
+    // backward net. That operator generates that variable.
     std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
 
     size_t local_op_id = 0;
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index b095c2c3d5dbf21b5ea70e17475a4aaad9b1db44..8f437e68041188831a17217099e0b0c96432cda4 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -15,8 +15,9 @@
 #include "paddle/framework/backward.h"
 
 #include <gtest/gtest.h>
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace framework {
@@ -70,7 +71,7 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-class FcOp : public NetOp {
+class FcOp : public ops::NetOp {
  public:
   void Init() override {
     AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")},
@@ -161,8 +162,8 @@ TEST(Backward, simple_op_grad) {
   auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
   ASSERT_NE(fwd, nullptr);
   auto gop = f::OpRegistry::CreateGradOp(*fwd);
-  ASSERT_EQ(1UL, gop->inputs_.size());
-  ASSERT_EQ("Out" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->inputs_[0]);
+  ASSERT_EQ(4UL, gop->inputs_.size());
+  ASSERT_EQ(f::OperatorBase::EMPTY_VAR_NAME(), gop->inputs_[0]);
   ASSERT_EQ("rowwise_add_grad", gop->type_);
   ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[0]);
   ASSERT_EQ("b" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[1]);
@@ -182,7 +183,8 @@ TEST(Backward, simple_op_not_need_grad) {
   auto no_input_gop = f::Backward(*fwd, {"X", "b"});
   ASSERT_NE(no_input_gop, nullptr);
   ASSERT_TRUE(no_input_gop->IsNetOp());
-  ASSERT_EQ(0UL, std::static_pointer_cast<f::NetOp>(no_input_gop)->ops_.size());
+  ASSERT_EQ(0UL,
+            std::static_pointer_cast<ops::NetOp>(no_input_gop)->ops_.size());
 }
 
 TEST(Backward, net_fc_backward_normal) {
@@ -191,7 +193,7 @@ TEST(Backward, net_fc_backward_normal) {
   ASSERT_NE(fwd, nullptr);
   std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
   ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<f::NetOp *>(gop.get());
+  auto net = static_cast<ops::NetOp *>(gop.get());
 
   ASSERT_NO_THROW(net->DebugString());
 
@@ -214,7 +216,7 @@ TEST(Backward, net_fc_backward_not_have_b) {
   ASSERT_NE(fwd, nullptr);
   std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
   ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<f::NetOp *>(gop.get());
+  auto net = static_cast<ops::NetOp *>(gop.get());
 
   ASSERT_NO_THROW(net->DebugString());
 
@@ -228,7 +230,7 @@ TEST(Backward, net_fc_backward_not_have_b) {
 }
 
 TEST(Backward, net_input_of_network_not_need_grad) {
-  f::NetOp net;
+  ops::NetOp net;
   net.AddOp(f::OpRegistry::CreateOp("fc", {"X", "W1", "b1"},
                                     {"mul_tmp_0", "add_tmp_0", "hidden0"}, {}));
   net.AddOp(f::OpRegistry::CreateOp("fc", {"hidden0", "W2", "b2"},
@@ -236,7 +238,7 @@ TEST(Backward, net_input_of_network_not_need_grad) {
   net.CompleteAddOp();
   auto bwd = Backward(net, {"X"});  // X@GRAD is not need.
   ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
 
   std::unordered_set<std::string> all_output = std::unordered_set<std::string>(
       bwd_net->outputs_.begin(), bwd_net->outputs_.end());
@@ -253,7 +255,7 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 
   ASSERT_EQ(2UL, bwd_net->ops_.size());
   ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
-  auto first_fc_grad = static_cast<f::NetOp *>(bwd_net->ops_[1].get());
+  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
   ASSERT_EQ(3UL, first_fc_grad->ops_.size());
   ASSERT_EQ(
       f::OperatorBase::EMPTY_VAR_NAME(),
@@ -261,14 +263,14 @@ TEST(Backward, net_input_of_network_not_need_grad) {
 }
 
 TEST(Backward, net_shared_weight) {
-  f::NetOp net;
+  ops::NetOp net;
   net.AddOp(f::OpRegistry::CreateOp("mul", {"X", "W"}, {"Out"}, {}));
   net.AddOp(f::OpRegistry::CreateOp("mul", {"Out", "W"}, {"FinalOut"}, {}));
   net.CompleteAddOp();
 
   auto bwd = f::Backward(net, {});
   ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
   ASSERT_EQ(3UL, bwd_net->ops_.size());
   ASSERT_EQ("add", bwd_net->ops_[2]->type_);
 }
@@ -285,7 +287,7 @@ TEST(Backward, op_all_input_are_not_need) {
   auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
   auto backward = f::Backward(*fwd, {"X", "b"});
   ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<f::NetOp *>(backward.get());
+  auto net = static_cast<ops::NetOp *>(backward.get());
   ASSERT_TRUE(net->ops_.empty());
 }
 
@@ -293,7 +295,7 @@ TEST(Backward, op_all_output_are_not_need) {
   auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
   auto backward = f::Backward(*fwd, {"Out"});
   ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<f::NetOp *>(backward.get());
+  auto net = static_cast<ops::NetOp *>(backward.get());
   ASSERT_TRUE(net->ops_.empty());
 }
 
@@ -301,7 +303,7 @@ TEST(Backward, op_part_of_output_are_not_need) {
   auto fwd = f::OpRegistry::CreateOp("many_output_op", {"X"}, {"Y", "Z"}, {});
   auto backward = f::Backward(*fwd, {"Z"});
   ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<f::NetOp *>(backward.get());
+  auto net = static_cast<ops::NetOp *>(backward.get());
   ASSERT_EQ(net->ops_.size(), 2UL);
 
   auto &fill_zero = *net->ops_[0];
@@ -341,7 +343,7 @@ TEST(Backward, op_part_of_input_are_not_need) {
 }
 
 TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
-  f::NetOp net;
+  ops::NetOp net;
   net.AddOp(f::OpRegistry::CreateOp("fc", {"x1", "w1", "b1"},
                                     {"mul_out1", "add_out1", "out1"}, {}));
   net.AddOp(f::OpRegistry::CreateOp("fc", {"out1", "w2", "b2"},
@@ -351,14 +353,13 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
   net.CompleteAddOp();
   auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
   ASSERT_TRUE(backward->IsNetOp());
-  auto bwd_net = static_cast<f::NetOp *>(backward.get());
+  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
   ASSERT_EQ(bwd_net->ops_.size(), 3UL);
   auto &grad_fc = *bwd_net->ops_[0];
   EXPECT_EQ(grad_fc.inputs_.size(),
             3UL       /* external input number */
                 + 1UL /* external output number*/
                 + 1UL /* number of gradient of external output*/
-                - 1UL /*ignoreGradient varable number*/
                 + 2U /* internal variable number*/);
   EXPECT_EQ(grad_fc.outputs_.size(), 2UL       /* input number of mul*/
                                          + 2UL /* input number of rowwise_add */
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 9fcc657edcd5459d0a42a64d708603a4bcd53cf0..5aa5af0c19be5a209c760282cb1a090fc57a53ad 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -25,18 +25,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-namespace {
-typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                       Dim<8>, Dim<9>>
-    DDimVar;
-}
-
 /**
  * \brief A dynamically sized dimension.
  *
  * The number of dimensions must be between [1, 9].
  */
 struct DDim {
+  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                         Dim<8>, Dim<9>>
+      DDimVar;
   DDimVar var;
 
   DDim() : var(Dim<1>()) {}
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
index dd686cc78246f06cdc3ec7d013086863d7e8fac0..ea5e939c6e26514c2f3c515da5581b29103f75b6 100644
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -8,107 +8,97 @@ You may obtain a copy of the License at
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either
+express or implied. See the License for the specific language governing
+permissions and limitations under the License. */
 
 #include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace framework {
 
-OperatorBase* GradOpBuilder::Build() {
-  BuildOpInOutArgList();
-  std::string grad_op_type = OpRegistry::grad_ops().at(op_.type_);
-  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
-  grad_op->type_ = grad_op_type;
-  CompleteGradOp(grad_op);
-  return grad_op;
-}
+class OpRegistry;
+
+using VarIndexMap = std::unordered_map<std::string, int>;
 
-OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var,
-                                    const VarIndexMap& var_map,
-                                    const std::vector<int>& format,
-                                    InOutType type) {
-  int idx = var_map.at(var.name());
-  int begin_idx = format.empty() ? idx : format.at(idx);
-  int end_idx = format.empty() ? idx + 1 : format.at(idx + 1);
-  return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx,
-                        end_idx);
+enum class OpArgType { IN, OUT };
+
+static std::vector<int>* GetOpFormat(OperatorBase* op, const OpArgType& type) {
+  std::string key = type == OpArgType::IN ? "input_format" : "output_format";
+  return op->attrs_.count(key)
+             ? &boost::get<std::vector<int>>(op->attrs_.at(key))
+             : nullptr;
 }
 
-void GradOpBuilder::BuildOpInOutArgList() {
-  const OpProto& op_proto = OpRegistry::protos().at(op_.type_);
-  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_.type_));
-  const std::vector<int>& in_format =
-      op_.attrs_.count("input_format")
-          ? op_.GetAttr<std::vector<int>>("input_format")
-          : std::vector<int>();
-  const std::vector<int>& out_format =
-      op_.attrs_.count("output_format")
-          ? op_.GetAttr<std::vector<int>>("output_format")
-          : std::vector<int>();
-  for (const auto& var : op_proto.inputs()) {
-    arg_list_.emplace_back(
-        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, in_format, IN)));
-  }
-  for (const auto& var : op_proto.outputs()) {
-    arg_list_.emplace_back(
-        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, out_format, OUT)));
-  }
+static const std::vector<int>* GetOpFormat(const OperatorBase* op,
+                                           const OpArgType& type) {
+  std::string key = type == OpArgType::IN ? "input_format" : "output_format";
+  return op->attrs_.count(key)
+             ? &boost::get<std::vector<int>>(op->attrs_.at(key))
+             : nullptr;
 }
 
-void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
-                                     std::vector<std::string>& in_out,
-                                     std::vector<int>& format,
-                                     VarIndexMap* varmap, int& idx,
-                                     bool is_grad) const {
-  std::string var_name = arg->proto_name_;
-  if (is_grad) {
-    var_name += OperatorBase::GRAD_VAR_SUFFIX();
-  }
-  (*varmap)[var_name] = idx++;
-  size_t pre_sz = in_out.size();
-  auto base_it = arg->type_ == IN ? op_.inputs_.begin() : op_.outputs_.begin();
-  std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
-            std::back_inserter(in_out));
-  if (is_grad) {
-    for (size_t i = pre_sz; i < in_out.size(); ++i) {
-      in_out[i] += OperatorBase::GRAD_VAR_SUFFIX();
+static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op,
+                       const OpArgType& src_type, const OpArgType& dst_type,
+                       int& idx, bool is_grad) {
+  const std::vector<std::string>& src_inout =
+      src_type == OpArgType::IN ? src_op->inputs_ : src_op->outputs_;
+  const std::vector<int>* src_format = GetOpFormat(src_op, src_type);
+
+  std::vector<std::string>& dst_inout =
+      dst_type == OpArgType::IN ? dst_op->inputs_ : dst_op->outputs_;
+  std::vector<int>* dst_format = GetOpFormat(dst_op, dst_type);
+  const OpProto& proto = OpRegistry::protos().at(src_op->type_);
+  const auto& src_arg_list =
+      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
+
+  for (const auto& arg : src_arg_list) {
+    std::string src_name = arg.name();
+    std::string dst_name =
+        is_grad ? src_name + OperatorBase::GRAD_VAR_SUFFIX() : src_name;
+    (*dst_op->in_out_idxs_)[dst_name] = idx++;
+    int src_arg_idx = src_op->in_out_idxs_->at(src_name);
+    int src_begin =
+        src_format == nullptr ? src_arg_idx : src_format->at(src_arg_idx);
+    int src_end = src_format == nullptr ? src_arg_idx + 1
+                                        : src_format->at(src_arg_idx + 1);
+    for (int i = src_begin; i < src_end; ++i) {
+      std::string s = is_grad ? src_inout[i] + OperatorBase::GRAD_VAR_SUFFIX()
+                              : arg.ignore_gradient()
+                                    ? OperatorBase::EMPTY_VAR_NAME()
+                                    : src_inout[i];
+      dst_inout.emplace_back(s);
+    }
+    if (dst_format != nullptr) {
+      dst_format->push_back(dst_inout.size());
     }
   }
-  format.push_back(in_out.size());
 }
 
-void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
-  grad_op->attrs_ = op_.attrs_;
+OperatorBase* BuildGradOp(const OperatorBase* op) {
+  std::string grad_op_type = OpRegistry::grad_ops().at(op->type_);
+  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
+  grad_op->type_ = grad_op_type;
+  grad_op->attrs_ = op->attrs_;
   grad_op->attrs_.erase("input_format");
   grad_op->attrs_.erase("output_format");
-  VarIndexMap* grad_varmap = new VarIndexMap();
+  if (GetOpFormat(op, OpArgType::IN) != nullptr) {
+    grad_op->attrs_["output_format"] = std::vector<int>({0});
+  }
+  if (GetOpFormat(op, OpArgType::IN) != nullptr ||
+      GetOpFormat(op, OpArgType::OUT) != nullptr) {
+    grad_op->attrs_["input_format"] = std::vector<int>({0});
+  }
+  grad_op->in_out_idxs_.reset(new VarIndexMap());
   int in_idx = 0;
   int out_idx = 0;
-  std::vector<int> in_format({0});
-  std::vector<int> out_format({0});
-  for (const auto& arg : arg_list_) {
-    // op_'s inputs_ and outputs_
-    if (arg->needed_in_grad_) {
-      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
-                       in_idx, false);
-    }
-    if (arg->type_ == IN) {
-      // gradients of op_'s inputs_
-      AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
-                       out_idx, true);
-    } else {
-      // gradients of op_'s outputs_
-      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
-                       in_idx, true);
-    }
-  }
-  grad_op->attrs_["input_format"] = in_format;
-  grad_op->attrs_["output_format"] = out_format;
-  grad_op->in_out_idxs_.reset(grad_varmap);
+  TransOpArg(op, grad_op, OpArgType::IN, OpArgType::IN, in_idx, false);   // I
+  TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, false);  // G
+  TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, true);   // OG
+  TransOpArg(op, grad_op, OpArgType::IN, OpArgType::OUT, out_idx, true);  // IG
+  return grad_op;
 }
 
 }  // namespace framework
diff --git a/paddle/framework/grad_op_builder.h b/paddle/framework/grad_op_builder.h
index cc7a76f3726e00a08fbe06bca4c9b9f5bad466b4..998f8ebbb5f2f4fb8b7e938b5916afd0f8a7930d 100644
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
@@ -1,48 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
-#include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
-class OpRegistry;
-
-enum InOutType { IN, OUT };
-
-struct OpInOutArg {
-  OpInOutArg(const std::string& proto_name, const InOutType& type,
-             bool needed_in_grad, size_t begin_idx, size_t end_idx)
-      : proto_name_(proto_name),
-        type_(type),
-        needed_in_grad_(needed_in_grad),
-        begin_idx_(begin_idx),
-        end_idx_(end_idx) {}
-
-  std::string proto_name_;
-  InOutType type_;
-  bool needed_in_grad_;
-  size_t begin_idx_;
-  size_t end_idx_;
-};
-
-class GradOpBuilder {
-  using VarIndexMap = std::unordered_map<std::string, int>;
-
- public:
-  GradOpBuilder(const OperatorBase& op) : op_(op) {}
-  OperatorBase* Build();
-
- private:
-  OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
-                       const std::vector<int>& format, InOutType type);
-  void BuildOpInOutArgList();
-  void AddArgIntoGradOp(const OpInOutArg* arg, std::vector<std::string>& in_out,
-                        std::vector<int>& format, VarIndexMap* varmap, int& idx,
-                        bool is_grad) const;
-  void CompleteGradOp(OperatorBase* grad_op) const;
-  const OperatorBase& op_;
-  std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
-};
+
+OperatorBase* BuildGradOp(const OperatorBase* op);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index e9cf3b9798db2cbfb8d26259ae9a6741fbae8278..96d7f309d67b15c000ab8ce3769931322fbca880 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -8,10 +8,49 @@ USE_OP(add_two);
 namespace paddle {
 namespace framework {
 
+class NOP : public OperatorBase {
+ public:
+  void InferShape(const Scope &scope) const override {}
+  void Run(const Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {}
+};
+
+class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In1", "a single input");
+    AddInput("In2_mult", "a multiple input").SetMultiple();
+    AddInput("In3", "another single input");
+    AddOutput("Out1", "a single output");
+    AddOutput("Out2_mult", "a multiple output").SetMultiple();
+    AddComment("test op with multiple inputs and outputs");
+  }
+};
+
+class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In1", "a single input");
+    AddInput("In2_mult", "a multiple input").SetMultiple().IgnoreGradient();
+    AddInput("In3_mult", "another multiple input").SetMultiple();
+    AddOutput("Out1_mult", "a multiple output").SetMultiple();
+    AddOutput("Out2", "a single output").IgnoreGradient();
+    AddComment("op with inputs and outputs ignored in gradient calculating");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+
 TEST(GradOpBuilder, AddTwo) {
-  std::shared_ptr<OperatorBase> add_op(
-      OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
-  std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(*add_op);
+  std::shared_ptr<f::OperatorBase> add_op(
+      f::OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
+  std::shared_ptr<f::OperatorBase> grad_add_op =
+      f::OpRegistry::CreateGradOp(*add_op);
   EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
   EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
   EXPECT_EQ(grad_add_op->Input("X"), "x");
@@ -22,5 +61,85 @@ TEST(GradOpBuilder, AddTwo) {
   EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD");
 }
 
-}  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker);
+REGISTER_GRADIENT_OP(mult_io, mult_io_grad, f::NOP);
+REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker);
+REGISTER_GRADIENT_OP(io_ignored, io_ignored_grad, f::NOP);
+
+TEST(GradOpBuilder, MutiInOut) {
+  f::AttributeMap attrs{{"input_format", std::vector<int>{0, 1, 4, 5}},
+                        {"output_format", std::vector<int>{0, 1, 3}}};
+  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
+      "mult_io", {"in1", "in2_1", "in2_2", "in2_3", "in3"},
+      {"out1", "out2_1", "out2_2"}, attrs));
+  std::shared_ptr<f::OperatorBase> grad_test_op =
+      f::OpRegistry::CreateGradOp(*test_op);
+
+  ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL);
+  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
+  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
+            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
+  EXPECT_EQ(grad_test_op->Input("In3"), "in3");
+  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
+  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
+            std::vector<std::string>({"out2_1", "out2_2"}));
+  EXPECT_EQ(grad_test_op->Input("Out1" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "out1" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(
+      grad_test_op->Inputs("Out2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+      std::vector<std::string>(
+          {"out2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+           "out2_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
+
+  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
+  EXPECT_EQ(grad_test_op->Output("In1" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "in1" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(
+      grad_test_op->Outputs("In2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+      std::vector<std::string>({"in2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+                                "in2_2" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+                                "in2_3" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
+  EXPECT_EQ(grad_test_op->Output("In3" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "in3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+}
+
+TEST(GradOpBuilder, IOIgnoredInGradient) {
+  f::AttributeMap attrs{{"input_format", std::vector<int>{0, 1, 3, 5}},
+                        {"output_format", std::vector<int>{0, 2, 3}}};
+  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
+      "io_ignored", {"in1", "in2_1", "in2_2", "in3_1", "in3_2"},
+      {"out1_1", "out1_2", "out2"}, attrs));
+  std::shared_ptr<f::OperatorBase> grad_test_op =
+      f::OpRegistry::CreateGradOp(*test_op);
+
+  // 'In2' and 'Out2' are ignored in gradient calculating
+  ASSERT_EQ(grad_test_op->inputs_.size(), 5UL + 3UL + 3UL);
+  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
+  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
+            std::vector<std::string>({f::OperatorBase::EMPTY_VAR_NAME(),
+                                      f::OperatorBase::EMPTY_VAR_NAME()}));
+  EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
+            std::vector<std::string>({"in3_1", "in3_2"}));
+  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
+            std::vector<std::string>({"out1_1", "out1_2"}));
+  EXPECT_EQ(grad_test_op->Input("Out2"), f::OperatorBase::EMPTY_VAR_NAME());
+  EXPECT_EQ(
+      grad_test_op->Inputs("Out1_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+      std::vector<std::string>(
+          {"out1_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+           "out1_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
+  EXPECT_EQ(grad_test_op->Input("Out2" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "out2" + f::OperatorBase::GRAD_VAR_SUFFIX());
+
+  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
+  EXPECT_EQ(grad_test_op->Output("In1" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "in1" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(
+      grad_test_op->Outputs("In2_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+      std::vector<std::string>({"in2_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+                                "in2_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
+  EXPECT_EQ(
+      grad_test_op->Outputs("In3_mult" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+      std::vector<std::string>({"in3_1" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+                                "in3_2" + f::OperatorBase::GRAD_VAR_SUFFIX()}));
+}
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 3e72e391266066de9e4114e68b43b066c15254db..1af894612c526fd57b5b6f1d26d934aac27493a9 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -306,8 +306,7 @@ class OpRegistry {
   static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
     PADDLE_ENFORCE(!op.IsNetOp(),
                    "Use framework::Backward to get backward ops");
-    GradOpBuilder builder(op);
-    std::shared_ptr<OperatorBase> grad_op(builder.Build());
+    std::shared_ptr<OperatorBase> grad_op(BuildGradOp(&op));
     grad_op->Init();
     return grad_op;
   }
@@ -315,7 +314,7 @@ class OpRegistry {
   static std::unordered_map<std::string, OpProto>& protos() {
     static std::unordered_map<std::string, OpProto> protos_;
     return protos_;
-  };
+  }
 
   static std::unordered_map<std::string, std::string>& grad_ops() {
     static std::unordered_map<std::string, std::string> grad_ops_;
@@ -337,7 +336,7 @@ class OpRegistry {
   static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
     static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
     return op_checkers_;
-  };
+  }
 
   static void GenerateTempVariableName(OperatorBase* op) {
     static std::atomic<size_t> gUniqId(0UL);
@@ -354,7 +353,7 @@ class OpRegistry {
 template <typename OpType, typename ProtoMakerType>
 class OpRegisterHelper {
  public:
-  OpRegisterHelper(const char* op_type) {
+  explicit OpRegisterHelper(const char* op_type) {
     OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
   }
 };
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 55435103489ace11868eed61c38018d8ba357e65..fbf9113e5677080e6573ba3ff1e1deb36a2889e9 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -55,6 +55,10 @@ class OperatorBase {
   /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
   static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
 
+  static std::string GRAD_VAR_NAME(const std::string& name) {
+    return name + GRAD_VAR_SUFFIX();
+  }
+
   /// Variables with this suffix are supposed to be filled up with zeros.
   static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }
 
@@ -280,7 +284,7 @@ class OperatorWithKernel : public OperatorBase {
     platform::Place place_;
 
     OpKernelKey() = default;
-    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
       place_ = dev_ctx.GetPlace();
     }
 
diff --git a/paddle/pybind/pybind.cc b/paddle/framework/pybind.cc
similarity index 59%
rename from paddle/pybind/pybind.cc
rename to paddle/framework/pybind.cc
index 40ff164497f627c0b562b6d33bfb4bec590e4c85..b9889e483e27e9dad3310a34b5306f073ad1887d 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,19 +17,19 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/backward.h"
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
+#include "paddle/framework/tensor_py.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/operators/type_alias.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
-#include "paddle/pybind/tensor_bind.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
 namespace py = pybind11;
-namespace pd = paddle::framework;
 
 USE_OP(add_two);
 USE_OP(onehot_cross_entropy);
@@ -41,17 +41,18 @@ USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP_WITHOUT_KERNEL(recurrent_op);
-
+namespace paddle {
+namespace framework {
 template <typename ClassType>
-void ExposeOperator(ClassType& m) {
+void ExposeOperator(ClassType &m) {
   m.def("infer_shape", &ClassType::type::InferShape)
       .def("run", &ClassType::type::Run)
       .def("type",
-           [](const typename ClassType::type& op) -> std::string {
+           [](const typename ClassType::type &op) -> std::string {
              return op.type_;
            })
       .def("outputs",
-           [](const typename ClassType::type& op) -> std::vector<std::string> {
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
              return op.outputs_;
            })
       .def("__str__", &ClassType::type::DebugString);
@@ -73,80 +74,81 @@ bool IsCompileGPU() {
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
-  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
-      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
-        return paddle::pybind::CastToPyBuffer(self);
-      })
+  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer(
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
       .def("get_dims",
-           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
+           [](const Tensor &self) { return vectorize(self.dims()); })
       .def("set_dims",
-           [](pd::Tensor& self, const std::vector<int>& dim) {
-             self.Resize(pd::make_ddim(dim));
+           [](Tensor &self, const std::vector<int> &dim) {
+             self.Resize(make_ddim(dim));
            })
       .def("alloc_float",
-           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+           [](Tensor &self, paddle::platform::GPUPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("alloc_float",
-           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("alloc_int",
-           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("alloc_int",
-           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+           [](Tensor &self, paddle::platform::GPUPlace &place) {
              self.mutable_data<int>(place);
            })
-      .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
-      .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
+      .def("set", PyCPUTensorSetFromArray<float>)
+      .def("set", PyCPUTensorSetFromArray<int>)
 #ifndef PADDLE_ONLY_CPU
-      .def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
-      .def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
+      .def("set", PyCUDATensorSetFromArray<float>)
+      .def("set", PyCUDATensorSetFromArray<int>)
 #endif
-      .def("shape",
-           [](pd::Tensor& self) { return pd::vectorize(self.dims()); });
+      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
+      .def("set_float_element",
+           [](Tensor &self, size_t offset, float f) {
+             // TODO(yuyang18): Only support GPU now.
+             self.data<float>()[offset] = f;
+           })
+      .def("get_float_element", [](Tensor &self, size_t offset) -> float {
+        // TODO(yuyang18): Only support GPU now.
+        return self.data<float>()[offset];
+      });
 
-  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
+  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
 
 All parameter, weight, gradient are variables in Paddle.
 )DOC")
-      .def("is_int", [](const pd::Variable& var) { return var.IsType<int>(); })
+      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
       .def("set_int",
-           [](pd::Variable& var, int val) -> void {
-             *var.GetMutable<int>() = val;
-           })
-      .def("get_int",
-           [](const pd::Variable& var) -> int { return var.Get<int>(); })
+           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
+      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
       .def("get_tensor",
-           [](pd::Variable& self) -> pd::Tensor* {
-             return self.GetMutable<pd::Tensor>();
-           },
+           [](Variable &self) -> Tensor * { return self.GetMutable<Tensor>(); },
            py::return_value_policy::reference)
       .def("get_net",
-           [](pd::Variable& self) -> pd::NetOp* {
-             return self.GetMutable<pd::NetOp>();
+           [](Variable &self) -> ops::NetOp * {
+             return self.GetMutable<ops::NetOp>();
            },
            py::return_value_policy::reference);
 
-  py::class_<pd::Scope>(m, "Scope", "")
+  py::class_<Scope>(m, "Scope", "")
       .def("new_var",
-           [](pd::Scope& self, const std::string& name) -> pd::Variable* {
+           [](Scope &self, const std::string &name) -> Variable * {
              return self.NewVar(name);
            },
            py::return_value_policy::reference)
-      .def("find_var", &pd::Scope::FindVar, py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
       .def(py::init<>())
-      .def("new_scope",
-           [](pd::Scope& self) -> pd::Scope* { return &self.NewScope(); },
+      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
            py::return_value_policy::reference)
-      .def("drop_kids", &pd::Scope::DropKids);
+      .def("drop_kids", &Scope::DropKids);
 
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
   m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
-    auto& protos = pd::OpRegistry::protos();
+    auto &protos = OpRegistry::protos();
     std::vector<py::bytes> ret_values;
     for (auto it = protos.begin(); it != protos.end(); ++it) {
       PADDLE_ENFORCE(it->second.IsInitialized(),
@@ -161,8 +163,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
-      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
-      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+      .def("empty", OperatorBase::EMPTY_VAR_NAME)
+      .def("temp", OperatorBase::TMP_VAR_NAME);
   // clang-format off
   py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
       .def_static("create",
@@ -185,43 +187,45 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
 
-  py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
+  py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
       m, "Operator");
 
   operator_base.def_static("create", [](py::bytes protobin) {
-    pd::OpDesc desc;
+    OpDesc desc;
     PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                    "Cannot parse user input to OpDesc");
     PADDLE_ENFORCE(desc.IsInitialized(),
                    "User OpDesc is not initialized, reason %s",
                    desc.InitializationErrorString());
-    return pd::OpRegistry::CreateOp(desc);
+    return OpRegistry::CreateOp(desc);
   });
 
   operator_base.def("backward",
-                    [](const pd::OperatorBase& forwardOp,
-                       const std::unordered_set<std::string>& no_grad_vars) {
-                      return pd::Backward(forwardOp, no_grad_vars);
+                    [](const OperatorBase &forwardOp,
+                       const std::unordered_set<std::string> &no_grad_vars) {
+                      return Backward(forwardOp, no_grad_vars);
                     });
 
   ExposeOperator(operator_base);
 
-  py::class_<pd::NetOp, std::shared_ptr<pd::NetOp>> net(m, "Net");
+  py::class_<ops::NetOp, std::shared_ptr<ops::NetOp>> net(m, "Net");
 
   net.def_static("create",
-                 []() -> std::shared_ptr<pd::NetOp> {
-                   auto retv = std::make_shared<pd::NetOp>();
+                 []() -> std::shared_ptr<ops::NetOp> {
+                   auto retv = std::make_shared<ops::NetOp>();
                    retv->type_ = "plain_net";
                    return retv;
                  })
-      .def("add_op", &pd::NetOp::AddOp)
-      .def("add_op",
-           [](pd::NetOp& self, const std::shared_ptr<pd::NetOp>& net) -> void {
-             self.AddOp(std::static_pointer_cast<pd::OperatorBase>(net));
-           })
-      .def("complete_add_op", &pd::NetOp::CompleteAddOp)
+      .def("add_op", &ops::NetOp::AddOp)
+      .def(
+          "add_op",
+          [](ops::NetOp &self, const std::shared_ptr<ops::NetOp> &net) -> void {
+            self.AddOp(std::static_pointer_cast<OperatorBase>(net));
+          })
+      .def("complete_add_op", &ops::NetOp::CompleteAddOp)
       .def("complete_add_op",
-           [](std::shared_ptr<pd::NetOp>& self) { self->CompleteAddOp(); });
+           [](std::shared_ptr<ops::NetOp> &self) { self->CompleteAddOp(); });
+
   ExposeOperator(net);
 
   m.def("unique_integer", UniqueIntegerGenerator);
@@ -230,3 +234,5 @@ All parameter, weight, gradient are variables in Paddle.
 
   return m.ptr();
 }
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 85af0e20a4174344716452bc03dcb1d5e596fe8d..4c3b14b83d841e88683a13634c93f51c012128b6 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -26,19 +26,17 @@ limitations under the License. */
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
-namespace pybind {
-namespace details {  // forward declare
-template <bool less, size_t i, typename... args>
-struct CastToPyBufferImpl;
-}  // namespace details
-}  // namespace pybind
 
 namespace framework {
+namespace details {
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}
 
 class Tensor {
  public:
   template <bool less, size_t i, typename... args>
-  friend struct paddle::pybind::details::CastToPyBufferImpl;
+  friend struct details::CastToPyBufferImpl;
 
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
diff --git a/paddle/pybind/tensor_bind.h b/paddle/framework/tensor_py.h
similarity index 92%
rename from paddle/pybind/tensor_bind.h
rename to paddle/framework/tensor_py.h
index def37219ccefd5435f1212c4e4daac5a351d76f4..4e1ab77b157fe1adaeac55c271c056236f2d40de 100644
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/framework/tensor_py.h
@@ -23,7 +23,7 @@ namespace py = pybind11;
 
 namespace paddle {
 
-namespace pybind {
+namespace framework {
 
 namespace details {
 
@@ -63,11 +63,8 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
       }
       return py::buffer_info(
           dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
-          sizeof(CUR_TYPE),
-          py::format_descriptor<CUR_TYPE>::format(),
-          (size_t)framework::arity(dst_tensor.dims()),
-          dims_outside,
-          strides);
+          sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
       return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
@@ -110,8 +107,8 @@ void PyCUDATensorSetFromArray(
 
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
-  paddle::platform::GpuMemcpySync(
-      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
+  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
+                                  cudaMemcpyHostToDevice);
 }
 #endif
 
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 9ddd449de7500f5682d59469328f06971c6e83bf..f98bf95064fa539b990309dfe0bff10c1e99d096 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -967,8 +967,9 @@ void RecurrentGradientMachine::generateSequence() {
   size_t numSequences = getGenBatchSize();
 
   resizeBootFrame(numSequences);
-  // We create only two sub-network in generation for alternate use.
-  // Thus, we can reduce total memory of output_ in layer forward.
+  // We create only two sub-network in generation, one stores states of all
+  // layers in previous time step and the other storing the states at current
+  // time step.
   resizeOrCreateFrames(2);
 
   // outFrameLines_.size() > 1UL
@@ -1001,10 +1002,9 @@ void RecurrentGradientMachine::generateSequence() {
 
   // init outArg
   size_t resultNum = generator_.config.num_results_per_sample();
-  IVector::resizeOrCreate(
-      generator_.outArg.ids,
-      generator_.config.max_num_frames() * numSequences * resultNum,
-      false);
+  size_t maxGenWordCount =
+      generator_.config.max_num_frames() * numSequences * resultNum;
+  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
   if (resultNum > 1) {
     CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
     Matrix::resizeOrCreate(generator_.outArg.in,
@@ -1012,6 +1012,11 @@ void RecurrentGradientMachine::generateSequence() {
                            /* width */ resultNum,
                            false,
                            /* useGpu */ false);
+    Matrix::resizeOrCreate(generator_.outArg.value,
+                           /* height */ maxGenWordCount,
+                           /* width */ 1,
+                           false,
+                           /* useGpu */ false);
   }
   ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
                                 numSequences + 1,
@@ -1313,13 +1318,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
   starts[0] = 0;
   if (numResults > 1) {
     real* probs = generator_.outArg.in->getData();
+    real* idsProb = generator_.outArg.value->getData();
+    size_t curPos = 0;
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
         Path& path = finalPaths_[i][j];
-        generator_.ids.push_back(path.ids.size());  // sequence size
+        size_t genLen = path.ids.size();
+        generator_.ids.push_back(genLen);  // sequence size
         generator_.ids.insert(
             generator_.ids.end(), path.ids.begin(), path.ids.end());
         generator_.ids.push_back(-1);  // end of sequence
+
+        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
+        curPos += genLen;
+        idsProb[curPos++] = -1.0;
         probs[i * numResults + j] = path.logProb;
 
         if (!j && dataArgsSize_) {
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index f245620cf668bb341df99cf498105cbd996a6b24..fb3fc5877ac96323e891f800db80af83b6809831 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -189,6 +189,11 @@ public:
      */
     std::vector<int> ids;
 
+    /**
+     * @brief idsProb, log probability of each generated words.
+     */
+    std::vector<real> idsProb;
+
     /**
      * @brief logProb, current probability of path.
      */
@@ -228,11 +233,13 @@ public:
      */
     Path(Path& old, int newId, real logProb, int machineId, int topIndex)
         : ids(old.ids),
+          idsProb(old.idsProb),
           logProb(old.logProb + logProb),
           machineId(machineId),
           topIndex(topIndex),
           seqId(old.seqId) {
       ids.push_back(newId);
+      idsProb.push_back(logProb);
       if (!old.probHistory.empty()) {
         this->probHistory = old.probHistory;
         // probHistory store current prob, not sum
@@ -411,8 +418,9 @@ protected:
 
   struct Generator {
     GeneratorConfig config;
-    std::vector<int> ids;  // store generated sequences
-    Argument outArg;       // final output argument
+    std::vector<int> ids;       // store generated sequences
+    std::vector<real> idsProb;  // log probability of each generated word
+    Argument outArg;            // final output argument
   };
   bool generating_;
   Generator generator_;
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 4fa3fb0ee5f826d2b084c0ba184c505aee3acc48..9c41378483993101a098fc4ad1068c1ef908e566 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -39,7 +39,7 @@ class BuddyAllocator {
 
  public:
   void* Alloc(size_t unaligned_size);
-  void Free(void*);
+  void Free(void* ptr);
   size_t Used();
 
  public:
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
index ca0789779e273fb71c3d6282c0a921cda2d776cc..cf5815644284c23a1d2abc904f8c5053ce107a72 100644
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
@@ -33,17 +33,17 @@ namespace detail {
  */
 class MetadataCache {
  public:
-  MetadataCache(bool uses_gpu);
+  explicit MetadataCache(bool uses_gpu);
 
  public:
   /*! \brief Load the associated metadata for the specified memory block. */
-  Metadata load(const MemoryBlock*);
+  Metadata load(const MemoryBlock* memory_block);
 
   /*! \brief Store the associated metadata for the specified memory block. */
-  void store(MemoryBlock*, const Metadata&);
+  void store(MemoryBlock* memory_block, const Metadata& meta_data);
 
   /*! \brief Indicate that the specified metadata will no longer be used. */
-  void invalidate(MemoryBlock*);
+  void invalidate(MemoryBlock* memory_block);
 
  public:
   MetadataCache(const MetadataCache&) = delete;
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 44f567caf9c19775f17988b5142b7693b41a126d..72351b9dfa63513713463bb47a3684f0dfd84ad3 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -68,7 +68,7 @@ class PODDeleter {
   static_assert(std::is_pod<T>::value, "T must be POD");
 
  public:
-  PODDeleter(Place place) : place_(place) {}
+  explicit PODDeleter(Place place) : place_(place) {}
   void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
 
  private:
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 6465deeec93100f0238ac850b92f7f7c5a60b795..96c76e22e9814682008f2e6c7ae98e2599d391c2 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -41,6 +41,9 @@ function(op_library TARGET)
     endif()
 endfunction()
 
+cc_library(net_op SRCS net_op.cc DEPS op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
+
 op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
 
@@ -59,6 +62,6 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
 
 op_library(fc_op
     SRCS fc_op.cc
-    DEPS mul_op rowwise_add_op sigmoid_op softmax_op net)
-op_library(recurrent_op SRCS recurrent_op.cc DEPS op_desc tensor op_registry operator net)
+    DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op)
+op_library(recurrent_op SRCS recurrent_op.cc DEPS op_desc tensor op_registry operator net_op)
 cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index f961b37565f400b5c26844b9e7a3cff5e682340b..9bd08634da96c5595d6dd702ad9afafb94632b03 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 926a0c616b957d8e542c1f3dee227a718fb29f07..2f453f8379ca7ce0612fed757719acb2d2cf0ad8 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -1,5 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
 
 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
-                       ops::OnehotCrossEntropyOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
+                       ops::OnehotCrossEntropyOpKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu
index 55ad58f4f17cd4a3e737c01b001675d2690d273e..ed1068219c8fee8c6e8809f450a9d38c8226f317 100644
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -1,6 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"
 
 REGISTER_OP_GPU_KERNEL(
     fill_zeros_like,
-    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
+    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
index e15de2fd0dd84e4015ee0e3b5343d7651b027a88..8b97b0154ccdc8c41a90f7580af829c5c8663b60 100644
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -1,6 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 
 #include "paddle/operators/mean_op.h"
 
 REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index dc9236701627dc9335b844d2a82e18eb1f7dfd42..1dc04c4297daed7a7861a09cf6b99446c296ffa5 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -15,4 +15,4 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 
-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
diff --git a/paddle/framework/net.cc b/paddle/operators/net_op.cc
similarity index 96%
rename from paddle/framework/net.cc
rename to paddle/operators/net_op.cc
index 2cd378c6b21303d1a24206ba3010b0d035aaa766..fbc98e09923bda7f3baee04e02df9076247bff0b 100644
--- a/paddle/framework/net.cc
+++ b/paddle/operators/net_op.cc
@@ -14,11 +14,11 @@
   limitations under the License.
 */
 
-#include "paddle/framework/net.h"
+#include "paddle/operators/net_op.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
-namespace framework {
+namespace operators {
 
 void NetOp::CompleteAddOp(bool calc) {
   add_op_done_ = true;
@@ -74,5 +74,5 @@ std::string NetOp::DebugString() const {
 
 bool NetOp::IsNetOp() const { return true; }
 
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/operators/net_op.h
similarity index 89%
rename from paddle/framework/net.h
rename to paddle/operators/net_op.h
index acf1a69da9fd8adce1bd89367c882eade052e725..13611e1ee83170db43e17d6088e4b04588ce6255 100644
--- a/paddle/framework/net.h
+++ b/paddle/operators/net_op.h
@@ -14,15 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/framework/op_desc.pb.h>
-#include <paddle/framework/operator.h>
+#include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
+#include "paddle/operators/type_alias.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
-namespace framework {
+namespace operators {
+
 /**
  * @brief Network is also a type of Operator
  *
@@ -37,13 +39,13 @@ namespace framework {
  * This is the base class of network, all the networks should implement the APIs
  * it defines.
  */
-class NetOp : public OperatorBase {
- public:
+class NetOp : public framework::OperatorBase {
+public:
   /**
    * Infer all the operators' input and output variables' shapes, will be called
    * before every mini-batch
    */
-  void InferShape(const Scope& scope) const override {
+  void InferShape(const framework::Scope& scope) const override {
     for (auto& op : ops_) {
       op->InferShape(scope);
     }
@@ -56,7 +58,7 @@ class NetOp : public OperatorBase {
    * scope will be used instead. If no OpContext is provicded, default context
    * will be used.
    */
-  void Run(const Scope& scope,
+  void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {
     for (auto& op : ops_) {
       op->Run(scope, dev_ctx);
@@ -88,7 +90,7 @@ class NetOp : public OperatorBase {
 
   std::vector<std::shared_ptr<OperatorBase>> ops_;
 
- private:
+private:
   bool add_op_done_{false};
 
   template <typename T, typename KeyType>
@@ -97,5 +99,5 @@ class NetOp : public OperatorBase {
   }
 };
 
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/framework/net_design.md b/paddle/operators/net_op_design.md
similarity index 100%
rename from paddle/framework/net_design.md
rename to paddle/operators/net_op_design.md
diff --git a/paddle/framework/net_op_test.cc b/paddle/operators/net_op_test.cc
similarity index 91%
rename from paddle/framework/net_op_test.cc
rename to paddle/operators/net_op_test.cc
index f32e456e5d142bf8203f9ec03e8059772c4f5c99..18c5c60eb43250c23e2819a3c79ab8a96fec103e 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -1,16 +1,18 @@
+#include "paddle/operators/net_op.h"
+
 #include <gtest/gtest.h>
-#include <paddle/framework/net.h>
-#include <paddle/framework/op_registry.h>
-#include <paddle/framework/operator.h>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
-namespace framework {
+namespace operators {
 
 static int infer_shape_cnt = 0;
 static int run_cnt = 0;
 
 class TestOp : public OperatorBase {
- public:
+public:
   void InferShape(const framework::Scope& scope) const override {
     ++infer_shape_cnt;
   }
@@ -21,7 +23,7 @@ class TestOp : public OperatorBase {
 };
 
 class EmptyOp : public OperatorBase {
- public:
+public:
   void InferShape(const Scope& scope) const override {}
   void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const override {}
@@ -73,7 +75,7 @@ TEST(OpKernel, all) {
   ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
 }
 
-TEST(Net, insert_op) {
+TEST(NetOp, insert_op) {
   NetOp net;
   auto op1 = std::make_shared<EmptyOp>();
   op1->inputs_ = {"x", "w1", "b1"};
@@ -85,5 +87,5 @@ TEST(Net, insert_op) {
   ASSERT_EQ(3UL, net.ops_.size());
 }
 
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index e5b76e3724b5b0287071c90d26235b8e1a1d80cf..aeb95569b728f53b288a0c9a28220be8b5f7aaa4 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -18,8 +18,8 @@
 #include <cstring>
 #include <sstream>
 
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index 2a0964fff326500b6215dd4afac63c75d64c4a06..35e6d9d50dd04048da7ffb384014d5909cd659a4 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-using namespace paddle::framework;
+using namespace paddle::framework;  // NOLINT
 
 namespace rnn {
 
@@ -94,7 +94,7 @@ void InitArgument(const ArgumentName& name, Argument* arg);
 };  // namespace rnn
 
 // The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
-// TODO:
+// TODO(Yan Chunwei):
 // 1. No-padding computing for sequences with indifinite length in one batch.
 // 2. Hierarchical RNN for sequence with sub-sequence.
 // 3. Internal Memory.
@@ -172,12 +172,10 @@ public:
   /**
    * InferShape must be called before Run.
    */
-  virtual void InferShape(const Scope& scope) const override {
-    alg_.InferShape(scope);
-  }
+  void InferShape(const Scope& scope) const override { alg_.InferShape(scope); }
 
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
     alg_.Run(scope, dev_ctx);
   }
 
@@ -194,12 +192,10 @@ public:
   /**
    * InferShape must be called before Run.
    */
-  virtual void InferShape(const Scope& scope) const override {
-    alg_.InferShape(scope);
-  }
+  void InferShape(const Scope& scope) const override { alg_.InferShape(scope); }
 
-  virtual void Run(const Scope& scope,
-                   const platform::DeviceContext& dev_ctx) const override {
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
     alg_.Run(scope, dev_ctx);
   }
 
diff --git a/paddle/operators/recurrent_op_test.cc b/paddle/operators/recurrent_op_test.cc
index 91f2972ca49953fd7a627289fa37db32916d85cd..08a6d9fe5681fdea180de2e9361734ade8564775 100644
--- a/paddle/operators/recurrent_op_test.cc
+++ b/paddle/operators/recurrent_op_test.cc
@@ -11,14 +11,15 @@
   limitations under the License.
 */
 
+#include "paddle/operators/recurrent_op.h"
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/tensor.h"
-#include "paddle/operators/recurrent_op.h"
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 82338ceccc06653791b26472e18d804f62735649..f76faa0a3a93a1ac277a1d1aa83c3fa6c3944648 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"
 
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index d79258cbf13c699cfb2afaee229cf96a3e377b5e..72629ccfbb8bc8ec53045289bd985c721c62fa10 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -1,4 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
 
-REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
index c9d11a2e1f9dcc563765c9e8cc1bae6beff57f18..2123b17e4b5e90c22c2d6e9177f2a8956f8a4ac9 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"
 
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 5b59fad7d5f9729b0862f8cd78cb32f94f87f513..5cbb96ab754467ea6ddab9380ca25987c9376980 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -1,16 +1,17 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
 #include "paddle/operators/softmax_op.h"
 
 namespace paddle {
@@ -19,12 +20,13 @@ namespace operators {
 class SoftmaxOp : public OperatorWithKernel {
 protected:
   void InferShape(const InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.InputSize() == 1, "Only one input is need for softmax");
-    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims().size() == 2,
+    PADDLE_ENFORCE(ctx.InputSize() == 1UL,
+                   "Only one input is need for softmax");
+    PADDLE_ENFORCE(ctx.Input<Tensor>("X")->dims().size() == 2UL,
                    "The input of softmax op must be matrix");
-    PADDLE_ENFORCE(ctx.OutputSize() == 1,
+    PADDLE_ENFORCE(ctx.OutputSize() == 1UL,
                    "Only one output is need for softmax");
-    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
+    ctx.Output<Tensor>("Y")->Resize(ctx.Input<Tensor>("X")->dims());
   }
 };
 
@@ -40,10 +42,19 @@ public:
 
 class SoftmaxOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {}
-  std::string DebugString() const override {
-    LOG(INFO) << "SoftmaxOpGrad";
-    return "";
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 3UL,
+                   "Input of SoftmaxOpGrad should be 3, X, Y, YG");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1UL,
+                   "Output of SoftmaxOpGrad should be 1");
+    PADDLE_ENFORCE(ctx.InputVar("Y") != nullptr, "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx.InputVar(GRAD_VAR_NAME("Y")) != nullptr,
+                   "Input(Y@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx.Input<Tensor>("Y")->dims() ==
+                       ctx.Input<Tensor>(GRAD_VAR_NAME("Y"))->dims(),
+                   "the shape of Input(0) and Input(1) should be the same");
+    ctx.Output<Tensor>(GRAD_VAR_NAME("X"))
+        ->Resize(ctx.Input<Tensor>("Y")->dims());
   }
 };
 
@@ -51,5 +62,7 @@ protected:
 }  // namespace paddle
 
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
-REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<ops::CPUPlace, float>);
+REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
+REGISTER_OP_CPU_KERNEL(softmax_grad,
+                       ops::SoftmaxGradKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index ddf8f6e913ccf450185f377f531bf978f69ed1fc..b79228580a7ea0f70b62eb2dc7a61cf85bc0b5fb 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -1,5 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"
 
 REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(softmax_grad,
+                       ops::SoftmaxGradKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 75c5197697dada58e09f4cda41cea13af56e79a3..13e74a79077982e9fba5d90f40986e699c1ed897 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -1,19 +1,22 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
 #include "paddle/operators/type_alias.h"
 
 namespace paddle {
@@ -23,8 +26,8 @@ template <typename Place, typename T>
 class SoftmaxKernel : public OpKernel {
 public:
   void Compute(const ExecutionContext& context) const override {
-    auto input = context.Input<Tensor>(0);
-    auto output = context.Output<Tensor>(0);
+    auto input = context.Input<Tensor>("X");
+    auto output = context.Output<Tensor>("Y");
     output->mutable_data<T>(context.GetPlace());
 
     auto logits = EigenMatrix<T>::From(*input);
@@ -57,5 +60,38 @@ public:
              .broadcast(one_by_class));
   }
 };
+
+template <typename Place, typename T>
+class SoftmaxGradKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();
+
+    auto Y = context.Input<Tensor>("Y");
+    auto dY = context.Input<Tensor>(OperatorBase::GRAD_VAR_NAME("Y"));
+    auto dX = context.Output<Tensor>(OperatorBase::GRAD_VAR_NAME("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    const int batch_size = Y->dims()[0];
+    const int class_num = Y->dims()[1];
+
+    Eigen::DSizes<int, 1> along_class(1);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, class_num);
+
+    auto Y_eigen = EigenMatrix<T>::From(*Y);
+    auto dY_eigen = EigenMatrix<T>::From(*dY);
+    auto dX_eigen = EigenMatrix<T>::From(*dX);
+    auto place = context.GetEigenDevice<Place>();
+
+    auto dot = (Y_eigen * dY_eigen)
+                   .sum(along_class)
+                   .eval()
+                   .reshape(batch_by_one)
+                   .broadcast(one_by_class);
+    dX_eigen.device(place) = (dY_eigen - dot) * Y_eigen;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h
index 9049ffda1da5408411687474c5ed0c76c2394623..931740e150946a939b8656be5a30185c6ee1cb8f 100644
--- a/paddle/operators/type_alias.h
+++ b/paddle/operators/type_alias.h
@@ -15,13 +15,14 @@
 #pragma once
 
 #include "paddle/framework/eigen.h"
-#include "paddle/framework/net.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
 
 using OpKernel = framework::OpKernel;
+using OperatorBase = framework::OperatorBase;
 using InferShapeContext = framework::InferShapeContext;
 using ExecutionContext = framework::ExecutionContext;
 using Variable = framework::Variable;
@@ -43,15 +44,16 @@ template <typename T,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using Tensor = framework::Tensor;
+using Scope = framework::Scope;
 using OperatorWithKernel = framework::OperatorWithKernel;
+using OperatorBase = framework::OperatorBase;
 using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
 using OpProto = framework::OpProto;
 using OpAttrChecker = framework::OpAttrChecker;
 using CPUPlace = platform::CPUPlace;
 using GPUPlace = platform::GPUPlace;
-using NetOp = framework::NetOp;
 using OpRegistry = framework::OpRegistry;
-using OperatorBase = framework::OperatorBase;
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 2038fafe2e15ec2631726643695ac6cbc317fed9..48b9f5dcb5cc578f9e70ed7abe076b66b68dc719 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -40,7 +40,7 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {
  public:
   CPUDeviceContext();
-  CPUDeviceContext(CPUPlace);
+  explicit CPUDeviceContext(CPUPlace);
   virtual ~CPUDeviceContext() {}
 
   Eigen::DefaultDevice* eigen_device() const;
@@ -55,7 +55,7 @@ class CPUDeviceContext : public DeviceContext {
 
 class CUDADeviceContext : public DeviceContext {
  public:
-  explicit CUDADeviceContext(GPUPlace);
+  CUDADeviceContext(GPUPlace);  // NOLINT
   virtual ~CUDADeviceContext();
 
   /*! \brief  Wait for all operations completion in the stream. */
@@ -69,10 +69,10 @@ class CUDADeviceContext : public DeviceContext {
 
   // clang-format off
   /*! \brief  Return cublas handle in the device context. */
-  cublasHandle_t    cublas_handle   ();
+  cublasHandle_t    cublas_handle();
 
   /*! \brief  Return cudnn  handle in the device context. */
-  cudnnHandle_t     cudnn_handle    ();
+  cudnnHandle_t     cudnn_handle();
 
   /*! \brief  Return curand handle in the device context. */
   curandGenerator_t curand_generator();
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
index 4e3dfdaefb2348346e8f917b1f6c758bf6d91a1a..9cd2a1f565526f8dc45932ba6168f4e25c6ad238 100644
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/cublas.h>
 
 namespace paddle {
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index 8b5e15b5efcdae6a1eed09f002eb2f4f2163035f..d3e4cb567d71b987724366b6a0896f5df0eb6055 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/cudnn.h>
 
 namespace paddle {
@@ -25,4 +39,4 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 
 }  // namespace dynload
 }  // namespace platform
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
index 5c1fab992c98569d4a95b6e699d97d428511e48e..d05dd88126bfee7278e553710a717b8f2eb02ae0 100644
--- a/paddle/platform/dynload/curand.cc
+++ b/paddle/platform/dynload/curand.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/platform/dynload/curand.h>
 
 namespace paddle {
@@ -10,6 +24,7 @@ void *curand_dso_handle;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-}
-}
-}
\ No newline at end of file
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 7cead183884bc9379355cd931921b40d6c11ce90..a37ad38a8fb030192fa4c871106c6eb54816768a 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -32,7 +32,7 @@ struct CPUPlace {
 
 struct GPUPlace {
   GPUPlace() : GPUPlace(0) {}
-  GPUPlace(int d) : device(d) {}
+  GPUPlace(int d) : device(d) {}  // NOLINT
 
   // needed for variant equality comparison
   inline bool operator==(const GPUPlace &o) const { return device == o.device; }
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index 0272529d1c9b2cb6000a26f1d4d80276d06bf27b..3b887490b5c6c016bc30d8db060c5c1c01b8bf54 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -39,8 +39,8 @@ public:
   // size_ is 0.
   Piece();
   Piece(const char* d, size_t n);
-  Piece(const char* d);
-  Piece(const std::string& s);
+  Piece(const char* d);         // NOLINT
+  Piece(const std::string& s);  // NOLINT
 
   const char* data() const { return data_; }
   size_t len() const { return size_; }
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 4619b0edc3dd7e253e01f7fee5e6a8641340d291..e66197030e2dd9e113e4564aaacb1c5dab25771b 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -13,4 +13,5 @@ add_python_test(test_framework
     test_sigmoid_op.py
     test_softmax_op.py
     test_rowwise_add_op.py
-    test_network.py)
+    test_network.py
+    gradient_checker.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..4022de1c40e41aa77a7f31d82b55b63585cbd5f5
--- /dev/null
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -0,0 +1,90 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.create_op_creation_methods import op_creations
+import numpy
+import unittest
+
+__all__ = ['get_numeric_gradient']
+
+
+def get_numeric_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=1e-2,
+                         local_scope=None):
+    """
+    Get Numeric Gradient for an operator's input.
+    
+    :param op: C++ operator instance, could be an network 
+    :param input_values: The input variables. Should be an dictionary, key is 
+    variable name. Value is numpy array.
+    :param output_name: The final output variable name. 
+    :param input_to_check: The input variable need to get gradient.
+    :param delta: The perturbation value for numeric gradient method. The 
+    smaller delta is, the more accurate result will get. But if that delta is
+     too small, it could occur numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
+    if local_scope is None:
+        local_scope = core.Scope()
+
+    # Create all input variable in local_scope
+    for var_name in input_values:
+        var = local_scope.new_var(var_name)
+        tensor = var.get_tensor()
+        tensor.set_dims(input_values[var_name].shape)
+        tensor.alloc_float(core.CPUPlace())
+        tensor.set(input_values[var_name], core.CPUPlace())
+
+    # Create all output variable in local_scope
+    for output in op.outputs():
+        if local_scope.find_var(output) is None:
+            local_scope.new_var(output).get_tensor()
+
+    op.infer_shape(local_scope)
+
+    # allocate output memory
+    for output in op.outputs():
+        local_scope.find_var(output).get_tensor().alloc_float(core.CPUPlace())
+
+    # TODO(yuyang18): Only CPU is support now.
+    cpu_ctx = core.DeviceContext.create(core.CPUPlace())
+
+    def get_output():
+        op.run(local_scope, cpu_ctx)
+        return numpy.array(local_scope.find_var(output_name).get_tensor()).sum()
+
+    def product(dim):
+        return reduce(lambda a, b: a * b, dim, 1)
+
+    tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
+    tensor_size = product(tensor_to_check.get_dims())
+    gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32')
+    for i in xrange(tensor_size):
+        origin = tensor_to_check.get_float_element(i)
+        x_pos = origin + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+
+        x_neg = origin - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+
+        tensor_to_check.set_float_element(i, origin)  # restore old value
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+
+
+if __name__ == '__main__':
+
+    class GetNumericGradientTest(unittest.TestCase):
+        def test_add_op(self):
+            add_op = op_creations.add_two(X="X", Y="Y", Out="Z")
+            x = numpy.random.random((10, 1)).astype("float32")
+            y = numpy.random.random((10, 1)).astype("float32")
+
+            arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
+            self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)
+
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py
index 191b698c1cdec9b86b4ded6b1f743586867ca62f..c80888128781d98e4ed30d845a30b39121f66459 100644
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -1,6 +1,10 @@
 import unittest
-from op_test_util import OpTestMeta
+
 import numpy as np
+import paddle.v2.framework.core as core
+import paddle.v2.framework.create_op_creation_methods as creation
+
+from op_test_util import OpTestMeta
 
 
 def stable_softmax(x):
@@ -19,5 +23,63 @@ class TestSoftmaxOp(unittest.TestCase):
         self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
 
 
+class TestSoftmaxGradOp(unittest.TestCase):
+    def test_softmax_grad(self):
+        op = creation.op_creations.softmax(X="X", Y="Y")
+        backward_op = core.Operator.backward(op, set())
+        self.assertEqual(backward_op.type(), "softmax_grad")
+        expected = '''Op(softmax_grad), inputs:(X, Y, Y@GRAD), outputs:(X@GRAD).'''
+        self.assertEqual(expected, str(backward_op))
+
+        batch_size = 3
+        class_num = 5
+        # Initialize X and add 1e-2 for numerical stability
+        Y = np.random.rand(batch_size, class_num).astype(np.float32)
+        Y = Y + 1e-2
+        dY = np.random.rand(batch_size, class_num).astype(np.float32)
+
+        # Reference implementation of cross entropy with soft labels
+        def label_softmax_grad(Y, dY):
+            dX = Y * 0.0
+            for i in range(batch_size):
+                d = np.dot(Y[i, :], dY[i, :])
+                dX[i, :] = Y[i, :] * (dY[i, :] - d)
+            return dX
+
+        expected = label_softmax_grad(Y, dY)
+
+        scope = core.Scope()
+        places = []
+        places.append(core.CPUPlace())
+        if core.is_compile_gpu():
+            places.append(core.GPUPlace(0))
+
+        for place in places:
+            y = scope.new_var("Y")
+            y_tensor = y.get_tensor()
+            y_tensor.set_dims([batch_size, class_num])
+            y_tensor.alloc_float(place)
+            y_tensor.set(Y, place)
+
+            dy = scope.new_var("Y@GRAD")
+            dy_tensor = dy.get_tensor()
+            dy_tensor.set_dims([batch_size, class_num])
+            dy_tensor.alloc_float(place)
+            dy_tensor.set(dY, place)
+
+            x = scope.new_var("X")
+            dx = scope.new_var("X@GRAD")
+
+            tensor = scope.find_var("X@GRAD").get_tensor()
+            backward_op.infer_shape(scope)
+            self.assertEqual([batch_size, class_num], tensor.shape())
+
+            ctx = core.DeviceContext.create(place)
+            backward_op.run(scope, ctx)
+            actual = np.array(tensor)
+
+            np.testing.assert_almost_equal(actual, expected, decimal=3)
+
+
 if __name__ == '__main__':
     unittest.main()