diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 5e9901bb87c9a454a393a913b6da6e82266ee719..170e0f839719c71d56008abefb79c7814d0f3e76 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -350,6 +350,22 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.load_persistables_for_increment ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.load_persistables_for_inference ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.convert_dist_to_sparse_program ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.HDFSClient.__init__ ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.HDFSClient.delete ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.HDFSClient.download ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False))
+paddle.fluid.contrib.HDFSClient.is_dir ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.HDFSClient.is_exist ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.HDFSClient.ls ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.HDFSClient.lsr ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True))
+paddle.fluid.contrib.HDFSClient.make_local_dirs ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.HDFSClient.makedirs ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.HDFSClient.rename ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.contrib.HDFSClient.upload ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5))
+paddle.fluid.contrib.multi_download ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,))
+paddle.fluid.contrib.multi_upload ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True))
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 779a9ed52365e66d8141f7e3a1183ef6d7832e4b..389366a8a98c5753268718c49c62c2dffe99c32f 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -131,9 +131,7 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
 
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
     const ProgramDesc &main_program, const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &param_names,
-    const std::vector<Scope *> &local_scopes,
+    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
 #else
@@ -149,9 +147,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
       pass->Erase("loss_var_name");
       pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
-      pass->Erase("params");
-      pass->SetNotOwned<const std::unordered_set<std::string>>("params",
-                                                               &param_names);
       pass->Erase("local_scopes");
       pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
                                                     &local_scopes);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 29396501dc0efedd31a42b77f915fd66c9943985..11db184cb4efe349a340aceb4b7e1e3f4d4b24a5 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -106,16 +106,15 @@ struct BuildStrategy {
 
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(
-      const ProgramDesc &main_program,
-      const std::vector<platform::Place> &places,
-      const std::string &loss_var_name,
-      const std::unordered_set<std::string> &param_names,
-      const std::vector<Scope *> &local_scopes,
+  std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
+                                   const std::vector<platform::Place> &places,
+                                   const std::string &loss_var_name,
+                                   const std::vector<Scope *> &local_scopes,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
+                                   const bool use_cuda,
+                                   platform::NCCLContextMap *nccl_ctxs) const;
 #else
-      const bool use_cuda) const;
+                                   const bool use_cuda) const;
 #endif
 
  private:
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 8af1d62dea89343ff2d41dd7c6ac837459df7685..7e320a08942e4a9a27e6b5c892a993b3a90c43a4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -130,7 +130,6 @@ void AddOutputToLeafOps(ir::Graph *graph) {
 
 static const char kLossVarName[] = "loss_var_name";
 static const char kPlaces[] = "places";
-static const char kParams[] = "params";
 static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
 static const char kNumTrainers[] = "num_trainers";
@@ -147,9 +146,6 @@ void MultiDevSSAGraphBuilder::Init() const {
   nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
 
-  for (auto &p : Get<const std::unordered_set<std::string>>(kParams)) {
-    grad_names_.insert(GradVarName(p));
-  }
   balance_vars_.resize(places_.size(), 0);
   if (strategy_.enable_data_balance_ && places_.size() == 1) {
     LOG(WARNING) << "It is no need to enable data balance when there is only "
@@ -359,7 +355,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
           BuildStrategy::GradientScaleStrategy::kCustomized) {
         // TODO(paddle-dev): Why is there no input for this op_handle?
         auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
+        auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
+        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
+                              out_dtype);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -662,13 +660,13 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
 
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
     ir::Graph *result, const std::string &loss_grad_name,
-    ir::Node *out_var_node) const {
+    ir::Node *out_var_node, proto::VarType::Type dtype) const {
   for (size_t i = 0; i < places_.size(); ++i) {
     // Insert ScaleCost OpHandle
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
     auto *op_handle = new ScaleLossGradOpHandle(
         result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx);
+        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype);
     result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
@@ -896,7 +894,6 @@ REGISTER_PASS(multi_devices_pass,
               paddle::framework::details::MultiDevSSAGraphBuilder)
     .RequirePassAttr(paddle::framework::details::kLossVarName)
     .RequirePassAttr(paddle::framework::details::kPlaces)
-    .RequirePassAttr(paddle::framework::details::kParams)
     .RequirePassAttr(paddle::framework::details::kLocalScopes)
     .RequirePassAttr(paddle::framework::details::kStrategy)
     .RequirePassAttr(paddle::framework::details::kNumTrainers);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 8e462aec7dc7ce45cad592b89de0b6edde8c9146..5736102ddc13418446013307cf8204b677f960dc 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -68,7 +68,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
 
   void CreateScaleLossGradOp(ir::Graph *result,
                              const std::string &loss_grad_name,
-                             ir::Node *out_var_node) const;
+                             ir::Node *out_var_node,
+                             proto::VarType::Type dtype) const;
 
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
@@ -102,7 +103,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   mutable std::string loss_var_name_;
   mutable std::vector<platform::Place> places_;
   mutable std::vector<Scope *> local_scopes_;
-  mutable std::unordered_set<std::string> grad_names_;
 
   mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index ef1626599795a553e654fe5d3ed74ef3a3a67d78..e1b8e8fe05f0615d689e78d9c405cc5d76d2abb1 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -22,39 +22,66 @@ namespace details {
 ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
                                              Scope *scope,
                                              platform::Place place,
-                                             platform::DeviceContext *dev_ctx)
+                                             platform::DeviceContext *dev_ctx,
+                                             proto::VarType::Type dtype)
     : OpHandleBase(node),
       coeff_(static_cast<float>(1.0 / num_dev)),
       scope_(scope),
-      place_(place) {
+      place_(place),
+      out_dtype_(dtype) {
   this->SetDeviceContext(place_, dev_ctx);
 }
 
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
+struct ScaleLossGradFunctor {
+  float coeff_;
+  Tensor *out_;
+  platform::Place place_;
+  OpHandleBase *op_handle_;
+  proto::VarType::Type out_dtype_;
+  platform::DeviceContext *ctx_;
+
+  ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
+                       OpHandleBase *op_handle, proto::VarType::Type dtype,
+                       platform::DeviceContext *ctx)
+      : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto *out_data = out_->mutable_data<OutT>(place_);
+    if (platform::is_cpu_place(place_)) {
+      *out_data = static_cast<OutT>(coeff_);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      OutT cast_coeff = static_cast<OutT>(coeff_);
+      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), out_data,
+                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
+                   stream);
+      VLOG(10) << place_ << "RUN Scale loss grad op";
+
+#endif
+    }
+  }
+};
+
 void ScaleLossGradOpHandle::RunImpl() {
   // Doesn't wait any event
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
   auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
-  float *tmp = local_scope.FindVar(var_name)
-                   ->GetMutable<LoDTensor>()
-                   ->mutable_data<float>(make_ddim({1}), place_);
+  auto *tensor = local_scope.FindVar(var_name)->GetMutable<LoDTensor>();
+  tensor->Resize(make_ddim({1}));
 
-  if (platform::is_cpu_place(place_)) {
-    *tmp = coeff_;
-  } else {
 #ifdef PADDLE_WITH_CUDA
-    this->RunAndRecordEvent([&] {
-      auto stream = static_cast<platform::CUDADeviceContext *>(
-                        this->dev_ctxes_.at(place_))
-                        ->stream();
-      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
-                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(10) << place_ << "RUN Scale loss grad op";
-    });
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
+                            this->dev_ctxes_.at(place_));
+  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
+#else
+  ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
+  framework::VisitDataType(out_dtype_, func);
 #endif
-  }
 }
 
 std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index 523b55724c82d4e2bef0520c10e5708c952a3ecc..8bedd1643eb9c5e591fa3c40995fcba08980b9fa 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -26,8 +26,8 @@ namespace details {
 
 struct ScaleLossGradOpHandle : public OpHandleBase {
   ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
-                        platform::Place place,
-                        platform::DeviceContext *context);
+                        platform::Place place, platform::DeviceContext *context,
+                        proto::VarType::Type dtype);
 
   ~ScaleLossGradOpHandle() final;
 
@@ -40,6 +40,7 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
   float coeff_;
   Scope *scope_;
   platform::Place place_;
+  proto::VarType::Type out_dtype_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
index 5376fc163e259e5049955052baf02fd614aa511e..a8029e67e659a269f8492cf6e2f1f09040144283 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -24,35 +24,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-// The function keeps the graph consistent by replacing
-// a node 'from' in the set of inputs nodes
-// of the visited node by a node 'to'.
-void CorrectGraphEdges(Graph* graph, Node* from, Node* to) {
-  for (auto& node : GraphTraits::DFS(*graph)) {
-    auto from_in_inputs =
-        std::find(std::begin(node.inputs), std::end(node.inputs), from);
-
-    if (from_in_inputs != std::end(node.inputs)) {
-      IR_NODE_LINK_TO(to, (&node));
-
-      auto inputs = node.Op()->Inputs();
-
-      using input_type = VariableNameMap::value_type;
-
-      std::for_each(std::begin(inputs), std::end(inputs),
-                    [from, to, &node](const input_type& i) -> void {
-                      auto param_names = i.second;
-                      auto pi = std::find(std::begin(param_names),
-                                          std::end(param_names), from->Name());
-
-                      if (pi != std::end(param_names)) {
-                        node.Op()->SetInput(i.first, {to->Name()});
-                      }
-                    });
-    }
-  }
-}
-
 bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
   auto find_node = [](ir::Graph* graph, const Node* node) -> Node* {
     for (auto n : graph->Nodes()) {
@@ -99,25 +70,12 @@ bool IsReachable(ir::Graph* graph, Node* from, Node* to) {
   return false;
 }
 
-boost::optional<Node*> HasBias(const Node& op, const std::string& bias_name) {
-  auto bias_input_names = op.Op()->Inputs();
-  auto bias_it = bias_input_names.find(bias_name);
-
-  if (bias_it != std::end(bias_input_names)) {
-    bool has_bias = !bias_it->second.empty();
-
-    if (has_bias) {
-      auto bias_names = bias_it->second;
-      auto bias_names_it =
-          std::find_if(std::begin(op.inputs), std::end(op.inputs),
-                       [&bias_names](Node* n) -> bool {
-                         return n->Name() == bias_names[0];
-                       });
-      return *bias_names_it;
-    }
-  }
-
-  return boost::none;
+template <typename T>
+boost::optional<T> HasAttribute(const Node& op, const std::string& attr) {
+  if (op.Op()->HasAttr(attr))
+    return boost::get<T>(op.Op()->GetAttr(attr));
+  else
+    return boost::none;
 }
 
 ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle(
@@ -151,40 +109,18 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
 
   if (!IsReachable(graph, elementwise_add_identity, conv_output)) return;
 
-  OpDesc op_desc;
-  op_desc.SetType("conv2d");
-
-  op_desc.SetInput("Input", {conv_input->Name()});
-  op_desc.SetInput("Filter", {conv_filter->Name()});
-  op_desc.SetInput("ResidualData", {elementwise_add_identity->Name()});
-  op_desc.SetOutput("Output", {conv_output->Name()});
+  auto fuse_relu = HasAttribute<bool>(*conv_op, "fuse_relu");
+  if (fuse_relu && *fuse_relu) return;
 
-  auto conv_bias = HasBias(*conv_op, "Bias");
+  conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
+  conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
+  conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-  if (conv_bias) {
-    op_desc.SetInput("Bias", {(*conv_bias)->Name()});
-  }
-
-  for (const auto& attr : conv_op->Op()->GetAttrMap()) {
-    op_desc.SetAttr(attr.first, attr.second);
-  }
-
-  op_desc.SetAttr("fuse_residual_connection", true);
+  GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op});
 
-  auto fused_conv_op = graph->CreateOpNode(&op_desc);
-
-  IR_NODE_LINK_TO(conv_input, fused_conv_op);
-  IR_NODE_LINK_TO(conv_filter, fused_conv_op);
-  IR_NODE_LINK_TO(elementwise_add_identity, fused_conv_op);
-  IR_NODE_LINK_TO(fused_conv_op, conv_output);
-
-  if (conv_bias) {
-    IR_NODE_LINK_TO((*conv_bias), fused_conv_op);
-  }
+  IR_NODE_LINK_TO(elementwise_add_identity, conv_op);
+  IR_NODE_LINK_TO(conv_op, elementwise_add_out);
 
-  CorrectGraphEdges(graph, elementwise_add_out, conv_output);
-  GraphSafeRemoveNodes(graph,
-                       {elementwise_add_out, conv_op, elementwise_add_op});
   (*fusion_stats)++;
 }
 
@@ -229,60 +165,33 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()(
 
   Node* projection_node;
   Node* residual_conv_op;
-  Node* residual_conv_input;
-  Node* residual_conv_filter;
   Node* residual_conv_output;
 
   if (IsReachable(graph, conv_x_input, conv_y_output)) {
     projection_node = conv_x_output;
     residual_conv_op = conv_y_op;
-    residual_conv_input = conv_y_input;
-    residual_conv_filter = conv_y_filter;
     residual_conv_output = conv_y_output;
   } else if (IsReachable(graph, conv_y_input, conv_x_output)) {
     projection_node = conv_y_output;
     residual_conv_op = conv_x_op;
-    residual_conv_input = conv_x_input;
-    residual_conv_filter = conv_x_filter;
     residual_conv_output = conv_x_output;
   } else {
     return;
   }
 
-  OpDesc op_desc;
-  op_desc.SetType("conv2d");
+  auto fuse_relu = HasAttribute<bool>(*residual_conv_op, "fuse_relu");
+  if (fuse_relu && *fuse_relu) return;
 
-  op_desc.SetInput("Input", {residual_conv_input->Name()});
-  op_desc.SetInput("Filter", {residual_conv_filter->Name()});
-  op_desc.SetInput("ResidualData", {projection_node->Name()});
-  op_desc.SetOutput("Output", {residual_conv_output->Name()});
+  residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()});
+  residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
 
-  auto residual_conv_bias = HasBias(*residual_conv_op, "Bias");
+  residual_conv_op->Op()->SetAttr("fuse_residual_connection", true);
 
-  if (residual_conv_bias) {
-    op_desc.SetInput("Bias", {(*residual_conv_bias)->Name()});
-  }
-
-  for (const auto& attr : residual_conv_op->Op()->GetAttrMap()) {
-    op_desc.SetAttr(attr.first, attr.second);
-  }
-
-  op_desc.SetAttr("fuse_residual_connection", true);
+  GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op});
 
-  auto fused_conv_op = graph->CreateOpNode(&op_desc);
-
-  IR_NODE_LINK_TO(residual_conv_input, fused_conv_op);
-  IR_NODE_LINK_TO(residual_conv_filter, fused_conv_op);
-  IR_NODE_LINK_TO(projection_node, fused_conv_op);
-  IR_NODE_LINK_TO(fused_conv_op, residual_conv_output);
-
-  if (residual_conv_bias) {
-    IR_NODE_LINK_TO((*residual_conv_bias), fused_conv_op);
-  }
+  IR_NODE_LINK_TO(projection_node, residual_conv_op);
+  IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out);
 
-  CorrectGraphEdges(graph, elementwise_add_out, residual_conv_output);
-  GraphSafeRemoveNodes(
-      graph, {elementwise_add_out, residual_conv_op, elementwise_add_op});
   (*fusion_stats)++;
 }
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index dde642764fa5dfce11edcef51ad1be11be331fbc..2fe1c94ec02e8ff0a4acb81868ba2124ea89e506 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -110,22 +110,125 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     }
   }
 
+  std::vector<InferShapeVarPtr> GetInputVarPtrs(
+      const std::string &name) override {
+    const std::vector<std::string> arg_names = Inputs(name);
+    std::vector<InferShapeVarPtr> res;
+    res.reserve(arg_names.size());
+    std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
+                   [this](const std::string &name) {
+                     return block_.FindVarRecursive(name);
+                   });
+    return res;
+  }
+
+  std::vector<InferShapeVarPtr> GetOutputVarPtrs(
+      const std::string &name) override {
+    const std::vector<std::string> arg_names = Outputs(name);
+    std::vector<InferShapeVarPtr> res;
+    res.reserve(arg_names.size());
+    std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res),
+                   [this](const std::string &name) {
+                     return block_.FindVarRecursive(name);
+                   });
+    return res;
+  }
+
+  DDim GetInputDim(const std::string &name) const override {
+    const std::vector<std::string> &arg_names = Inputs(name);
+    PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                      "Input(%s) should hold one element, but now it holds %d",
+                      name, arg_names.size());
+    return this->GetDim(arg_names[0]);
+  }
+
+  std::vector<DDim> GetInputsDim(const std::string &name) const override {
+    const std::vector<std::string> &arg_names = Inputs(name);
+    return GetDims(arg_names);
+  }
+
   bool IsRuntime() const override;
 
+  std::vector<proto::VarType::Type> GetInputsVarType(
+      const std::string &name) const override {
+    return GetVarTypes(Inputs(name));
+  }
+
+  std::vector<proto::VarType::Type> GetOutputsVarType(
+      const std::string &name) const override {
+    return GetVarTypes(Outputs(name));
+  }
+
+  void SetOutputDim(const std::string &name, const DDim &dim) override {
+    auto &arg_names = Outputs(name);
+    PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                      "Output(%s) should hold one element, but now it holds %d",
+                      name, arg_names.size());
+    SetDim(arg_names[0], dim);
+  }
+
+  void SetOutputsDim(const std::string &name,
+                     const std::vector<DDim> &dims) override {
+    auto &names = Outputs(name);
+    SetDims(names, dims);
+  }
+
  protected:
-  proto::VarType::Type GetVarType(const std::string &name) const override;
+  std::vector<proto::VarType::Type> GetVarTypes(
+      const std::vector<std::string> &names) const {
+    std::vector<proto::VarType::Type> retv;
+    retv.resize(names.size());
+    std::transform(
+        names.begin(), names.end(), retv.begin(),
+        std::bind(std::mem_fn(&CompileTimeInferShapeContext::GetVarType), this,
+                  std::placeholders::_1));
+    return retv;
+  }
+
+  proto::VarType::Type GetVarType(const std::string &name) const;
+
+  DDim GetDim(const std::string &name) const {
+    auto var = block_.FindVarRecursive(name);
+    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+    DDim res;
+    try {
+      auto shape = var->GetShape();
+      res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
+    } catch (...) {
+      VLOG(5) << "GetDim of variable " << name << " error";
+      std::rethrow_exception(std::current_exception());
+    }
+    return res;
+  }
 
-  DDim GetDim(const std::string &name) const override;
+  std::vector<DDim> GetDims(const std::vector<std::string> &names) const {
+    std::vector<DDim> ret;
+    ret.reserve(names.size());
+    std::transform(
+        names.begin(), names.end(), std::back_inserter(ret),
+        [this](const std::string &name) { return this->GetDim(name); });
+    return ret;
+  }
+
+  void SetDim(const std::string &name, const DDim &dim);
 
-  void SetDim(const std::string &name, const DDim &dim) override;
+  void SetDims(const std::vector<std::string> &names,
+               const std::vector<DDim> &dims) {
+    size_t length = names.size();
+    PADDLE_ENFORCE_EQ(length, dims.size());
+    for (size_t i = 0; i < length; ++i) {
+      if (names[i] == framework::kEmptyVarName) {
+        continue;
+      }
+      SetDim(names[i], dims[i]);
+    }
+  }
 
   std::vector<DDim> GetRepeatedDims(const std::string &name) const override;
 
   void SetRepeatedDims(const std::string &name,
                        const std::vector<DDim> &dims) override;
 
-  InferShapeVarPtr GetVarPtr(const std::string &name) override;
-
   const OpDesc &op_;
   const BlockDesc &block_;
 };
@@ -644,20 +747,6 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
   return op_.Output(name);
 }
 
-DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
-  auto var = block_.FindVarRecursive(name);
-  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
-  DDim res;
-  try {
-    auto shape = var->GetShape();
-    res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
-  } catch (...) {
-    VLOG(5) << "GetDim of variable " << name << " error";
-    std::rethrow_exception(std::current_exception());
-  }
-  return res;
-}
-
 std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
     const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
@@ -696,10 +785,5 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType(
   return block_.FindVarRecursive(name)->GetType();
 }
 
-InferShapeVarPtr CompileTimeInferShapeContext::GetVarPtr(
-    const std::string &name) {
-  return block_.FindVarRecursive(name);
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index abc0c22b6e090725ac25fa51fb4c523341ec9716..b2d4de5916ba89e6692e10ac7ebef53484b576ca 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -142,12 +142,14 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
                                const Scope& scope) {
   for (auto& var_name_item : innames) {
     std::vector<Variable*>& input_vars = inputs[var_name_item.first];
+    input_vars.reserve(var_name_item.second.size());
     for (auto& var_name : var_name_item.second) {
       input_vars.push_back(scope.FindVar(var_name));
     }
   }
   for (auto& var_name_item : outnames) {
     std::vector<Variable*>& output_vars = outputs[var_name_item.first];
+    output_vars.reserve(var_name_item.second.size());
     for (auto& var_name : var_name_item.second) {
       output_vars.push_back(scope.FindVar(var_name));
     }
@@ -556,30 +558,28 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   bool HasOutput(const std::string& name) const override {
     // has only one output
-    const auto& outs = op_.Outputs();
+    const auto& outs = ctx_.outputs;
     auto it = outs.find(name);
     if (it == outs.end()) {
       return false;
     }
     const auto& out = it->second;
-    if (out.size() == 0 || out[0] == kEmptyVarName) {
+    if (out.size() == 0) {
       return false;
     }
     PADDLE_ENFORCE_EQ(out.size(), 1UL,
                       "Output %s should not have more than one outputs", name);
-    return scope_.FindVar(out[0]) != nullptr;
+    return out[0] != nullptr;
   }
 
   bool HasInputs(const std::string& name) const override {
-    if (!op_.HasInputs(name)) {
-      return false;
-    }
-    auto inputs = op_.Inputs(name);
-    if (inputs.empty()) {
+    const auto& ins = ctx_.inputs;
+    auto it = ins.find(name);
+    if (it == ins.end() || it->second.empty()) {
       return false;
     }
-    for (auto& input : inputs) {
-      if (scope_.FindVar(input) == nullptr) {
+    for (auto& input : it->second) {
+      if (input == nullptr) {
         return false;
       }
     }
@@ -587,15 +587,13 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   bool HasOutputs(const std::string& name) const override {
-    if (!op_.HasOutputs(name)) {
-      return false;
-    }
-    auto outputs = op_.Outputs(name);
-    if (outputs.empty()) {
+    const auto& outs = ctx_.outputs;
+    auto it = outs.find(name);
+    if (it == outs.end() || it->second.empty()) {
       return false;
     }
-    for (auto& output : outputs) {
-      if (scope_.FindVar(output) == nullptr) {
+    for (auto& output : it->second) {
+      if (output == nullptr) {
         return false;
       }
     }
@@ -616,16 +614,18 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) override {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    const std::string& input_n = Inputs(in)[i];
-    const std::string& output_n = Outputs(out)[j];
+    auto in_it = ctx_.inputs.find(in);
+    auto out_it = ctx_.outputs.find(out);
+    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
+                   "Inputs %s should have %llu argument", in, i);
+    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
+                   "Outputs %s should have %llu argument", out, j);
+
+    Variable* in_var = in_it->second[i];
+    Variable* out_var = out_it->second[j];
 
-    Variable* in_var = scope_.FindVar(input_n);
-    Variable* out_var = scope_.FindVar(output_n);
     PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
-                   "The type of %s and %s is not the same.", output_n,
-                   GetDim(input_n));
+                   "The type of %s and %s is not the same.", in, out);
 
     if (in_var->IsType<framework::SelectedRows>()) {
       auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
@@ -646,13 +646,16 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) const override {
-    const std::vector<std::string>& inputs = Inputs(in);
-    const std::vector<std::string>& outputs = Outputs(out);
-    PADDLE_ENFORCE_LT(i, inputs.size());
-    PADDLE_ENFORCE_LT(j, outputs.size());
-    Variable* in_var = scope_.FindVar(inputs.at(i));
+    auto in_it = ctx_.inputs.find(in);
+    auto out_it = ctx_.outputs.find(out);
+    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
+                   "Inputs %s should have %llu argument", in, i);
+    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
+                   "Outputs %s should have %llu argument", out, j);
+
+    Variable* in_var = in_it->second.at(i);
     if (!in_var->IsType<LoDTensor>()) return;
-    Variable* out_var = scope_.FindVar(outputs.at(j));
+    Variable* out_var = out_it->second.at(j);
     PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
                    "The %d-th output of Output(%s) must be LoDTensor.", j, out);
     auto in_tensor = in_var->Get<LoDTensor>();
@@ -687,9 +690,64 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   bool IsRuntime() const override { return true; }
 
+  // TODO(paddle-dev): Can this be template?
+  std::vector<InferShapeVarPtr> GetInputVarPtrs(
+      const std::string& name) override {
+    const std::vector<Variable*>& vars = InputVars(name);
+    std::vector<InferShapeVarPtr> res;
+    res.reserve(vars.size());
+    res.insert(res.begin(), vars.begin(), vars.end());
+    return res;
+  }
+
+  std::vector<InferShapeVarPtr> GetOutputVarPtrs(
+      const std::string& name) override {
+    const std::vector<Variable*>& vars = OutputVars(name);
+    std::vector<InferShapeVarPtr> res;
+    res.reserve(vars.size());
+    res.insert(res.begin(), vars.begin(), vars.end());
+    return res;
+  }
+
+  DDim GetInputDim(const std::string& name) const override {
+    const std::vector<Variable*>& vars = InputVars(name);
+    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
+                      "Input(%s) should hold one element, but now it holds %d",
+                      name, vars.size());
+    return this->GetDim(vars[0]);
+  }
+
+  std::vector<DDim> GetInputsDim(const std::string& name) const override {
+    const std::vector<Variable*>& vars = InputVars(name);
+    return GetDims(vars);
+  }
+
+  std::vector<proto::VarType::Type> GetInputsVarType(
+      const std::string& name) const override {
+    return GetVarTypes(InputVars(name));
+  }
+
+  std::vector<proto::VarType::Type> GetOutputsVarType(
+      const std::string& name) const override {
+    return GetVarTypes(OutputVars(name));
+  }
+
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
+    auto& vars = OutputVars(name);
+    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
+                      "Output(%s) should hold one element, but now it holds %d",
+                      name, vars.size());
+    SetDim(vars[0], dim);
+  }
+
+  void SetOutputsDim(const std::string& name,
+                     const std::vector<DDim>& dims) override {
+    auto& vars = OutputVars(name);
+    SetDims(vars, dims);
+  }
+
  protected:
-  DDim GetDim(const std::string& name) const override {
-    Variable* var = scope_.FindVar(name);
+  DDim GetDim(Variable* var) const {
     PADDLE_ENFORCE_NOT_NULL(var);
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
@@ -697,25 +755,44 @@ class RuntimeInferShapeContext : public InferShapeContext {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
       PADDLE_THROW(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's "
+          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
           "type_id is %s.",
-          name, ToTypeName(var->Type()));
+          ToTypeName(var->Type()));
     }
   }
 
+  std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
+    std::vector<DDim> ret;
+    ret.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
+                   [this](Variable* var) { return this->GetDim(var); });
+    return ret;
+  }
+
   std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
     PADDLE_THROW("Only compile time support this method");
   }
 
-  void SetDim(const std::string& name, const DDim& dim) override {
-    Variable* var = scope_.FindVar(name);
+  void SetDim(Variable* var, const DDim& dim) {
     if (var->IsType<LoDTensor>()) {
       var->GetMutable<LoDTensor>()->Resize(dim);
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
-      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
-                   name, ToTypeName(var->Type()));
+      PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                   ToTypeName(var->Type()));
+    }
+  }
+
+  void SetDims(const std::vector<Variable*>& vars,
+               const std::vector<DDim>& dims) {
+    size_t length = vars.size();
+    PADDLE_ENFORCE_EQ(length, dims.size());
+    for (size_t i = 0; i < length; ++i) {
+      if (vars[i] == nullptr) {
+        continue;
+      }
+      SetDim(vars[i], dims[i]);
     }
   }
 
@@ -724,16 +801,36 @@ class RuntimeInferShapeContext : public InferShapeContext {
     PADDLE_THROW("Only compile time support this method");
   }
 
-  proto::VarType::Type GetVarType(const std::string& name) const override {
-    auto* var = scope_.FindVar(name);
-    return ToVarType(var->Type());
+  std::vector<proto::VarType::Type> GetVarTypes(
+      const std::vector<Variable*>& vars) const {
+    std::vector<proto::VarType::Type> retv;
+    retv.resize(vars.size());
+    std::transform(vars.begin(), vars.end(), retv.begin(),
+                   std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
+                             this, std::placeholders::_1));
+    return retv;
   }
 
-  InferShapeVarPtr GetVarPtr(const std::string& name) override {
-    return scope_.FindVar(name);
+  proto::VarType::Type GetVarType(Variable* var) const {
+    return ToVarType(var->Type());
   }
 
  private:
+  const std::vector<Variable*>& InputVars(const std::string& name) const {
+    auto it = ctx_.inputs.find(name);
+    PADDLE_ENFORCE(it != ctx_.inputs.end(),
+                   "Operator %s does not have the input %s.", op_.Type(), name);
+    return it->second;
+  }
+
+  const std::vector<Variable*>& OutputVars(const std::string& name) const {
+    auto it = ctx_.outputs.find(name);
+    PADDLE_ENFORCE(it != ctx_.outputs.end(),
+                   "Operator %s does not have the outputs %s.", op_.Type(),
+                   name);
+    return it->second;
+  }
+
   const OperatorBase& op_;
   const Scope& scope_;
   const RuntimeContext& ctx_;
@@ -864,8 +961,7 @@ Scope* OperatorWithKernel::PrepareData(
 
     for (size_t i = 0; i < var_name_item.second.size(); ++i) {
       auto& var_name = var_name_item.second[i];
-      auto* var = scope.FindVar(var_name);
-      input_vars[i] = var;
+      auto* var = input_vars[i];
 
       // Only tensor can be tranfer to another device.
       if (var == nullptr || !VarIsTensor(*var)) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 7e3fe02eaf5560ef03e42c6b82ed338edc30b0ab..a921f469f5e0276884fe194c99b15100a11113dc 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -190,7 +190,6 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
 
 ParallelExecutor::ParallelExecutor(
     const std::vector<platform::Place> &places,
-    const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
     Scope *scope, const std::vector<Scope *> &local_scopes,
@@ -209,7 +208,7 @@ ParallelExecutor::ParallelExecutor(
                    "the number of places must be greater than 1.");
   }
 
-  // Step 1. Bcast the params to devs.
+  // Step 1. Bcast the bcast_vars to devs.
   // Create local scopes
   if (local_scopes.empty()) {
     member_->own_local_scope_ = true;
@@ -249,12 +248,12 @@ ParallelExecutor::ParallelExecutor(
 // ncclOp
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
-      main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
+      main_program, member_->places_, loss_var_name, member_->local_scopes_,
+      member_->use_cuda_, member_->nccl_ctxs_.get());
 #else
   std::unique_ptr<ir::Graph> graph =
       build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                           params, member_->local_scopes_, member_->use_cuda_);
+                           member_->local_scopes_, member_->use_cuda_);
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 1fc17a0d64d50eb70ce66cacd4752a5b96d5e894..5f6c2159aa2d90378ac298a8e56b51a188225d45 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -41,7 +41,6 @@ class ParallelExecutor {
 
  public:
   explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                            const std::unordered_set<std::string> &params,
                             const std::unordered_set<std::string> &bcast_vars,
                             const ProgramDesc &main_program,
                             const std::string &loss_var_name, Scope *scope,
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index ddff2c7c261746ac9986e79cff3da7e0a9654adc..4ac872ac3d3bf918678f5294a4c35097c3fb18ab 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -22,20 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-DDim InferShapeContext::GetInputDim(const std::string &name) const {
-  const std::vector<std::string> &arg_names = Inputs(name);
-  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                    "Input(%s) should hold one element, but now it holds %d",
-                    name, arg_names.size());
-  return this->GetDim(arg_names[0]);
-}
-
-std::vector<DDim> InferShapeContext::GetInputsDim(
-    const std::string &name) const {
-  const std::vector<std::string> &arg_names = Inputs(name);
-  return GetDims(arg_names);
-}
-
 std::vector<DDim> InferShapeContext::GetReaderDims(
     const std::string &name) const {
   const std::vector<std::string> &arg_names = Inputs(name);
@@ -46,26 +32,6 @@ std::vector<DDim> InferShapeContext::GetReaderDims(
   return this->GetRepeatedDims(arg_names[0]);
 }
 
-DDim InferShapeContext::GetInputsElementDim(const std::string &name,
-                                            int idx) const {
-  const std::vector<std::string> &names = Inputs(name);
-  return this->GetDim(names[idx]);
-}
-
-void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
-  auto &arg_names = Outputs(name);
-  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                    "Output(%s) should hold one element, but now it holds %d",
-                    name, arg_names.size());
-  SetDim(arg_names[0], dim);
-}
-
-void InferShapeContext::SetOutputsDim(const std::string &name,
-                                      const std::vector<DDim> &dims) {
-  auto &names = Outputs(name);
-  SetDims(names, dims);
-}
-
 void InferShapeContext::SetReaderDims(const std::string &name,
                                       const std::vector<DDim> &dims) {
   const std::vector<std::string> &arg_names = Outputs(name);
@@ -76,69 +42,5 @@ void InferShapeContext::SetReaderDims(const std::string &name,
   return this->SetRepeatedDims(arg_names[0], dims);
 }
 
-std::vector<InferShapeVarPtr> InferShapeContext::GetInputVarPtrs(
-    const std::string &name) {
-  const std::vector<std::string> arg_names = Inputs(name);
-  std::vector<InferShapeVarPtr> res;
-  res.reserve(arg_names.size());
-  std::transform(
-      arg_names.begin(), arg_names.end(), std::back_inserter(res),
-      [this](const std::string &name) { return this->GetVarPtr(name); });
-  return res;
-}
-
-std::vector<InferShapeVarPtr> InferShapeContext::GetOutputVarPtrs(
-    const std::string &name) {
-  const std::vector<std::string> arg_names = Outputs(name);
-  std::vector<InferShapeVarPtr> res;
-  res.reserve(arg_names.size());
-  std::transform(
-      arg_names.begin(), arg_names.end(), std::back_inserter(res),
-      [this](const std::string &name) { return this->GetVarPtr(name); });
-  return res;
-}
-
-std::vector<DDim> InferShapeContext::GetDims(
-    const std::vector<std::string> &names) const {
-  std::vector<DDim> ret;
-  ret.reserve(names.size());
-  std::transform(
-      names.begin(), names.end(), std::back_inserter(ret),
-      [this](const std::string &name) { return this->GetDim(name); });
-  return ret;
-}
-
-void InferShapeContext::SetDims(const std::vector<std::string> &names,
-                                const std::vector<DDim> &dims) {
-  size_t length = names.size();
-  PADDLE_ENFORCE_EQ(length, dims.size());
-  for (size_t i = 0; i < length; ++i) {
-    if (names[i] == framework::kEmptyVarName) {
-      continue;
-    }
-    SetDim(names[i], dims[i]);
-  }
-}
-
-std::vector<proto::VarType::Type> InferShapeContext::GetInputsVarType(
-    const std::string &name) const {
-  return GetVarTypes(Inputs(name));
-}
-
-std::vector<proto::VarType::Type> InferShapeContext::GetOutputsVarType(
-    const std::string &name) const {
-  return GetVarTypes(Outputs(name));
-}
-
-std::vector<proto::VarType::Type> InferShapeContext::GetVarTypes(
-    const std::vector<std::string> &names) const {
-  std::vector<proto::VarType::Type> retv;
-  retv.resize(names.size());
-  std::transform(names.begin(), names.end(), retv.begin(),
-                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
-                           std::placeholders::_1));
-  return retv;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index d73cca121e41e68f9fb6548117ed91c5cc1415ca..824f75b3d3cfa03020182d2ea0b2970bdd6aeeca 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -33,22 +33,23 @@ class InferShapeContext {
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
 
-  std::vector<proto::VarType::Type> GetInputsVarType(
-      const std::string &name) const;
-  std::vector<proto::VarType::Type> GetOutputsVarType(
-      const std::string &name) const;
+  virtual std::vector<proto::VarType::Type> GetInputsVarType(
+      const std::string &name) const = 0;
+  virtual std::vector<proto::VarType::Type> GetOutputsVarType(
+      const std::string &name) const = 0;
 
   virtual bool HasInputs(const std::string &name) const = 0;
   virtual bool HasOutputs(const std::string &name) const = 0;
 
-  DDim GetInputDim(const std::string &name) const;
-  std::vector<DDim> GetInputsDim(const std::string &name) const;
-  std::vector<DDim> GetReaderDims(const std::string &name) const;
-  DDim GetInputsElementDim(const std::string &name, int idx) const;
+  virtual DDim GetInputDim(const std::string &name) const = 0;
+  virtual std::vector<DDim> GetInputsDim(const std::string &name) const = 0;
+  virtual std::vector<DDim> GetReaderDims(const std::string &name) const;
 
-  void SetOutputDim(const std::string &name, const DDim &dim);
-  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
-  void SetReaderDims(const std::string &name, const std::vector<DDim> &dims);
+  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
+  virtual void SetOutputsDim(const std::string &name,
+                             const std::vector<DDim> &dims) = 0;
+  virtual void SetReaderDims(const std::string &name,
+                             const std::vector<DDim> &dims);
 
   virtual AttrReader Attrs() const = 0;
   virtual const std::vector<std::string> &Inputs(
@@ -67,27 +68,15 @@ class InferShapeContext {
 
   virtual bool IsRuntime() const = 0;
 
-  std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
-  std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
-  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
-
-  // Note: In while op, we need this to be public
-  void SetDims(const std::vector<std::string> &names,
-               const std::vector<DDim> &dims);
+  virtual std::vector<InferShapeVarPtr> GetInputVarPtrs(
+      const std::string &name) = 0;
+  virtual std::vector<InferShapeVarPtr> GetOutputVarPtrs(
+      const std::string &name) = 0;
 
  protected:
-  virtual DDim GetDim(const std::string &name) const = 0;
-  virtual void SetDim(const std::string &name, const DDim &dim) = 0;
   virtual std::vector<DDim> GetRepeatedDims(const std::string &name) const = 0;
   virtual void SetRepeatedDims(const std::string &name,
                                const std::vector<DDim> &dims) = 0;
-
-  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
-
-  std::vector<proto::VarType::Type> GetVarTypes(
-      const std::vector<std::string> &names) const;
-
-  virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 612503768079472ba233ee3fcd43a47fdba9a0cc..342cb68ab2bf8ceb543317ed8d8f2356ef6b2cde 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -188,11 +188,13 @@ std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
   std::vector<Variable*> ret;
   for (size_t i = 0; i < input_vars_->size(); ++i) {
     bool found = false;
+    VarBase* origin_var = (*input_vars_)[i];
     for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
       Variable* var = scope->FindVar(outvar);
-      VarBase* origin_var = (*input_vars_)[i];
       std::string orig_var = grad_to_var_->at(outvar);
-      PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var);
+      if (origin_var->var_desc_->Name() != orig_var) {
+        continue;
+      }
       VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
       origin_var->ApplyGrad(scope, var);
       found = true;
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 433d07c0e5aa0986ab1e9fe349ef865d2851c0c0..97772dc110135d9d2533e1574933d49f7c8cd346 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc,
 
 class Tracer {
  public:
-  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
+  explicit Tracer(framework::BlockDesc* root_block,
+                  framework::BlockDesc* startup_block)
+      : root_block_(root_block), startup_block_(startup_block) {
     root_scope_ = new framework::Scope();
     scopes_[root_block_] = root_scope_;
+    scopes_[startup_block_] = root_scope_;
   }
 
   virtual ~Tracer() { delete root_scope_; }
@@ -80,6 +83,8 @@ class Tracer {
       } else {
         op->pre_ops_->push_back(nullptr);
       }
+      VLOG(3) << "input vname " << vname << " "
+              << var->Get<framework::LoDTensor>().dims().size();
     }
 
     *op->output_vars_ = outputs;
@@ -98,12 +103,19 @@ class Tracer {
       outputs[i]->pre_op_ = op;
       outputs[i]->pre_op_out_idx_ = i;
     }
+
+    VLOG(3) << "tracer running " << op_desc->Type();
     op_base->Run(*scope, platform::CPUPlace());
-    framework::OpDesc* grad_op_desc;
-    auto grad_to_var = new std::unordered_map<std::string, std::string>();
-    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
-    op->grad_op_desc_ = grad_op_desc;
-    op->grad_to_var_ = grad_to_var;
+    if (block == startup_block_) {
+      op->grad_op_desc_ = nullptr;
+      op->grad_to_var_ = nullptr;
+    } else {
+      framework::OpDesc* grad_op_desc;
+      auto grad_to_var = new std::unordered_map<std::string, std::string>();
+      CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
+      op->grad_op_desc_ = grad_op_desc;
+      op->grad_to_var_ = grad_to_var;
+    }
     op->block_ = block;
   }
 
@@ -121,6 +133,7 @@ class Tracer {
  private:
   std::map<framework::BlockDesc*, framework::Scope*> scopes_;
   framework::BlockDesc* root_block_;
+  framework::BlockDesc* startup_block_;
   framework::Scope* root_scope_;
 };
 
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 9b5eda17faecce63964954448211238dfccdc5e6..0360cf5273591946570cac47e2578e43f498b550 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -398,26 +398,41 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
     ctx->HasInputs(kOutputs);
     ctx->HasInputs(framework::GradVarName(kOutputs));
 
-    auto p_names = ctx->Inputs(kX);
     auto pg_ig_names = ctx->Outputs(kXGRAD);
-    auto var_types = ctx->GetInputsVarType(kX);
-    std::vector<std::string> names_to_set;
-    std::vector<framework::DDim> dims_to_set;
-    for (size_t i = 0; i < p_names.size(); ++i) {
+    std::vector<framework::InferShapeVarPtr> in_var_ptrs =
+        ctx->GetInputVarPtrs(kX);
+    std::vector<framework::InferShapeVarPtr> out_var_ptrs =
+        ctx->GetOutputVarPtrs(kXGRAD);
+    PADDLE_ENFORCE(in_var_ptrs.size() == out_var_ptrs.size());
+
+    for (size_t i = 0; i < in_var_ptrs.size(); ++i) {
       if (pg_ig_names[i] == framework::kEmptyVarName) {
         continue;
       }
-      auto dims = ctx->GetInputsElementDim(kX, i);
-      if (var_types[i] == framework::proto::VarType::LOD_TENSOR) {
-        names_to_set.push_back(pg_ig_names[i]);
-        dims_to_set.push_back(dims);
-      } else if (var_types[i] == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-        // not sure how to set the dim of LOD_TENSOR_ARRAY
-        names_to_set.push_back(pg_ig_names[i]);
-        dims_to_set.push_back(dims);
+      if (ctx->IsRuntime()) {
+        framework::Variable *in_var =
+            boost::get<framework::Variable *>(in_var_ptrs[i]);
+        framework::Variable *out_var =
+            boost::get<framework::Variable *>(out_var_ptrs[i]);
+
+        auto type = framework::ToVarType(in_var->Type());
+        if (type == framework::proto::VarType::LOD_TENSOR) {
+          out_var->GetMutable<LoDTensor>()->Resize(
+              in_var->Get<framework::LoDTensor>().dims());
+        } else if (type == framework::proto::VarType::SELECTED_ROWS) {
+          out_var->GetMutable<framework::SelectedRows>()->set_height(
+              in_var->Get<framework::SelectedRows>().GetCompleteDims()[0]);
+        } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+          PADDLE_THROW("WhileGradOp doesn't support type %d",
+                       static_cast<int>(type));
+        }
+      } else {
+        framework::VarDesc *in_var =
+            boost::get<framework::VarDesc *>(in_var_ptrs[i]);
+        boost::get<framework::VarDesc *>(out_var_ptrs[i])
+            ->SetShape(in_var->GetShape());
       }
     }
-    ctx->SetDims(names_to_set, dims_to_set);
   }
 };
 
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 154ff2bb209bb8f932c06caa319223ccf3314767..8c116c4abfe42296b616dc536821e9be55a8be84 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -155,11 +155,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
-    if (is_conv3d) {
-      chosen_memory_format =
-          platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    weights_format = mkldnn::memory::format::any;
+    // Check the format for user's special output
+    if (chosen_memory_format != mkldnn::memory::format::any) {
+      if (is_conv3d) {
+        chosen_memory_format =
+            platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+      }
     }
-    weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
 
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
@@ -435,11 +438,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
-    if (is_conv3d) {
-      chosen_memory_format =
-          platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    weights_format = mkldnn::memory::format::any;
+    // Check the format for user's special output
+    if (chosen_memory_format != mkldnn::memory::format::any) {
+      if (is_conv3d) {
+        chosen_memory_format =
+            platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+      }
     }
-    weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
 
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 78956c9ea4942098002dd30e6f2f471ae49ab8d1..8c54159a41e3361322d0fa7ce36534447680207d 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <stdlib.h>
 #include <limits>
 
 #include "glog/logging.h"  // For VLOG
@@ -420,7 +421,15 @@ void GRPCClient::Proceed() {
       sync_cond_.notify_all();
     }
   }
-  VLOG(3) << "GRPCClient Proceed end";
+
+  // Last log message
+  // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a
+  // static Mutex log_mutex is used for synchronization, which might have been
+  // destructed at this moment.
+  if (FLAGS_v >= 3) {
+    std::string msg("GRPCClient Proceed end");
+    fwrite(msg.c_str(), msg.length(), 1, stdout);
+  }
 }
 
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 1a149298fd33f132a90ff5de3b35dd5894a4ae68..ae669f5525443abe424109b6a6869e2ddaf52ba0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 833c4072826c58277bc23e03b787fafbbaa73d03..50b2322b17bdba44f8c5c1dd4a9f0b2160f6a7d8 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    elementwise_mul, ops::ElementwiseMulKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseMulKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index 95381774606b2d8e74519befc9a6f7a3ac20aa45..e80a703c30c0335124c089ea82ba4f6fe055acde 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fill_zeros_like_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
@@ -22,4 +23,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+                             paddle::platform::float16>,
     ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
index b255d2a7c413b4f965f6b874d342dcb93c7b5e66..4682940f7e15bc8af5dcda24ea058ac7351887c6 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ b/paddle/fluid/operators/metrics/accuracy_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <thrust/reduce.h>
 #include "paddle/fluid/operators/metrics/accuracy_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
@@ -94,6 +95,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_CUDA_KERNEL(accuracy,
-                        paddle::operators::AccuracyOpCUDAKernel<float>,
-                        paddle::operators::AccuracyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
+    paddle::operators::AccuracyOpCUDAKernel<double>,
+    paddle::operators::AccuracyOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 8a111e6065b102fd177b9e313cd87dcf8c22b669..271428408cb26296ff318bb39414ad0e8ecc0ac8 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -49,7 +49,8 @@ class MulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(
         y_dims.size(), y_num_col_dims,
         "The input tensor Y's rank of MulOp should be larger than "
-        "y_num_col_dims.");
+        "y_num_col_dims: %ld vs %ld",
+        y_dims.size(), y_num_col_dims);
 
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cu b/paddle/fluid/operators/optimizers/momentum_op.cu
index 8ce739de8dfd74cb43f9521bf39e3127a8a21925..7f9e7246401bc3c765e539ac4395c4feef3c9508 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/momentum_op.cu
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     momentum, ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MomentumOpKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 84955d3f04308538b9a7b09beff35bc06c34bdc3..3ed1bff5ff4993e9c858dea8d56a8cb6124aca89 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -237,7 +237,8 @@ class SparseMomentumFunctor<T, UseNesterov> {
   inline HOSTDEVICE void operator()(size_t i) {
     auto row_idx =
         math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
+                       : static_cast<T>(0);
     // put memory access in register
     const T p = p_[i];
     const T lr = lr_[0];
@@ -282,7 +283,8 @@ class SparseMomentumFunctor<T, NoNesterov> {
   inline HOSTDEVICE void operator()(size_t i) {
     auto row_idx =
         math::BinarySearch<int64_t>(rows_, row_height_, i / row_numel_);
-    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0;
+    T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_]
+                       : static_cast<T>(0);
     // put memory access in register
     const T p = p_[i];
     const T lr = lr_[0];
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 0cad224ca8860b0e4bc2e3f2bc1659235aadfe2d..99a4b1b7b0b33aebd9a1a49b0b771fe6fd134bb3 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -150,7 +151,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - (*beam)) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(-INFINITY, -1);
+          topk[k].set(-static_cast<T>(INFINITY), -1);
         }
       }
       if (!(*is_empty)) {
@@ -160,7 +161,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
     }
 
     *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
+    if ((*max).v == -static_cast<T>(1)) *is_empty = true;
     *beam = 0;
   }
 }
@@ -181,7 +182,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - *beam) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(-INFINITY, -1);
+          topk[k].set(-static_cast<T>(INFINITY), -1);
         }
       }
       if (!(*is_empty)) {
@@ -278,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
     bool firststep = true;
 
     for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-INFINITY, -1);
+      topk[j].set(-static_cast<T>(INFINITY), -1);
     }
     while (top_num) {
       ThreadGetTopK<T, MaxLength, BlockSize>(
@@ -362,5 +363,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
-                        paddle::operators::TopkOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    top_k, paddle::operators::TopkOpCUDAKernel<float>,
+    paddle::operators::TopkOpCUDAKernel<double>,
+    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index cbb090adefda03717a634dab24132d36d1cfc648..6ce4bf8f13922e2756c3ee8f189bd36123d6964c 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 
 #define NCCL_ID_VARNAME "NCCLID"
 
@@ -38,6 +39,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclInt;
   } else if (type == framework::proto::VarType::INT64) {
     return ncclInt64;
+  } else if (type == framework::proto::VarType::FP16) {
+    return ncclFloat16;
   } else {
     PADDLE_THROW("Not supported");
   }
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 34e9c897d9e95feb185083b7c0a6a824d8dc809c..be63fb877869b64035207342e5d4398e481dbc99 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -24,8 +24,9 @@ namespace pybind {
 void BindTracer(pybind11::module *m) {
   pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
-             new (&self) imperative::Tracer(root_block);
+           [](imperative::Tracer &self, framework::BlockDesc *root_block,
+              framework::BlockDesc *startup_block) {
+             new (&self) imperative::Tracer(root_block, startup_block);
            })
       .def("trace", &imperative::Tracer::Trace)
       .def("get_scope", &imperative::Tracer::GetScope,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 737ae2dd9c3451a0c9aabd31a5ec05b908356c98..f8a5c9deb066048ff645191c30b8c3dbf1d7eef5 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -977,7 +977,6 @@ All parameter, weight, gradient are variables in Paddle.
                 cannot be updated after being finalized.)DOC");
 
   pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::unordered_set<std::string> &,
                   const std::unordered_set<std::string> &, const ProgramDesc &,
                   const std::string &, Scope *, std::vector<Scope *> &,
                   const ExecutionStrategy &, const BuildStrategy &, size_t,
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index b2c3e7c989c6e7d947055a1f907ddb439445eac5..6303be003a701e57a8aa1e2f925459f416cdb543 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -489,8 +489,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     grad_to_var = dict()
 
     op_desc = _create_op_desc_(
-        "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, {
-            "shape": [1],
+        "fill_constant",
+        {},
+        {"Out": [_append_grad_suffix_(loss.name)]},
+        {
+            "shape": [1],  # TODO(panyx0718): This can be loss.shape.
             "value": 1.0,
             "dtype": loss.dtype,
             "force_cpu": False,
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 3bf2fe5db0cb2126295ebfda822eeac8762dbdb7..ece97b661fd7d60f8822439a84ee4403b9e3d81c 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,9 +22,12 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import utils
+from .utils import *
 
 __all__ = []
 __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += utils.__all__
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py
index 20b2cc381aaa1b837ce106410246bc8cedb2fc88..1c1c2fb22709189ca03dc543ca551257c8031c1a 100644
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/fluid/contrib/utils/__init__.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 from __future__ import print_function
-#from . import lookup_table_utils
-#from .lookup_table_utils import *
+from . import lookup_table_utils
+from .lookup_table_utils import *
 from . import hdfs_utils
 from .hdfs_utils import *
 
-#__all__ = lookup_table_utils.__all__
-__all__ = hdfs_utils.__all__
+__all__ = []
+__all__ += lookup_table_utils.__all__
+__all__ += hdfs_utils.__all__
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
index baea57ccce0e9ca3a8fab244e43a107a89cfe67d..35ddf97ff2361d8abd34b16761be99990fc3880d 100644
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -14,6 +14,7 @@
 """HDFS Utils"""
 
 import os
+import sys
 import subprocess
 import multiprocessing
 from datetime import datetime
@@ -24,7 +25,7 @@ import errno
 
 import logging
 
-__all__ = ["HDFSClient", "multi_download"]
+__all__ = ["HDFSClient", "multi_download", "multi_upload"]
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
 _logger = logging.getLogger("hdfs_utils")
@@ -93,13 +94,15 @@ class HDFSClient(object):
 
     def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
         """
-            upload the local file to hdfs
-            Args:
-                hdfs_path: hdfs path, target path 
-                local_path: local file path, source path
-                overwrite: will overwrite the original file
-                retry_times: max times retry to upload
-            Returns:
+        upload the local file to hdfs
+
+        Args:
+            hdfs_path(str): the hdfs file path
+            local_path(str): the local file path
+            overwrite(bool|None): will overwrite the file on HDFS or not
+            retry_times(int|5): retry times
+
+        Returns:
                 True or False
         """
         assert hdfs_path is not None
@@ -109,7 +112,7 @@ class HDFSClient(object):
             _logger.warn(
                 "The Local path: {} is dir and I will support it later, return".
                 format(local_path))
-            return
+            return False
 
         base = os.path.basename(local_path)
         if not self.is_exist(hdfs_path):
@@ -141,14 +144,16 @@ class HDFSClient(object):
 
     def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
         """
-            download from hdfs
-            Args:
-                hdfs_path: hdfs path, target path 
-                local_path: local file path, source path
-                overwrite: will remove original file and overwrite it.
-                unzip: ignore this param
-            Returns
-                True or False
+        download file from HDFS
+
+        Args:
+            hdfs_path(str): the hdfs file path
+            local_path(str): the local file path
+            overwrite(bool|None): will overwrite the file on HDFS or not
+            unzip(bool|False): if the download file is compressed by zip, unzip it or not.
+
+        Returns:
+            True or False
         """
         _logger.info('Downloading %r to %r.', hdfs_path, local_path)
         _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
@@ -188,13 +193,13 @@ class HDFSClient(object):
 
     def is_exist(self, hdfs_path=None):
         """
-            whether the remote hdfs path exists?
-            Args:
-                hdfs_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
-                fs_name: The default values are the same as in the job configuration
-                fs_ugi: The default values are the same as in the job configuration
-            Returns:
-                True or False
+        whether the remote HDFS path exists
+
+        Args:
+            hdfs_path(str): the hdfs file path
+
+        Returns:
+            True or False
         """
         exist_cmd = ['-test', '-e', hdfs_path]
         returncode, output, errors = self.__run_hdfs_cmd(
@@ -211,13 +216,13 @@ class HDFSClient(object):
 
     def is_dir(self, hdfs_path=None):
         """
-            whether the remote hdfs path exists?
-            Args:
-                remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp)
-                fs_name: The default values are the same as in the job configuration
-                fs_ugi: The default values are the same as in the job configuration
-            Returns:
-                True or False
+        whether the remote HDFS path is directory
+
+        Args:
+            hdfs_path(str): the hdfs file path
+
+        Returns:
+            True or False
         """
 
         if not self.is_exist(hdfs_path):
@@ -237,17 +242,17 @@ class HDFSClient(object):
 
     def delete(self, hdfs_path):
         """
-            Remove a file or directory from HDFS.
+        Remove a file or directory from HDFS.
+
+        whether the remote HDFS path exists
 
         Args:
-            param hdfs_path: HDFS path.
-            param recursive: Recursively delete files and directories. By default,
-            this method will raise an :class:`HdfsError` if trying to delete a
-            non-empty directory.
+        hdfs_path: HDFS path.
+
         Returns:
+            True or False
             This function returns `True` if the deletion was successful and `False` if
             no file or directory previously existed at `hdfs_path`.
-
         """
         _logger.info('Deleting %r.', hdfs_path)
 
@@ -273,16 +278,14 @@ class HDFSClient(object):
 
     def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
         """
-        Rename a file or folder.
-        Args:    
-            :param hdfs_src_path: Source path.
-            :param hdfs_dst_path: Destination path. If the path already exists and is
-              a directory, the source will be moved into it. If the path exists and is
-              a file, or if a parent destination directory is missing, this method will
-              raise an :class:`HdfsError`.
+        Move a file or folder on HDFS.
+
+        Args:
+        hdfs_path(str): HDFS path.
+        overwrite(bool|False): If the path already exists and overwrite is False, will return False.
+
         Returns:
-             This function returns `True` if the rename was successful and `False` if
-             rename was faild.       
+            True or False
         """
         assert hdfs_src_path is not None
         assert hdfs_dst_path is not None
@@ -320,17 +323,20 @@ class HDFSClient(object):
                 raise
 
     def makedirs(self, hdfs_path):
-        """Create a remote directory, recursively if necessary.
+        """
+        Create a remote directory, recursively if necessary.
+
         Args:
-            :param hdfs_path: Remote path. Intermediate directories will be created
-              appropriately.
+        hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
+
         Returns:
-            True if make a directories was successful, False when make a directiries was failed. 
+            True or False
         """
         _logger.info('Creating directories to %r.', hdfs_path)
         assert hdfs_path is not None
 
         if self.is_exist(hdfs_path):
+            _logger.error("HDFS path is exist: {}".format(hdfs_path))
             return
 
         mkdirs_commands = ['-mkdir', hdfs_path]
@@ -346,11 +352,13 @@ class HDFSClient(object):
 
     def ls(self, hdfs_path):
         """
-        ls a hdfs_path.
-        Args:    
-            :param hdfs_path: hdfs_path will be ls.
+        ls directory contents about HDFS hdfs_path
+
+        Args:
+        hdfs_path(str): Remote HDFS path will be ls.
+
         Returns:
-             This function returns a `list` that contaion all files in the hdfs_path.        
+            List: a contents list about hdfs_path.
         """
         assert hdfs_path is not None
 
@@ -378,11 +386,15 @@ class HDFSClient(object):
 
     def lsr(self, hdfs_path, only_file=True, sort=True):
         """
-        ls a hdfs_path sort by time.
-        Args:    
-            :param hdfs_path: hdfs_path will be ls.
+        list directory contents about HDFS hdfs_path recursively
+
+        Args:
+        hdfs_path(str): Remote HDFS path.
+        only_file(bool|True): will discard folders.
+        sort(bool|True): will be sorted by create time.
+
         Returns:
-             This function returns a `list` that contaion all files sorted by time in the hdfs_path.        
+            List: a contents list about hdfs_path.
         """
 
         def sort_by_time(v1, v2):
@@ -422,21 +434,106 @@ class HDFSClient(object):
             return ret_lines
 
 
+def multi_download(client,
+                   hdfs_path,
+                   local_path,
+                   trainer_id,
+                   trainers,
+                   multi_processes=5):
+    """
+    Download files from HDFS using multi process.
+
+    Args:
+        client(HDFSClient): instance of HDFSClient
+        hdfs_path(str): path on hdfs
+        local_path(str): path on local
+        trainer_id(int): current trainer id
+        trainers(int): all trainers number
+        multi_processes(int|5): the download data process at the same time, default=5
+
+    Returns:
+        List:
+        Download files in local folder.
+    """
+
+    def __subprocess_download(datas):
+        for data in datas:
+            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+            if re_path == os.curdir:
+                sub_local_re_path = local_path
+            else:
+                sub_local_re_path = os.path.join(local_path, re_path)
+            client.download(data, sub_local_re_path)
+
+    assert isinstance(client, HDFSClient)
+
+    client.make_local_dirs(local_path)
+    _logger.info("Make local dir {} successfully".format(local_path))
+
+    all_need_download = client.lsr(hdfs_path, sort=True)
+    need_download = all_need_download[trainer_id::trainers]
+    _logger.info("Get {} files From all {} files need to be download from {}".
+                 format(len(need_download), len(all_need_download), hdfs_path))
+
+    _logger.info("Start {} multi process to download datas".format(
+        multi_processes))
+    procs = []
+    for i in range(multi_processes):
+        process_datas = need_download[i::multi_processes]
+        p = multiprocessing.Process(
+            target=__subprocess_download, args=(process_datas, ))
+        procs.append(p)
+        p.start()
+
+    # complete the processes
+    for proc in procs:
+        proc.join()
+
+    _logger.info("Finish {} multi process to download datas".format(
+        multi_processes))
+
+    local_downloads = []
+    for data in need_download:
+        data_name = os.path.basename(data)
+        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
+        if re_path == os.curdir:
+            local_re_path = os.path.join(local_path, data_name)
+        else:
+            local_re_path = os.path.join(local_path, re_path, data_name)
+        local_downloads.append(local_re_path)
+
+    return local_downloads
+
+
+def getfilelist(path):
+    rlist = []
+    for dir, folder, file in os.walk(path):
+        for i in file:
+            t = os.path.join(dir, i)
+            rlist.append(t)
+    for r in rlist:
+        print(r)
+
+
 def multi_upload(client,
                  hdfs_path,
                  local_path,
                  multi_processes=5,
-                 overwrite=False):
+                 overwrite=False,
+                 sync=True):
     """
-    Upload file to hdfs.
+    Upload files to HDFS using multi process.
+
     Args:
-        :param overwrite: will overwrite hdfs file or not
-        :param multi_processes: the upload data process at the same time, default=5
-        :param client: instance of HDFSClient
-        :param hdfs_path: path on hdfs
-        :param local_path: path on local
+        client(HDFSClient): instance of HDFSClient
+        hdfs_path(str): path on hdfs
+        local_path(str): path on local
+        multi_processes(int|5): the upload data process at the same time, default=5
+        overwrite(bool|False): will overwrite file on HDFS or not
+        sync(bool|True): upload files sync or not.
+
     Returns:
-        
+        None
     """
 
     def __subprocess_upload(datas):
@@ -446,13 +543,6 @@ def multi_upload(client,
             client.upload(hdfs_re_path, data, overwrite, retry_times=5)
 
     def get_local_files(path):
-        """
-            Get all local files
-        Args:
-            path: local file path
-        Returns:
-            A list that contation all files in the path.
-        """
         rlist = []
 
         if not os.path.isdir(path):
@@ -488,71 +578,6 @@ def multi_upload(client,
         multi_processes))
 
 
-def multi_download(client,
-                   hdfs_path,
-                   local_path,
-                   trainer_id,
-                   trainers,
-                   file_cnt,
-                   multi_processes=5):
-    """
-    multi_download
-    Args:
-        :param client: instance of HDFSClient
-        :param hdfs_path: path on hdfs
-        :param local_path: path on local
-        :param trainer_id: current trainer id
-        :param trainers: all trainers number
-        :param file_cnt: all file number
-        :param multi_processes: the download data process at the same time, default=5
-        :return: None
-    Returns:
-        A list that be downloaded. 
-    """
-
-    def __subprocess_download(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-            local_re_path = os.path.join(local_path, re_path)
-            client.download(data, local_re_path)
-
-    assert isinstance(client, HDFSClient)
-
-    client.make_local_dirs(local_path)
-    _logger.info("Make local dir {} successfully".format(local_path))
-
-    all_need_download = client.lsr(hdfs_path, sort=True)[:file_cnt]
-    need_download = all_need_download[trainer_id::trainers]
-    _logger.info("Get {} files From all {} files need to be download from {}".
-                 format(len(need_download), len(all_need_download), hdfs_path))
-
-    _logger.info("Start {} multi process to download datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = need_download[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_download, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to download datas".format(
-        multi_processes))
-
-    local_downloads = []
-    for data in need_download:
-        data_name = os.path.basename(data)
-        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-        local_re_path = os.path.join(local_path, re_path, data_name)
-        local_downloads.append(local_re_path)
-
-    return local_downloads
-
-
 if __name__ == "__main__":
     hadoop_home = "/home/client/hadoop-client/hadoop/"
 
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
index cc2418238f98d8e2b9af0cf4290f6088c11e1b92..20e6328d81cc727340ea4a16012f6ee9967ea1e6 100644
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
@@ -18,14 +18,12 @@ import os
 import time
 import logging
 
-import paddle
-import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid import io
 from paddle.fluid import Program
 
 __all__ = [
-    "load_inference_model", "load_persistable_vars",
+    "load_persistables_for_increment", "load_persistables_for_inference",
     "convert_dist_to_sparse_program"
 ]
 
@@ -80,19 +78,28 @@ def __get_prefetch_op_tuples(main_program):
     return prefetch_op_tuples
 
 
-def convert_dist_to_sparse_program(main_program):
-    if not main_program._distributed_lookup_table:
+def convert_dist_to_sparse_program(program):
+    """
+    WARNING: this function will only be used for distributed training with distributed lookup table.
+    when we train model with distributed lookup table but want to do the local inference, we can use
+    this function to convert the train program with distributed lookup table to sparse lookup table.
+
+    :param program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
+    :return:
+        program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
+    """
+    if not program._distributed_lookup_table:
         _logger.warn(
             "There are no distributed lookup tables need to be converted")
         return
 
     # create table param and grad var in pserver program
-    origin_emb_var = "{}.origin".format(main_program._distributed_lookup_table)
-    emb_var = main_program._distributed_lookup_table
-    main_program.global_block()._rename_var(emb_var, origin_emb_var)
-    origin_param_var = main_program.global_block().vars[origin_emb_var]
+    origin_emb_var = "{}.origin".format(program._distributed_lookup_table)
+    emb_var = program._distributed_lookup_table
+    program.global_block()._rename_var(emb_var, origin_emb_var)
+    origin_param_var = program.global_block().vars[origin_emb_var]
 
-    param_var = main_program.global_block().create_var(
+    param_var = program.global_block().create_var(
         name=emb_var,
         shape=origin_param_var.shape,
         dtype=origin_param_var.dtype,
@@ -100,28 +107,28 @@ def convert_dist_to_sparse_program(main_program):
         persistable=True)
     # parameter must be selected rows
     param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-    main_program._sync_with_cpp()
+    program._sync_with_cpp()
 
-    prefetch_op_tuples = __get_prefetch_op_tuples(main_program)
+    prefetch_op_tuples = __get_prefetch_op_tuples(program)
 
     split_ids_id = prefetch_op_tuples[0]
 
     for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
-        main_program.global_block()._remove_op(idx)
-    main_program.desc.flush()
+        program.global_block()._remove_op(idx)
+    program.desc.flush()
 
     in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
 
     for in_out_pair in in_out_pairs:
         idx = split_ids_id
-        ids = main_program.global_block().vars[in_out_pair[0]]
-        out = main_program.global_block().vars[in_out_pair[1]]
-        __insert_lookup_sparse_table_op(main_program, idx, ids, param_var, out)
-        main_program.desc.flush()
-    return main_program
+        ids = program.global_block().vars[in_out_pair[0]]
+        out = program.global_block().vars[in_out_pair[1]]
+        __insert_lookup_sparse_table_op(program, idx, ids, param_var, out)
+        program.desc.flush()
+    return program
 
 
-def load_persistable_vars(executor, dirname, program, lookup_table_var):
+def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
     def _is_checkpoint_var(exclude_fluid_vars=None):
         """
         the checkpoint will not save or load all the variables.
@@ -159,8 +166,82 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var):
 
         return is_valid
 
-    def _load_lookup_table_vars(executor, dirname, main_program,
-                                lookup_table_vars):
+    io.load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var(lookup_table_vars),
+        filename=None)
+
+
+def load_persistables_for_increment(dirname, executor, program,
+                                    lookup_table_var, lookup_table_var_path):
+    """
+    WARNING: this function will only be used for distributed training with distributed lookup table.
+    for increment trainning, the pserver will not only load dense variables,
+    but also load the suitable lookup table var. Because of slice lookup table
+    var with HASH, we must load the correct slice var.
+
+
+    :param dirname(str): The directory path
+    :param executor(Executor): The executor to run for loading inference model.
+    :param program(Program): The parameter server program, which will run on Pserver.
+    :param lookup_table_var: the distributed lookup tables var name.
+    :param lookup_table_var_path: the the distributed lookup tables var location.
+    :return: None
+    """
+
+    def __load_lookup_table_vars(executor, main_program, lookup_table_var,
+                                 lookup_table_var_path):
+        emb_var = main_program.global_block().var(lookup_table_var)
+
+        load_program = Program()
+        load_block = load_program.global_block()
+        load_block.append_op(
+            type='load',
+            inputs={},
+            outputs={'Out': [emb_var]},
+            attrs={'file_path': lookup_table_var_path})
+        executor.run(load_program)
+
+    if not os.path.isdir(dirname):
+        raise ValueError("There is no directory named '%s'", dirname)
+
+    if not os.path.exists(lookup_table_var_path):
+        raise ValueError("There is no file named '%s'", lookup_table_var_path)
+
+    if not isinstance(program, Program):
+        raise ValueError("program must be an instance of fluid.Program")
+
+    _logger.info("Start Load Sparse Program With "
+                 "Distributed Lookup Table Vars from {}, time = {}".format(
+                     dirname, time.ctime()))
+
+    _load_persistable_vars(executor, dirname, program, [lookup_table_var])
+    __load_lookup_table_vars(executor, program, lookup_table_var,
+                             lookup_table_var_path)
+
+    _logger.info("Finish Load Sparse Program With "
+                 "Distributed Lookup Table Vars from {}, time = {}".format(
+                     dirname, time.ctime()))
+
+
+def load_persistables_for_inference(dirname, executor, program,
+                                    lookup_table_var_name):
+    """
+    WARNING: this function will only be used for inference with distributed lookup table.
+    Inference with distributed lookup table is a little funky, this function will load distributed
+    lookup table vars into sparse var, can be used in local inference mode.
+
+    :param dirname(str): The directory path
+    :param executor(Executor): The executor to run for loading inference model.
+    :param program(Program): The parameter server program, which will run on Pserver.
+    :param lookup_table_var_name: the distributed lookup tables var name.
+    :return: None
+    """
+
+    def __load_lookup_table_vars(executor, dirname, main_program,
+                                 lookup_table_vars):
         if not os.path.isdir(dirname):
             raise ValueError("There is no directory named '%s'", dirname)
 
@@ -209,48 +290,34 @@ def load_persistable_vars(executor, dirname, program, lookup_table_var):
         global_block.append_op(type='delete_var', inputs={'X': sums})
         executor.run(convert_program)
 
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    lookup_table_vars = [lookup_table_var]
-
-    io.load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var(lookup_table_vars),
-        filename=None)
-
-    _load_lookup_table_vars(executor, dirname, program, lookup_table_vars)
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-
-def load_inference_model(dirname, executor, lookup_table_var_name):
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
 
-    local_model = os.path.join(dirname, model_filename)
+    if program:
+        if not isinstance(program, Program):
+            raise ValueError("program must be an instance of fluid.Program")
+    else:
+        local_model = os.path.join(dirname, model_filename)
 
-    with open(local_model, "rb") as f:
-        program_desc_str = f.read()
+        with open(local_model, "rb") as f:
+            program_desc_str = f.read()
 
-    program = Program.parse_from_string(program_desc_str)
+        program = Program.parse_from_string(program_desc_str)
 
-    if not core._is_program_version_supported(program._version()):
-        raise ValueError("Unsupported program version: %d\n" %
-                         program._version())
+        if not core._is_program_version_supported(program._version()):
+            raise ValueError("Unsupported program version: %d\n" %
+                             program._version())
 
-    # Binary data also need version.
-    load_persistable_vars(executor, dirname, program, lookup_table_var_name)
+    _logger.info("Start Load Sparse Program With "
+                 "Distributed Lookup Table Vars from {}, time = {}".format(
+                     dirname, time.ctime()))
+
+    _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
+    __load_lookup_table_vars(executor, dirname, program,
+                             [lookup_table_var_name])
 
-    feed_target_names = program.desc.get_feed_target_names()
-    fetch_target_names = program.desc.get_fetch_target_names()
-    fetch_targets = [
-        program.global_block().var(name) for name in fetch_target_names
-    ]
+    _logger.info("Finish Load Sparse Program With "
+                 "Distributed Lookup Table Vars from {}, time = {}".format(
+                     dirname, time.ctime()))
 
-    return [program, feed_target_names, fetch_targets]
+    return program
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 13d2893fd146b5a3d9100ee1ba6c2243cb9c411b..af02721eb72c1d0f8aa3d7ab8db504c4c33b64d5 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -44,6 +44,8 @@ class DataToLoDTensorConverter(object):
             self.dtype = 'int64'
         elif dtype == core.VarDesc.VarType.FP64:
             self.dtype = 'float64'
+        elif dtype == core.VarDesc.VarType.FP16:
+            self.dtype = 'float16'
         elif dtype == core.VarDesc.VarType.INT32:
             self.dtype = 'int32'
         elif dtype == core.VarDesc.VarType.UINT8:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b5d603d4781a7fa0b964b635e8f94f2557aeccd8..de30ed2fc5858187d2ecede299832701304e4198 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1324,6 +1324,9 @@ class Block(object):
     def _prepend_op(self, *args, **kwargs):
         op_desc = self.desc._prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
+        if _in_imperative_mode():
+            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
+                                       [v._ivar for v in op.outputs], self.desc)
         self.ops.insert(0, op)
         return op
 
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 15d38ddb56c71ef7de67f79cf52cd26070f470cb..aa48ef71aa61086764019ac29abd9cb4c53325fa 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -28,7 +28,8 @@ def enabled():
 def guard():
     train = framework.Program()
     startup = framework.Program()
-    tracer = core.Tracer(train.current_block().desc)
+    tracer = core.Tracer(train.current_block().desc,
+                         startup.current_block().desc)
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._imperative_guard(tracer):
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 1a28f7f4ae35295394b560d79e3dc0cdd5f2beab..044717c31975d671818cae17cd989774c96ed9fa 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -25,11 +25,9 @@ __all__ = ['PyLayer']
 
 class PyLayer(core.Layer):
     def __init__(self):
-        pass
+        self._built = False
 
     def __call__(self, inputs):
-        # TODO(panyx0718): Support declarative mode as well.
-        assert base.enabled()
         if not isinstance(inputs, list) and not isinstance(inputs, tuple):
             inputs = [inputs]
 
@@ -37,8 +35,15 @@ class PyLayer(core.Layer):
         for x in inputs:
             py_var = base.to_variable(x)
             var_inputs.append(py_var)
+        if not self._built:
+            self._build_once(inputs)
+            self._built = True
+
         outputs = self.forward(var_inputs)
         return outputs
 
+    def _build_once(self, inputs):
+        pass
+
     def forward(self, inputs):
         return []
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index b37ebbe5179ba6e36be70ff936cb8a3ca0d89d13..26d1f8f4d2bd67a35c4ec96a025ee273cec4dbd1 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -18,6 +18,7 @@ from . import framework
 import numpy as np
 import contextlib
 from .core import VarDesc
+from . import unique_name
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
@@ -207,16 +208,39 @@ class UniformInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         op = block._prepend_op(
             type="uniform_random",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": out_dtype,
                 "min": self._low,
                 "max": self._high,
                 "seed": self._seed
             })
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
         var.op = op
         return op
 
@@ -261,17 +285,39 @@ class NormalInitializer(Initializer):
         # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16:
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(".".join(['gaussian_random', 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
         op = block._prepend_op(
             type="gaussian_random",
-            outputs={"Out": var},
+            outputs={"Out": out_var},
             attrs={
                 "shape": var.shape,
-                "dtype": int(var.dtype),
+                "dtype": out_dtype,
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed,
                 "use_mkldnn": False
             })
+
+        if var.dtype == VarDesc.VarType.FP16:
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
         var.op = op
         return op
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index dde05189722fef77e03a1c2d8f3cbae44a3e8245..06039b206b4ddb02e38035134e50b353b987074e 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -63,14 +63,18 @@ def noam_decay(d_model, warmup_steps):
     Returns:
         The decayed learning rate.
     """
-    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter(1)
 
-        a = global_step**-0.5
-        b = (warmup_steps**-1.5) * global_step
-        lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
+    def _lr_schedule(dtype):
+        with default_main_program()._lr_schedule_guard():
+            global_step = _decay_step_counter(1)
 
-    return lr_value
+            a = global_step**-0.5
+            b = (warmup_steps**-1.5) * global_step
+            lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
+
+        return lr_value
+
+    return _lr_schedule
 
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -109,15 +113,19 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
           sgd_optimizer.minimize(avg_cost)
 
     """
-    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
 
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * (decay_rate**div_res)
+    def _lr_schedule(dtype):
+        with default_main_program()._lr_schedule_guard():
+            global_step = _decay_step_counter()
+
+            div_res = global_step / decay_steps
+            if staircase:
+                div_res = ops.floor(div_res)
+            decayed_lr = learning_rate * (decay_rate**div_res)
 
-        return decayed_lr
+            return decayed_lr
+
+    return _lr_schedule
 
 
 def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -138,15 +146,19 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     Returns:
         The decayed learning rate
     """
-    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
 
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
+    def _lr_schedule(dtype):
+        with default_main_program()._lr_schedule_guard():
+            global_step = _decay_step_counter()
+
+            div_res = global_step / decay_steps
+            if staircase:
+                div_res = ops.floor(div_res)
+            decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
+
+            return decayed_lr
 
-        return decayed_lr
+    return _lr_schedule
 
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -184,16 +196,20 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
                     staircase=True))
           sgd_optimizer.minimize(avg_cost)
     """
-    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
 
-        div_res = global_step / decay_steps
-        if staircase:
-            div_res = ops.floor(div_res)
+    def _lr_schedule(dtype):
+        with default_main_program()._lr_schedule_guard():
+            global_step = _decay_step_counter()
 
-        decayed_lr = learning_rate / (1 + decay_rate * div_res)
+            div_res = global_step / decay_steps
+            if staircase:
+                div_res = ops.floor(div_res)
 
-        return decayed_lr
+            decayed_lr = learning_rate / (1 + decay_rate * div_res)
+
+            return decayed_lr
+
+    return _lr_schedule
 
 
 def polynomial_decay(learning_rate,
@@ -224,28 +240,33 @@ def polynomial_decay(learning_rate,
     Returns:
         Variable: The decayed learning rate
     """
-    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
 
-        if cycle:
-            div_res = ops.ceil(global_step / decay_steps)
-            zero_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=0.0)
-            one_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=1.0)
+    def _lr_schedule(dtype, decay_steps=decay_steps):
+        with default_main_program()._lr_schedule_guard():
+            global_step = _decay_step_counter()
+
+            if cycle:
+                div_res = ops.ceil(global_step / decay_steps)
+                zero_var = tensor.fill_constant(
+                    shape=[1], dtype=dtype, value=0.0)
+                one_var = tensor.fill_constant(
+                    shape=[1], dtype=dtype, value=1.0)
+
+                with control_flow.Switch() as switch:
+                    with switch.case(global_step == zero_var):
+                        tensor.assign(input=one_var, output=div_res)
+                decay_steps = decay_steps * div_res
+            else:
+                decay_steps_var = tensor.fill_constant(
+                    shape=[1], dtype=dtype, value=float(decay_steps))
+                global_step = nn.elementwise_min(
+                    x=global_step, y=decay_steps_var)
 
-            with control_flow.Switch() as switch:
-                with switch.case(global_step == zero_var):
-                    tensor.assign(input=one_var, output=div_res)
-            decay_steps = decay_steps * div_res
-        else:
-            decay_steps_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=float(decay_steps))
-            global_step = nn.elementwise_min(x=global_step, y=decay_steps_var)
+            decayed_lr = (learning_rate - end_learning_rate) * \
+                ((1 - global_step / decay_steps) ** power) + end_learning_rate
+            return decayed_lr
 
-        decayed_lr = (learning_rate - end_learning_rate) * \
-            ((1 - global_step / decay_steps) ** power) + end_learning_rate
-        return decayed_lr
+    return _lr_schedule
 
 
 def piecewise_decay(boundaries, values):
@@ -273,38 +294,42 @@ def piecewise_decay(boundaries, values):
 
 
     """
-    with default_main_program()._lr_schedule_guard():
-        if len(values) - len(boundaries) != 1:
-            raise ValueError("len(values) - len(boundaries) should be 1")
-
-        global_step = _decay_step_counter()
-
-        lr = tensor.create_global_var(
-            shape=[1],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            name="learning_rate")
-
-        with control_flow.Switch() as switch:
-            for i in range(len(boundaries)):
-                boundary_val = tensor.fill_constant(
-                    shape=[1],
-                    dtype='float32',
-                    value=float(boundaries[i]),
-                    force_cpu=True)
-                value_var = tensor.fill_constant(
-                    shape=[1], dtype='float32', value=float(values[i]))
-                with switch.case(global_step < boundary_val):
-                    tensor.assign(value_var, lr)
-            last_value_var = tensor.fill_constant(
+
+    def _lr_schedule(dtype):
+        with default_main_program()._lr_schedule_guard():
+            if len(values) - len(boundaries) != 1:
+                raise ValueError("len(values) - len(boundaries) should be 1")
+
+            global_step = _decay_step_counter()
+
+            lr = tensor.create_global_var(
                 shape=[1],
+                value=0.0,
                 dtype='float32',
-                value=float(values[len(values) - 1]))
-            with switch.default():
-                tensor.assign(last_value_var, lr)
+                persistable=True,
+                name="learning_rate")
+
+            with control_flow.Switch() as switch:
+                for i in range(len(boundaries)):
+                    boundary_val = tensor.fill_constant(
+                        shape=[1],
+                        dtype='float32',
+                        value=float(boundaries[i]),
+                        force_cpu=True)
+                    value_var = tensor.fill_constant(
+                        shape=[1], dtype='float32', value=float(values[i]))
+                    with switch.case(global_step < boundary_val):
+                        tensor.assign(value_var, lr)
+                last_value_var = tensor.fill_constant(
+                    shape=[1],
+                    dtype='float32',
+                    value=float(values[len(values) - 1]))
+                with switch.default():
+                    tensor.assign(last_value_var, lr)
+
+        return lr
 
-    return lr
+    return _lr_schedule
 
 
 def append_LARS(params_grads, learning_rate, weight_decay):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 4d8311a0d3ada78e4f6cc54f8990e2a2e2cadc4d..4d44ce50a310cc6c95318a159b15544d8628e0bf 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -29,6 +29,7 @@ from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
+from ..imperative import layers
 
 __all__ = [
     'fc',
@@ -2797,6 +2798,10 @@ def batch_norm(input,
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
 
+    # use fp32 for bn parameter
+    if dtype == core.VarDesc.VarType.FP16:
+        dtype = core.VarDesc.VarType.FP32
+
     input_shape = input.shape
     if data_layout == 'NCHW':
         channel_num = input_shape[1]
@@ -2831,7 +2836,7 @@ def batch_norm(input,
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
-        dtype=input.dtype)
+        dtype=dtype)
     mean.stop_gradient = True
 
     variance = helper.create_parameter(
@@ -2841,7 +2846,7 @@ def batch_norm(input,
             trainable=False,
             do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
-        dtype=input.dtype)
+        dtype=dtype)
     variance.stop_gradient = True
 
     # create output
@@ -9426,3 +9431,47 @@ def huber_loss(input, label, delta):
                  'Residual': residual},
         attrs={'delta': delta})
     return out
+
+
+class FC(layers.PyLayer):
+    def __init__(self,
+                 size,
+                 param_attr=None,
+                 num_flatten_dims=1,
+                 dtype=core.VarDesc.VarType.FP32):
+        super(FC, self).__init__()
+        self._size = size
+        self._num_flatten_dims = num_flatten_dims
+        self._dtype = dtype
+        self._helper = LayerHelper('FC', param_attr=param_attr)
+
+    def _build_once(self, inputs):
+        input_shape = inputs[0].shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
+        ] + [self._size]
+        self._w = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, inputs):
+        tmp = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="mul",
+            inputs={"X": inputs[0],
+                    "Y": self._w},
+            outputs={"Out": tmp},
+            attrs={
+                "x_num_col_dims": self._num_flatten_dims,
+                "y_num_col_dims": 1
+            })
+
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="sum",
+            inputs={"X": [tmp]},
+            outputs={"Out": out},
+            attrs={"use_mkldnn": False})
+        return out
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 59c22d4e498814d468c78b10265b7afe35461dfb..58cfc498c9edd77163b2bd4cad2cb991b6f2b20c 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -50,17 +50,21 @@ class Optimizer(object):
 
     def __init__(self, learning_rate, regularization=None, name=None):
         if not isinstance(learning_rate, float) and \
-                not isinstance(learning_rate, framework.Variable):
-            raise TypeError("learning rate should be float or Variable")
+                not isinstance(learning_rate, framework.Variable) and \
+                not callable(learning_rate):
+            raise TypeError(
+                "learning rate should be float or Variable or callable(dtype)")
         self._name = name
         self.regularization = regularization
         self._learning_rate = learning_rate
         # the learning rate type should be inferenced from loss
         self._dtype = None
         # each program should have a independent learning rate
-        # program -> Variable(learning_rate)
+        # program -> Variable(learning_rate) or:
+        # program -> callable(return learning_rate Variable)
         self._learning_rate_map = dict()
-        if isinstance(self._learning_rate, framework.Variable):
+        if isinstance(self._learning_rate, framework.Variable) or \
+            callable(self._learning_rate):
             self._learning_rate_map[framework.default_main_program(
             )] = self._learning_rate
         # Dictionary of accumulators. Some optimizer subclasses need to
@@ -75,6 +79,11 @@ class Optimizer(object):
 
         if isinstance(lr, framework.Variable):
             return
+        elif callable(lr):
+            dtype = 'float32' if self._dtype is None else self._dtype
+            self._learning_rate_map[framework.default_main_program()] = lr(
+                dtype)
+            return
         else:
             if not isinstance(self._learning_rate, float):
                 raise TypeError(
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index c54c3963a152851f5396c2ba71c28cc09c1cd523..74cf76da951a4cea884c4fdb8591b3d4fb010300 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -92,35 +92,27 @@ class ParallelExecutor(object):
                  num_trainers=1,
                  trainer_id=0,
                  scope=None):
+        # step1: get places, the places are used in run too.
         self._places = []
-        self._act_places = []
         if use_cuda:
-            gpus = []
             gpus_env = os.getenv("FLAGS_selected_gpus")
             if gpus_env:
                 gpus = [int(s) for s in gpus_env.split(",")]
             else:
-                for i in six.moves.range(core.get_cuda_device_count()):
-                    gpus.append(i)
-            for i in gpus:
-                p = core.Place()
-                self._act_places.append(core.CUDAPlace(i))
-                p.set_place(self._act_places[-1])
-                self._places.append(p)
+                gpus = [
+                    i for i in six.moves.range(core.get_cuda_device_count())
+                ]
+            self._places = [core.CUDAPlace(i) for i in gpus]
         else:
             cpu_num = int(
                 os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            for i in six.moves.range(cpu_num):
-                p = core.Place()
-                self._act_places.append(core.CPUPlace())
-                p.set_place(self._act_places[-1])
-                self._places.append(p)
+            self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
         assert self._places, "no place for execution"
 
+        # step2: init exec_strategy
         if exec_strategy is None:
             exec_strategy = ExecutionStrategy()
         exec_strategy.use_cuda = use_cuda
-
         if exec_strategy.num_threads == 0:
             if use_cuda:
                 # Experiments on se-resnext shows that too many threads hurt
@@ -131,49 +123,54 @@ class ParallelExecutor(object):
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                 exec_strategy.num_threads = cpu_num * 2
 
+        # step3: init build_strategy
         if build_strategy is None:
             build_strategy = BuildStrategy()
-
         build_strategy.num_trainers = num_trainers
         build_strategy.trainer_id = trainer_id
 
-        main = main_program
-        main = main if main else framework.default_main_program()
+        # step4: get main_program, scope, local_scopes
+        main = main_program if main_program \
+            else framework.default_main_program()
+        scope = scope if scope is not None else executor.global_scope()
+
+        if share_vars_from and not isinstance(share_vars_from,
+                                              ParallelExecutor):
+            raise TypeError("share_vars_from must be ParallelExecutor.")
+
+        local_scopes = share_vars_from.executor.local_scopes()\
+            if share_vars_from else []
 
+        # step5: check trainers_endpoints, it is used for distribution.
         trainers_endpoints = main._trainers_endpoints
         if num_trainers > 1 and trainers_endpoints:
             assert num_trainers == len(
                 trainers_endpoints), "num_trainers == len(end_points)"
             build_strategy.trainers_endpoints = trainers_endpoints
 
-        if scope == None:
-            scope = executor.global_scope()
-
-        if share_vars_from and not isinstance(share_vars_from,
-                                              ParallelExecutor):
-            raise TypeError("share_vars_from must be ParallelExecutor.")
-
-        local_scopes = share_vars_from.executor.local_scopes(
-        ) if share_vars_from else []
-
-        self.persistable_vars = [
-            v.name for v in [
+        # step5: get persistable_vars, parameter_vars, places. persistable_vars
+        # need be broadcast to other local_scope.
+        persistable_vars = set([
+            cpt.to_text(v.name) for v in [
                 var for var in main.list_vars()
                 if var.persistable and var.type != core.VarDesc.VarType.RAW
             ]
-        ]
+        ])
+
+        def place_obj(place):
+            p = core.Place()
+            p.set_place(place)
+            return p
 
+        places = list(map(place_obj, self._places))
+
+        # step6: init ParallelExecutor
         self.executor = core.ParallelExecutor(
-            self._places,
-            set([
-                cpt.to_text(p.name)
-                for p in main.global_block().iter_parameters()
-                if not p.stop_gradient
-            ]),
-            set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
+            places, persistable_vars, main.desc,
             cpt.to_text(loss_name)
             if loss_name else six.u(''), scope, local_scopes, exec_strategy,
             build_strategy, num_trainers, trainer_id)
+
         self.scope = scope
 
     def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
@@ -261,7 +258,7 @@ class ParallelExecutor(object):
             self.executor.feed_and_split_tensor_into_local_scopes(
                 feed_tensor_dict)
         elif isinstance(feed, list) or isinstance(feed, tuple):
-            if len(feed) != len(self._act_places):
+            if len(feed) != len(self._places):
                 raise ValueError(
                     "Feed a list of tensor, the list should be the same size as places"
                 )
@@ -277,7 +274,7 @@ class ParallelExecutor(object):
                     tensor = each[feed_name]
                     if not isinstance(tensor, core.LoDTensor):
                         tmp = core.LoDTensor()
-                        tmp.set(tensor, self._act_places[i])
+                        tmp.set(tensor, self._places[i])
                         tensor = tmp
                     res_dict[feed_name] = tensor
                 res.append(res_dict)
@@ -294,4 +291,4 @@ class ParallelExecutor(object):
 
     @property
     def device_count(self):
-        return len(self._act_places)
+        return len(self._places)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 76a707efdc0804be0316ab12c347ffed6199529a..0fe836683b029698b670bbb9f9bb258c2f3b68a0 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -368,6 +368,8 @@ class OpTest(unittest.TestCase):
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
                     return [place]
+                else:
+                    return []
             else:
                 return []
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index 1b2b53f2d4ce91ae7b5b191ed770b5338f0948c8..5257b0be6f61bc90a6492c44044c122485f4742c 100644
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -22,8 +22,10 @@ from op_test import OpTest
 class TestAccuracyOp(OpTest):
     def setUp(self):
         self.op_type = "accuracy"
+        self.dtype = np.float32
+        self.init_dtype()
         n = 8192
-        infer = np.random.random((n, 1)).astype("float32")
+        infer = np.random.random((n, 1)).astype(self.dtype)
         indices = np.random.randint(0, 2, (n, 1))
         label = np.random.randint(0, 2, (n, 1))
         self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
@@ -34,14 +36,25 @@ class TestAccuracyOp(OpTest):
                     num_correct += 1
                     break
         self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype("float32"),
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
             'Correct': np.array([num_correct]).astype("int32"),
             'Total': np.array([n]).astype("int32")
         }
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestAccuracyOpFp16(TestAccuracyOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
index 1902a9869807ba7ce3f9828c124256cc6752857e..438d45b84033b697c3210acc44392b93bf436df0 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 
-from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
+from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
 
 
 class TestMKLDNN(TestConv2dOp):
@@ -37,5 +37,23 @@ class TestMKLDNNWithStride(TestWithStride):
         self.data_format = "NCHW"
 
 
+class TestMKLDNNWithGroup(TestWithGroup):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNWith1x1(TestWith1x1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index cadaf1df53af0af56afa8c3631b0f5ce390f318c..15d4db590edc9012604361751e9860ba63239bba 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -21,14 +21,16 @@ from op_test import OpTest
 class ElementwiseDivOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
+        self.dtype = np.float32
+        self.init_dtype()
         """ Warning
         CPU gradient check error!
         'X': np.random.random((32,84)).astype("float32"),
         'Y': np.random.random((32,84)).astype("float32")
         """
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
         }
         self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
 
@@ -46,6 +48,9 @@ class ElementwiseDivOp(OpTest):
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
 
+    def init_dtype(self):
+        pass
+
 
 class TestElementwiseDivOp_scalar(ElementwiseDivOp):
     def setUp(self):
@@ -126,5 +131,21 @@ class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
         }
 
 
+class TestElementwiseDivOpFp16(ElementwiseDivOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=1, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=1, no_grad_set=set('Y'))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 57ba34f833f824d13e0b82caea789f7f57622bc9..04840991883229614c1ca4890e5cec2e7ae21084 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -135,5 +135,10 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
         }
 
 
+class TestElementwiseMulOpFp16(ElementwiseMulOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
index eec73d0beb39c49f535a03532e536092001c8445..20f1a110c35d689064c49efba246f078c3badd33 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
@@ -22,12 +22,22 @@ from op_test import OpTest
 class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
-        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
+        self.dtype = np.float32
+        self.init_dtype()
+        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
         self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index b5b6305155d1ef3dcf6ce590c221664754c5bdc8..0fe69d1bd4b1b10c09879871c8cf1fc197d1106b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -12,12 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import unittest
-import sys
 import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.fluid.layers.nn import FC
+
+
+@contextlib.contextmanager
+def new_program_scope():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
 
 
 class MyLayer(fluid.imperative.PyLayer):
@@ -30,6 +41,23 @@ class MyLayer(fluid.imperative.PyLayer):
         return [fluid.layers.elementwise_mul(x, x)]
 
 
+class MLP(fluid.imperative.PyLayer):
+    def __init__(self):
+        super(MLP, self).__init__()
+        self._fc1 = FC(3,
+                       fluid.ParamAttr(
+                           initializer=fluid.initializer.Constant(value=0.1)))
+        self._fc2 = FC(4,
+                       fluid.ParamAttr(
+                           initializer=fluid.initializer.Constant(value=0.1)))
+
+    def forward(self, inputs):
+        x = self._fc1(inputs[0])
+        x = self._fc2(x)
+        x = fluid.layers.reduce_sum(x)
+        return x
+
+
 class TestImperative(unittest.TestCase):
     def test_layer(self):
         with fluid.imperative.guard():
@@ -39,13 +67,56 @@ class TestImperative(unittest.TestCase):
             l.forward([])
 
     def test_layer_in_out(self):
+        np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
             l = MyLayer()
-            x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0]
+            x = l(np_inp)[0]
             self.assertIsNotNone(x)
-            sys.stderr.write("%s output: %s\n" % (x, x._numpy()))
+            dy_out = x._numpy()
             x._backward()
-            sys.stderr.write("grad %s\n" % l._x_for_debug._gradient())
+            dy_grad = l._x_for_debug._gradient()
+
+        with new_program_scope():
+            inp = fluid.layers.data(
+                name="inp", shape=[3], append_batch_size=False)
+            l = MyLayer()
+            x = l(inp)[0]
+            param_grads = fluid.backward.append_backward(
+                x, parameter_list=[l._x_for_debug.name])[0]
+            exe = fluid.Executor(fluid.CPUPlace())
+
+            static_out, static_grad = exe.run(
+                feed={inp.name: np_inp},
+                fetch_list=[x.name, param_grads[1].name])
+
+        self.assertTrue(np.allclose(dy_out, static_out))
+        self.assertTrue(np.allclose(dy_grad, static_grad))
+
+    def test_mlp(self):
+        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        with fluid.imperative.guard():
+            mlp = MLP()
+            out = mlp(np_inp)
+            dy_out = out._numpy()
+            out._backward()
+            dy_grad = mlp._fc1._w._gradient()
+
+        with new_program_scope():
+            inp = fluid.layers.data(
+                name="inp", shape=[2, 2], append_batch_size=False)
+            mlp = MLP()
+            out = mlp(inp)
+            param_grads = fluid.backward.append_backward(
+                out, parameter_list=[mlp._fc1._w.name])[0]
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(fluid.default_startup_program())
+
+            static_out, static_grad = exe.run(
+                feed={inp.name: np_inp},
+                fetch_list=[out.name, param_grads[1].name])
+
+        self.assertTrue(np.allclose(dy_out, static_out))
+        self.assertTrue(np.allclose(dy_grad, static_grad))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 0d3e6d73e0149fe633b8f1de9041068c2e3bb293..e34a712d844c2d45f442d04f9350fbd7bc911a2a 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -97,7 +97,7 @@ class TestLearningRateDecay(unittest.TestCase):
         startup_prog = fluid.Program()
 
         with fluid.program_guard(main_prog, startup_prog):
-            decayed_lr = fluid_decay_fn(**kwargs)
+            decayed_lr = fluid_decay_fn(**kwargs)("float32")
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index cf4346cf2e7a099334ec273546901a91d0ad925d..77ec6f9b6bcda7568325698634fd4f86557cd1be 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -24,11 +24,13 @@ from op_test import OpTest
 class TestMomentumOp1(OpTest):
     def setUp(self):
         self.op_type = "momentum"
+        self.dtype = np.float32
+        self.init_dtype()
 
-        param = np.random.random((123, 321)).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        velocity = np.zeros((123, 321)).astype("float32")
-        learning_rate = np.array([0.001]).astype("float32")
+        param = np.random.random((123, 321)).astype(self.dtype)
+        grad = np.random.random((123, 321)).astype(self.dtype)
+        velocity = np.zeros((123, 321)).astype(self.dtype)
+        learning_rate = np.array([0.001]).astype(self.dtype)
         mu = 0.0001
         use_nesterov = False
 
@@ -50,10 +52,21 @@ class TestMomentumOp1(OpTest):
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
+    def init_dtype(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestMomentumOpFp16(TestMomentumOp1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
 class TestMomentumOp2(OpTest):
     '''Test Momentum with default values for attributes
     '''
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index 69b29db83a43d18c0825b610642009a0377b9901..21b5a62baf96bfb2d76a8c59133e8f5d1cb35aea 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -23,8 +23,11 @@ class TestTopkOp(OpTest):
     def setUp(self):
         self.set_args()
         self.op_type = "top_k"
+        self.dtype = np.float32
+        self.init_dtype()
+
         k = self.top_k
-        input = np.random.random((self.row, k)).astype("float32")
+        input = np.random.random((self.row, k)).astype(self.dtype)
         output = np.ndarray((self.row, k))
         indices = np.ndarray((self.row, k)).astype("int64")
 
@@ -38,6 +41,9 @@ class TestTopkOp(OpTest):
 
         self.outputs = {'Out': output, 'Indices': indices}
 
+    def init_dtype(self):
+        pass
+
     def set_args(self):
         self.row = 32
         self.top_k = 1
@@ -46,6 +52,11 @@ class TestTopkOp(OpTest):
         self.check_output()
 
 
+class TestTopkOpFp16(TestTopkOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestTopkOp3d(OpTest):
     def setUp(self):
         self.op_type = "top_k"
diff --git a/python/setup.py.in b/python/setup.py.in
index 22b9537a90e79c2571f61ec0dc156b602df784d6..5d5f2dd0f18cd3e707dca8b9f337f2f2a07d47aa 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -107,9 +107,9 @@ packages=['paddle',
           'paddle.fluid.distributed',
           'paddle.fluid.layers',
           'paddle.fluid.contrib',
-          'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']