diff --git a/CMakeLists.txt b/CMakeLists.txt
index f56c5d382af8cdfb5a941ee272a0f8d22ec04d67..920c20d6f813c14df8e02593b1c4a5a13cc11ef0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,12 +204,11 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 
+set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
 if(WITH_GPU)
     include(cuda)
     include(tensorrt)
     include(external/anakin)
-else()
-  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
 endif()
 
 include(cudnn)              # set cudnn libraries, must before configure
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
index e1eee3f818796e895362caab10846cf59b557162..3571f81326a9f9ae31a8327c3e288e601f248e4b 100644
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@@ -4,7 +4,7 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。
 
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
 包括
 
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
@@ -104,5 +104,5 @@ engine
 ------------
 
 -  `inference
-   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
--  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
+   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci>`__
+-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/api_impl_tester.cc>`__
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 2ce73df0248bee244665b033edddcea70407546d..ea9105d79c18310b16bdc81a7ab0e643e72c3965 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N
 paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Program.inference_optimize ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Program.inference_optimize ArgSpec(args=['self', 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No
 paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
@@ -52,7 +55,7 @@ paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path
 paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
 paddle.fluid.InferenceTranspiler.__init__ 
@@ -74,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro
 paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
 paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
@@ -156,6 +159,8 @@ paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaul
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -324,7 +329,7 @@ paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
 paddle.fluid.transpiler.InferenceTranspiler.__init__ 
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
index 6e302a29233b96451df14b4685911be1cd87c1ab..c97b364de1ecae21e97351196389615187932b5e 100644
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -22,27 +23,24 @@ namespace details {
 
 class ExceptionHolder {
  public:
-  void Catch(const platform::EnforceNotMet& exp) {
-    std::lock_guard<std::mutex> lock(mu_);
-    exception_.reset(new platform::EnforceNotMet(exp));
-    type_ = kEnforceNotMet;
-  }
-
-  void Catch(const platform::EOFException& exp) {
-    std::lock_guard<std::mutex> lock(mu_);
-    // EOFException will not cover up existing EnforceNotMet.
-    if (exception_.get() == nullptr) {
-      exception_.reset(new platform::EOFException(exp));
-      type_ = kEOF;
+  void Catch(std::exception_ptr eptr) {
+    try {
+      std::rethrow_exception(eptr);
+    } catch (platform::EOFException exp) {
+      Catch(exp);
+    } catch (platform::EnforceNotMet exp) {
+      Catch(exp);
+    } catch (...) {
+      LOG(FATAL) << "Unknown exception caught";
     }
   }
 
-  bool ExceptionCatched() const {
+  bool IsCaught() const {
     std::lock_guard<std::mutex> lock(mu_);
     return exception_.get() != nullptr;
   }
 
-  void Throw() {
+  void ReThrow() {
     std::lock_guard<std::mutex> lock(mu_);
     switch (type_) {
       case kNone:
@@ -50,27 +48,41 @@ class ExceptionHolder {
       case kEnforceNotMet: {
         auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
         throw e;
-        break;
       }
       case kEOF: {
         auto e = *static_cast<platform::EOFException*>(exception_.get());
         throw e;
-        break;
       }
-      default:
-        LOG(FATAL) << "Unknown exception.";
     }
-    exception_.reset();
-    type_ = kNone;
+    ClearImpl();
   }
 
   void Clear() {
     std::lock_guard<std::mutex> lock(mu_);
+    ClearImpl();
+  }
+
+ private:
+  void ClearImpl() {
     exception_.reset();
     type_ = kNone;
   }
 
- private:
+  void Catch(const platform::EnforceNotMet& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset(new platform::EnforceNotMet(exp));
+    type_ = kEnforceNotMet;
+  }
+
+  void Catch(const platform::EOFException& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    // EOFException will not cover up existing EnforceNotMet.
+    if (exception_.get() == nullptr) {
+      exception_.reset(new platform::EOFException(exp));
+      type_ = kEOF;
+    }
+  }
+
   enum ExceptionType { kNone, kEnforceNotMet, kEOF };
   ExceptionType type_{kNone};
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 994bb6492f685138d02971a6caf12572aecd6d6f..c9e331ef359f853263f8dad38dd0a2be4d9618ad 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -107,11 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      if (exception_holder_.ExceptionCatched()) {
+      if (exception_holder_.IsCaught()) {
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        exception_holder_.Throw();
+        exception_holder_.ReThrow();
       } else {
         continue;
       }
@@ -220,12 +220,8 @@ void ThreadedSSAGraphExecutor::RunOp(
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
-    } catch (platform::EOFException ex) {
-      exception_holder_.Catch(ex);
-    } catch (platform::EnforceNotMet ex) {
-      exception_holder_.Catch(ex);
     } catch (...) {
-      LOG(FATAL) << "Unknown exception catched";
+      exception_holder_.Catch(std::current_exception());
     }
   };
   if (pool_) {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index bf7d76a8a6e173e648cea5aaba9b7202d787173b..923a7083d4f30b646bbab03d79992b275aa2b403 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -3,7 +3,10 @@ cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
+cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
+cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
 
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
+cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index c9d55fbf525a1a476ac469e8e57462169a7db2da..5736a5c4e232698085936303d1f23760649f8245 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -28,6 +28,38 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+/*
+ * The graph is a Directed Acyclic Single Static Assignment Graph.
+ *
+ * In more detail, the following properties must hold:
+ *
+ *   The graph shouldn't contain cycle. Each node is a black-box to the graph
+ *   so the node itself could be a loop operator.
+ *
+ *   Each Variable-type node has only one input (thus single static assignment).
+ *
+ *   The output/input of operator is variable and the output/input of variable
+ *   is operator.
+ *
+ * The following data harzards in Program are addressed in the Graph:
+ *
+ *   Write-After-Read
+ *     a = op1(x)
+ *     x = op2(b)
+ *     A control-dependency connection is created bettwen op1 and op2 such that
+ *     op1->op2, so as to ensure correct order.
+ *
+ *   Write-After-Write
+ *     x = op1(a)
+ *     x = op2(b)
+ *     A control-dependency connection is created between op1 and op2 such that
+ *     op1->op2, so as to ensure correct order.
+ *
+ * Other properties currently hold, but is not enforced yet:
+ *
+ *   Variable-type node (not control dep) with the same variable name share
+ *   the same underlying VarDesc.
+ */
 class Graph {
  public:
   explicit Graph(const ProgramDesc &program);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.cc b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f27d9b0509aa4561cfd1e5da3b46a3a085cc888c
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
+  nodes_.emplace_back(new PDNode(std::move(teller), name));
+  auto* cur = nodes_.back().get();
+  return cur;
+}
+
+void PDPattern::AddEdge(PDNode* a, PDNode* b) {
+  PADDLE_ENFORCE(a);
+  PADDLE_ENFORCE(b);
+  PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
+  edges_.emplace_back(a, b);
+}
+
+void GraphPatternDetecter::operator()(Graph* graph,
+                                      GraphPatternDetecter::handle_t handler) {
+  if (!MarkPDNodesInGraph(*graph)) return;
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+
+  for (auto& g : subgraphs) {
+    handler(g, graph);
+  }
+}
+
+bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
+  if (graph.Nodes().empty()) return false;
+
+  for (auto& node : GraphTraits::DFS(graph)) {
+    for (const auto& pdnode : pattern_.nodes()) {
+      if (pdnode->Tell(&node)) {
+        pdnodes2nodes_[pdnode.get()].insert(&node);
+      }
+    }
+  }
+  return !pdnodes2nodes_.empty();
+}
+
+struct HitGroup {
+  std::unordered_map<PDNode*, Node*> roles;
+
+  bool Match(Node* node, PDNode* pat) {
+    return !roles.count(pat) || roles.at(pat) == node;
+  }
+
+  void Register(Node* node, PDNode* pat) { roles[pat] = node; }
+};
+
+// Tell whether Node a links to b.
+bool IsNodesLink(Node* a, Node* b) {
+  for (auto* node : a->outputs) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<GraphPatternDetecter::subgraph_t>
+GraphPatternDetecter::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<GraphPatternDetecter::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
+  auto* first_pnode = pattern_.edges().front().first;
+  if (!pdnodes2nodes_.count(first_pnode)) return result;
+  for (auto* node : pdnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+
+  int step = 0;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  bi_records[0] = std::move(init_groups);
+
+  // Extend a PDNode to subgraphs by deducing the connection relations defined
+  // in edges of PDNodes.
+  for (const auto& edge : pattern_.edges()) {
+    // Each role has two PDNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto& pre_groups = bi_records[step % 2];
+    auto& cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    // source -> target
+    for (Node* source : pdnodes2nodes_[edge.first]) {
+      for (Node* target : pdnodes2nodes_[edge.second]) {
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto& group : pre_groups) {
+          HitGroup new_group = group;
+          if (IsNodesLink(source, target) &&
+              new_group.Match(source, edge.first)) {
+            new_group.Register(source, edge.first);
+            if (new_group.Match(target, edge.second)) {
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (auto& group : bi_records[step % 2]) {
+    GraphPatternDetecter::subgraph_t subgraph;
+    for (auto& role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+
+void GraphPatternDetecter::UniquePatterns(
+    std::vector<GraphPatternDetecter::subgraph_t>* subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<GraphPatternDetecter::subgraph_t> result;
+
+  std::unordered_set<size_t> set;
+  for (auto& g : *subgraphs) {
+    size_t key = 0;
+    for (auto& item : g) {
+      key ^= std::hash<void*>{}(item.first);
+      key ^= std::hash<void*>{}(item.second);
+    }
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+
+void GraphPatternDetecter::RemoveOverlappedMatch(
+    std::vector<subgraph_t>* subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node*> node_set;
+
+  for (const auto& subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto& item : subgraph) {
+      if (node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto& item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.h b/paddle/fluid/framework/ir/graph_pattern_detecter.h
new file mode 100644
index 0000000000000000000000000000000000000000..1778bf00000f60e5cf8b2a585bf7e5dae0a582eb
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest_prod.h>
+#endif
+
+#include <numeric>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// Some basic torminolygies:
+//   - PDPattern: a pattern defined as a data flow graph.
+//   - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
+//     that meets some conditions defined in `PDNode.teller`.
+//   - A pattern is defined with PDNodes with edges.
+
+// Pattern detector node. This node helps to build a pattern.
+struct PDNode {
+  // tell whether an ir::Node* is a candidation for a PDNode.
+  using teller_t = std::function<bool(Node*)>;
+
+  PDNode(teller_t&& teller, const std::string& name = "")
+      : teller_(teller), name_(name) {
+    PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
+  }
+
+  PDNode(PDNode&& other) = default;
+
+  std::vector<PDNode*> inlinks;
+  std::vector<PDNode*> outlinks;
+
+  bool Tell(Node* node) const {
+    PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode");
+    return teller_(node);
+  }
+
+  const std::string& name() const { return name_; }
+
+  PDNode(const PDNode&) = delete;
+  PDNode& operator=(const PDNode&) = delete;
+
+ private:
+  teller_t teller_;
+  std::string name_;
+};
+
+/*
+ * A pattern in a graph, which defined with PDNode and edges. Most graph
+ * patterns can be divided into PDNodes and link relations between them.
+ *
+ * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
+ * operators from the computation graph, the MUL's output should have only one
+ * consumer which is the ELEMENTWISE_ADD.
+ * This pattern can be defined as with the following pseudo codes
+ *
+ *     // Create two operator PDNodes.
+ *     MUL = PDPattern.NewNode()
+ *     ELE = PDPattern.NewNode()
+ *     // Create the variable PDNodes.
+ *     MUL_out = PDPattern.NewNode()
+ *     // Add teller to define some rules that help to filter the target Nodes.
+ *     MUL.teller = lambda(node): node->IsOp() && node->Op()->Type == "mul";
+ *     ELE.teller = lambda(node): \
+ *                        node->IsOp() && node->Op()->Type == "elementwise_add";
+ *     MUL_out.teller = lambda(node): node->IsVar() && (MUL in node->inputs)
+ *                                                  && (ELE in node->outputs)
+ *
+ * One can add more specific tellers for PDNodes or edges, both the Operator
+ * and Variable Nodes can be ruled in PDNode.teller.
+ *
+ * PDPattern can record the general patterns, such as the pattern represents
+ *   - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
+ *   - Ops whose inputs and outputs share the same variables
+ */
+class PDPattern {
+ public:
+  using edge_t = std::pair<PDNode*, PDNode*>;
+
+  void AddEdge(PDNode* a, PDNode* b);
+
+  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
+
+  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
+  const std::vector<edge_t>& edges() const { return edges_; }
+
+ private:
+#ifdef PADDLE_WITH_TESTING
+  FRIEND_TEST(PDPattern, AddEdge);
+  FRIEND_TEST(PDPattern, NewNode);
+#endif
+
+  std::vector<std::unique_ptr<PDNode>> nodes_;
+  std::vector<edge_t> edges_;
+};
+
+/*
+ * GraphPatternDetecter helps to detect the specific patterns in the graph.
+ * Input a pattern, output a list of the matched subgraphs/nodes.
+ * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
+ *
+ * The algorithm has three phases:
+ *   1. Mark the nodes that match the defined PDNodes in a PDPattern,
+ *   2. Extend a PDNode to subgraphs by deducing the connection relation defined
+ *      in PAPattern(the edges),
+ *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
+ *
+ * Usage:
+ *    // Create a detector
+ *    GraphPatternDetecter detector;
+ *    // Define the detector's pattern, by adding PDNode and define the edges.
+ *    auto* node0 = detector.mutable_pattern().AddNode(...)
+ *    auto* node1 = detector.mutable_pattern().AddNode(...)
+ *    node0->teller = some lambda.
+ *    node1->teller = some lambda.
+ *    detector.mutable_pattern().AddEdge(node0, node1);
+ *    // Create an handler, to define the behavior of treating the filtered
+ *    // subgraphs that comply with the patterns.
+ *    GraphPatternDetecter::handle_t handler = some labmda
+ *    // Execute the detector.
+ *    detector(&graph, handler);
+ */
+class GraphPatternDetecter {
+ public:
+  using subgraph_t = std::unordered_map<PDNode*, Node*>;
+
+  // Operate on the detected pattern.
+  using handle_t =
+      std::function<void(const subgraph_t& /*hitted pattern*/, Graph*)>;
+
+  void operator()(Graph* graph, handle_t handler);
+
+  const PDPattern& pattern() const { return pattern_; }
+  PDPattern* mutable_pattern() { return &pattern_; }
+
+ private:
+  // Mark the nodes that fits the pattern.
+  bool MarkPDNodesInGraph(const ir::Graph& graph);
+
+  // Detect all the pattern and output the hit records.
+  std::vector<subgraph_t> DetectPatterns();
+
+  // Remove duplicate patterns.
+  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
+
+  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
+
+#ifdef PADDLE_WITH_TESTING
+  FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph);
+  FRIEND_TEST(GraphPatternDetecter, DetectPatterns);
+#endif
+
+ private:
+  using hit_rcd_t =
+      std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
+  PDPattern pattern_;
+  std::vector<hit_rcd_t> marked_records_;
+  std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..993c885a810fe80a170ed190b892b148d85e8b5f
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(PDPattern, NewNode) {
+  PDPattern x;
+  auto* n = x.NewNode([](Node* x) { return true; });
+  ASSERT_TRUE(n);
+  ASSERT_EQ(x.nodes_.size(), 1UL);
+}
+
+TEST(PDPattern, AddEdge) {
+  PDPattern x;
+  auto* a = x.NewNode([](Node* x) { return true; });
+  auto* b = x.NewNode([](Node* x) { return true; });
+  ASSERT_TRUE(a);
+  ASSERT_TRUE(b);
+  x.AddEdge(a, b);
+  ASSERT_EQ(x.nodes_.size(), 2UL);
+  ASSERT_EQ(x.edges_.size(), 1UL);
+  ASSERT_EQ(x.edges_.front().first, a);
+  ASSERT_EQ(x.edges_.front().second, b);
+
+  ASSERT_EQ(x.nodes().size(), 2UL);
+  ASSERT_EQ(x.edges().size(), 1UL);
+  ASSERT_EQ(x.edges().front().first, a);
+  ASSERT_EQ(x.edges().front().second, b);
+}
+
+TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
+  GraphPatternDetecter x;
+  // mark o2, o3, v2
+
+  // The pattern is a graph:
+  //   o2(a node named o2) -> v2(a node named v2)
+  //   v2 -> o3(a node named o3)
+  auto* o2 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "op2" && node->IsOp();
+  });
+  auto* o3 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "op3" && node->IsOp();
+  });
+  auto* v2 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "var2" && node->IsVar();
+  });
+
+  ASSERT_FALSE(o2->Tell(nullptr));
+  ASSERT_FALSE(o3->Tell(nullptr));
+  ASSERT_FALSE(v2->Tell(nullptr));
+
+  x.pattern_.AddEdge(o2, v2);
+  x.pattern_.AddEdge(v2, o3);
+
+  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
+  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
+  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
+
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+
+  x.MarkPDNodesInGraph(graph);
+
+  ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL);
+
+  auto subgraphs = x.DetectPatterns();
+  ASSERT_EQ(subgraphs.size(), 1UL);
+}
+
+TEST(GraphPatternDetecter, MultiSubgraph) {
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+
+  GraphPatternDetecter x;
+
+  // The pattern is a graph:
+  //   op -> var
+  auto* any_op = x.mutable_pattern()->NewNode(
+      [](Node* node) {
+        return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
+      },
+      "OP0");
+  auto* any_var = x.mutable_pattern()->NewNode(
+      [](Node* node) { return node->IsVar(); }, "VAR");
+  auto* any_op1 = x.mutable_pattern()->NewNode(
+      [](Node* node) { return node->IsOp(); }, "OP1");
+
+  x.mutable_pattern()->AddEdge(any_op, any_var);
+  x.mutable_pattern()->AddEdge(any_var, any_op1);
+
+  int count = 0;
+  GraphPatternDetecter::handle_t handle = [&](
+      const GraphPatternDetecter::subgraph_t& s, Graph* g) {
+    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
+              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
+    count++;
+  };
+
+  x(&graph, handle);
+
+  // 1. Detect op3 -> var4 -> op5
+  // 2. Detect op2 -> var2 -> op3
+  // 3. Detect op2 -> var2 -> op4
+  // 4. Detect op2 -> var3 -> op5
+  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
+  ASSERT_GE(count, 1UL);
+  ASSERT_LE(count, 2UL);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index f9e6bdf3625bdced9d1a9195a979b0f46016d8bf..b1b8d1c586c98a327a8e5b4890ced00022155e6b 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -36,7 +36,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
+    AddOutput("Out", "").AsDuplicable();
     AddComment("");
   }
 };
@@ -59,11 +59,27 @@ class SumOpVarTypeInference : public VarTypeInference {
     block->Var(out_var_name)->SetType(default_var_type);
   }
 };
+
+class DummyOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "").AsDuplicable();
+    AddComment("");
+  }
+};
+
+class DummyOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
+};
 }  // namespace framework
 }  // namespace paddle
 
 REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
                   paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
 REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
                   paddle::framework::SumOpMaker);
 
@@ -110,5 +126,83 @@ TEST(GraphTest, Basic) {
   }
   ASSERT_EQ(nodes.size(), 5);
 }
+
+TEST(GraphTest, WriteAfterRead) {
+  // void Test() {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"a"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+
+  op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("dummy");
+  op->SetInput("X", {"c"});
+  op->SetOutput("Out", {"a"});
+  op->SetAttr("op_role", 1);
+
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  ir::Node *control_dep1 = nullptr;
+  ir::Node *control_dep2 = nullptr;
+  for (ir::Node *n : g->Nodes()) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->outputs[0]->Name(), "b");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
+      control_dep1 = n->outputs[1];
+      ASSERT_EQ(n->outputs.size(), 2);
+    }
+    if (n->Name() == "dummy") {
+      ASSERT_EQ(n->inputs[0]->Name(), "c");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
+      control_dep2 = n->inputs[1];
+      ASSERT_EQ(n->inputs.size(), 2);
+    }
+  }
+  ASSERT_EQ(control_dep1, control_dep2);
+}
+
+TEST(GraphTest, WriteAfterWrite) {
+  // void Test() {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"a"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+
+  op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("dummy");
+  op->SetInput("X", {"c"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  ir::Node *control_dep1 = nullptr;
+  ir::Node *control_dep2 = nullptr;
+  for (ir::Node *n : g->Nodes()) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->outputs[0]->Name(), "b");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
+      ASSERT_EQ(n->outputs.size(), 2);
+      control_dep1 = n->outputs[1];
+    }
+    if (n->Name() == "dummy") {
+      ASSERT_EQ(n->inputs[0]->Name(), "c");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
+      control_dep2 = n->inputs[1];
+      ASSERT_EQ(n->inputs.size(), 2);
+      ASSERT_EQ(control_dep1, control_dep2);
+    }
+  }
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f548913e4e1d9d5bc5bdace8b92db9065cf3b5e
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/graph_traits.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+//
+// NodesDFSIterator
+//
+NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
+  for (auto *x : source) stack_.push(x);
+}
+
+NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
+    : stack_(std::move(other.stack_)),
+      visited_(std::move(other.visited_)) {}
+
+NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
+    : stack_(other.stack_), visited_(other.visited_) {}
+
+Node &NodesDFSIterator::operator*() {
+  PADDLE_ENFORCE(!stack_.empty());
+  return *stack_.top();
+}
+
+NodesDFSIterator &NodesDFSIterator::operator++() {
+  PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
+  visited_.insert(stack_.top());
+  auto *cur = stack_.top();
+  stack_.pop();
+  for (auto *x : cur->outputs) {
+    if (!visited_.count(x)) {
+      stack_.push(x);
+    }
+  }
+  return *this;
+}
+bool NodesDFSIterator::operator==(const NodesDFSIterator &other) {
+  if (stack_.empty()) return other.stack_.empty();
+  if ((!stack_.empty()) && (!other.stack_.empty())) {
+    return stack_.top() == other.stack_.top();
+  }
+  return false;
+}
+
+NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
+  stack_ = other.stack_;
+  visited_ = other.visited_;
+  return *this;
+}
+Node *NodesDFSIterator::operator->() { return stack_.top(); }
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..edbe45acb98326ee3bf1d86495832ec8469b634e
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stack>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <typename IteratorT>
+class iterator_range {
+  IteratorT begin_, end_;
+
+ public:
+  template <typename Container>
+  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
+
+  iterator_range(const IteratorT &begin, const IteratorT &end)
+      : begin_(begin), end_(end) {}
+
+  const IteratorT &begin() const { return begin_; }
+  const IteratorT &end() const { return end_; }
+};
+
+// DFS iterator on nodes.
+struct NodesDFSIterator
+    : public std::iterator<std::forward_iterator_tag, Node *> {
+  NodesDFSIterator() = default;
+  explicit NodesDFSIterator(const std::vector<Node *> &source);
+  NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+  NodesDFSIterator(const NodesDFSIterator &other);
+
+  Node &operator*();
+  NodesDFSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesDFSIterator &operator=(const NodesDFSIterator &other);
+  bool operator==(const NodesDFSIterator &other);
+  bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
+  Node *operator->();
+
+ private:
+  std::stack<Node *> stack_;
+  std::unordered_set<Node *> visited_;
+};
+
+/*
+ * GraphTraits contains some graph traversal algorithms.
+ *
+ * Usage:
+ *
+ */
+struct GraphTraits {
+  static iterator_range<NodesDFSIterator> DFS(const Graph &g) {
+    auto start_points = ExtractStartPoints(g);
+    NodesDFSIterator x(start_points);
+    return iterator_range<NodesDFSIterator>(NodesDFSIterator(start_points),
+                                            NodesDFSIterator());
+  }
+
+ private:
+  // The nodes those have no input will be treated as start points.
+  static std::vector<Node *> ExtractStartPoints(const Graph &g) {
+    std::vector<Node *> result;
+    for (auto *node : g.Nodes()) {
+      if (node->inputs.empty()) {
+        result.push_back(node);
+      }
+    }
+    return result;
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index b3138fccee86fb274abe72007961fc1c982b1e96..9c0765ab8ce16733ac021aefc8c7b2bb779319f3 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -58,6 +58,9 @@ class Node {
     return op_desc_;
   }
 
+  bool IsOp() const { return type_ == Type::kOperation; }
+  bool IsVar() const { return type_ == Type::kVariable; }
+
   std::vector<Node*> inputs;
   std::vector<Node*> outputs;
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index a190199f1cb1361f67f20c755b8e7ef52c284adc..03f7e71c03b8dd75d2a47cb4c6d1ef1a71792cf3 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
   }
 }
 
-int OpDesc::GetBlockAttr(const std::string &name) const {
+std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  auto blocks = boost::get<std::vector<BlockDesc *>>(it->second);
+
+  std::vector<int> ids;
+  for (auto n : blocks) {
+    ids.push_back(n->ID());
+  }
+
+  return ids;
+}
+
+int OpDesc::GetBlockAttrId(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
   return boost::get<BlockDesc *>(it->second)->ID();
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 74dd8ec002005dd080424b48b5db1a2574a6974f..b77d84125a23b81c3de4123bea6f0e09cd6d1e90 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -83,7 +83,9 @@ class OpDesc {
 
   Attribute GetNullableAttr(const std::string &name) const;
 
-  int GetBlockAttr(const std::string &name) const;
+  int GetBlockAttrId(const std::string &name) const;
+
+  std::vector<int> GetBlocksAttrIds(const std::string &name) const;
 
   void Rename(const std::string &old_name, const std::string &new_name);
 
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 1e01a6e900404990e16674755367d2fc6d832725..20bdc7830f32564448a69e9cd76c02585b7a1aca 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
       for (const std::string &attr_name : op->AttrNames()) {
         if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
           int sub_block_id =
-              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+              o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
           op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
         }
       }
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index c7286dacf01659f3af0927a71856e5a6496cb877..56bb9142dabe0d5546e321e675a5acba7bf4d306 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -112,5 +112,6 @@ Tensor& Tensor::Resize(const DDim& dims) {
 const DDim& Tensor::dims() const { return dims_; }
 
 int64_t Tensor::numel() const { return product(dims_); }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 7f678f869aac4616c8bca440d0431f765da41dd6..b7b62eef23ec351686378c913d18fc72308fd7b2 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -59,6 +59,14 @@ inline T* Tensor::mutable_data(platform::Place place) {
 }
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
+  int rank = src.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank, 2,
+      "'ReshapeToMatrix()' is only used for flatten high rank "
+      "tensors to matrixs. Can not be used in reshaping vectors.");
+  if (rank == 2) {
+    return src;
+  }
   Tensor res;
   res.ShareDataWith(src);
   res.Resize(flatten_to_2d(src.dims(), num_col_dims));
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index f26f212d4d5793b88fd1e6d782cdf983bf341879..18cdca3a658a6a89e6ab637a7f38825756acfea8 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -20,6 +20,9 @@
 DEFINE_int32(io_threadpool_size, 100,
              "number of threads used for doing IO, default 100");
 
+DEFINE_int32(dist_threadpool_size, 0,
+             "number of threads used for distributed executed.");
+
 namespace paddle {
 namespace framework {
 
@@ -35,6 +38,10 @@ void ThreadPool::Init() {
   if (threadpool_.get() == nullptr) {
     // TODO(Yancey1989): specify the max threads number
     int num_threads = std::thread::hardware_concurrency();
+    if (FLAGS_dist_threadpool_size > 0) {
+      num_threads = FLAGS_dist_threadpool_size;
+      VLOG(1) << "set dist_threadpool_size to " << num_threads;
+    }
     PADDLE_ENFORCE_GT(num_threads, 0);
     threadpool_.reset(new ThreadPool(num_threads));
   }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 08d7af6d3af7054061b15b904c69b2862c629562..e31c637e969f7a86f4f185abb4f0f01d3303db75 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -22,6 +22,9 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(profile, false, "Turn on profiler for fluid");
 
 namespace paddle {
 namespace {
@@ -58,6 +61,15 @@ bool NativePaddlePredictor::Init(
     std::shared_ptr<framework::Scope> parent_scope) {
   VLOG(3) << "Predictor::init()";
 
+  if (FLAGS_profile) {
+    LOG(WARNING) << "Profiler is actived, might affect the performance";
+    LOG(INFO) << "You can turn off by set gflags '-profile false'";
+
+    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
+                                           : platform::ProfilerState::kCPU;
+    platform::EnableProfiler(tracking_device);
+  }
+
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
   } else {
@@ -102,6 +114,10 @@ bool NativePaddlePredictor::Init(
 }
 
 NativePaddlePredictor::~NativePaddlePredictor() {
+  if (FLAGS_profile) {
+    platform::DisableProfiler(platform::EventSortingKey::kTotal,
+                              "./profile.log");
+  }
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
diff --git a/paddle/fluid/operators/.flatten_op.cc.swp b/paddle/fluid/operators/.flatten_op.cc.swp
deleted file mode 100644
index 3395b6074b6a4c684a97674af702ca8b91dc85e9..0000000000000000000000000000000000000000
Binary files a/paddle/fluid/operators/.flatten_op.cc.swp and /dev/null differ
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index ff0e989464e76b0f7cb163bd95997b82303036d6..e8b5dec9d49f5613cec92441d19ab7dc1a1ad90c 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -170,6 +170,9 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       elseif(${TARGET} STREQUAL "tensorrt_engine_op")
           message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
+      elseif(${TARGET} STREQUAL "fc")
+        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
       else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
       endif()
@@ -300,12 +303,6 @@ op_library(channel_recv_op DEPS concurrency)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 
-# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
-# Because the fully connected layer has only one MKLDNN's operator
-if(NOT WITH_MKLDNN)
-    list(REMOVE_ITEM GENERAL_OPS fc_op)
-endif(NOT WITH_MKLDNN)
-
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index a3bec3da45136bca5cb2763e7ffd6b67703a1813..578ab63bc380ee62d76e34b7cf3cbd590bfa2eda 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -28,23 +28,26 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
+                      "Input(X) and Input(Label) shall have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "Input(X) and Input(Label) shall have the same shape "
+                      "except the last dimension.");
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
+                        "If Attr(soft_label) == true, the last dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
-                        "If Attr(softLabel) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL,
+                        "If Attr(softLabel) == false, the last dimension of "
                         "Input(Label) should be 1.");
     }
 
-    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    auto y_dims = x_dims;
+    y_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Y", y_dims);
     ctx->ShareLoD("X", /*->*/ "Y");
   }
 
@@ -74,24 +77,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto label_dims = ctx->GetInputDim("Label");
     auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
-                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
+                      "Input(Y@Grad) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
+                      "Input(Label) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "The Input(X) and Input(Label) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(dy_dims, 0, rank - 1),
+                      "The Input(X) and Input(Y@Grad) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
+                      "The last dimension of Input(Y@Grad) should be 1.");
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
+                        "When Attr(soft_label) == true, the last dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
+                        "When Attr(soft_label) == false, the last dimension of "
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
@@ -113,18 +120,20 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
-             " where N is the batch size and D is the number of classes. "
-             "This input is a probability computed by the previous operator, "
-             "which is almost always the result of a softmax operator.");
-    AddInput("Label",
-             "(Tensor), the ground truth which is a 2-D tensor. When "
-             "soft_label is set to false, Label is a Tensor<int64> with shape "
-             "[N x 1]. When soft_label is set to true, Label is a "
-             "Tensor<float/double> with shape [N x D].");
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
+             "size is equal to the number of classes. This input is a "
+             "probability computed by the previous operator, which is almost "
+             "always the result of a softmax operator.");
+    AddInput(
+        "Label",
+        "(Tensor), the tensor which represents the ground truth. It has the "
+        "same shape with 'X' except the last dimension. When soft_label is set "
+        "to false, the last dimension size is 1; when soft_label is set to "
+        "true, the last dimension size is equal to the number of classes.");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.");
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
+              "with 'X' except that the last dimension size is 1. It "
+              "represents the cross entropy loss.");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
                   "interpretate the given labels as soft labels.")
@@ -132,6 +141,12 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 CrossEntropy Operator.
 
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
+The matrix's second dimension(row length) is as same as the original last 
+dimension, and the first dimension(column length) is the product of all other 
+original dimensions. Then the softmax computation will take palce on each raw 
+of flattened matrixs.
+
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 19a2aec92b267ece94685ce34604b7d1cfa5d209..36b58d80144d242277f6fc970a3a61a6721d4b50 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -33,8 +33,13 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
     auto* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
+    int rank = x->dims().size();
+    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
+
     math::CrossEntropyFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), y, x, labels,
+        ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
         ctx.Attr<bool>("soft_label"));
   }
 };
@@ -98,9 +103,12 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
     auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
     auto* label = ctx.Input<Tensor>("Label");
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
-    int64_t class_num = x->dims()[1];
+    // Following computation only depends on the last dimension size. So it's
+    // unnecessary to convert tensors to 2-D views.
+    int rank = x->dims().size();
+    int64_t class_num = x->dims()[rank - 1];
     if (ctx.Attr<bool>("soft_label")) {
       XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
                                         label->data<T>(),
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 466bce18af7cf97014a7b1ba64df68eab193c7c8..8e38b3713f28b045e9214db68aec50f0ba6c06f6 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
 #endif
   }
 
+  VLOG(7) << "ProcSerializedField:" << meta_.varname()
+          << ", type:" << meta_.type() << std::endl;
   framework::DDim dims = GetDims(meta_.dims());
   if (meta_.type() == sendrecv::LOD_TENSOR) {
     PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
     if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
       return false;
     }
+
     return true;
   }
 
@@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
     return true;
   }
 
-  return true;
+  PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
+
+  return false;
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index 99fa659a351249a4a93f71700e1c646465861aba..e595f1a627cfefbb91b070b898046cf135dc4988 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -125,13 +125,16 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto input = ctx.Input<Tensor>("Input");
     auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
 
     PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
                    "Input must be with 2 or 4 dimensions, i.e. NCHW");
+    // TODO(intel friends): the native weight format is io,
+    // but the mkldnn weight format is oihw, which may need be transposed.
     PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
                    "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
 
-    bool with_bias = ctx.Attr<bool>("bias_attr");
+    bool with_bias = bias != nullptr;
     MKLDNNMD<Tensor> md(input, w, with_bias);
 
     std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
@@ -154,6 +157,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_memory = mem.dst(output_data);
     auto src_memory = mem.src(input_data);
     auto weights_memory = mem.weights(w_data);
+    // TODO(intel friends): bias memory should also be obtain from bias->data()
     auto bias_memory = mem.bias();
 
     auto forward = with_bias ? mkldnn::inner_product_forward(
@@ -216,7 +220,8 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     const T* out_grad_data = out_grad->data<T>();
 
-    bool with_bias = ctx.Attr<bool>("bias_attr");
+    auto bias = ctx.Input<Tensor>("Bias");
+    bool with_bias = bias != nullptr;
 
     MKLDNNMD<Tensor> md(input, w, with_bias);
     MKLDNNMemory mem(&md, mkldnn_engine);
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed..099ca52c8e945a0e93c2f13adb612158c67397cf 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fc_op.h"
 #include <vector>
+#include "paddle/fluid/operators/math/blas.h"
+
+DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
 namespace operators {
@@ -25,16 +28,24 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Out(Output) of Fully Connected should not be null.");
   PADDLE_ENFORCE(ctx->HasInput("W"),
                  "W(Input) of Fully Connected should not be null.");
-
+  // NCHW
   auto in_dims = ctx->GetInputDim("Input");
+  // IO, I=C*H*W
   auto w_dims = ctx->GetInputDim("W");
   std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
 
+  if (ctx->HasInput("Bias")) {
+    auto bias_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
+    PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
+                      "The shape of Bias must be [1, dim].");
+  }
   PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                  "Fully Connected input should be 2-D or 4-D tensor.");
-
-  PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4,
-                 "Fully Connected input should be 2-D or 4-D tensor.");
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
+                    "Fully Connected input should be 2-D tensor.");
+  PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0],
+                    "Fully Connected input and weigth size do not match.");
 
   ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   ctx->ShareLoD("Input", "Out");
@@ -42,9 +53,12 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType FCOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
-
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  if (ctx.Attr<bool>("use_mkldnn")) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout, library);
@@ -60,27 +74,39 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
   if (ctx->HasOutput(framework::GradVarName("W"))) {
     ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
   }
+
+  if (ctx->HasInput("Bias")) {
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                   "Should have bias grad");
+    auto bias_dims = ctx->GetInputDim("Bias");
+    ctx->SetOutputDim(framework::GradVarName("Bias"), bias_dims);
+  }
 }
 
 framework::OpKernelType FCOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
-
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  if (ctx.Attr<bool>("use_mkldnn")) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout, library);
 }
 
 void FCOpMaker::Make() {
-  AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
-  AddInput("W", "(Tensor), The second input tensor of fc op.");
+  AddInput("Input",
+           "(Tensor), The input tensor of fully connected operator with format "
+           "(NCHW). ");
+  AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
+  AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
+      .AsDispensable();
   AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
-  AddAttr<bool>("bias_attr", "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
   AddComment(R"DOC(
   Fully Connected Operator.
 
@@ -94,9 +120,47 @@ void FCOpMaker::Make() {
 )DOC");
 }
 
+template <typename T>
+class FCOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
+    auto output = ctx.Output<Tensor>("Out");
+    auto in_dims = input->dims();
+    auto w_dims = w->dims();
+
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0],
+              static_cast<T>(1), input_data, w_data, static_cast<T>(0),
+              output_data);
+
+    if (bias) {
+      const T* bias_data = bias->data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#endif
+      for (int bs = 0; bs < in_dims[0]; bs++) {
+        blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
+                  output_data + bs * w_dims[1]);
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fc, ops::FCOp, ops::FCOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad);
+REGISTER_OPERATOR(fc_grad, ops::FCOpGrad);
+REGISTER_OP_CPU_KERNEL(fc, ops::FCOpKernel<float>, ops::FCOpKernel<double>);
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index b1948076969e7a651179b97b107b85beea175280..f196e18fe122af9536230752096a2d90de8ab527 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
+  // Trainers will get all parameters from pserver in the
+  // startup program, so we will wait RequestGet first
+  rpc_service_->SetCond(distributed::kRequestGet);
+  rpc_service_->WaitBarrier(distributed::kRequestGet);
   rpc_service_->ResetBarrierCounter();
-
   while (true) {
     rpc_service_->Profiler().OneStep();
     // Get from multiple trainers, we don't care about the order in which
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index db040509bc08c3f6ad031c5b97c93574e31337e0..23d9ea88f6701f9f9e5e02948e996878a849ddd6 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,14 +23,40 @@ class PReluOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
+    std::string mode = ctx->Attrs().Get<std::string>("mode");
+
+    auto x_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
-    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
-                   "Size of weight Alpha must be one.");
+
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (mode == "all") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
+                     "For mode 'all', size of weight Alpha must be one.");
+    } else if (mode == "channel") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == x_dim[1],
+                     "For channel-wise mode, size of weight Alpha must be "
+                     "equal to the number of channels, should be %d",
+                     x_dim[1]);
+    } else if (mode == "element") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == product(x_dim),
+                     "For element-wise mode, size of weight Alpha must be "
+                     "equal to the number of input, should be %d",
+                     product(x_dim));
+    } else {
+      PADDLE_THROW("Unkown mode %s", mode);
+    }
+    ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
 };
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -44,9 +67,7 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The output tensor of prelu operator.");
     AddComment(R"DOC(
 PRelu Operator.
-
 The equation is:
-
 $$
 f(x) =
 \begin{cases}
@@ -54,11 +75,15 @@ f(x) =
 x,         \qquad  \text{if} \ x >= 0
 \end{cases}
 $$
-
 The input `X` can carry the LoD (Level of Details) information,
 or not. And the output shares the LoD information with input `X`.
-
+There are modes: 
+  all: all elements share same weight
+  channel: elements in a channel share same weight
+  element: each element has a weight 
 )DOC");
+    AddAttr<std::string>("mode", "The mode for inputs to share weights.")
+        .SetDefault("all");
   }
 };
 
@@ -71,9 +96,23 @@ class PReluGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->SetOutputDim(framework::GradVarName("Alpha"),
-                      ctx->GetInputDim("Alpha"));
+    auto x_grad_name = framework::GradVarName("X");
+    auto alpha_grad_name = framework::GradVarName("Alpha");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(alpha_grad_name)) {
+      ctx->SetOutputDim(alpha_grad_name, ctx->GetInputDim("Alpha"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
deleted file mode 100644
index 37d934a29046be04a1721b7330c813f663f61aed..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prelu_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prelu_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    prelu,
-    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(prelu_grad,
-                        paddle::operators::PReluGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
index a6197d354833a2f4173003ad2a970c487ad9a65b..f9076cbc678534fd5490fa0d7adeac0e50909a39 100644
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,32 +10,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
-
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 using platform::Transform;
 
-template <typename T>
-class PReluFunctor {
- public:
-  explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
-
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x > 0)
-      return x;
-    else
-      return x * (*alpha_);
-  }
-
- private:
-  const T* alpha_;
-};
-
 template <typename DeviceContext, typename T>
 class PReluKernel : public framework::OpKernel<T> {
  public:
@@ -50,53 +31,93 @@ class PReluKernel : public framework::OpKernel<T> {
     const T* x_ptr = x->data<T>();
     T* o_ptr = out->mutable_data<T>(context.GetPlace());
 
-    auto* alpha_ptr = alpha->data<T>();
+    const T* alpha_ptr = alpha->data<T>();
+    std::string mode = context.Attr<std::string>("mode");
 
     int numel = x->numel();
-
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x_ptr,
-          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
-  }
-};
-
-template <typename T>
-class PReluGradFunctor {
- public:
-  explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
-
-  HOSTDEVICE T operator()(const T& out, const T& dout) const {
-    if (out > 0)
-      return dout;
-    else
-      return dout * (*alpha_);
+    auto dim = x->dims();
+    int index = 0;
+    int i = 0;
+    int temp = 0;
+    if (mode == "channel") {
+      for (i = 0; i < numel; i++) {
+        temp = numel / (dim[0] * dim[1]);
+        index = (i / temp) % dim[1];
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+      }
+    } else if (mode == "element") {
+      for (i = 0; i < numel; i++) {
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
+      }
+    } else {
+      for (i = 0; i < numel; i++) {
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
+      }
+    }
   }
-
- private:
-  const T* alpha_;
 };
 
 template <typename DeviceContext, typename T>
 class PReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
     auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
     auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-
+    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
     auto* out = context.Input<Tensor>("Out");
     auto* alpha = context.Input<Tensor>("Alpha");
-    auto* alpha_ptr = alpha->data<T>();
-
-    T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+    const T* alpha_ptr = alpha->data<T>();
+    const T* x_ptr = x->data<T>();
     const T* dout_ptr = dout->data<T>();
     const T* out_ptr = out->data<T>();
-    int numel = dx->numel();
-
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), out_ptr,
-          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
-
-    // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
+    std::string mode = context.Attr<std::string>("mode");
+    int numel = x->numel();
+    auto dim = x->dims();
+    int index = 0;
+    int i = 0;
+    int temp = 0;
+    if (dx) {
+      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+      if (mode == "channel") {
+        for (i = 0; i < numel; i++) {
+          temp = numel / (dim[0] * dim[1]);
+          index = (i / temp) % dim[1];
+          dx_ptr[i] =
+              out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
+        }
+      } else if (mode == "element") {
+        for (i = 0; i < numel; i++) {
+          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[i] * dout_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
+        }
+      }
+    }
+
+    index = 0;
+    if (dalpha) {
+      T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
+      if (mode == "channel") {
+        for (i = 0; i < numel; i++) {
+          temp = numel / (dim[0] * dim[1]);
+          index = (i / temp) % dim[1];
+          dalpha_ptr[index] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      } else if (mode == "element") {
+        for (i = 0; i < numel; i++) {
+          dalpha_ptr[i] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          dalpha_ptr[0] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      }
+    }
+
+    // TODO(Guanzhong): add GPU kernels
   }
 };
 
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index b44d5f898013a5d27467bd80118c29a886d5e8b3..1be9fe47af71d31ce2e0eba807ea4a43601f8aca 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -38,7 +38,7 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Input", "(Tensor), The input tensor.");
     AddOutput("Out",
               "(Tensor), The shape of input tensor, the data type of the shape"
-              " is int64_t, will be on the same device with the input Tensor.");
+              " is int32_t, will be on the same device with the input Tensor.");
     AddComment(R"DOC(
 Shape Operator
 
@@ -53,5 +53,5 @@ Get the shape of input tensor. Only support CPU input Tensor now.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
                        ops::ShapeKernel<float>, ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
index 7736a2a1e13cfa5d445411b3efac7669a7bf23a2..d8fa9515abf807ab4ae3c47e8e1b1cf7f30440a8 100644
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -15,6 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 
 REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
-                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<int32_t>,
                         paddle::operators::ShapeKernel<float>,
                         paddle::operators::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
index 3be86b66a538e7b38a5d59095fee7e7636364bce..0d510a505583c55e26a26bfc6e5d6192899b3d9e 100644
--- a/paddle/fluid/operators/shape_op.h
+++ b/paddle/fluid/operators/shape_op.h
@@ -27,7 +27,7 @@ class ShapeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_t = ctx.Input<Tensor>("Input");
     auto* out_t = ctx.Output<Tensor>("Out");
-    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
     auto in_dims = in_t->dims();
     for (int i = 0; i < in_dims.size(); ++i) {
       out_data[i] = in_dims[i];
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 1205bd0587f32caae04c27ecea581fc17988507f..cf1eeb017d666f605a431aa54637d8cbc99c7c46 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -31,16 +31,12 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
-    auto dims = X->dims();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
-    framework::LoDTensor flattened_x;
-    framework::LoDTensor flattened_out;
-    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    int rank = X->dims().size();
+    Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
+    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
     math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), &flattened_x,
-        &flattened_out);
+        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
   }
 };
 
@@ -55,18 +51,14 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
-    auto dims = Out->dims();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
-    framework::LoDTensor flattened_out;
-    framework::LoDTensor flattened_d_out;
-    framework::LoDTensor flattened_d_x;
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
-    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
-    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+    int rank = Out->dims().size();
+    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
+    Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1);
+    Tensor dX_2d = framework::ReshapeToMatrix(*dX, rank - 1);
 
     math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), &flattened_out,
-        &flattened_d_out, &flattened_d_x);
+        context.template device_context<DeviceContext>(), &Out_2d, &dOut_2d,
+        &dX_2d);
   }
 };
 
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index d0286719b9ea1aa671294f519051ac1e269c4e93..652a6ec7a4e2e823b28f39b449570cd375e88e18 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -270,12 +270,13 @@ struct EventItem {
   double min_time;
   double max_time;
   double ave_time;
+  float ratio;
 };
 
 // Print results
 void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                    const std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width) {
+                   const size_t data_width, double total) {
   // Output header information
   std::cout << "\n------------------------->"
             << "     Profiling Report     "
@@ -300,7 +301,8 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
   std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
             << "Calls" << std::setw(data_width) << "Total"
             << std::setw(data_width) << "Min." << std::setw(data_width)
-            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+            << "Max." << std::setw(data_width) << "Ave."
+            << std::setw(data_width) << "Ratio." << std::endl;
   for (size_t i = 0; i < events_table.size(); ++i) {
     for (size_t j = 0; j < events_table[i].size(); ++j) {
       const EventItem& event_item = events_table[i][j];
@@ -309,7 +311,9 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                 << std::setw(data_width) << event_item.total_time
                 << std::setw(data_width) << event_item.min_time
                 << std::setw(data_width) << event_item.max_time
-                << std::setw(data_width) << event_item.ave_time << std::endl;
+                << std::setw(data_width) << event_item.ave_time
+                << std::setw(data_width) << event_item.total_time / total
+                << std::endl;
     }
   }
   std::cout << std::endl;
@@ -359,6 +363,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
   std::vector<std::vector<EventItem>> events_table;
   size_t max_name_width = 0;
+  double total = 0.;  // the total time
   for (size_t i = 0; i < events.size(); i++) {
     std::list<Event> pushed_events;
     std::vector<EventItem> event_items;
@@ -379,6 +384,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
                                g_state == ProfilerState::kAll)
                                   ? rit->CudaElapsedMs(events[i][j])
                                   : rit->CpuElapsedMs(events[i][j]);
+          total += event_time;
 
           std::string event_name =
               "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
@@ -387,7 +393,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
           if (event_idx.find(event_name) == event_idx.end()) {
             event_idx[event_name] = event_items.size();
             EventItem event_item = {event_name, 1,          event_time,
-                                    event_time, event_time, event_time};
+                                    event_time, event_time, event_time,
+                                    0.};
             event_items.push_back(event_item);
           } else {
             int index = event_idx[event_name];
@@ -431,7 +438,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
   }
 
   // Print report
-  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
+  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total);
 }
 
 void DisableProfiler(EventSortingKey sorted_key,
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 2199f5311fd3728e624fc222a1b876eb947cc0aa..be623703c2480774bb04a6bc0c5b00b699d7bb16 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) {
              std::string ser(seriralized);
              self.SetAttr(name, ser);
            })
-      .def("block_attr", &pd::OpDesc::GetBlockAttr)
+      .def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
+      .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
       .def("check_attrs", &pd::OpDesc::CheckAttrs)
       .def("infer_shape", &pd::OpDesc::InferShape)
       .def("infer_var_type", &pd::OpDesc::InferVarType)
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 1ae05dec8de6b9cbc9568f0f4d437833be520f8d..9aac3c7fc16ae1ded2700662764895385b043130 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -122,7 +122,7 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'cpu_deterministic'
+        "dist_threadpool_size", 'cpu_deterministic'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 6b739745119836a80518d7bc6138b37101e74431..fd6a76dd0cfa347328d87093884e5cd324395497 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -344,7 +344,7 @@ def _append_backward_ops_(block,
         grad_sub_block_list = []
         # If the op has its own sub-block, deal with the sub-block first
         if op.has_attr("sub_block"):
-            sub_block = program.block(op.block_attr("sub_block"))
+            sub_block = program.block(op.block_attr_id("sub_block"))
             grad_sub_block = program.create_block()
             grad_sub_block._set_forward_block_idx(sub_block.idx)
             cb = _callback_lookup_(op)
@@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
     for op_idx in range(start_op_idx, block.desc.op_size()):
         op_desc = block.desc.op(op_idx)
         if op_desc.has_attr("sub_block"):
-            sub_block = block.program.block(op_desc.block_attr("sub_block"))
+            sub_block = block.program.block(op_desc.block_attr_id("sub_block"))
             _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
         new_vars = set()
         # create new gradient variables
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3d7c29c6ea33ebc49b071681c5f9746b97445aa3..45b3abb88c9431f52705bb62df2c32779dd0cf9d 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -476,23 +476,25 @@ class Operator(object):
                  attrs=None):
         self.block = block
         self.desc = desc
-        self.attrs = attrs
-        if self.attrs is None:
-            self.attrs = dict()
+        # note: not add self.attrs here:
+        # https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
+        op_attrs = attrs
+        if op_attrs is None:
+            op_attrs = dict()
         del attrs
 
         op_maker = core.op_proto_and_checker_maker
 
-        if op_maker.kOpRoleAttrName() not in self.attrs:
-            self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
+        if op_maker.kOpRoleAttrName() not in op_attrs:
+            op_attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
 
         role_var_name = op_maker.kOpRoleVarAttrName()
         if len(self.block.program.
-               op_role_var) != 0 and role_var_name not in self.attrs:
-            self.attrs[role_var_name] = self.block.program.op_role_var
+               op_role_var) != 0 and role_var_name not in op_attrs:
+            op_attrs[role_var_name] = self.block.program.op_role_var
 
-        if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0:
-            del self.attrs[role_var_name]
+        if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
+            del op_attrs[role_var_name]
 
         if len(self.desc.type()) != 0:
             return
@@ -576,15 +578,14 @@ class Operator(object):
                     arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
-        if self.attrs is not None:
-            if not isinstance(self.attrs, dict):
+        if op_attrs is not None:
+            if not isinstance(op_attrs, dict):
                 raise TypeError("'attrs' should be a dict.")
             for attr in proto.attrs:
                 attr_name = attr.name
-                if (attr_name not in self.attrs) or (
-                        self.attrs[attr_name] is None):
+                if (attr_name not in op_attrs) or (op_attrs[attr_name] is None):
                     continue
-                attr_val = self.attrs[attr_name]
+                attr_val = op_attrs[attr_name]
                 self._update_desc_attr(attr_name, attr_val)
 
         self.desc.check_attrs()
@@ -732,7 +733,6 @@ class Operator(object):
         Raises:
             ValueError: If the type of value doesn't match with desc.attr_type(name).
         """
-        self.attrs[name] = val
         self._update_desc_attr(name, val)
 
     def _update_desc_attr(self, name, val):
@@ -774,9 +774,9 @@ class Operator(object):
         """
         return self.desc.attr(name)
 
-    def block_attr(self, name):
+    def block_attr_id(self, name):
         """
-        Get the block attribute by name.
+        Get the block attribute's id by name.
 
         Args:
             name(str): the attribute name.
@@ -784,22 +784,74 @@ class Operator(object):
         Returns:
             int: the block index.
         """
-        return self.desc.block_attr(name)
+        return self.desc.block_attr_id(name)
+
+    def block_attr(self, name):
+        """
+        Get the block attribute  by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            block: the block attribute.
+        """
+
+        id = self.block_attr_id(name)
+        assert (id >= 0 and id < len(self.block.program.blocks))
+        return self.block.program.blocks[id]
+
+    def blocks_attr(self, name):
+        """
+        Get the blocks attribute  by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            list: list of the blocks attribute.
+        """
+        attrs = []
+        for i in self.blocks_attr_ids(name):
+            assert (i >= 0 and i < len(self.block.program.blocks))
+            attrs.append(self.block.program.blocks[i])
+
+        return attrs
+
+    def blocks_attr_ids(self, name):
+        """
+        Get the blocks attribute's ids by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            list: list of the blocks ids.
+        """
+
+        return self.desc.blocks_attr_ids(name)
 
     def all_attrs(self):
         """
         Get the attribute dict.
 
         Returns:
-            dict: The Operator's attribute dict.
+            dict: The Operator's attribute dict, name->attr.
         """
         attr_names = self.attr_names
         attr_map = {}
         for n in attr_names:
-            if n == 'sub_block':
+            attr_type = self.desc.attr_type(n)
+            if attr_type == core.AttrType.BLOCK:
                 attr_map[n] = self.block_attr(n)
-            else:
-                attr_map[n] = self.attr(n)
+                continue
+
+            if attr_type == core.AttrType.BLOCKS:
+                attr_map[n] = self.blocks_attr(n)
+                continue
+
+            attr_map[n] = self.attr(n)
+
         return attr_map
 
 
@@ -1518,11 +1570,17 @@ class Program(object):
             The two code snippets above will generate same programs.
         """
         if for_test:
-            p = self.inference_optimize()
+            p = self.inference_optimize(export_for_deployment=False)
         else:
             p = Program()
+            p.current_block_idx = self.current_block_idx
+            p._seed = self._seed
             p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())]
+            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+
+            p._current_role = self._current_role
+            p._op_role_var = self._op_role_var
+
             p._sync_with_cpp()
 
         p._copy_param_info_from(self)
@@ -1578,7 +1636,7 @@ class Program(object):
         res._sync_with_cpp()
         return res
 
-    def inference_optimize(self):
+    def inference_optimize(self, export_for_deployment=True):
         """
         This method will create a new program and do following adjustments on it:
         1. Remove all reader variables and their creator ops if exist.
@@ -1589,6 +1647,10 @@ class Program(object):
         attribute of operators to :code:`True`. All the :code:`Parameter`
         information will be lost.
 
+        Args:
+            export_for_deployment(bool): remove the read ops that are added by py_reader
+                                        for cpp inference library
+
         Notes: This API is a very low level API. Use
         :code:`Program.clone(for_test=True)` instead.
 
@@ -1603,16 +1665,17 @@ class Program(object):
         # remove all readers and the read_op if exist
         read_op_idx = 0
         root_block = res.desc.block(0)
-        while True:
-            if read_op_idx >= root_block.op_size() or root_block.op(
-                    read_op_idx).type() == 'read':
-                break
-            read_op_idx += 1
-        if read_op_idx < root_block.op_size():
-            root_block._remove_op(0, read_op_idx + 1)
-        for var in root_block.all_vars():
-            if var.type() == core.VarDesc.VarType.READER:
-                root_block._remove_var(var.name())
+        if export_for_deployment:
+            while True:
+                if read_op_idx >= root_block.op_size() or root_block.op(
+                        read_op_idx).type() == 'read':
+                    break
+                read_op_idx += 1
+            if read_op_idx < root_block.op_size():
+                root_block._remove_op(0, read_op_idx + 1)
+            for var in root_block.all_vars():
+                if var.type() == core.VarDesc.VarType.READER:
+                    root_block._remove_var(var.name())
 
         # change all `is_test` attributes to True
         for i in range(res.desc.num_blocks()):
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 83290ac60839b855a0348696ad6898af7335e2fc..6dedbae7a6586f862328c7f23d0aea6ba5022614 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -15,7 +15,6 @@
 from . import framework
 import numpy as np
 import contextlib
-from .framework import convert_np_dtype_to_dtype_
 from .core import VarDesc
 
 __all__ = [
@@ -264,7 +263,8 @@ class NormalInitializer(Initializer):
                 "dtype": int(var.dtype),
                 "mean": self._mean,
                 "std": self._std_dev,
-                "seed": self._seed
+                "seed": self._seed,
+                "use_mkldnn": False
             })
         var.op = op
         return op
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 55e517f1f4a48beddf62fe875c2064da09b2f6b0..af734210323913a36f861380dc38a98253aca0a1 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -555,7 +555,8 @@ def save_inference_model(dirname,
                          executor,
                          main_program=None,
                          model_filename=None,
-                         params_filename=None):
+                         params_filename=None,
+                         export_for_deployment=True):
     """
     Prune the given `main_program` to build a new program especially for inference,
     and then save it and all related parameters to given `dirname` by the `executor`.
@@ -577,6 +578,8 @@ def save_inference_model(dirname,
         params_filename(str|None): The name of file to save all related parameters.
                                    If it is setted None, parameters will be saved
                                    in separate files .
+        export_for_deployment(bool): remove the read ops that are added by py_reader
+                                    for cpp inference lib. Default True
 
     Returns:
         None
@@ -643,7 +646,8 @@ def save_inference_model(dirname,
     copy_program.desc.flush()
 
     pruned_program = copy_program.prune(targets=target_vars)
-    inference_program = pruned_program.inference_optimize()
+    inference_program = pruned_program.inference_optimize(
+        export_for_deployment=export_for_deployment)
     fetch_var_names = [v.name for v in target_vars]
 
     prepend_feed_ops(inference_program, feeded_var_names)
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 0800c02d9ec7dc51d4f22d3b04f3ae90315ebad8..b996c8368862184f9bc8b177f3b6e43aebdfb007 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -20,7 +20,9 @@ from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 from . import tensor
 from . import nn
+from . import ops
 import math
+import numpy
 from functools import reduce
 
 __all__ = [
@@ -264,10 +266,11 @@ def detection_output(loc,
         prior_box_var=prior_box_var,
         target_box=loc,
         code_type='decode_center_size')
-    old_shape = scores.shape
-    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
+    compile_shape = scores.shape
+    run_shape = ops.shape(scores)
+    scores = nn.flatten(x=scores, axis=2)
     scores = nn.softmax(input=scores)
-    scores = nn.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
@@ -677,9 +680,10 @@ def ssd_loss(location,
         raise ValueError("Only support mining_type == max_negative now.")
 
     num, num_prior, num_class = confidence.shape
+    conf_shape = ops.shape(confidence)
 
     def __reshape_to_2d(var):
-        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.flatten(x=var, axis=2)
 
     # 1. Find matched boundding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -690,7 +694,8 @@ def ssd_loss(location,
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(
+        x=gt_label, shape=(len(gt_label.shape) - 1) * (0, ) + (-1, 1))
     gt_label.stop_gradient = True
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
@@ -701,9 +706,12 @@ def ssd_loss(location,
     target_label = __reshape_to_2d(target_label)
     target_label.stop_gradient = True
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
-
     # 3. Mining hard examples
-    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(
+        x=conf_loss,
+        shape=(num, num_prior),
+        actual_shape=ops.slice(
+            conf_shape, axes=[0], starts=[0], ends=[2]))
     conf_loss.stop_gradient = True
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
@@ -772,7 +780,11 @@ def ssd_loss(location,
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
     # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = nn.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(
+        x=loss,
+        shape=(num, num_prior),
+        actual_shape=ops.slice(
+            conf_shape, axes=[0], starts=[0], ends=[2]))
     loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
     if normalize:
         normalizer = nn.reduce_sum(target_loc_weight)
@@ -1005,13 +1017,7 @@ def multi_box_head(inputs,
     """
 
     def _reshape_with_axis_(input, axis=1):
-        if not (axis > 0 and axis < len(input.shape)):
-            raise ValueError("The axis should be smaller than "
-                             "the arity of input and bigger than 0.")
-        new_shape = [
-            -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
-        ]
-        out = nn.reshape(x=input, shape=new_shape)
+        out = nn.flatten(x=input, axis=axis)
         return out
 
     def _is_list_or_tuple_(data):
@@ -1101,11 +1107,13 @@ def multi_box_head(inputs,
             stride=stride)
 
         mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
-        new_shape = [
+        compile_shape = [
             mbox_loc.shape[0],
             mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
         ]
-        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
+        run_shape = tensor.assign(numpy.array([0, -1, 4]).astype("int32"))
+        mbox_loc_flatten = nn.reshape(
+            mbox_loc, shape=compile_shape, actual_shape=run_shape)
         mbox_locs.append(mbox_loc_flatten)
 
         # get conf
@@ -1117,11 +1125,15 @@ def multi_box_head(inputs,
             padding=pad,
             stride=stride)
         conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
-        new_shape = [
+        new_shape = [0, -1, num_classes]
+        compile_shape = [
             conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
             conf_loc.shape[3] / num_classes, num_classes
         ]
-        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
+        run_shape = tensor.assign(
+            numpy.array([0, -1, num_classes]).astype("int32"))
+        conf_loc_flatten = nn.reshape(
+            conf_loc, shape=compile_shape, actual_shape=run_shape)
         mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0960b54123d36b0ddcced6841384399cab816f48..3e50fc91d92d0b338f2c0282e3e61bfda6e5edbc 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -112,6 +112,8 @@ __all__ = [
     'log',
     'crop',
     'rank_loss',
+    'prelu',
+    'flatten',
 ]
 
 
@@ -5361,3 +5363,123 @@ def rank_loss(label, left, right, name=None):
                 "Right": right},
         outputs={'Out': out})
     return out
+
+
+def prelu(x, mode, param_attr=None, name=None):
+    """
+    Equation:
+
+        y = \max(0, x) + alpha \min(0, x)
+
+    Args:
+        x (Variable): The input tensor.
+	  param_attr(ParamAttr|None): The parameter attribute for the learnable
+                                    weight (alpha).
+        mode (string): The mode for weight sharing
+		       all: all elements share same weight
+ 		       channel:elements in a channel share same weight
+ 		       element:each element has a weight
+	  name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically. 
+
+    Returns:
+        Variable: The output tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+         x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
+            mode = 'channel'
+            output = fluid.layers.prelu(x,mode)
+    """
+    helper = LayerHelper('prelu', **locals())
+    if mode not in ['all', 'channel', 'element']:
+        raise ValueError('mode should be one of all, channel, element.')
+    alpha_shape = [1]
+    if mode == 'channel':
+        alpha_shape = [1, x.shape[1], 1, 1]
+    elif mode == 'element':
+        alpha_shape = x.shape
+    dtype = helper.input_dtype(input_param_name='x')
+    alpha = helper.create_parameter(
+        attr=param_attr,
+        shape=alpha_shape,
+        dtype='float32',
+        is_bias=False,
+        default_initializer=Constant(1.0))
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x,
+                'Alpha': alpha},
+        attrs={"mode": mode},
+        outputs={"Out": out})
+    return out
+
+
+def flatten(x, axis=1, name=None):
+    """
+    **Flatten layer**
+    Flattens the input tensor into a 2D matrix.
+
+    Examples:
+    Case 1:
+      Given
+        X.shape = (3, 100, 100, 4)
+      and
+        axis = 2
+      We get:
+        Out.shape = (3 * 100, 4 * 100)
+    
+    Case 2:
+      Given
+        X.shape = (3, 100, 100, 4)
+      and
+        axis = 0
+      We get:
+        Out.shape = (1, 3 * 100 * 100 * 4)
+
+    Args:
+        x (Variable): A tensor of rank >= axis.
+        axis (int): Indicate up to which input dimensions (exclusive) should 
+                    be flattened to the outer dimension of the output. 
+                    The value for axis must be in the range [0, R], where R
+                    is the rank of the input tensor. When axis = 0, the shape
+                    of the output tensor is (1, (d_0 X d_1 ... d_n), where the
+                    shape of the input tensor is (d_0, d_1, ... d_n).
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: A 2D tensor with the contents of the input tensor, with input
+                  dimensions up to axis flattened to the outer dimension of
+                  the output and remaining input dimensions flattened into the
+                  inner dimension of the output.
+
+    Raises:
+        ValueError: If x is not a variable.
+        ValueError: If axis is not in range [0, rank(x)]. 
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[4, 4, 3], dtype="float32")
+            out = fluid.layers.flatten(x=x, axis=2)
+    """
+    helper = LayerHelper('flatten', **locals())
+
+    if not (isinstance(x, Variable)):
+        raise ValueError("The input x should be a Variable")
+
+    if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0:
+        raise ValueError("The axis should be a int, and in range [0, rank(x)]")
+
+    out = helper.create_tmp_variable(x.dtype)
+    helper.append_op(
+        type='flatten',
+        inputs={"X": x},
+        outputs={'Out': out},
+        attrs={"axis": axis})
+    return out
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a6a911721dfa31e5fb8d57645071af42adc968be..e7dd85ef5c3641be04261dc5d4166fa8452b4200 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -59,8 +59,8 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
-    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
-    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
+    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index c5b9e92d69133e593a2ce223e83006eda590daa5..86ac159323a5f9f6149ce5ed4437402eb885c6bc 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -105,5 +105,107 @@ class TestCrossEntropyOp3(OpTest):
             ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
 
 
+class TestCrossEntropyOp4(OpTest):
+    """Test high rank tensor cross-entropy with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [10, 2, 4]
+        ins_num = np.prod(np.array(shape))
+        class_num = 10
+
+        X_2d = randomize_probability(ins_num, class_num, dtype='float64')
+
+        label_2d = np.random.randint(0, class_num, (ins_num, 1), dtype="int64")
+        cross_entropy_2d = np.asmatrix(
+            [[-np.log(X_2d[i][label_2d[i][0]])] for i in range(X_2d.shape[0])],
+            dtype="float64")
+
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [1])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": False}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+
+
+class TestCrossEntropyOp5(OpTest):
+    """Test high rank tensor cross-entropy with vectorized soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [4, 3]
+        ins_num = np.prod(np.array(shape))
+        class_num = 37
+
+        X_2d = randomize_probability(ins_num, class_num)
+        label_2d = np.random.uniform(0.1, 1.0,
+                                     [ins_num, class_num]).astype("float32")
+        label_2d /= label_2d.sum(axis=1, keepdims=True)
+        cross_entropy_2d = (-label_2d * np.log(X_2d)).sum(
+            axis=1, keepdims=True).astype("float32")
+
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [class_num])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+
+
+class TestCrossEntropyOp6(OpTest):
+    """Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [4, 3, 2]
+        ins_num = np.prod(np.array(shape))
+        class_num = 17
+
+        X_2d = randomize_probability(ins_num, class_num)
+        label_index_2d = np.random.randint(
+            0, class_num, (ins_num), dtype="int32")
+        label_2d = np.zeros(X_2d.shape)
+        label_2d[np.arange(ins_num), label_index_2d] = 1
+
+        cross_entropy_2d = np.asmatrix(
+            [[-np.log(X_2d[i][label_index_2d[i]])]
+             for i in range(X_2d.shape[0])],
+            dtype="float32")
+
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [class_num])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+
+        self.inputs = {"X": X, "Label": label.astype(np.float32)}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
new file mode 100644
index 0000000000000000000000000000000000000000..8603d3a5b3b5d368fe87b8dcf9dc7363f95caf86
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -0,0 +1,196 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+import collections
+
+SEED = 1
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(batch_size):
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = cnn_model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=batch_size)
+    opt.minimize(avg_cost)
+    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def operator_equal(a, b):
+    for k, v in a.__dict__.iteritems():
+        if isinstance(v, fluid.framework.Program) or \
+                isinstance(v, fluid.framework.Block):
+            continue
+
+        elif isinstance(v, core.OpDesc):
+            if v.serialize_to_string() != b.__dict__[k].serialize_to_string():
+                raise ValueError("In operator_equal not equal:{0}\n".format(k))
+
+        elif isinstance(v, collections.OrderedDict):
+            v0 = sorted(v.iteritems(), key=lambda x: x[0])
+            v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
+
+            if v0 != v1:
+                raise ValueError("In operator_equal not equal:{0}\n".format(k))
+
+        elif (v != b.__dict__[k]):
+            raise ValueError("In operator_equal not equal:{0}\n".format(k))
+
+    return True
+
+
+def block_equal(a, b):
+    for k, v in a.__dict__.iteritems():
+        if isinstance(v, core.ProgramDesc) or isinstance(
+                v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
+            continue
+
+        elif k == "ops":
+            for i in range(0, len(a.ops)):
+                if not operator_equal(a.ops[i], b.ops[i]):
+                    raise ValueError("In block_equal not equal:{0}\n".format(k))
+            assert (len(a.ops) == len(b.ops))
+
+        elif isinstance(v, collections.OrderedDict):
+            v0 = sorted(v.iteritems(), key=lambda x: x[0])
+            v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
+
+            if v0 != v1:
+                raise ValueError("In block_equal not equal:{0}\n".format(k))
+
+        elif (v != b.__dict__[k]):
+            raise ValueError("In block_equal not equal:{0}\n".format(k))
+
+    return True
+
+
+def program_equal(a, b):
+    for k, v in a.__dict__.iteritems():
+        if isinstance(v, core.ProgramDesc):
+            continue
+
+        elif k == 'blocks':
+            for i in range(0, len(a.blocks)):
+                if not block_equal(a.blocks[i], b.blocks[i]):
+                    raise ValueError("In operator_equal not equal:{0}\n".format(
+                        k))
+                    return False
+            assert (len(a.blocks) == len(b.blocks))
+
+        elif (v != b.__dict__[k]):
+            raise ValueError("In program_equal not equal:{0}\n".format(k))
+
+    return True
+
+
+class TestDistMnist(unittest.TestCase):
+    def test_desc_clone(self):
+        get_model(batch_size=20)
+
+        pserver_endpoints = "127.0.0.1:9123"
+        trainers = 1
+        current_endpoint = "127.0.0.1:9123"
+        t = get_transpiler(0,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+        main = pserver_prog.clone()
+        startup = startup_prog.clone()
+
+        self.assertTrue(program_equal(main, pserver_prog))
+        self.assertTrue(program_equal(startup, startup_prog))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 1deccbe4af542cce32f0cbb4299cbe73d4df1111..4379463aca4443eb7a886ce78446440cc59f3b30 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -130,7 +130,7 @@ class TestDistBase(unittest.TestCase):
         self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
         self._python_interp = "python"
 
-    def start_pserver(self, model_file):
+    def start_pserver(self, model_file, check_error_log):
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
         ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
             (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
@@ -139,11 +139,23 @@ class TestDistBase(unittest.TestCase):
             (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
              self._trainers)
 
+        ps0_pipe = subprocess.PIPE
+        ps1_pipe = subprocess.PIPE
+        if check_error_log:
+            print("ps0_cmd:", ps0_cmd)
+            print("ps1_cmd:", ps1_cmd)
+            ps0_pipe = open("/tmp/ps0_err.log", "wb")
+            ps1_pipe = open("/tmp/ps1_err.log", "wb")
+
         ps0_proc = subprocess.Popen(
-            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
         ps1_proc = subprocess.Popen(
-            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return ps0_proc, ps1_proc
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
+
+        if not check_error_log:
+            return ps0_proc, ps1_proc, None, None
+        else:
+            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
 
     def _wait_ps_ready(self, pid):
         retry_times = 50
@@ -160,7 +172,7 @@ class TestDistBase(unittest.TestCase):
                                  (e, retry_times))
                 retry_times -= 1
 
-    def check_with_place(self, model_file, delta=1e-3):
+    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
         # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
         required_envs = {
             "PATH": os.getenv("PATH"),
@@ -169,17 +181,32 @@ class TestDistBase(unittest.TestCase):
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_cudnn_deterministic": "1"
         }
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
         # Run local to get a base line
         env_local = {"CUDA_VISIBLE_DEVICES": "0"}
         env_local.update(required_envs)
         local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
             (self._python_interp, model_file,
              "127.0.0.1:1234", "127.0.0.1:1234", 1)
-        local_proc = subprocess.Popen(
-            local_cmd.split(" "),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            env=env_local)
+        if not check_error_log:
+            local_proc = subprocess.Popen(
+                local_cmd.split(" "),
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                env=env_local)
+        else:
+            print("trainer cmd:", local_cmd)
+            err_log = open("/tmp/trainer.err.log", "wb")
+            local_proc = subprocess.Popen(
+                local_cmd.split(" "),
+                stdout=subprocess.PIPE,
+                stderr=err_log,
+                env=env_local)
+
         local_proc.wait()
         out, err = local_proc.communicate()
         local_ret = out
@@ -187,7 +214,8 @@ class TestDistBase(unittest.TestCase):
         sys.stderr.write('local_stderr: %s\n' % err)
 
         # Run dist train to compare with local results
-        ps0, ps1 = self.start_pserver(model_file)
+        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
+                                                          check_error_log)
         self._wait_ps_ready(ps0.pid)
         self._wait_ps_ready(ps1.pid)
 
@@ -205,15 +233,23 @@ class TestDistBase(unittest.TestCase):
         env1.update(required_envs)
         FNULL = open(os.devnull, 'w')
 
+        tr0_pipe = subprocess.PIPE
+        tr1_pipe = subprocess.PIPE
+        if check_error_log:
+            print("tr0_cmd:", tr0_cmd)
+            print("tr1_cmd:", tr1_cmd)
+            tr0_pipe = open("/tmp/tr0_err.log", "wb")
+            tr1_pipe = open("/tmp/tr1_err.log", "wb")
+
         tr0_proc = subprocess.Popen(
             tr0_cmd.split(" "),
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stderr=tr0_pipe,
             env=env0)
         tr1_proc = subprocess.Popen(
             tr1_cmd.split(" "),
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stderr=tr1_pipe,
             env=env1)
 
         tr0_proc.wait()
@@ -230,6 +266,13 @@ class TestDistBase(unittest.TestCase):
         local_first_loss = eval(local_lines[0])[0]
         local_last_loss = eval(local_lines[1])[0]
 
+        # close trainer file
+        if check_error_log:
+            tr0_pipe.close()
+            tr1_pipe.close()
+
+            ps0_pipe.close()
+            ps1_pipe.close()
         # FIXME: use terminate() instead of sigkill.
         os.kill(ps0.pid, signal.SIGKILL)
         os.kill(ps1.pid, signal.SIGKILL)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index aab8969a96ff69d1a306506337a0e009f14758b9..55aa923f5ab229bc8e9a0b891e0ac9c2ec49d31b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -26,6 +26,12 @@ from paddle.fluid.layers.io import ListenAndServ
 from paddle.fluid.layers.io import Recv
 from paddle.fluid.layers.io import Send
 
+from paddle.fluid import core
+
+RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
+)
+RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
+
 
 class TestSendOp(unittest.TestCase):
     def test_send(self):
@@ -89,18 +95,29 @@ class TestSendOp(unittest.TestCase):
     def init_client(self, place, port):
         main = fluid.Program()
         with fluid.program_guard(main):
+            main.global_block().append_op(
+                type="fetch_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": ["127.0.0.1:{0}".format(port)],
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
             x = layers.data(
                 shape=[32, 32],
                 dtype='float32',
                 name='X',
                 append_batch_size=False)
             fluid.initializer.Constant(value=2.3)(x, main.global_block())
+
             get_var = main.global_block().create_var(
                 name="scale_0.tmp_0",  # server side var
                 dtype="float32",
                 persistable=False,
                 shape=[32, 32])
             fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+
             Send("127.0.0.1:%d" % port, [x])
             o = Recv("127.0.0.1:%d" % port, [get_var])
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index b6f4f0726f13c804e56929836689258a66254fcf..124abf4ccde98d565b3286c72793c91fd26bb71c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+
 import unittest
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
 import traceback
+import collections
 
 
 class TranspilerTest(unittest.TestCase):
@@ -51,9 +54,18 @@ class TranspilerTest(unittest.TestCase):
         self.origin_prog = main.clone()
         return main
 
-    def get_trainer(self, config=None, sync_mode=True):
-        t = self._transpiler_instance(config, sync_mode)
-        return t.get_trainer_program()
+    def get_trainer(self, config=None):
+        src = fluid.default_startup_program().clone()
+
+        t = self._transpiler_instance(config)
+
+        trainer_main = t.get_trainer_program()
+        trainer_startup = fluid.default_startup_program()
+
+        assert (src.num_blocks == 1)
+        assert (trainer_startup.num_blocks == src.num_blocks)
+
+        return trainer_main, trainer_startup
 
     def get_pserver(self, ep, config=None, sync_mode=True):
         t = self._transpiler_instance(config, sync_mode)
@@ -89,7 +101,21 @@ class TestBasicModel(TranspilerTest):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         pserver2, startup2 = self.get_pserver(self.pserver2_ep)
 
-        trainer = self.get_trainer()
+        trainer, trainer_startup = self.get_trainer()
+
+        # splited var blocks should be in startup program
+        self.assertTrue("fc_w.block0" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w.block1" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_b" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w@GRAD" not in trainer_startup.global_block().vars)
+        self.assertTrue("fc_b@GRAD" not in trainer_startup.global_block().vars)
+
+        src = [op.type for op in trainer_startup.global_block().ops]
+        dst = ['fill_constant', 'fill_constant', 'uniform_random', 'recv', 'recv', \
+               'fetch_barrier', 'concat']
+
+        self.assertEqual(src, dst)
 
         self.assertEqual([op.type for op in trainer.global_block().ops], [
             'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
@@ -140,7 +166,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
         pserver, startup = self.get_pserver(self.pserver1_ep, config)
         pserver2, startup2 = self.get_pserver(self.pserver2_ep, config)
 
-        trainer = self.get_trainer(config)
+        trainer, _ = self.get_trainer(config)
 
         self.assertEqual([op.type for op in trainer.global_block().ops], [
             'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
@@ -224,7 +250,7 @@ class TestLRDecay(TranspilerTest):
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
 
         self.assertEqual(len(pserver.blocks), 4)
         lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
@@ -254,12 +280,12 @@ class TestLRDecayConditional(TranspilerTest):
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
 
         serv_op = pserver.blocks[0].ops[0]
         sub_blocks = []
         optimize_blocks = []
-        for b in serv_op.attrs["optimize_blocks"]:
+        for b in serv_op.all_attrs()["optimize_blocks"]:
             optimize_blocks.append(b.idx)
         for b in pserver.blocks:
             if b.idx not in optimize_blocks:
@@ -303,7 +329,7 @@ class TestL2Decay(TranspilerTest):
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
 
         self.assertEqual(len(pserver.blocks), 3)
         self.assertEqual([op.type for op in pserver.blocks[1].ops],
@@ -338,7 +364,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
 
     def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
 
         self.assertEqual(len(pserver.blocks), 9)
         self.assertEqual([op.type for op in pserver.blocks[1].ops], [
@@ -362,12 +388,13 @@ class TestL2DecayWithPiecewise(TranspilerTest):
 
 class TestDistLookupTableBase(TranspilerTest):
     def network_with_table(self, is_sparse, is_distributed):
+        self.table_size = 1000
+        self.emb_size = 64
+
         def emb_pool(ids):
-            table_size = 1000
-            emb_size = 64
             emb = fluid.layers.embedding(
                 input=ids,
-                size=[table_size, emb_size],
+                size=[self.table_size, self.emb_size],
                 dtype='float32',
                 param_attr='shared_w',  # share parameter
                 is_sparse=is_sparse,
@@ -412,7 +439,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
                          ["sum", "adam", "scale", "scale"])
 
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
@@ -450,7 +477,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
         # 5 save table
         self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
 
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
@@ -483,7 +510,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
                          ["adam", "scale", "scale"])
 
-        trainer = self.get_trainer(config)
+        trainer, _ = self.get_trainer(config)
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
@@ -522,7 +549,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         # 5 save table
         self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
 
-        trainer = self.get_trainer(config)
+        trainer, _ = self.get_trainer(config)
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
@@ -536,6 +563,22 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
 
+class TestDistLookupTableSliceSize(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config)
+
+        self.assertTrue(self.transpiler.has_distributed_lookup_table)
+        lookup_table_var = pserver1.global_block().vars[
+            self.transpiler.table_name]
+        row_size = lookup_table_var.shape[0]
+        calc_row_size = int(math.ceil(self.table_size / self.pservers))
+        self.assertEqual(row_size, calc_row_size)
+
+
 class TestRMSPropOptimizer(TranspilerTest):
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
index 3f547f3c484bf034a87823a75d946ef130a5cb70..099e6e60642e9637f8f3648696e844c667e1c406 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
@@ -22,6 +22,7 @@ def fully_connected_naive(input, weights, bias_data=None):
     w_h, w_c = weights.shape
 
     x_data = np.reshape(input, [in_n, in_c * in_h * in_w])
+    # this transpose should be implemented at C code
     w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w)))
     result = None
 
@@ -43,15 +44,11 @@ class TestFCMKLDNNOp(OpTest):
     def setUp(self):
         self.op_type = "fc"
         self.use_mkldnn = True
-        self.with_bias = True
         self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
 
         self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
 
-        self.attrs = {
-            'use_mkldnn': self.use_mkldnn,
-            'with_bias': self.with_bias
-        }
+        self.attrs = {'use_mkldnn': self.use_mkldnn, }
 
         self.outputs = {
             'Out': fully_connected_naive(self.matrix.input, self.matrix.weights)
@@ -85,13 +82,11 @@ class TestFCMKLDNNOp3(TestFCMKLDNNOp):
 
 class TestFCMKLDNNOp4(TestFCMKLDNNOp):
     def init_op_type(self):
-        self.with_bias = False
         self.matrix = MatrixGenerate(2, 32, 48, 2, 2)
 
 
 class TestFCMKLDNNOp4(TestFCMKLDNNOp):
     def init_op_type(self):
-        self.with_bias = False
         self.matrix = MatrixGenerate(2, 32, 1000, 6, 6)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bb920710a9b10f3a8159bad3b33dd15ffbada19
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def fc_refer(matrix, with_bias):
+    in_n, in_c, in_h, in_w = matrix.input.shape
+    w_i, w_o = matrix.weights.shape
+
+    x_data = np.reshape(matrix.input, [in_n, in_c * in_h * in_w])
+    w_data = np.reshape(matrix.weights, [w_i, w_o])
+    b_data = np.reshape(matrix.bias, [1, w_o])
+    result = None
+
+    if with_bias:
+        result = np.dot(x_data, w_data) + b_data
+    else:
+        result = np.dot(x_data, w_data)
+
+    return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+        self.bias = np.random.random((1, oc)).astype("float32")
+
+
+class TestFCOp(OpTest):
+    def setUp(self):
+        self.op_type = "fc"
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
+
+        self.with_bias = True
+        if self.with_bias:
+            self.inputs = {
+                'Input': self.matrix.input,
+                'W': self.matrix.weights,
+                'Bias': self.matrix.bias
+            }
+        else:
+            self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
+
+        self.attrs = {'use_mkldnn': False}
+
+        self.outputs = {'Out': fc_refer(self.matrix, self.with_bias)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFCOpBiasBoth(TestFCOp):
+    def init_shapes(self, mb, ic, oc, h, w):
+        for with_bias in {True, False}:
+            self.with_bias = with_bias
+            self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+
+
+class TestFCOp1(TestFCOpBiasBoth):
+    def init_op_type(self):
+        self.init_shapes(2, 8, 10, 1, 1)
+
+
+class TestFCOp2(TestFCOpBiasBoth):
+    def init_op_type(self):
+        self.init_shapes(4, 5, 6, 2, 2)
+
+
+class TestFCOp4(TestFCOpBiasBoth):
+    def init_op_type(self):
+        self.init_shapes(1, 32, 64, 3, 3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 8f2dac786d03334642714d221710a20833939c7a..07fd0575d333dacf309620a883e4052c6126739f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -21,6 +21,7 @@ import paddle.fluid.nets as nets
 from paddle.fluid.framework import Program, program_guard, default_main_program
 from paddle.fluid.param_attr import ParamAttr
 import decorators
+from paddle.fluid.initializer import Constant
 
 
 class TestBook(unittest.TestCase):
@@ -465,6 +466,17 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_flatten(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x',
+                append_batch_size=False,
+                shape=[4, 4, 3],
+                dtype="float32")
+            out = layers.flatten(x, axis=1, name="flatten")
+            self.assertIsNotNone(out)
+
     def test_shape(self):
         program = Program()
         with program_guard(program):
@@ -474,6 +486,20 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_prelu(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[5, 200, 100, 100], dtype="float32")
+            mode = 'channel'
+            out = layers.prelu(
+                input,
+                mode,
+                param_attr=ParamAttr(initializer=Constant(1.0)),
+                name='prelu')
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index ae19a553bb826002c562c15ee07759391d51b4d8..cb7de3fc93c0379ea50c88044876d6a8ee617a69 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -20,30 +20,58 @@ from op_test import OpTest
 class PReluTest(OpTest):
     def setUp(self):
         self.op_type = "prelu"
-        x_np = np.random.normal(size=(10, 10)).astype("float32")
-
-        for pos, val in np.ndenumerate(x_np):
-            # Since zero point in prelu is not differentiable, avoid randomize
-            # zero.
-            while abs(val) < 1e-3:
-                x_np[pos] = np.random.normal()
-                val = x_np[pos]
-
-        x_np_sign = np.sign(x_np)
-        x_np = x_np_sign * np.maximum(x_np, .005)
-        alpha_np = np.array([.1], dtype="float32")
-        self.inputs = {'X': x_np, 'Alpha': alpha_np}
+        self.initTestCase()
+        x_np = np.random.normal(size=(3, 5, 5, 10)).astype("float32")
+
+        # Since zero point in prelu is not differentiable, avoid randomize
+        # zero.
+        x_np[np.abs(x_np) < 0.005] = 0.02
+
+        if self.attrs == {'mode': "all"}:
+            alpha_np = np.random.rand(1).astype("float32")
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
+        elif self.attrs == {'mode': "channel"}:
+            alpha_np = np.random.rand(1, x_np.shape[1], 1, 1).astype("float32")
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
+        else:
+            alpha_np = np.random.rand(*x_np.shape).astype("float32")
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
+
         out_np = np.maximum(self.inputs['X'], 0.)
         out_np = out_np + np.minimum(self.inputs['X'],
                                      0.) * self.inputs['Alpha']
         assert out_np is not self.inputs['X']
         self.outputs = {'Out': out_np}
 
+    def initTestCase(self):
+        self.attrs = {'mode': "channel"}
+
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X', 'Alpha'], 'Out')
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(['Alpha'], 'Out', no_grad_set=set('X'))
+
+    def test_check_grad_ignore_alpha(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))
+
+
+class TestCase1(PReluTest):
+    def initTestCase(self):
+        self.attrs = {'mode': "all"}
+
+
+class TestCase2(PReluTest):
+    def initTestCase(self):
+        self.attrs = {'mode': "channel"}
+
+
+class TestCase3(PReluTest):
+    def initTestCase(self):
+        self.attrs = {'mode': "element"}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
index c51a48239330621d8e008415f81361616467cabf..0997afc97a97333c914a3027103ec48733b410dc 100644
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -17,6 +17,7 @@ import unittest
 
 from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name
 import paddle.fluid.layers as layers
+import paddle.fluid as fluid
 
 main_program = default_main_program()
 
@@ -98,6 +99,39 @@ class TestProgram(unittest.TestCase):
         new_program = main_program.clone()
         self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
 
+    def test_program_inference_optimize(self):
+        def net():
+            reader = fluid.layers.py_reader(
+                capacity=10,
+                shapes=[[-1, 10], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'],
+                use_double_buffer=True)
+            in_data, label = fluid.layers.read_file(reader)
+            predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict_label, label=label))
+
+            optimizer = fluid.optimizer.Adam()
+            optimizer.minimize(loss)
+
+        startup_program = fluid.Program()
+        main_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            net()
+        no_read_program = main_program.inference_optimize()
+        keep_read_program = main_program.inference_optimize(
+            export_for_deployment=False)
+        no_read_ops = no_read_program.global_block().ops
+        keep_read_ops = keep_read_program.global_block().ops
+        self.assertEqual(len(keep_read_ops) - len(no_read_ops), 2)
+        self.assertEqual(keep_read_ops[0].type, 'create_double_buffer_reader')
+        self.assertEqual(keep_read_ops[1].type, 'read')
+
+        for i in range(len(no_read_ops)):
+            self.assertEqual(no_read_ops[i].type, keep_read_ops[i + 2].type)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index 621dd681345d2c185c0644de917b8bcd529f9937..9853fb4e9a89944bfdf2954e3d3d86fef92ac93c 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase):
         self.assertEqual(8, len(op.attr_names()))
 
         op.set_block_attr("block_attr", program_desc.block(0))
-        self.assertEqual(0, op.block_attr("block_attr"))
+        self.assertEqual(0, op.block_attr_id("block_attr"))
 
         mul_op = block.append_op()
         mul_op.set_type("mul")
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 1bb86acdf8398fff63e5f55148ddb43b6b4da5be..ce4709f23b752cc061f3b767a262f82378b86707 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -195,6 +195,9 @@ class DistributeTranspiler(object):
         if program is None:
             program = default_main_program()
         self.origin_program = program
+        self.origin_startup_program = default_startup_program().clone()
+
+        self.startup_program = default_startup_program()
         self.trainer_num = trainers
         self.sync_mode = sync_mode
         self.trainer_id = trainer_id
@@ -205,10 +208,10 @@ class DistributeTranspiler(object):
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
         self.has_distributed_lookup_table = self._has_distributed_lookup_table()
 
-        # split and create vars, then put splited vars in dicts for later use.
+        # step 1: split and create vars, then put splited vars in dicts for later use.
         self._init_splited_vars()
 
-        # step 3.1: insert send op to send gradient vars to parameter servers
+        # step 2: insert send op to send gradient vars to parameter servers
         ps_dispatcher.reset()
         send_vars = []
 
@@ -265,7 +268,7 @@ class DistributeTranspiler(object):
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
-        # step 3.2: insert recv op to receive parameters from parameter server
+        # step 3: insert recv op to receive parameters from parameter server
         recv_vars = []
         for _, var in enumerate(send_vars):
             recv_vars.append(self.grad_param_mapping[var])
@@ -312,6 +315,8 @@ class DistributeTranspiler(object):
                 outputs={"Out": [orig_param]},
                 attrs={"axis": 0})
 
+        self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
+
         if self.has_distributed_lookup_table:
             self._replace_lookup_table_op_with_prefetch(program,
                                                         pserver_endpoints)
@@ -328,8 +333,78 @@ class DistributeTranspiler(object):
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         self.origin_program.__str__()
+
         return self.origin_program
 
+    def _get_trainer_startup_program(self,
+                                     recv_vars,
+                                     eplist,
+                                     startup_program=None):
+        """
+        Get transpiled trainer side startup program.
+
+        Args:
+            startup_program(Program): Startup program.
+
+        Returns:
+            Program: trainer side startup program.
+        """
+        if startup_program is None:
+            startup_program = self.startup_program
+
+        # FIXME(gongwb): delete not need ops.
+        # note that: some parameter is not trainable and those ops can't be deleted.
+
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            # Get the eplist of recv vars
+            eps = []
+            for var in splited_var:
+                index = [v.name for v in recv_vars].index(var.name)
+                eps.append(eplist[index])
+
+            for var in splited_var:
+                if startup_program.global_block().has_var(var.name):
+                    continue
+
+                startup_program.global_block().create_var(
+                    name=var.name,
+                    persistable=False,
+                    type=var.type,
+                    dtype=var.dtype,
+                    shape=var.shape,
+                    lod_level=var.lod_level)
+
+            op = startup_program.global_block().append_op(
+                type="recv",
+                inputs={},
+                outputs={"Out": splited_var},
+                attrs={
+                    "epmap": eps,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
+        startup_program.global_block().append_op(
+            type="fetch_barrier",
+            inputs={},
+            outputs={},
+            attrs={
+                "endpoints": self.pserver_endpoints,
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            #add concat ops to merge splited parameters received from parameter servers.
+            if len(splited_var) <= 1:
+                continue
+            orig_param = startup_program.global_block().vars[varname]
+            startup_program.global_block().append_op(
+                type="concat",
+                inputs={"X": splited_var},
+                outputs={"Out": [orig_param]},
+                attrs={"axis": 0})
+
+        return startup_program
+
     def get_pserver_program(self, endpoint):
         """
         Get parameter server side program.
@@ -530,7 +605,10 @@ class DistributeTranspiler(object):
         pserver_program._sync_with_cpp()
         return pserver_program
 
-    def get_startup_program(self, endpoint, pserver_program):
+    def get_startup_program(self,
+                            endpoint,
+                            pserver_program,
+                            startup_program=None):
         """
         Get startup program for current parameter server.
         Modify operator input variables if there are variables that
@@ -540,12 +618,17 @@ class DistributeTranspiler(object):
             endpoint (str): current pserver endpoint.
             pserver_program (Program): call get_pserver_program first and
                 pass the result here.
+            startup_program (Program): if pass None, will use
+                default_startup_program
 
         Returns:
             Program: parameter server side startup program.
         """
         s_prog = Program()
-        orig_s_prog = default_startup_program()
+        if not startup_program:
+            orig_s_prog = default_startup_program()
+        else:
+            orig_s_prog = startup_program
         s_prog.random_seed = orig_s_prog.random_seed
         params = self.param_grad_ep_mapping[endpoint]["params"]
 
@@ -568,14 +651,16 @@ class DistributeTranspiler(object):
             new_outputs = dict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
-            for key in op.output_names:
-                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
-                if newname:
-                    op_on_pserver = True
-                    new_outputs[key] = created_var_map[newname]
-                elif op.output(key)[0] in pserver_vars:
-                    op_on_pserver = True
-                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+            # TODO(gongwb): remove this line.
+            if op.type not in ["recv", "fetch_barrier", "concat"]:
+                for key in op.output_names:
+                    newname, _ = _get_splited_name_and_shape(op.output(key)[0])
+                    if newname:
+                        op_on_pserver = True
+                        new_outputs[key] = created_var_map[newname]
+                    elif op.output(key)[0] in pserver_vars:
+                        op_on_pserver = True
+                        new_outputs[key] = pserver_vars[op.output(key)[0]]
 
             if op_on_pserver:
                 # most startup program ops have no inputs
@@ -584,12 +669,12 @@ class DistributeTranspiler(object):
                 if op.type in [
                         "gaussian_random", "fill_constant", "uniform_random"
                 ]:
-                    op.attrs["shape"] = new_outputs["Out"].shape
+                    op.set_attr("shape", list(new_outputs["Out"].shape))
                 s_prog.global_block().append_op(
                     type=op.type,
                     inputs=new_inputs,
                     outputs=new_outputs,
-                    attrs=op.attrs)
+                    attrs=op.all_attrs())
         return s_prog
 
     # ====================== private transpiler functions =====================
@@ -603,7 +688,7 @@ class DistributeTranspiler(object):
         self.table_name = None
         for op in self.origin_program.global_block().ops:
             if op.type == LOOKUP_TABLE_TYPE:
-                if op.attrs['is_distributed'] is True:
+                if op.attr('is_distributed') is True:
                     if self.table_name is None:
                         self.table_name = op.input("W")[0]
                     if self.table_name != op.input("W")[0]:
@@ -877,9 +962,15 @@ class DistributeTranspiler(object):
         # create table param and grad var in pserver program
         origin_param_var = self.origin_program.global_block().vars[
             self.table_name]
+
+        zero_dim = int(
+            math.ceil(origin_param_var.shape[0] / len(self.pserver_endpoints)))
+        table_shape = list(origin_param_var.shape)
+        table_shape[0] = zero_dim
+
         param_var = pserver_program.global_block().create_var(
             name=origin_param_var.name,
-            shape=origin_param_var.shape,
+            shape=table_shape,
             dtype=origin_param_var.dtype,
             type=core.VarDesc.VarType.SELECTED_ROWS,
             persistable=True)
@@ -1008,7 +1099,6 @@ class DistributeTranspiler(object):
                     var_mapping[varname] = \
                         [program.global_block().var(orig_var.name)]
                 continue
-
             var_mapping[varname] = []
             orig_shape = orig_var.shape
             orig_dim1_flatten = 1
@@ -1263,7 +1353,7 @@ class DistributeTranspiler(object):
             type=opt_op.type,
             inputs=new_inputs,
             outputs=outputs,
-            attrs=opt_op.attrs)
+            attrs=opt_op.all_attrs())
 
     def _is_splited_grad_var(self, var, var_dict):
         grad_block = None
@@ -1296,7 +1386,7 @@ class DistributeTranspiler(object):
                     block._clone_variable(var)
 
         return block.append_op(
-            type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs())
 
     def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
         program = optimize_block.program
@@ -1337,7 +1427,7 @@ class DistributeTranspiler(object):
             type=opt_op.type,
             inputs=inputs,
             outputs=outputs,
-            attrs=opt_op.attrs)
+            attrs=opt_op.all_attrs())
 
     def _is_op_connected(self, op1, op2):
         # If one op's input is another op's output or
@@ -1442,8 +1532,8 @@ class DistributeTranspiler(object):
         # optimize
         op_maker = core.op_proto_and_checker_maker
         optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attrs and \
-            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+        if op_maker.kOpRoleAttrName() in op.attr_names and \
+                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
             return True
         return False
 
@@ -1466,8 +1556,8 @@ class DistributeTranspiler(object):
                 # and op_role_var to get the pair.
                 for input_name in op.input_arg_names:
                     if input_name.find("@GRAD") != -1 and \
-                        op.attrs[RPC_OP_ROLE_ATTR_NAME]:
-                        param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0]
+                        op.attr(RPC_OP_ROLE_ATTR_NAME):
+                        param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
                         params_grads.append([
                             origin_var_dict[param_name],
                             origin_var_dict[input_name]