merge develop

c999895e · nhzlx · 27695029 · 896a37b6 · c999895e · c999895e
78 changed file
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -102,7 +102,6 @@ set(COMMON_FLAGS
    -fno-omit-frame-pointer
    -Wall
    -Wextra
-    -Werror
    -Wnon-virtual-dtor
    -Wdelete-non-virtual-dtor
    -Wno-unused-parameter
@@ -115,6 +114,11 @@ set(COMMON_FLAGS
    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )

+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
+list(APPEND COMMON_FLAGS -Werror)
+endif()
+
 set(GPU_COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer

--- a/doc/fluid/design/others/graph_survey.md
+++ b/doc/fluid/design/others/graph_survey.md
@@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs):



-Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null.

 Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.


--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -78,7 +78,7 @@ paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', '
 paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
-paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
 paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
@@ -153,6 +153,7 @@ paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'n
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
 paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -250,7 +251,6 @@ paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwarg
 paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -5,8 +5,12 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
+cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter)
+cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
+

 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto)
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+bool VarOutLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void BuildFCPattern(PDPattern* pattern) {
+  // make sure the selected MUL op has one input argument is a parameter.
+  auto* mul_parameter_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->outputs.size() == 1UL &&
+               node->outputs.front()->Op()->Type() == "mul" && node->Var() &&
+               node->Var()->Persistable();  // check is a parameter
+      },
+      "mul_weight" /*name*/);
+
+  auto* mul_tmp_input_var = pattern->NewNode(
+      [](Node* node) {
+        bool result =
+            node->IsVar() && node->outputs.size() >= 1UL && node->Var() &&
+            !node->Var()->Persistable();  // this input is not an parameter.
+        if (!result) return false;
+        // check whether one output is MUL op.
+        for (auto* op : node->outputs) {
+          if (op->IsOp() && op->Op()->Type() == "mul") return true;
+        }
+        return false;
+      },
+      "mul_tmp_var" /*name*/);
+
+  // select a MUL op
+  auto* mul_op = pattern->NewNode(
+      [](Node* node) {
+        return node->IsOp() &&               // start from an Op
+               node->Op()->Type() == "mul";  // type is mul
+        // the output should be consumed only by one element_add, that check
+        // leaves in a Var PDNode.
+      },
+      "mul" /*name*/);
+
+  // make sure the MUL op's output has only one consumer and links to an
+  // ELEMENTWISE_ADD op.
+  auto* mul_out_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() &&                  // starts from a Var
+               node->outputs.size() == 1UL &&    // only has one consumer
+               node->outputs.front()->IsOp() &&  // check basic logic
+               node->Var() &&                    // not a ControlDepVar
+               node->outputs.front()->Op()->Type() ==
+                   "elementwise_add";  // a very strong validation
+      },
+      "mul_out");
+  // this check is not essential, just to make the corresponding variable Node
+  // retrival easier.
+  auto* elementwise_add_tmp_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->outputs.size() >= 1UL && node->Var() &&
+               VarOutLinksToOp(node, "elementwise_add");
+      },
+      "elementwise_add_tmpvar");
+
+  // select an ELEMENTWISE_ADD op
+  auto* elementwise_add_op = pattern->NewNode(
+      [](Node* node) {
+        return node->IsOp() && node->Op()->Type() == "elementwise_add";
+      },
+      "elementwise_add" /*name*/);
+
+  // get the ELEMENTWISE_ADD op's output
+  auto* elementwise_add_out_var = pattern->NewNode(
+      [](Node* node) {
+        return node->IsVar() && node->inputs.size() == 1UL && node->Var() &&
+               node->inputs.front()->Op()->Type() == "elementwise_add";
+      },
+      "elementwise_add_out");
+
+  pattern->AddEdge(mul_parameter_var, mul_op);
+  pattern->AddEdge(mul_tmp_input_var, mul_op);
+  pattern->AddEdge(mul_op, mul_out_var);
+  pattern->AddEdge(mul_out_var, elementwise_add_op);
+  pattern->AddEdge(elementwise_add_tmp_var, elementwise_add_op);
+  pattern->AddEdge(elementwise_add_op, elementwise_add_out_var);
+}
+
+// Replace the node `from` in the links to `to`
+bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
+  for (auto*& n : *links) {
+    if (n == from) {
+      n = to;
+      return true;
+    }
+  }
+  return false;
+}
+
+std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+
+  std::unordered_set<Node*> nodes2delete;
+
+  GraphPatternDetecter gpd;
+  BuildFCPattern(gpd.mutable_pattern());
+
+#define GET_NODE(id)                                             \
+  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetriveNode(#id)), \
+                 "pattern has no Node called %s", #id);          \
+  auto* id = subgraph.at(gpd.pattern().RetriveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+  auto handler = [&](const GraphPatternDetecter::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle FC fuse";
+    // Currently, there is no FC op available, so I will just simulate the
+    // scenerio.
+    // FC's fusion is simple, just op fuse, no need to process the
+    // parameters.
+    GET_NODE(mul_tmp_var);             // x
+    GET_NODE(mul_weight);              // Y
+    GET_NODE(elementwise_add_tmpvar);  // bias
+    GET_NODE(elementwise_add_out);     // Out
+    GET_NODE(mul);                     // MUL op
+    GET_NODE(elementwise_add);         // ELEMENT_ADD op
+    GET_NODE(mul_out);                 // tmp
+#undef GET_NODE
+
+    // Create an FC Node.
+    OpDesc desc;
+    std::string fc_x_in = mul_tmp_var->Name();
+    std::string fc_Y_in = mul_weight->Name();
+    std::string fc_bias_in = elementwise_add_tmpvar->Name();
+    std::string fc_out = elementwise_add_out->Name();
+    desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
+    desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
+    desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
+    desc.SetOutput("Out", std::vector<std::string>({fc_out}));
+    desc.SetType("fc");
+    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
+    fc_node->inputs =
+        std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
+    fc_node->outputs.push_back(elementwise_add_out);
+
+    // Update link relatons
+    PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
+    PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
+    PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
+                                elementwise_add, fc_node));
+    PADDLE_ENFORCE(
+        LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
+
+    // Drop old nodes
+    graph->RemoveNode(mul);
+    graph->RemoveNode(elementwise_add);
+    graph->RemoveNode(mul_out);  // tmp variable
+  };
+
+  gpd(graph.get(), handler);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass);
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the MUL and ELEMENTWISE_ADD to a FCOp.
+ */
+class FCFusePass : public Pass {
+ public:
+  virtual ~FCFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetInput("Xs", inputs);
+  op->SetOutput("Ys", outputs);
+}
+
+// a->OP0->b
+// a->OP1->c
+// (b, c)->mul->d
+// (d, e)->elementwise_add->f
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "c") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
+        std::vector<std::string>({"d"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
+        std::vector<std::string>({"f"}));
+
+  return prog;
+}
+
+TEST(FCFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("fc_fuse_pass");
+
+  int pre_nodes = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int after_nodes = graph->Nodes().size();
+
+  // Remove 3 Nodes: MUL,ELEMENTWISE_ADD, mul_out
+  // Add 1 Node: FC
+  EXPECT_EQ(pre_nodes - 2, after_nodes);
+
+  // Assert fc op in newly generated graph
+  int fc_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "fc") {
+      ++fc_count;
+    }
+  }
+  EXPECT_EQ(fc_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -98,11 +98,13 @@ class Graph {

  // Create a normal variable with non-null VarDesc.
  ir::Node *CreateVarNode(VarDesc *var_desc) {
+    PADDLE_ENFORCE(var_desc);
    return AddNode(new ir::Node(var_desc));
  }

  // Create a normal runnable operator with OpDesc.
  ir::Node *CreateOpNode(OpDesc *op_desc) {
+    PADDLE_ENFORCE(op_desc);
    return AddNode(new ir::Node(op_desc));
  }

@@ -134,6 +136,14 @@ class Graph {
    return ret;
  }

+  void RemoveNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
+    node_set_.erase(node);
+    nodes_.erase(node);
+  }
+
+  const ProgramDesc &program() const { return program_; }
+
 private:
  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
@@ -143,12 +153,6 @@ class Graph {
    return node;
  }

-  void RemoveNode(ir::Node *node) {
-    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
-    node_set_.erase(node);
-    nodes_.erase(node);
-  }
-
  // NOTE: program_ shouldn't be exposed to user.
  const ProgramDesc &program_;
  std::map<std::string, boost::any> attrs_;

--- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
@@ -25,12 +25,30 @@ namespace paddle {
 namespace framework {
 namespace ir {

+size_t PDPattern::id_ = 0UL;
+
 PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
+  if (!name.empty()) {
+    PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+                      "PDNode's name should be unique, get duplicate [%s]",
+                      name);
+  }
+
  nodes_.emplace_back(new PDNode(std::move(teller), name));
  auto* cur = nodes_.back().get();
+  node_map_[name] = cur;
  return cur;
 }

+PDNode* PDPattern::RetriveNode(const std::string& id) const {
+  auto it = node_map_.find(id);
+  if (it == node_map_.end()) {
+    return nullptr;
+  }
+
+  return it->second;
+}
+
 void PDPattern::AddEdge(PDNode* a, PDNode* b) {
  PADDLE_ENFORCE(a);
  PADDLE_ENFORCE(b);
@@ -51,15 +69,18 @@ void GraphPatternDetecter::operator()(Graph* graph,
 }

 bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
+  VLOG(4) << "mark pdnodes in graph";
  if (graph.Nodes().empty()) return false;

  for (auto& node : GraphTraits::DFS(graph)) {
    for (const auto& pdnode : pattern_.nodes()) {
      if (pdnode->Tell(&node)) {
+        VLOG(4) << "pdnode " << pdnode->name() << " marked";
        pdnodes2nodes_[pdnode.get()].insert(&node);
      }
    }
  }
+  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
  return !pdnodes2nodes_.empty();
 }

@@ -67,10 +88,20 @@ struct HitGroup {
  std::unordered_map<PDNode*, Node*> roles;

  bool Match(Node* node, PDNode* pat) {
+    if (nodes_.count(node)) {
+      if (!roles.count(pat)) return false;
+      return roles[pat] == node;
+    }
    return !roles.count(pat) || roles.at(pat) == node;
  }

-  void Register(Node* node, PDNode* pat) { roles[pat] = node; }
+  void Register(Node* node, PDNode* pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+
+ private:
+  std::unordered_set<Node*> nodes_;
 };

 // Tell whether Node a links to b.
@@ -104,6 +135,7 @@ GraphPatternDetecter::DetectPatterns() {
  // Extend a PDNode to subgraphs by deducing the connection relations defined
  // in edges of PDNodes.
  for (const auto& edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
    // Each role has two PDNodes, which indicates two roles.
    // Detect two Nodes that can match these two roles and they are connected.
    auto& pre_groups = bi_records[step % 2];
@@ -127,6 +159,7 @@ GraphPatternDetecter::DetectPatterns() {
        }
      }
    }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
  }

  for (auto& group : bi_records[step % 2]) {

--- a/paddle/fluid/framework/ir/graph_pattern_detecter.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h
@@ -96,7 +96,8 @@ class PDPattern {

  void AddEdge(PDNode* a, PDNode* b);

-  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
+  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
+  PDNode* RetriveNode(const std::string& id) const;

  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
  const std::vector<edge_t>& edges() const { return edges_; }
@@ -107,8 +108,12 @@ class PDPattern {
  FRIEND_TEST(PDPattern, NewNode);
 #endif

+  static std::string NewID() { return "pdnode-" + std::to_string(id_++); }
+
  std::vector<std::unique_ptr<PDNode>> nodes_;
  std::vector<edge_t> edges_;
+  std::unordered_map<std::string, PDNode*> node_map_;
+  static size_t id_;
 };

 /*

--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -200,9 +200,11 @@ TEST(GraphTest, WriteAfterWrite) {
      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
      control_dep2 = n->inputs[1];
      ASSERT_EQ(n->inputs.size(), 2);
-      ASSERT_EQ(control_dep1, control_dep2);
    }
  }
+  ASSERT_NE(control_dep1, nullptr);
+  ASSERT_NE(control_dep2, nullptr);
+  ASSERT_EQ(control_dep1, control_dep2);
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -25,6 +25,7 @@ static const char kGraphVizPath[] = "graph_viz_path";
 std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
+  VLOG(3) << "draw IR graph viz to " << graph_viz_path;
  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
  PADDLE_ENFORCE(fout->good());
  std::ostream& sout = *fout;

--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferCleanGraphPass : public Pass {
+ public:
+  virtual ~InferCleanGraphPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
+    PADDLE_ENFORCE(graph.get());
+
+    auto is_valid_node = [](Node* x) {
+      return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
+    };
+
+    std::unordered_set<Node*> invalid_nodes;
+    for (auto* node : graph->Nodes()) {
+      if (is_valid_node(node)) {
+        invalid_nodes.insert(node);
+      }
+    }
+
+    // remove nodes from the graph.
+    for (auto* node : invalid_nodes) {
+      graph->RemoveNode(node);
+    }
+
+    // clean edges.
+    for (auto* node : graph->Nodes()) {
+      CleanEdges(&node->inputs, invalid_nodes);
+      CleanEdges(&node->outputs, invalid_nodes);
+    }
+
+    return graph;
+  }
+
+  void CleanEdges(std::vector<Node*>* nodes,
+                  const std::unordered_set<Node*>& to_remove) const {
+    auto it = std::remove_if(nodes->begin(), nodes->end(),
+                             [&](Node* x) { return to_remove.count(x); });
+    nodes->erase(it, nodes->end());
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_clean_graph_pass,
+              paddle::framework::ir::InferCleanGraphPass);
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -34,14 +34,15 @@ class Node {

  explicit Node(VarDesc* var_desc)
      : name_(var_desc->Name()),
-        var_desc_(var_desc),
+        var_desc_(new VarDesc(*var_desc)),
        op_desc_(nullptr),
        type_(Type::kVariable) {}

  explicit Node(OpDesc* op_desc)
      : name_(op_desc->Type()),
        var_desc_(nullptr),
-        op_desc_(op_desc),
+        op_desc_(new OpDesc(*op_desc)),  // TODO(panyx0718) the pointer in the
+                                         // original OpDesc might go out.
        type_(Type::kOperation) {}

  Type NodeType() const { return type_; }
@@ -50,12 +51,12 @@ class Node {

  VarDesc* Var() {
    PADDLE_ENFORCE(type_ == Type::kVariable);
-    return var_desc_;
+    return var_desc_.get();
  }

  OpDesc* Op() {
-    PADDLE_ENFORCE(type_ == Type::kOperation);
-    return op_desc_;
+    PADDLE_ENFORCE(IsOp());
+    return op_desc_.get();
  }

  bool IsOp() const { return type_ == Type::kOperation; }
@@ -66,8 +67,8 @@ class Node {

 protected:
  const std::string name_;
-  VarDesc* var_desc_;
-  OpDesc* op_desc_;
+  std::unique_ptr<VarDesc> var_desc_;
+  std::unique_ptr<OpDesc> op_desc_;
  Type type_;

 private:

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
 cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+  analyzer.cc
+  helper.cc
+  # passes
  fluid_to_data_flow_graph_pass.cc
  data_flow_graph_to_fluid_pass.cc
  dfg_graphviz_draw_pass.cc
  tensorrt_subgraph_pass.cc
  tensorrt_subgraph_node_mark_pass.cc
-  analyzer.cc
-  helper.cc
+  fluid_to_ir_pass.cc
  model_store_pass.cc
-  DEPS framework_proto proto_desc)
-cc_test(test_node SRCS node_tester.cc DEPS analysis gflags glog gtest)
+  DEPS framework_proto proto_desc ir_pass_manager graph pass)
+
+cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)

@@ -27,19 +31,30 @@ function (inference_analysis_test TARGET)
        endif()
        cc_test(${TARGET}
                SRCS "${analysis_test_SRCS}"
-                DEPS analysis
+                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detecter pass
                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
    endif(WITH_TESTING)
 endfunction(inference_analysis_test)

+cc_test(test_analyzer SRCS analyzer_tester.cc DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
+		  # ir
+		  fc_fuse_pass
+		  graph_viz_pass
+		  infer_clean_graph_pass
+		  graph_pattern_detecter
+		  pass
+        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+#set_tests_properties(test_analyzer PROPERTIES DEPENDS test_word2vec)
+#inference_api_test(test_analyzer SRC analyzer_tester.cc ARGS test_word2vec)
+
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
 inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
 inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
 #include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
@@ -24,14 +25,15 @@

 namespace paddle {

-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
+DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false,
            "Enable subgraph to TensorRT engine for acceleration");

-DEFINE_string(inference_analysis_graphviz_log_root, "./",
+DEFINE_bool(IA_enable_ir, false, "Turn on IR support");
+
+DEFINE_string(IA_graphviz_log_root, "./",
              "Graphviz debuger for data flow graphs.");

-DEFINE_string(inference_analysis_output_storage_path, "",
-              "optimized model output path");
+DEFINE_string(IA_output_storage_path, "", "optimized model output path");

 namespace inference {
 namespace analysis {
@@ -40,8 +42,34 @@ class DfgPassManagerImpl final : public DfgPassManager {
 public:
  DfgPassManagerImpl() {
    // TODO(Superjomn) set the key with pass reprs.
+    LOG(INFO)
+        << "-----------------------------------------------------------------";
+    if (FLAGS_IA_enable_ir) {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
+    } else {
      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
-    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+    }
+    TryAddTensorRtPass();
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_IA_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
+    LOG(INFO)
+        << "-----------------------------------------------------------------";
+  }
+
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    VLOG(3) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+
+  void TryAddTensorRtPass() {
+    if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
      auto trt_teller = [&](const Node* node) {
        std::unordered_set<std::string> teller_set(
            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
@@ -60,20 +88,6 @@ class DfgPassManagerImpl final : public DfgPassManager {
              new TensorRTSubgraphNodeMarkPass(trt_teller));
      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
    }
-    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
-    if (!FLAGS_inference_analysis_output_storage_path.empty()) {
-      AddPass("model-store-pass", new ModelStorePass);
-    }
-  }
-
-  std::string repr() const override { return "dfg-pass-manager"; }
-  std::string description() const override { return "DFG pass manager."; }
-
- private:
-  void AddPass(const std::string& name, Pass* pass) {
-    LOG(INFO) << "Adding pass " << name;
-    Register(name, pass);
-    AddGraphvizDebugerPass(pass);
  }

  // Add the graphviz debuger pass if the parent pass has one.

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -43,9 +43,10 @@ namespace paddle {

 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
-DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
-DECLARE_string(inference_analysis_graphviz_log_root);
-DECLARE_string(inference_analysis_output_storage_path);
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);

 namespace inference {
 namespace analysis {

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -14,14 +14,16 @@

 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <google/protobuf/text_format.h>
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"

 namespace paddle {
 namespace inference {
 namespace analysis {

 TEST(Analyzer, analysis_without_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
  Argument argument;
  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
  Analyzer analyser;
@@ -29,13 +31,73 @@ TEST(Analyzer, analysis_without_tensorrt) {
 }

 TEST(Analyzer, analysis_with_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
  Argument argument;
  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
  Analyzer analyser;
  analyser.Run(&argument);
 }

+void TestWord2vecPrediction(const std::string& model_path) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  auto predictor =
+      ::paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+          config);
+
+  // One single batch
+
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> slots(4, tensor);
+  std::vector<PaddleTensor> outputs;
+  CHECK(predictor->Run(slots, &outputs));
+
+  PADDLE_ENFORCE(outputs.size(), 1UL);
+  // Check the output buffer size and result of each tid.
+  PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+  float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
+                     0.000932706};
+  const size_t num_elements = outputs.front().data.length() / sizeof(float);
+  // The outputs' buffers are in CPU memory.
+  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+    LOG(INFO) << "data: "
+              << static_cast<float*>(outputs.front().data.data())[i];
+    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+                   result[i]);
+  }
+}
+
+// Turn on the IR pass supportion, run a real inference and check the result.
+TEST(Analyzer, SupportIRPass) {
+  FLAGS_IA_enable_ir = true;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
+  FLAGS_IA_output_storage_path = "./analysis.out";
+
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(new std::string("./analysis.out"));
+
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  // Should get the transformed model stored to ./analysis.out
+  ASSERT_TRUE(PathExists("./analysis.out"));
+
+  // Inference from this path.
+  TestWord2vecPrediction("./analysis.out");
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
+using ir_node_t = framework::ir::Node;
+using ir_graph_t = framework::ir::Graph;

 // It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
 void DataFlowGraph::Build() {
-  inputs.clear();
-  outputs.clear();
+  inputs_.clear();
+  outputs_.clear();
  std::unordered_set<Node *> ins;
  std::unordered_set<Node *> outs;
  for (auto &node : nodes.nodes()) {
@@ -42,18 +44,140 @@ void DataFlowGraph::Build() {
  // similarly, the nodes that in outs but not in ins is the graphs' outputs
  for (auto *in : ins) {
    if (!outs.count(in)) {
-      inputs.push_back(in);
+      inputs_.push_back(in);
    }
  }
  for (auto *out : outs) {
-    if (!outs.count(out)) {
-      outputs.push_back(out);
+    if (!ins.count(out)) {
+      outputs_.push_back(out);
    }
  }

  Clean();
 }

+void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) {
+  // insert vars
+  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
+  // will keep updating to its latest alias during the graph-building.
+  std::unordered_map<std::string, size_t> var2id;
+  auto &main_block = prog.blocks(framework::kRootBlockIndex);
+  for (int i = 0; i < main_block.vars_size(); i++) {
+    const auto &var = main_block.vars(i);
+    auto *v = nodes.Create(Node::Type::kValue);
+    v->SetName(var.name());
+    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
+    var2id[var.name()] = v->id();
+  }
+
+  // The variables in a SSA can only write once, so if a variable is written
+  // multiple times(quite common in our ProgramDesc design), multiple alias
+  // Nodes of this variable will be created, and each will just write once.
+
+  // An set that keep all the names of the variables(the original, not alias)
+  // that have been written(as outputs). Once an Op's output variable hit the
+  // set, it should create a new alias and update the global alias for this
+  // variable. And that make a Data Flow Graph a SSA.
+  std::unordered_set<Node *> unique_written_vars;
+  for (int i = 0; i < main_block.ops_size(); i++) {
+    const auto &op = main_block.ops(i);
+    auto *o = nodes.Create(Node::Type::kFunction);
+    o->SetName(op.type());
+    static_cast<Function *>(o)->SetFuncType(op.type());
+    // Link to the original protobuf message's memory, make it easier to
+    // generate from a data flow graph to fluid ProgramDesc.
+    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
+    // set inputs and outputs
+    for (int j = 0; j < op.inputs_size(); j++) {
+      auto &in_var = op.inputs(j);
+      for (int k = 0; k < in_var.arguments_size(); k++) {
+        auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k)));
+        in->outlinks.push_back(o);
+        o->inlinks.push_back(in);
+        unique_written_vars.insert(in);
+      }
+    }
+    for (int j = 0; j < op.outputs_size(); j++) {
+      auto &out_var = op.outputs(j);
+      for (int k = 0; k < out_var.arguments_size(); k++) {
+        auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (unique_written_vars.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] =
+              out_alias->id();  // update variable's alias Node
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
+        out->inlinks.push_back(o);
+        o->outlinks.push_back(out);
+      }
+    }
+  }
+  // Analysis and extract the inputs and outputs of this graph.
+  Build();
+}
+
+void DataFlowGraph::Build(const framework::ir::Graph &graph) {
+  // Create nodes
+  std::unordered_map<ir_node_t *, Node *> ir_node_map;
+  for (auto *ir_node : graph.Nodes()) {
+    Node *x{nullptr};
+    if (ir_node->IsOp()) {
+      PADDLE_ENFORCE(ir_node->Op());
+      VLOG(4) << "get op " << ir_node << " " << ir_node->Name();
+      x = nodes.Create(Node::Type::kFunction);
+      x->attr("ir_node").Pointer() = ir_node;
+      PADDLE_ENFORCE(ir_node->Op()->Proto());
+      x->SetName(ir_node->Op()->Proto()->type());
+      x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString());
+    } else if (ir_node->IsVar()) {
+      // Not create a Node for IR ControlDepVar, considering Inference currently
+      // just used in single thread scenerio.
+      VLOG(4) << "get var " << ir_node->Name();
+      x = nodes.Create(Node::Type::kValue);
+      x->attr("ir_node").Pointer() = ir_node;
+      x->SetName(ir_node->Name());
+      // x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString());
+    } else {
+      PADDLE_THROW("Failed to create an Node from IR, unknown type");
+    }
+    ir_node_map.emplace(ir_node, x);
+  }
+  VLOG(4) << "finish creating Nodes";
+
+  VLOG(4) << "to create edge";
+  // Create links
+  for (auto *ir_node : graph.Nodes()) {
+    auto it = ir_node_map.find(ir_node);
+    // Skip ControlDepVar.
+    if (it == ir_node_map.end()) continue;
+    auto *node = it->second;
+    for (auto *x : ir_node->inputs) {
+      if (!ir_node_map.count(x)) continue;
+      node->inlinks.push_back(ir_node_map.at(x));
+    }
+    for (auto *x : ir_node->outputs) {
+      if (!ir_node_map.count(x)) continue;
+      node->outlinks.push_back(ir_node_map.at(x));
+    }
+  }
+
+  Build();
+  PADDLE_ENFORCE(!inputs_.empty(),
+                 "Can't deduce any inputs from the graph, Is the graph empty?");
+
+  ir_graph = &graph;
+  VLOG(3) << "finished build from IR";
+}
+
 void DataFlowGraph::Clean() {
  for (auto &node : nodes.nodes()) {
    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
@@ -61,11 +185,9 @@ void DataFlowGraph::Clean() {
    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
                                            node->outlinks.end());
    if (inlinks_set.size() < node->inlinks.size()) {
-      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
    }
    if (outlinks_set.size() < node->outlinks.size()) {
-      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
    }
  }
@@ -112,10 +234,10 @@ GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
    const std::vector<Node *> &source)
    : queue_(source.begin(), source.end()) {}

-// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
-//     GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
-//     : queue_(std::move(other.queue_)),
-//       visited_(std::move(other.visited_)) {}
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
+    : queue_(std::move(other.queue_)),
+      visited_(std::move(other.visited_)) {}

 GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
@@ -159,7 +281,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
  if (queue_.empty()) return other.queue_.empty();
  if ((!queue_.empty()) && (!other.queue_.empty())) {
    return queue_.front() == other.queue_.front() &&
-           visited_.size() == other.visited_.size();  // here need to check the
+           visited_.size() == other.visited_.size();
    // equality of queue and
    // visited. Just a light but week implementation.
  }
@@ -174,10 +296,10 @@ GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
  for (auto *x : source) stack_.push(x);
 }

-// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
-//     GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
-//     : stack_(std::move(other.stack_)),
-//       visited_(std::move(other.visited_)) {}
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
+    : stack_(std::move(other.stack_)),
+      visited_(std::move(other.visited_)) {}

 GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
@@ -339,7 +461,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT

 void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
  std::vector<Node *> op_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
    if (node.type() == Node::Type::kValue || node.deleted()) {
      continue;
    }

--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>

+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/graph_traits.h"
 #include "paddle/fluid/inference/analysis/node.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -41,19 +42,43 @@ namespace analysis {
 */
 struct DataFlowGraph {
  NodeMap nodes;
-  std::vector<Node *> inputs;
-  std::vector<Node *> outputs;
+  // inputs and outputs are deduced from the graph.
+  // Used to interact with IR.
+  const framework::ir::Graph *ir_graph{nullptr};

  // Extract inputs and outputs of the graph.
  void Build();

+  void Build(const framework::proto::ProgramDesc &prog);
+
+  // Build a graph from ir::Graph.
+  void Build(const framework::ir::Graph &graph);
+
+  // Get an attribute.
+  AnyAttr &Attr(const std::string &key) { return attrs_[key]; }
+
  // Output a DOT graph file for debug.
  std::string DotString() const;

  std::string HumanReadableInfo(bool show_values = true,
                                bool show_functions = true) const;

+  const std::vector<Node *> &inputs() const {
+    PADDLE_ENFORCE(!inputs_.empty(),
+                   "No inputs are deduced, need to Build() first.");
+    return inputs_;
+  }
+  const std::vector<Node *> &outputs() const {
+    PADDLE_ENFORCE(!outputs_.empty(),
+                   "No outputs are deduced, need to Build() first.");
+    return outputs_;
+  }
+
 private:
+  mutable std::vector<Node *> inputs_;
+  mutable std::vector<Node *> outputs_;
+  std::unordered_map<std::string, AnyAttr> attrs_;
+
  // Remove duplicate edges and so on.
  void Clean();
 };
@@ -70,7 +95,7 @@ struct GraphTraits<DataFlowGraph> {
      : public std::iterator<std::forward_iterator_tag, Node *> {
    NodesBFSIterator() = default;
    explicit NodesBFSIterator(const std::vector<Node *> &source);
-    // NodesBFSIterator(NodesBFSIterator &&other) noexcept;
+    NodesBFSIterator(NodesBFSIterator &&other) noexcept;
    // NOTE Heavy to use.
    NodesBFSIterator(const NodesBFSIterator &other);

@@ -93,8 +118,8 @@ struct GraphTraits<DataFlowGraph> {
  struct NodesDFSIterator
      : public std::iterator<std::forward_iterator_tag, Node *> {
    NodesDFSIterator() = default;
-    explicit NodesDFSIterator(const std::vector<Node *> &source);
-    // NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+    NodesDFSIterator(const std::vector<Node *> &source);
+    NodesDFSIterator(NodesDFSIterator &&other) noexcept;
    NodesDFSIterator(const NodesDFSIterator &other);

    Node &operator*();
@@ -116,7 +141,7 @@ struct GraphTraits<DataFlowGraph> {
  struct NodesTSIterator
      : public std::iterator<std::forward_iterator_tag, Node *> {
    NodesTSIterator() = default;
-    explicit NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(const std::vector<Node *> &source);
    NodesTSIterator(NodesTSIterator &&other)
        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
      other.cursor_ = 0;
@@ -138,7 +163,7 @@ struct GraphTraits<DataFlowGraph> {
    size_t cursor_{0};
  };

-  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
+  explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {}

  // default use BFS to visit the nodes.
  iterator_range<NodesBFSIterator> nodes() {
@@ -156,20 +181,20 @@ struct GraphTraits<DataFlowGraph> {

 private:
  NodesBFSIterator nodes_bfs_begin() {
-    return NodesBFSIterator(graph_->inputs);
+    return NodesBFSIterator(graph_.inputs());
  }
  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }

  NodesDFSIterator nodes_dfs_begin() {
-    return NodesDFSIterator(graph_->inputs);
+    return NodesDFSIterator(graph_.inputs());
  }
  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }

-  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); }
  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }

 private:
-  DataFlowGraph *graph_;
+  const DataFlowGraph &graph_;
 };

 // Extract the inputs and outputs of a graph. The inputs and outputs of a

--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"

 namespace paddle {
@@ -24,20 +25,18 @@ TEST(DataFlowGraph, BFS) {
  auto dfg = ProgramDescToDFG(desc);
  dfg.Build();

-  for (auto *in : dfg.inputs) {
+  for (auto* in : dfg.inputs()) {
    LOG(INFO) << "inputs: " << in->name() << " "
              << static_cast<int>(in->type());
  }
-  for (auto *out : dfg.outputs) {
+  for (auto* out : dfg.outputs()) {
    LOG(INFO) << "outputs: " << out->name() << " "
              << static_cast<int>(out->type());
  }

-  GraphTraits<DataFlowGraph> trait(&dfg);
-  auto nodes = trait.nodes();
  size_t count = 0;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    LOG(INFO) << "visiting " << it->name();
+  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes()) {
+    LOG(INFO) << "visiting " << node.name();
    ++count;
  }
  ASSERT_EQ(count, dfg.nodes.size());
@@ -45,13 +44,11 @@ TEST(DataFlowGraph, BFS) {

 TEST(DataFlowGraph, DFS) {
  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
-  auto dfg = ProgramDescToDFG(desc);
-  dfg.Build();
-  GraphTraits<DataFlowGraph> trait(&dfg);
-  auto nodes = trait.nodes_in_DFS();
+  DataFlowGraph dfg;
+  dfg.Build(desc);
  size_t count = 0;
-  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
-    LOG(INFO) << "visiting " << it->name();
+  for (auto& node : GraphTraits<DataFlowGraph>(dfg).nodes_in_DFS()) {
+    LOG(INFO) << "visiting " << node.name();
    ++count;
  }
  ASSERT_EQ(count, dfg.nodes.size());
@@ -74,21 +71,17 @@ TEST(DataFlowGraph, TS) {
  DataFlowGraph graph;

  for (int i = 0; i < 8; i++) {
-    auto *node = graph.nodes.Create(Node::Type::kValue);
+    auto* node = graph.nodes.Create(Node::Type::kValue);
    node->SetName("node-" + std::to_string(i));
  }

  auto add_link = [&](int i, int j) {
-    Node *source = graph.nodes.GetMutable(i);
-    Node *target = graph.nodes.GetMutable(j);
+    Node* source = graph.nodes.GetMutable(i);
+    Node* target = graph.nodes.GetMutable(j);
    target->inlinks.push_back(source);
    source->outlinks.push_back(target);
  };

-  graph.inputs.push_back(graph.nodes.GetMutable(0));
-  graph.inputs.push_back(graph.nodes.GetMutable(1));
-  graph.inputs.push_back(graph.nodes.GetMutable(2));
-
  add_link(0, 4);
  add_link(0, 5);
  add_link(1, 6);
@@ -97,8 +90,9 @@ TEST(DataFlowGraph, TS) {
  add_link(4, 7);
  add_link(4, 3);
  add_link(7, 3);
+  graph.Build();

-  auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
+  auto its = GraphTraits<DataFlowGraph>(graph).nodes_in_TS();
  std::vector<int> sorted_ids;
  for (auto it = its.begin(); it != its.end(); ++it) {
    LOG(INFO) << it->name();
@@ -122,6 +116,50 @@ TEST(DataFlowGraph, TS) {
  assert_positive_sequence_pair(4, 7);
 }

+TEST(DataFlowGraph, Build_ProgramDesc) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
+  DataFlowGraph graph;
+  graph.Build(desc);
+  ASSERT_EQ(graph.nodes.size(), 38UL);
+}
+
+void SetOp(framework::ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetInput("Xs", inputs);
+  op->SetOutput("Xs", outputs);
+}
+
+TEST(DataFlowGraph, Build_IR_Graph) {
+  framework::ProgramDesc prog;
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e", "f"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(framework::proto::VarType::SELECTED_ROWS);
+    if (v == "c") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "mul", std::vector<std::string>({"b", "c"}),
+        std::vector<std::string>({"d"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"d", "e"}),
+        std::vector<std::string>({"f"}));
+
+  DataFlowGraph graph;
+
+  framework::ir::Graph ir_graph(prog);
+
+  graph.Build(ir_graph);
+
+  ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size());
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -49,18 +49,15 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
 bool DataFlowGraphToFluidPass::Finalize() { return true; }

 void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
-  LOG(INFO) << "graph.inputs " << graph->inputs.size();
-  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+  // FilterRedundantOutputOfSubGraph(graph);
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
    if (node.deleted()) continue;

    switch (node.type()) {
      case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << node.repr();
        AddFluidOp(&node);
      } break;
      case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << node.repr() << " , "
-                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
        AddEngineOp(&node);
      } break;
      default:
@@ -72,15 +69,27 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
 }

 void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
-  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
+  PADDLE_ENFORCE(node);
+  PADDLE_ENFORCE(node->IsFunction());
+  PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(),
+                 "node has invalid protobuf repr.");
+
  // currently only the main block is analyzed.
+  PADDLE_ENFORCE(desc_);
  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
  auto *op = main_block->add_ops();
-  *op = *ori_op;  // copy the attributes, by default, these will not be changed
+
+  if (node->pb_desc()) {
+    auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
+    *op =
+        *ori_op;  // copy the attributes, by default, these will not be changed
    // by analysis phrase.
    // The inputs and outputs of the existing ops are not changed by tensorrt
    // subgraph pass.
    // NOTE It might be changed by other passes in the long run.
+  } else {
+    op->ParseFromString(node->pb_msg());
+  }
 }

 void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
@@ -215,10 +224,9 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  framework::BlockDesc block_desc(nullptr, &proto);
  block_desc.Proto()->set_parent_idx(-1);
  block_desc.Proto()->set_idx(0);
-  LOG(INFO) << "origin variable size: "
+  VLOG(4) << "origin variable size: "
          << argument_->origin_program_desc->blocks(0).vars().size();
-  LOG(INFO) << "transformed variable size: "
-            << block_desc.Proto()->vars().size();
+  VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size();
  // copy ops.

  for (auto *node : block_node->subgraph) {
@@ -252,7 +260,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {

 Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_inference_analysis_graphviz_log_root,
+      FLAGS_IA_graphviz_log_root,
      "data_flow_graph_to_fluid_graphviz_debugger"));
 }


--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {

  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
  std::string message;
-  LOG(INFO) << "draw to " << png_path;
+  VLOG(3) << "draw to " << png_path;
  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }


--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -52,72 +52,7 @@ bool FluidToDataFlowGraphPass::Finalize() { return true; }
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
  PADDLE_ENFORCE(graph);
  PADDLE_ENFORCE(desc_);
-  // insert vars
-  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
-  // will keep updating to its latest alias during the graph-building.
-  std::unordered_map<std::string, size_t> var2id;
-  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
-  for (int i = 0; i < main_block.vars_size(); i++) {
-    const auto &var = main_block.vars(i);
-    auto *v = graph->nodes.Create(Node::Type::kValue);
-    v->SetName(var.name());
-    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
-    v->SetPbMsg(var.SerializeAsString());
-    var2id[var.name()] = v->id();
-  }
-
-  // The variables in a SSA can only write once, so if a variable is written
-  // multiple times(quite common in our ProgramDesc design), multiple alias
-  // Nodes of this variable will be created, and each will just write once.
-
-  // An set that keep all the names of the variables(the original, not alias)
-  // that have been written(as outputs). Once an Op's output variable hit the
-  // set, it should create a new alias and update the global alias for this
-  // variable. And that make a Data Flow Graph a SSA.
-  std::unordered_set<Node *> unique_written_vars;
-  for (int i = 0; i < main_block.ops_size(); i++) {
-    const auto &op = main_block.ops(i);
-    auto *o = graph->nodes.Create(Node::Type::kFunction);
-    o->SetName(op.type());
-    static_cast<Function *>(o)->SetFuncType(op.type());
-    // Link to the original protobuf message's memory, make it easier to
-    // generate from a data flow graph to fluid ProgramDesc.
-    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
-    o->SetPbMsg(op.SerializeAsString());
-
-    // set inputs and outputs
-    for (int j = 0; j < op.inputs_size(); j++) {
-      auto &in_var = op.inputs(j);
-      for (int k = 0; k < in_var.arguments_size(); k++) {
-        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
-        in->outlinks.push_back(o);
-        o->inlinks.push_back(in);
-        unique_written_vars.insert(in);
-      }
-    }
-    for (int j = 0; j < op.outputs_size(); j++) {
-      auto &out_var = op.outputs(j);
-      for (int k = 0; k < out_var.arguments_size(); k++) {
-        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
-        if (unique_written_vars.count(out)) {
-          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
-          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
-          out_alias->SetName(out->name());
-          out_alias->SetPbDesc(out->pb_desc());
-          out_alias->SetPbMsg(out->pb_msg());
-          var2id[out_alias->name()] =
-              out_alias->id();  // update variable's alias Node
-          LOG(INFO) << "loop found in graph, create SSA alias node ["
-                    << out_alias->repr() << "] for [" << out->repr() << "]";
-          out = out_alias;
-        }
-        out->inlinks.push_back(o);
-        o->outlinks.push_back(out);
-      }
-    }
-  }
-  // Analysis and extract the inputs and outputs of this graph.
-  graph->Build();
+  graph->Build(*desc_);
 }

 namespace {
@@ -133,7 +68,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {

 Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
-      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
+      FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger"));
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -30,7 +30,7 @@ TEST(FluidToDataFlowGraphPass, Test) {
  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
  pass.Finalize();
  ASSERT_FALSE(argument.main_dfg->DotString().empty());
-  EXPECT_FALSE(argument.main_dfg->inputs.empty());
+  EXPECT_FALSE(argument.main_dfg->inputs().empty());
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class FluidToIrPass final : public DataFlowGraphPass {
+ public:
+  FluidToIrPass() = default;
+
+  bool Initialize(Argument *argument) override {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+    if (argument->origin_program_desc) {
+      LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                      "duplicate called";
+    }
+    // set fluid model program path
+    if (!argument->fluid_model_program_path) {
+      ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+      argument->fluid_model_program_path.reset(
+          new std::string(*argument->fluid_model_dir + "/__model__"));
+    }
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+    // Load program.
+    auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+    argument->origin_program_desc.reset(
+        new framework::proto::ProgramDesc(program));
+    // Create main data flow graph.
+    if (!argument->main_dfg) {
+      argument->main_dfg.reset(new DataFlowGraph);
+    }
+    // Persist the ProgramDesc in graph's attribute. The IR graph just keep the
+    // address, will segfault if the original ProgramDesc destroys.
+    auto &ir_program_p = argument->main_dfg->Attr("ir_program_desc").Pointer();
+    ir_program_p = new framework::ProgramDesc(program);
+
+    argument_ = argument;
+    return true;
+  }
+
+  bool Finalize() override { return true; }
+
+  void Run(DataFlowGraph *graph) override {
+    // Call all the IR Passes
+    IRPassManager ir_passes(*static_cast<framework::ProgramDesc *>(
+        argument_->main_dfg->Attr("ir_program_desc").Pointer()));
+    ir_passes.Apply(std::vector<std::string>(
+        {// Manual update the passes here.
+         "graph_viz_pass", "infer_clean_graph_pass", "graph_viz_pass",
+         "fc_fuse_pass", "graph_viz_pass"}));
+
+    PADDLE_ENFORCE(argument_->main_dfg.get());
+    argument_->main_dfg->Build(ir_passes.graph());
+    // PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected());
+  }
+
+  std::string repr() const override { return "fluid-to-ir-pass"; }
+
+ private:
+  Argument *argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(FluidToIrPass, Test) {
+  FluidToIrPass pass;
+  Argument argument(FLAGS_inference_model_dir);
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
 #include <string>
@@ -151,6 +152,23 @@ static framework::proto::ProgramDesc LoadProgramDesc(
  return program_desc;
 }

+static bool FileExists(const std::string &filepath) {
+  std::ifstream file(filepath);
+  bool exists = file.is_open();
+  file.close();
+  return exists;
+}
+
+static bool PathExists(const std::string &path) {
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include <string>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+IRPassManager::IRPassManager(const ProgramDesc& program) {
+  graph_.reset(new framework::ir::Graph(program));
+}
+
+void IRPassManager::Apply(const std::vector<std::string>& passes) {
+  graph_->Set("graph_viz_path", new std::string("./1.dot"));
+  // Apply all the passes
+  std::string pre_pass;
+  for (const std::string& pass_name : passes) {
+    LOG(WARNING) << "Running IR pass [" << pass_name << "]";
+    auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "graph_viz_pass") {
+      std::string dot_file_path =
+          "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
+      pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
+    }
+    graph_ = pass->Apply(std::move(graph_));
+    pre_pass = pass_name;
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines IRPassManager, it helps control the passes in IR. Inference
+ * phrase will load the model program and parameters from disk, that is quite
+ * different from the training phase.
+ * This manager will control the Passes and make the passes in IR work smoothly
+ * for inference.
+ */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ProgramDesc;
+
+class IRPassManager final {
+ public:
+  IRPassManager(const ProgramDesc& program);
+
+  void Apply(const std::vector<std::string>& passes);
+
+  framework::ir::Graph& graph() const { return *graph_; }
+
+ private:
+  std::unique_ptr<framework::ir::Graph> graph_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/model_store_pass.cc
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -35,19 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) {
  std::stringstream ss;
  // NOTE these commands only works on linux.
  ss << "mkdir -p " << *argument_->model_output_store_path;
-  LOG(INFO) << "run command: " << ss.str();
+  VLOG(3) << "run command: " << ss.str();
  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
  ss.str("");

  ss << "cp " << *argument_->fluid_model_dir << "/*"
     << " " << *argument_->model_output_store_path;
-  LOG(INFO) << "run command: " << ss.str();
+  VLOG(3) << "run command: " << ss.str();
  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);

  // Store program
  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
                          "program desc is not transformed, should call "
                          "DataFlowGraphToFluidPass first.");
+  VLOG(3) << "store analyzed program to "
+          << *argument_->model_output_store_path;
  const std::string program_output_path =
      *argument_->model_output_store_path + "/__model__";
  std::ofstream file(program_output_path, std::ios::binary);
@@ -58,6 +60,8 @@ void ModelStorePass::Run(DataFlowGraph *x) {
  file.write(serialized_message.c_str(), serialized_message.size());
 }

+bool ModelStorePass::Finalize() { return true; }
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/model_store_pass.h
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -44,6 +44,8 @@ class ModelStorePass : public DataFlowGraphPass {
    model in the disk, and that model can be reloaded for prediction again.)DD";
  }

+  bool Finalize() override;
+
 private:
  Argument* argument_{nullptr};
 };

--- a/paddle/fluid/inference/analysis/model_store_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -30,7 +30,7 @@ TEST(DFG_StorePass, test) {
  argument.model_output_store_path.reset(
      new std::string("./_dfg_store_pass_tmp"));
  // disable storage in alalyzer
-  FLAGS_inference_analysis_output_storage_path = "";
+  FLAGS_IA_output_storage_path = "";
  analyzer.Run(&argument);

  ModelStorePass pass;

--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -38,7 +38,7 @@ namespace analysis {
 class NodeMap;

 // A helper class to maintain the status from Pass.
-struct NodeAttr {
+struct AnyAttr {
  using any_t =
      boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
  // NOTE T should be a primary type or a struct combined by several primary
@@ -54,10 +54,9 @@ struct NodeAttr {
  void *&Pointer() { return As<void *>(); }
  std::string &String() { return As<std::string>(); }

- private:
  template <typename T>
  T &As() {
-    if (type_index_ == typeid(NodeAttr)) {
+    if (type_index_ == typeid(AnyAttr)) {
      type_index_ = typeid(T);
      any_data_ = T();
    } else {
@@ -68,7 +67,7 @@ struct NodeAttr {

 private:
  any_t any_data_;
-  std::type_index type_index_{typeid(NodeAttr)};
+  std::type_index type_index_{typeid(AnyAttr)};
 };

 /*
@@ -105,7 +104,7 @@ class Node {

  // Get an additional attribute and convert it to T data type. NOTE this will
  // silently create a new attribute if not exists.
-  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
+  AnyAttr &attr(const std::string &name) const { return attrs_[name]; }

  int id() const { return id_; }

@@ -150,7 +149,7 @@ class Node {
  Type type_{Type::kNone};
  // Mark this node is deleted by some pass.
  bool deleted_{false};
-  mutable std::unordered_map<std::string, NodeAttr> attrs_;
+  mutable std::unordered_map<std::string, AnyAttr> attrs_;
 };

 class Function;

--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ b/paddle/fluid/inference/analysis/node_tester.cc
@@ -21,19 +21,19 @@ namespace inference {
 namespace analysis {

 TEST(NodeAttr, bool) {
-  NodeAttr x;
+  AnyAttr x;
  x.Bool() = true;
  ASSERT_EQ(x.Bool(), true);
 }

 TEST(NodeAttr, int32) {
-  NodeAttr x;
+  AnyAttr x;
  x.Int32() = 32;
  ASSERT_EQ(x.Int32(), 32);
 }

 TEST(NodeAttr, string) {
-  NodeAttr x;
+  AnyAttr x;
  x.String() = "Hello";
  ASSERT_EQ(x.String(), "Hello");
 }

--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -63,7 +63,7 @@ class Pass {
  // Human-readable short representation.
  virtual std::string repr() const = 0;
  // Human-readable long description.
-  virtual std::string description() const = 0;
+  virtual std::string description() const { return "No DOC"; }
 };

 // NodePass process on any Node types.

--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -22,7 +22,7 @@ namespace analysis {
 bool PassManager::Initialize(Argument* argument) {
  argument_ = argument;
  for (auto& pass : data_) {
-    LOG(INFO) << "Initializing pass " << pass->repr();
+    LOG(WARNING) << "Initializing pass [" << pass->repr() << "]";
    if (!pass->Initialize(argument)) {
      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
      return false;
@@ -33,8 +33,9 @@ bool PassManager::Initialize(Argument* argument) {

 void DfgPassManager::RunAll() {
  PADDLE_ENFORCE(argument_);
+  LOG(INFO) << "Total " << data_.size() << " passes";
  for (auto& pass : data_) {
-    VLOG(4) << "Running pass [" << pass->repr() << "]";
+    LOG(WARNING) << "Running pass [" << pass->repr() << "]";
    pass->Run(argument_->main_dfg.get());
  }
 }
@@ -42,8 +43,7 @@ void DfgPassManager::RunAll() {
 void NodePassManager::RunAll() {
  PADDLE_ENFORCE(argument_);
  PADDLE_ENFORCE(argument_->main_dfg.get());
-  auto trait =
-      GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
+  auto trait = GraphTraits<DataFlowGraph>(*argument_->main_dfg).nodes_in_DFS();
  for (auto& node : trait) {
    for (auto& pass : data_) {
      pass->Run(&node);

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -34,7 +34,7 @@ inline void MarkOutLinksInSubGraph(const Function *func) {
 }

 void SubGraphSplitter::MarkNodesInsideSubGraph() {
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes()) {
    if (node_inside_subgraph_teller_(&node)) {
      node.attr(kMarkerAttrName).Bool() = true;
      if (node.type() == Node::Type::kFunction) {
@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {

 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
  std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
    if (node.attr(kMarkerAttrName).Bool()) {
      marked_nodes.push_back(&node);
    }

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -69,8 +69,8 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass {
 };

 Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
-  DFG_GraphvizDrawPass::Config config(
-      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root,
+                                      "tensorrt_marked_node");
  return new DfgDebuggerPass(config);
 }
 bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -23,7 +23,7 @@ namespace paddle {
 DEFINE_string(dirname, "", "Directory of the inference model.");

 void CompareTensorRTWithFluid(bool enable_tensorrt) {
-  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
+  FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt;

  //# 1. Create PaddlePredictor with a config.
  NativeConfig config0;

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -9,7 +9,6 @@ function(op_library TARGET)
    # op_library is a function to create op library. The interface is same as
    # cc_library. But it handle split GPU/CPU code and link some common library
    # for ops.
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
    set(cc_srcs)
    set(cu_srcs)
    set(hip_cu_srcs)
@@ -92,6 +91,7 @@ function(op_library TARGET)
        endif()
    endforeach()
    endif(WIN32)
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)

    list(LENGTH op_library_DEPS op_library_DEPS_len)
    if (${op_library_DEPS_len} GREATER 0)

--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -130,12 +130,13 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
      checkpoint_notify_id != -1,
      "when checkpoint_notify_id = -1, there should be no RPC invoke.");

-  auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+  // TODO(tangwei12): find out why scope will be error.
+  auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
  lt_var->clear();
  lt_var->append(out_var_name);
  VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
          << out_var_name;
-  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope);
+  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
  return true;
 }


--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"

 namespace paddle {
 namespace operators {
@@ -23,6 +24,37 @@ struct MulFunctor {
  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
 };

+template <typename DeviceContext, typename T>
+void default_elementwise_mul(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y, framework::Tensor* z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        MulFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(),
+            z->mutable_data<T>(ctx.GetPlace()));
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
 public:
@@ -33,9 +65,11 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
    auto* y = ctx.Input<Tensor>("Y");
    auto* z = ctx.Output<Tensor>("Out");
    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          MulFunctor<T>(), z);
+    if (x->numel() == y->numel()) {
+      elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    } else {
+      default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    }
  }
 };


--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -35,9 +35,14 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {

  if (ctx->HasInput("Bias")) {
    auto bias_dims = ctx->GetInputDim("Bias");
+    if (bias_dims.size() == 2) {
      PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
      PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
                        "The shape of Bias must be [1, dim].");
+    } else if (bias_dims.size() == 1) {
+      PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1],
+                        "The shape of Bias must be [1, dim].");
+    }
  }
  PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                 "Fully Connected input should be 2-D or 4-D tensor.");

--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -92,6 +92,7 @@ class LoadOp : public framework::OperatorBase {
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(place);
    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+    selectedRows->SyncIndex();
  }
 };


--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -134,6 +134,9 @@ class Blas {
  template <typename T>
  void VADD(int n, const T* x, const T* y, T* z) const;

+  template <typename T>
+  void VMUL(int n, const T* x, const T* y, T* z) const;
+
  template <typename T>
  void VCOPY(int n, const T* x, T* y) const;

@@ -202,6 +205,11 @@ class BlasT : private Blas<DeviceContext> {
    Base()->template VADD<T>(args...);
  }

+  template <typename... ARGS>
+  void VMUL(ARGS... args) const {
+    Base()->template VMUL<T>(args...);
+  }
+
  template <typename... ARGS>
  void VCOPY(ARGS... args) const {
    Base()->template VCOPY<T>(args...);

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -82,6 +82,11 @@ struct CBlas<float> {
  static void VADD(ARGS... args) {
    platform::dynload::vsAdd(args...);
  }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vsMul(args...);
+  }
 };

 template <>
@@ -142,6 +147,11 @@ struct CBlas<double> {
  static void VADD(ARGS... args) {
    platform::dynload::vdAdd(args...);
  }
+
+  template <typename... ARGS>
+  static void VMUL(ARGS... args) {
+    platform::dynload::vdMul(args...);
+  }
 };

 #else
@@ -199,6 +209,7 @@ struct CBlas<platform::float16> {
  static void SMM_GEMM(...) {
    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
  }
+  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
 #ifdef PADDLE_WITH_MKLML
  static void GEMM_BATCH(...) {
    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -374,6 +385,20 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
 #endif
 }

+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
+                                            T *z) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VMUL(n, x, y, z);
+#else
+  // try to find if openblas support vmul
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,

--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -142,6 +142,8 @@ class SaveOp : public framework::OperatorBase {
    std::string filename = lt_var->data();
    VLOG(4) << "SaveSelectedRows get File name: " << filename;

+    MkDirRecursively(DirName(filename).c_str());
+
    auto &selectedRows = var->Get<framework::SelectedRows>();

    // get device context from pool

--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -81,8 +81,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "The source input of scatter op");
    AddInput("Ids", "The index input of scatter op where X will be updated");
-    AddInput("Updates", "The updated value of updates op");
-    AddOutput("Out", "The output of add op");
+    AddInput("Updates", "The updated value of scatter op");
+    AddOutput("Out", "The output of scatter op");
    AddComment(R"DOC(
 Scatter Operator.

@@ -90,7 +90,7 @@ This operator obtains output by updating the input on selected indices on the fi

 $$
 Out = X \\
-Out[Ids] = X[Ids] + Updates
+Out[Ids] = Updates
 $$

 )DOC");

--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -34,9 +34,9 @@ class ScatterOpKernel : public framework::OpKernel<T> {
    auto *Updates = ctx.Input<Tensor>("Updates");
    auto *Out = ctx.Output<Tensor>("Out");

-    // In place output: Out = X, Out[Ids] += Updates
+    // In place output: Out = X, Out[Ids] = Updates
    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    // Apply ScatterUpdate: Out[index] += Updates[:]
+    // Apply ScatterUpdate: Out[index] = Updates[:]
    ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
  }
 };
@@ -55,7 +55,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
    // In place gradient: dX = dO
    framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
    dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates += dO[Ids]
+    // Gradient by Gather: dUpdates = dO[Ids]
    CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
  }
 };

--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -17,10 +17,10 @@
 #include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 #include <type_traits>
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once

 #include <cudnn.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -17,10 +17,10 @@ limitations under the License. */

 #include <cuda.h>
 #include <cupti.h>
-#include <dlfcn.h>
 #include <mutex>  // NOLINT

 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -14,9 +14,9 @@ limitations under the License. */
 #pragma once

 #include <curand.h>
-#include <dlfcn.h>

 #include <mutex>  // NOLINT
+#include "paddle/fluid/platform/port.h"

 #include "paddle/fluid/platform/dynload/dynamic_loader.h"


--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -14,10 +14,10 @@ limitations under the License. */

 #pragma once

-#include <dlfcn.h>
 #include <mkl.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace platform {
@@ -49,25 +49,27 @@ extern void* mklml_dso_handle;

 #define MKLML_ROUTINE_EACH(__macro) \
  __macro(cblas_sgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_sgemm_batch);       \
  __macro(cblas_dgemm);             \
+  __macro(cblas_saxpy);             \
  __macro(cblas_daxpy);             \
+  __macro(cblas_scopy);             \
  __macro(cblas_dcopy);             \
+  __macro(cblas_sgemv);             \
  __macro(cblas_dgemv);             \
-  __macro(cblas_dgemm_batch);       \
-  __macro(vsAdd);                   \
-  __macro(vdAdd);                   \
  __macro(cblas_sgemm_alloc);       \
-  __macro(cblas_sgemm_pack);        \
-  __macro(cblas_sgemm_compute);     \
-  __macro(cblas_sgemm_free);        \
  __macro(cblas_dgemm_alloc);       \
+  __macro(cblas_sgemm_pack);        \
  __macro(cblas_dgemm_pack);        \
+  __macro(cblas_sgemm_compute);     \
  __macro(cblas_dgemm_compute);     \
+  __macro(cblas_sgemm_free);        \
  __macro(cblas_dgemm_free);        \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm_batch);       \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(vsMul);                   \
+  __macro(vdMul);                   \
  __macro(MKL_Set_Num_Threads)

 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);

--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -13,12 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

-#include <dlfcn.h>
 #include <nccl.h>

 #include <mutex>  // NOLINT
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -14,10 +14,9 @@ limitations under the License. */

 #pragma once

-#include <dlfcn.h>
 #include <mutex>  // NOLINT
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
 #include "warpctc/include/ctc.h"

 namespace paddle {

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -14,9 +14,6 @@ limitations under the License. */

 #pragma once

-#include <dlfcn.h>     // for dladdr
-#include <execinfo.h>  // for backtrace
-
 #ifdef __GNUC__
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
@@ -37,6 +34,7 @@ limitations under the License. */

 #include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"

@@ -75,7 +73,7 @@ struct EnforceNotMet : public std::exception {

      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
      sout << "PaddlePaddle Call Stacks: " << std::endl;
-
+#if !defined(_WIN32)
      void* call_stack[TRACE_STACK_LIMIT];
      auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
      auto symbols = backtrace_symbols(call_stack, size);
@@ -95,6 +93,9 @@ struct EnforceNotMet : public std::exception {
        }
      }
      free(symbols);
+#else
+      sout << "Windows not support stack backtrace yet.";
+#endif
      err_str_ = sout.str();
    }
  }

--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#if !defined(_WIN32)
+#include <dlfcn.h>     // for dladdr
+#include <execinfo.h>  // for backtrace
+#else
+#include <Shlwapi.h>
+#include <Windows.h>
+
+static void* dlsym(void* handle, const char* symbol_name) {
+  FARPROC found_symbol;
+  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
+
+  if (found_symbol == NULL) {
+    throw std::runtime_error(std::string(symbol_name) + " not found.");
+  }
+  return reinterpret_cast<void*>(found_symbol);
+}
+
+#endif
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
+set(PYBIND_DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
+          )
+if(NOT WIN32)
+list(APPEND PYBIND_DEPS parallel_executor)
+endif()
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-           parallel_executor
+      DEPS ${PYBIND_DEPS}
      ${GLOB_OP_LIB})
  else()
    cc_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
-           parallel_executor
+      DEPS ${PYBIND_DEPS}
      ${GLOB_OP_LIB})
-    if(NOT APPLE AND NOT ANDROID)
+    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
      target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT ANDROID)
+    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
  endif(WITH_AMD_GPU)

  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1363,6 +1363,13 @@ class Program(object):
        self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
        self._op_role_var = []

+        # for distribute
+        self._is_distributed = False
+        self._is_chief = False
+        self._slice_vars_and_attrs = []
+        self._endpoints = []
+        self._distributed_lookup_table = None
+
    @property
    def op_role(self):
        """

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -372,6 +372,7 @@ def load_vars(executor,
        load_vars(
            executor,
            dirname=dirname,
+            main_program=main_program,
            vars=list(filter(predicate, main_program.list_vars())),
            filename=filename)
    else:
@@ -403,9 +404,12 @@ def load_vars(executor,
                inputs={},
                outputs={"Out": load_var_list},
                attrs={'file_path': os.path.join(dirname, filename)})
-
        executor.run(load_prog)

+        # load slice vars on pserver, if have it.
+        _load_slice_up_vars(executor, dirname,
+                            main_program._slice_vars_and_attrs)
+

 def load_params(executor, dirname, main_program=None, filename=None):
    """
@@ -659,11 +663,19 @@ def save_inference_model(dirname,

    save_persistables(executor, dirname, inference_program, params_filename)

+    # if there is lookup table, the trainer 0 will notify all pserver to save.
+    if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        _save_lookup_tables_by_notify(executor, lookup_table_filename,
+                                      main_program._distributed_lookup_table,
+                                      main_program._endpoints)
+

 def load_inference_model(dirname,
                         executor,
                         model_filename=None,
-                         params_filename=None):
+                         params_filename=None,
+                         pserver_endpoints=None):
    """
    Load inference model from a directory

@@ -679,6 +691,10 @@ def load_inference_model(dirname,
                                   parameters were saved in a single binary
                                   file. If parameters were saved in separate
                                   files, set it as 'None'.
+        pserver_endpoints(list|None): This only need by distributed inference.
+                                    When use distributed look up table in training,
+                                    We also need it in inference.The parameter is
+                                    a list of pserver endpoints.

    Returns:
        tuple: The return of this function is a tuple with three elements:
@@ -697,12 +713,16 @@ def load_inference_model(dirname,

            exe = fluid.Executor(fluid.CPUPlace())
            path = "./infer_model"
+            endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
            [inference_program, feed_target_names, fetch_targets] =
                fluid.io.load_inference_model(dirname=path, executor=exe)
            results = exe.run(inference_program,
                          feed={feed_target_names[0]: tensor_img},
                          fetch_list=fetch_targets)

+            # if we need lookup table, we will use:
+            fluid.io.load_inference_model(dirname=path, executor=exe, pserver_endpoints=endpoints)
+
            # In this exsample, the inference program was saved in the
            # "./infer_model/__model__" and parameters were saved in
            # separate files in ""./infer_model".
@@ -729,6 +749,9 @@ def load_inference_model(dirname,
    program = Program.parse_from_string(program_desc_str)
    load_persistables(executor, dirname, program, params_filename)

+    if pserver_endpoints:
+        program = _endpoints_replacement(program, pserver_endpoints)
+
    feed_target_names = program.desc.get_feed_target_names()
    fetch_target_names = program.desc.get_fetch_target_names()
    fetch_targets = [
@@ -738,6 +761,61 @@ def load_inference_model(dirname,
    return [program, feed_target_names, fetch_targets]


+def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
+                                  pserver_endpoints):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name,
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
+            distribute arguments.
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            _save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name,
+                    pserver_endpoints=ps_endpoints)
+    """
+
+    pserver_notify_program = Program()
+    pserver_notify_block = pserver_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = pserver_endpoints
+    attrs['dir'] = dirname
+    attrs['lookup_table'] = lookup_table
+
+    pserver_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(pserver_notify_program)
+
+
+def _endpoints_replacement(program, endpoints):
+    ENDPOINT_MAP = "epmap"
+    for op in program.global_block().ops:
+        if op.has_attr(ENDPOINT_MAP):
+            op.set_attr(ENDPOINT_MAP, endpoints)
+    program._sync_with_cpp()
+    return program
+
+
 def get_parameter_value(para, executor):
    """
    Get the LoDTensor value of the given parameter.
@@ -799,3 +877,46 @@ def get_parameter_value_by_name(name, executor, program=None):
        program = default_main_program()
    var = program.global_block().var(name)
    return get_parameter_value(var, executor)
+
+
+def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
+    if not slice_vars_and_attrs:
+        return
+
+    load_prog = Program()
+    load_block = load_prog.global_block()
+
+    for var_tuple in slice_vars_and_attrs:
+        orig_var = var_tuple[0]
+        start = var_tuple[1]
+        slice_var = var_tuple[2]
+        end = start + reduce(lambda x, y: x * y, slice_var.shape)
+
+        clone_orig_var = load_block.create_var(
+            name=orig_var.name,
+            type=orig_var.type,
+            shape=orig_var.shape,
+            dtype=orig_var.dtype,
+            persistable=True)
+
+        clone_slice_var = load_block.create_var(
+            name=slice_var.name,
+            type=slice_var.type,
+            shape=slice_var.shape,
+            dtype=slice_var.dtype,
+            persistable=True)
+
+        load_block.append_op(
+            type='load',
+            inputs={},
+            outputs={'Out': [clone_orig_var]},
+            attrs={'file_path': os.path.join(dirname, clone_orig_var.name)})
+        load_block.append_op(
+            type="slice",
+            inputs={'Input': clone_orig_var},
+            outputs={'Out': clone_slice_var},
+            attrs={'axes': [0],
+                   'starts': [start],
+                   'ends': [end]})
+
+    executor.run(load_prog)
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -94,6 +94,7 @@ __all__ = [
    'image_resize_short',
    'resize_bilinear',
    'gather',
+    'scatter',
    'random_crop',
    'mean_iou',
    'relu',
@@ -5036,6 +5037,47 @@ def gather(input, index):
    return out


+def scatter(input, index, updates, name=None):
+    """
+    **Scatter Layer**
+
+    Output is obtained by updating the input on selected indices on the first
+    axis.
+
+    .. math::
+
+        Out = X
+        Out[Ids] = Updates
+
+    Args:
+        input (Variable): The source input with rank>=1.
+        index (Variable): The index input with rank=1. Its dtype should be
+                          int32 or int64 as it is used as indexes.
+        updates (Variable): The updated value of scatter op.
+        name (str|None): The output variable name. Default None.
+
+    Returns:
+        output (Variable): The output is a tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.scatter(input, index, updates)
+
+    """
+    helper = LayerHelper('scatter', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="scatter",
+        inputs={"X": input,
+                "Ids": index,
+                "Updates": updates},
+        outputs={"Out": out})
+    return out
+
+
 @templatedoc()
 def random_crop(x, shape, seed=None):
    """

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -65,7 +65,6 @@ __all__ = [
    'uniform_random_batch_size_like',
    'gaussian_random',
    'gaussian_random_batch_size_like',
-    'scatter',
    'sum',
    'slice',
    'shape',

--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -46,7 +46,8 @@ def cnn_model(data):
        pool_size=2,
        pool_stride=2,
        act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant()))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.3)))
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
@@ -54,7 +55,8 @@ def cnn_model(data):
        pool_size=2,
        pool_stride=2,
        act="relu",
-        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant()))
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.2)))

    SIZE = 10
    input_shape = conv_pool_2.shape
@@ -66,8 +68,7 @@ def cnn_model(data):
        size=SIZE,
        act="softmax",
        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale, seed=1)))
+            initializer=fluid.initializer.Constant(value=0.1)))
    return predict



--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -129,7 +129,12 @@ class SE_ResNeXt():
            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
-        out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
+        out = fluid.layers.fc(
+            input=drop,
+            size=class_dim,
+            act='softmax',
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.2)))
        return out

    def shortcut(self, input, ch_out, stride):
@@ -179,7 +184,7 @@ class SE_ResNeXt():
            act=None,
            # avoid pserver CPU init differs from GPU
            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant()),
+                initializer=fluid.initializer.Constant(value=0.2)),
            bias_attr=False)
        return fluid.layers.batch_norm(input=conv, act=act)

@@ -228,10 +233,8 @@ class DistSeResneXt2x2(TestDistRunnerBase):
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]

        optimizer = fluid.optimizer.Momentum(
-            # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
-            #learning_rate=fluid.layers.piecewise_decay(
-            #    boundaries=bd, values=lr),
-            learning_rate=base_lr,
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
        optimizer.minimize(avg_cost)

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -265,9 +265,9 @@ def main(role="pserver",


 if __name__ == "__main__":
-    if len(sys.argv) != 7:
+    if len(sys.argv) != 8:
        print(
-            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
        )
    role = sys.argv[1]
    endpoints = sys.argv[2]
@@ -275,6 +275,8 @@ if __name__ == "__main__":
    current_endpoint = sys.argv[4]
    trainers = int(sys.argv[5])
    is_dist = True if sys.argv[6] == "TRUE" else False
+    # FIXME(typhoonzero): refine this test.
+    is_async = True if sys.argv[7] == "TRUE" else False
    main(
        role=role,
        endpoints=endpoints,

--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -27,6 +27,7 @@ import unittest
 from multiprocessing import Process
 import os
 import signal
+import six
 import collections

 SEED = 1
@@ -55,7 +56,8 @@ def cnn_model(data):
    # TODO(dzhwinter) : refine the initializer and random seed settting
    SIZE = 10
    input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    param_shape = [six.moves.reduce(lambda a, b: a * b, input_shape[1:], 1)
+                   ] + [SIZE]
    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5

    predict = fluid.layers.fc(
@@ -108,7 +110,7 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):


 def operator_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
        if isinstance(v, fluid.framework.Program) or \
                isinstance(v, fluid.framework.Block):
            continue
@@ -118,8 +120,8 @@ def operator_equal(a, b):
                raise ValueError("In operator_equal not equal:{0}\n".format(k))

        elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(v.iteritems(), key=lambda x: x[0])
-            v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
+            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
+            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])

            if v0 != v1:
                raise ValueError("In operator_equal not equal:{0}\n".format(k))
@@ -131,7 +133,7 @@ def operator_equal(a, b):


 def block_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
        if isinstance(v, core.ProgramDesc) or isinstance(
                v, fluid.framework.Program) or isinstance(v, core.BlockDesc):
            continue
@@ -143,8 +145,8 @@ def block_equal(a, b):
            assert (len(a.ops) == len(b.ops))

        elif isinstance(v, collections.OrderedDict):
-            v0 = sorted(v.iteritems(), key=lambda x: x[0])
-            v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0])
+            v0 = sorted(six.iteritems(v), key=lambda x: x[0])
+            v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0])

            if v0 != v1:
                raise ValueError("In block_equal not equal:{0}\n".format(k))
@@ -156,7 +158,7 @@ def block_equal(a, b):


 def program_equal(a, b):
-    for k, v in a.__dict__.iteritems():
+    for k, v in six.iteritems(a.__dict__):
        if isinstance(v, core.ProgramDesc):
            continue


--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -30,7 +30,7 @@ class TestDistRunnerBase(object):
            "get_model should be implemented by child classes.")

    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
-                       trainers):
+                       trainers, sync_mode):
        # NOTE: import fluid until runtime, or else forking processes will cause error.
        import paddle
        import paddle.fluid as fluid
@@ -39,17 +39,22 @@ class TestDistRunnerBase(object):
            trainer_id=trainer_id,
            program=main_program,
            pservers=pserver_endpoints,
-            trainers=trainers)
+            trainers=trainers,
+            sync_mode=sync_mode)
        return t

-    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
-                    trainer_id):
+    def run_pserver(self,
+                    pserver_endpoints,
+                    trainers,
+                    current_endpoint,
+                    trainer_id,
+                    sync_mode=True):
        import paddle
        import paddle.fluid as fluid
        self.get_model(batch_size=2)
        t = self.get_transpiler(trainer_id,
                                fluid.default_main_program(), pserver_endpoints,
-                                trainers)
+                                trainers, sync_mode)
        pserver_prog = t.get_pserver_program(current_endpoint)
        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
        place = fluid.CPUPlace()
@@ -57,7 +62,13 @@ class TestDistRunnerBase(object):
        exe.run(startup_prog)
        exe.run(pserver_prog)

-    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+    def run_trainer(self,
+                    place,
+                    endpoints,
+                    trainer_id,
+                    trainers,
+                    is_dist=True,
+                    sync_mode=True):
        import paddle
        import paddle.fluid as fluid
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
@@ -65,7 +76,7 @@ class TestDistRunnerBase(object):
        if is_dist:
            t = self.get_transpiler(trainer_id,
                                    fluid.default_main_program(), endpoints,
-                                    trainers)
+                                    trainers, sync_mode)
            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()
@@ -106,9 +117,9 @@ def runtime_main(test_class):
    import paddle.fluid as fluid
    import paddle.fluid.core as core

-    if len(sys.argv) != 7:
+    if len(sys.argv) != 8:
        print(
-            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
        )
    role = sys.argv[1]
    endpoints = sys.argv[2]
@@ -116,34 +127,43 @@ def runtime_main(test_class):
    current_endpoint = sys.argv[4]
    trainers = int(sys.argv[5])
    is_dist = True if sys.argv[6] == "TRUE" else False
+    sync_mode = True if sys.argv[7] == "TRUE" else False

    model = test_class()
    if role == "pserver":
-        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id,
+                          sync_mode)
    else:
        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
        ) else fluid.CPUPlace()
-        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist,
+                          sync_mode)


 import paddle.compat as cpt


 class TestDistBase(unittest.TestCase):
+    def _setup_config(self):
+        raise NotImplementedError("tests should have _setup_config implemented")
+
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
        self._python_interp = "python"
+        self._sync_mode = True
+        self._setup_config()

    def start_pserver(self, model_file, check_error_log):
+        sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
-        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+             self._trainers, sync_mode_str)
+        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+             self._trainers, sync_mode_str)

        ps0_pipe = subprocess.PIPE
        ps1_pipe = subprocess.PIPE
@@ -195,9 +215,10 @@ class TestDistBase(unittest.TestCase):
        # Run local to get a base line
        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
        env_local.update(required_envs)
-        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
+        sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
+        local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \
            (self._python_interp, model_file,
-             "127.0.0.1:1234", "127.0.0.1:1234", 1)
+             "127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str)
        if not check_error_log:
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
@@ -226,12 +247,12 @@ class TestDistBase(unittest.TestCase):
        self._wait_ps_ready(ps1.pid)

        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
+        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \
            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
-             self._trainers)
-        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
+             self._trainers, sync_mode_str)
+        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \
            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
-             self._trainers)
+             self._trainers, sync_mode_str)

        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
        env1 = {"CUDA_VISIBLE_DEVICES": "1"}

--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -17,10 +17,21 @@ import unittest
 from test_dist_base import TestDistBase


-class TestDistSeResneXt2x2(TestDistBase):
+class TestDistMnist2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
    def test_se_resnext(self):
        self.check_with_place("dist_mnist.py", delta=1e-7)


+class TestDistMnistAsync(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist.py", delta=200)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase


 class TestDistSeResneXt2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
    def test_se_resnext(self):
        self.check_with_place("dist_se_resnext.py", delta=1e-7)


+class TestDistSeResneXt2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_se_resnext.py", delta=100)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -19,6 +19,9 @@ from test_dist_base import TestDistBase


 class TestDistTransformer2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
    def test_transformer(self):
        # TODO(paddle-dev): check if the delta is OK.
        # Usually start around ~8000 and converge to ~5000

--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -47,7 +47,6 @@ class TranspilerTest(unittest.TestCase):
        avg_cost = fluid.layers.mean(cost)
        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
        sgd_optimizer.minimize(avg_cost)
-        return

    def get_main_program(self):
        main = fluid.Program()
@@ -95,6 +94,7 @@ class TranspilerTest(unittest.TestCase):
    def test_transpiler(self):
        main = fluid.Program()
        startup = fluid.Program()
+        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
                self.transpiler_test_impl()

@@ -249,7 +249,6 @@ class TestLRDecay(TranspilerTest):
                decay_rate=0.1,
                staircase=True))
        sgd_optimizer.minimize(avg_cost)
-        return

    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -279,7 +278,6 @@ class TestLRDecayConditional(TranspilerTest):
            learning_rate=fluid.layers.piecewise_decay([10000, 20000],
                                                       [1.0, 0.5, 1.0]))
        sgd_optimizer.minimize(avg_cost)
-        return

    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -328,7 +326,6 @@ class TestL2Decay(TranspilerTest):
        avg_cost = fluid.layers.mean(cost)
        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
        sgd_optimizer.minimize(avg_cost)
-        return

    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -363,7 +360,6 @@ class TestL2DecayWithPiecewise(TranspilerTest):
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
        sgd_optimizer.minimize(avg_cost)
-        return

    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -393,13 +389,14 @@ class TestDistLookupTableBase(TranspilerTest):
    def network_with_table(self, is_sparse, is_distributed):
        self.table_size = 1000
        self.emb_size = 64
+        self.lookup_table_name = 'shared_w'

        def emb_pool(ids):
            emb = fluid.layers.embedding(
                input=ids,
                size=[self.table_size, self.emb_size],
                dtype='float32',
-                param_attr='shared_w',  # share parameter
+                param_attr=self.lookup_table_name,  # share parameter
                is_sparse=is_sparse,
                is_distributed=is_distributed)
            pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
@@ -572,7 +569,7 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase):

    def transpiler_test_impl(self):
        config = fluid.DistributeTranspilerConfig()
-        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config)
+        pserver1, _ = self.get_pserver(self.pserver1_ep, config)

        self.assertTrue(self.transpiler.has_distributed_lookup_table)
        lookup_table_var = pserver1.global_block().vars[
@@ -582,6 +579,21 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase):
        self.assertEqual(row_size, calc_row_size)


+class TestDistArgsInProgram(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        trainer, _ = self.get_trainer()
+
+        self.assertTrue(trainer._is_distributed)
+        self.assertTrue(trainer._is_chief)
+        self.assertEqual(trainer._distributed_lookup_table,
+                         self.lookup_table_name)
+        self.assertEqual(trainer._endpoints,
+                         [self.pserver1_ep, self.pserver2_ep])
+
+
 class TestRMSPropOptimizer(TranspilerTest):
    def net_conf(self):
        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
@@ -595,7 +607,6 @@ class TestRMSPropOptimizer(TranspilerTest):
        avg_cost = fluid.layers.mean(cost)
        optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
        optimizer.minimize(avg_cost)
-        return

    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
@@ -612,5 +623,40 @@ class TestRMSPropOptimizer(TranspilerTest):
        self.assertEqual(moment_var.shape, (500, 1000))


+class TestLoadSliceVar(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
+        optimizer.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, _ = self.get_pserver(self.pserver1_ep)
+        pserver2, _ = self.get_pserver(self.pserver2_ep)
+
+        self.assertTrue(pserver._slice_vars_and_attrs)
+        self.assertTrue(pserver2._slice_vars_and_attrs)
+
+        for idx in xrange(len(pserver._slice_vars_and_attrs)):
+            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
+                             pserver2._slice_vars_and_attrs[idx][0])
+
+            total_numel = reduce(lambda x, y: x * y,
+                                 pserver._slice_vars_and_attrs[idx][0].shape)
+            self.assertEqual(
+                total_numel,
+                reduce(lambda x, y: x * y,
+                       pserver._slice_vars_and_attrs[idx][2].shape) + reduce(
+                           lambda x, y: x * y,
+                           pserver2._slice_vars_and_attrs[idx][2].shape))
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -18,9 +18,20 @@ from test_dist_base import TestDistBase


 class TestDistSeResneXt2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+
    def test_se_resnext(self):
        self.check_with_place("dist_word2vec.py", delta=1e-7)


+class TestDistSeResneXt2x2Async(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_word2vec.py", delta=1)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -347,6 +347,25 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(loss)
        print(str(program))

+    def test_scatter(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x',
+                shape=[3, 3],
+                append_batch_size=False,
+                dtype='float32')
+            idx = layers.data(
+                name='idx', shape=[2], append_batch_size=False, dtype='int32')
+            updates = layers.data(
+                name='updates',
+                shape=[2, 3],
+                append_batch_size=False,
+                dtype='float32')
+            out = layers.scatter(input=x, index=idx, updates=updates)
+            self.assertIsNotNone(out)
+        print(str(program))
+
    def test_lod_reset(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -215,6 +215,13 @@ class DistributeTranspiler(object):
        for param_var, grad_var in self.params_grads:
            self.param_name_to_grad_name[param_var.name] = grad_var.name

+        # add distributed attrs to program
+        self.origin_program._is_distributed = True
+        self.origin_program._endpoints = self.pserver_endpoints
+        self.origin_program._is_chief = self.trainer_id == 0
+        self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
+
+        # split and create vars, then put splited vars in dicts for later use.
        # step 1: split and create vars, then put splited vars in dicts for later use.
        self._init_splited_vars()

@@ -369,7 +376,7 @@ class DistributeTranspiler(object):
        # FIXME(gongwb): delete not need ops.
        # note that: some parameter is not trainable and those ops can't be deleted.

-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in six.iteritems(self.param_var_mapping):
            # Get the eplist of recv vars
            eps = []
            for var in splited_var:
@@ -406,7 +413,7 @@ class DistributeTranspiler(object):
                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
            })

-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in six.iteritems(self.param_var_mapping):
            #add concat ops to merge splited parameters received from parameter servers.
            if len(splited_var) <= 1:
                continue
@@ -590,6 +597,8 @@ class DistributeTranspiler(object):
            checkpoint_block_id = self._create_checkpoint_save_block(
                pserver_program, table_opt_block.idx)

+            pserver_program._distributed_lookup_table = self.table_name
+
        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
        # not be executed, so it's safe to use optimize_block to hold the place
        if self.has_distributed_lookup_table:
@@ -616,6 +625,10 @@ class DistributeTranspiler(object):
            outputs={},
            attrs=attrs)

+        # add distributed attrs
+        pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs(
+            endpoint)
+
        pserver_program._sync_with_cpp()
        return pserver_program

@@ -689,8 +702,31 @@ class DistributeTranspiler(object):
                    inputs=new_inputs,
                    outputs=new_outputs,
                    attrs=op.all_attrs())
+
+        # add slice vars
+        s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint)
+
        return s_prog

+    def _get_slice_vars_and_attrs(self, endpoint):
+        slice_vars_and_attrs = []
+        block_suffix = "block"
+        for param in self.param_grad_ep_mapping[endpoint]["params"]:
+            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
+            if not block_name:
+                continue
+
+            block_idx = int(block_name.split(block_suffix)[1])
+            orig_var = self.origin_program.global_block().vars[orig_var_name]
+
+            skip_numel = 0
+            slice_vars = self.param_var_mapping[orig_var_name]
+            for slice_var in slice_vars[:block_idx]:
+                skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
+            slice_vars_and_attrs.append([orig_var, skip_numel, param])
+
+        return slice_vars_and_attrs
+
    # ====================== private transpiler functions =====================

    def _has_distributed_lookup_table(self):
@@ -1209,8 +1245,8 @@ class DistributeTranspiler(object):
        elif op_type == "momentum":
            if varkey == "Velocity":
                return param_shape
-        elif op_type == "":
-            if varkey == "Moment":
+        elif op_type == "rmsprop":
+            if varkey in ["Moment", "MeanSquare"]:
                return param_shape
        elif op_type == "sgd":
            pass
@@ -1289,8 +1325,6 @@ class DistributeTranspiler(object):
        pserver_block = program.global_block()
        new_inputs = collections.OrderedDict()

-        # update param/grad shape first, then other inputs like
-        # moment can use the updated shape
        def _get_param_block(opt_op):
            # param is already created on global program
            param_block = None
@@ -1303,22 +1337,6 @@ class DistributeTranspiler(object):
        for key in opt_op.input_names:
            if key == "Grad":
                new_inputs[key] = merged_var
-            # For RMSProp optimizer
-            elif key == "Moment" or key == "MeanSquare":
-                param_block = _get_param_block(opt_op)
-                if not param_block:
-                    return
-                moment_var = origin_program.global_block().vars[opt_op.input(
-                    key)[0]]
-                tmpvar = pserver_block.create_var(
-                    name=moment_var.name,
-                    persistable=moment_var.persistable,
-                    dtype=moment_var.dtype,
-                    # change to use same shape as param
-                    # TODO(typhoonzero): didn't append .block in the var name,
-                    # may affect checkpoint saving? Need to verify.
-                    shape=param_block.shape)
-                new_inputs[key] = tmpvar
            elif key == "Param":
                param_block = _get_param_block(opt_op)
                if not param_block:
@@ -1346,7 +1364,7 @@ class DistributeTranspiler(object):

        for key in opt_op.input_names:
            new_shape = None
-            if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]:
+            if key in ["Param", "Grad", "LearningRate"]:
                continue
            var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
            # update accumulator variable shape

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -159,7 +159,9 @@ if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_LIB}', libs_path)
    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
    package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
-if '${WITH_MKLDNN}' == 'ON':
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    # only change rpath in Release mode.
+    if '${WITH_MKLDNN}' == 'ON':
        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
        # we can support mkl on mac.
        #
@@ -179,13 +181,15 @@ package_dir['paddle.libs']=libs_path
 # The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
 # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-if "@APPLE@" == "1":
+if '${CMAKE_BUILD_TYPE}' == 'Release':
+    # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed.
+    if "@APPLE@" == "1":
        command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-else:
+    else:
        command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
-if os.system(command) != 0:
+    if os.system(command) != 0:
        raise Exception("patch core.so failed, command: %s" % command)
-if '${WITH_FLUID_ONLY}'== 'OFF':
+    if '${WITH_FLUID_ONLY}'== 'OFF':
        # change rpath of _swig_paddle.so.
        if "@APPLE@" == "1":
            command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"