diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index dbd375aa31bfbdcb109b6302acf23b3bb3b6befe..627370cd2df7317b4d32aa967565aaf9cf0c7a08 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto glog lod_rank_table feed_fetch_method)
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index c026e6c100a303b43650f08cd12d7260258c8f7e..c106761f72e689ff53867ecad8e36b6038173d0e 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,6 +7,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -28,6 +29,9 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
 
+
+cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
+
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 629aa00cb817c4b1446e7b750ca62a7c6b1db670..8036f756b6d6506684c109ab881d546f38176a10 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -59,8 +59,8 @@ struct BroadcastOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 91bdfe6134ffbd1404336c9d6d1222a505084b2b..64e83acb4dc1995800c4ca3caf81668b24a7c9fe 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -29,6 +31,8 @@ struct BuildStrategy {
 
   ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+
+  std::string debug_graphviz_path_{""};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/graph_builder_factory.cc b/paddle/fluid/framework/details/graph_builder_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a04b9bb63c06b40ff5c30c9792cdfad5d64d404c
--- /dev/null
+++ b/paddle/fluid/framework/details/graph_builder_factory.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
+#include <fstream>
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
+  std::unique_ptr<SSAGraphBuilder> res(
+#ifdef PADDLE_WITH_CUDA
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, nccl_ctxs_, strategy_)
+#else
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, strategy_)
+#endif
+          );  // NOLINT
+
+  if (!strategy_.debug_graphviz_path_.empty()) {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(strategy_.debug_graphviz_path_));
+    PADDLE_ENFORCE(fout->good());
+    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
+        new GraphvizSSAGraphPrinter());
+    res.reset(new SSAGraghBuilderWithPrinter(
+        std::move(fout), std::move(graphviz_printer), std::move(res)));
+  }
+  return res;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/graph_builder_factory.h b/paddle/fluid/framework/details/graph_builder_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..857ab12d684e19788597e144fc0c46571d06aafc
--- /dev/null
+++ b/paddle/fluid/framework/details/graph_builder_factory.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class SSAGraphBuilderFactory {
+ public:
+  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
+                         const std::string& loss_var_name,
+                         const std::unordered_set<std::string>& param_names,
+                         const std::vector<Scope*>& local_scopes,
+                         const BuildStrategy& strategy)
+      : places_(places),
+        loss_var_name_(loss_var_name),
+        param_names_(param_names),
+        local_scopes_(local_scopes),
+        strategy_(strategy) {}
+
+#ifdef PADDLE_WITH_CUDA
+  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
+    nccl_ctxs_ = nccl_ctxs;
+  }
+#endif
+
+  std::unique_ptr<SSAGraphBuilder> Create();
+
+ private:
+  std::vector<platform::Place> places_;
+  std::string loss_var_name_;
+  std::unordered_set<std::string> param_names_;
+  std::vector<Scope*> local_scopes_;
+  BuildStrategy strategy_;
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap* nccl_ctxs_;
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index d1683774b4d508af469447eaa1c305f5f6a18c4b..868de0f9a60a3dcad9a2c295f38707cdadcd1d21 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -30,10 +30,6 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 
-DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
-              "the ssa graph path only print with GLOG_v=10,"
-              "default /tmp/graph.dot");
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -149,6 +145,7 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
 
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
+  VLOG(3) << "Building ....";
   std::unordered_map<std::string, VarDesc *> all_vars;
   for (auto *var : program.Block(0).AllVars()) {
     all_vars[var->Name()] = var;
@@ -315,11 +312,6 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    */
   AddOutputToLeafOps(&result);
 
-  if (VLOG_IS_ON(10)) {
-    std::ofstream fout(FLAGS_ssa_graph_path);
-    PrintGraphviz(*graph, fout);
-  }
-
   return std::unique_ptr<SSAGraph>(graph);
 }
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 79c2b79a3ff539876c53919c8ca31a587faef57a..fd4245461bce7115a581fed8f8aa226a7a9911fa 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -48,7 +48,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
 
-  int GetRemoteVarDevice(const std::string &var_name) const {
+  int GetRemoteVarDeviceId(const std::string &var_name) const override {
     auto got = remote_vars_devices_.find(var_name);
     if (got != remote_vars_devices_.end()) {
       return got->second;
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index a0c321843e3fc5abcbd1ef2ce2e153250269aa7d..8e98d894b828b4162059b30f5c6a74cfc06f402e 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -41,8 +41,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
   const platform::NCCLContextMap &nccl_ctxs_;
 };
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index c652a2f4eb0f9b73cb19ebbd9d0809210b280ad3..4d14334cdfe06e2e805c2577458d6689e6324cc7 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -32,8 +32,8 @@ namespace framework {
 namespace details {
 
 struct ReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
 
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 6a567527550883add08031e50aa8de2b204cf13d..211113c7979ee95d896c0a57879f7b3ad13b36ef 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
   op_handle->AddOutput(var);
 }
 
-template <typename Callback>
-void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
-    for (auto &pair1 : each) {
-      for (auto &pair2 : pair1.second) {
-        callback(*pair2);
-      }
-    }
-  }
-
-  for (auto &var : graph.dep_vars_) {
-    callback(*var);
-  }
-}
-
-void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
-  size_t var_id = 0;
-  std::unordered_map<const VarHandleBase *, size_t> vars;
-
-  sout << "digraph G {\n";
-
-  IterAllVar(graph, [&](const VarHandleBase &var) {
-    auto *var_ptr = &var;
-    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
-    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
-
-    size_t cur_var_id = var_id++;
-    vars[var_ptr] = cur_var_id;
-
-    if (var_handle_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
-           << "\\n"
-           << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
-    } else if (dummy_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
-    }
-  });
-
-  size_t op_id = 0;
-  for (auto &op : graph.ops_) {
-    std::string op_name = "op_" + std::to_string(op_id++);
-    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
-         << std::endl;
-    for (auto in : op->Inputs()) {
-      std::string var_name = "var_" + std::to_string(vars[in]);
-      sout << var_name << " -> " << op_name << std::endl;
-    }
-
-    for (auto out : op->Outputs()) {
-      std::string var_name = "var_" + std::to_string(vars[out]);
-      sout << op_name << " -> " << var_name << std::endl;
-    }
-  }
-
-  sout << "}\n";
-}
-
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
   for (auto &op : graph->ops_) {
     if (!op->Outputs().empty()) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 64e5d93081eb76c56898bbeb530e37364619fdbb..0da9a2128fcecb3201ed7952f1b2185fac158f98 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -30,6 +30,7 @@ class SSAGraphBuilder {
   SSAGraphBuilder() {}
   virtual ~SSAGraphBuilder() {}
   virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+  virtual int GetRemoteVarDeviceId(const std::string &var_name) const = 0;
 
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
@@ -55,8 +56,6 @@ class SSAGraphBuilder {
                              const platform::Place &place, size_t place_offset);
 
   static void AddOutputToLeafOps(SSAGraph *graph);
-
-  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22a40ca4b25cdd8ed9856b6c71bffc79561edcac
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename Callback>
+static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(*pair2);
+      }
+    }
+  }
+
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+
+void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+                                    std::ostream &sout) const {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->Inputs()) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : op->Outputs()) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5287be3b6a05ec7067ca433ba976b0314d05fe02
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+class SSAGraph;
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+};
+
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+};
+
+class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
+ public:
+  SSAGraghBuilderWithPrinter(std::ostream& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ref_(sout) {}
+
+  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ptr_(std::move(sout)),
+        stream_ref_(*stream_ptr_) {}
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    printer_->Print(*graph, stream_ref_);
+    return graph;
+  }
+
+ private:
+  std::unique_ptr<SSAGraphPrinter> printer_;
+  std::unique_ptr<SSAGraphBuilder> builder_;
+  std::unique_ptr<std::ostream> stream_ptr_;
+  std::ostream& stream_ref_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 31882504c35162b427595a8c57f073d2eac07bcd..85dad0a46a7a9fd2aa4dcd83c7316beed35db9d7 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -101,23 +102,23 @@ ParallelExecutor::ParallelExecutor(
     var_infos.back().persistable_ = var->Persistable();
   }
 
-// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
-// ncclOp
-#ifdef PADDLE_WITH_CUDA
-  builder_.reset(new details::MultiDevSSAGraphBuilder(
-      member_->places_, loss_var_name, params, member_->local_scopes_,
-      member_->nccl_ctxs_.get(), build_strategy));
-
-#else
-  builder_.reset(new details::MultiDevSSAGraphBuilder(
+  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+  details::SSAGraphBuilderFactory builder_factory(
       member_->places_, loss_var_name, params, member_->local_scopes_,
-      build_strategy));
+      build_strategy);
 
+#ifdef PADDLE_WITH_CUDA
+  builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
 #endif
-  auto graph = builder_->Build(main_program);
+  builder_.reset(builder_factory.Create().get());
+  if (builder_.get() == nullptr) {
+    VLOG(3) << "builder is null.";
+  }
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places, std::move(graph)));
+      exec_strategy, member_->local_scopes_, places,
+      builder_->Build(main_program)));
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),
@@ -155,8 +156,8 @@ void ParallelExecutor::BCastParamsToGPUs(
         auto &nccl_ctx = member_->nccl_ctxs_->at(place);
 
         if (builder_.get() != nullptr &&
-            builder_->GetRemoteVarDevice(var) != -1) {
-          int place_id = builder_->GetRemoteVarDevice(var);
+            builder_->GetRemoteVarDeviceId(var) != -1) {
+          int place_id = builder_->GetRemoteVarDeviceId(var);
           platform::dynload::ncclBcast(buffer, numel, data_type, place_id,
                                        nccl_ctx.comm_, nccl_ctx.stream());
         } else {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index b71a440d6a0c83dc6f26eb0243fabcefd4ede167..058f83f07c26224e3180d140630c08a24c40cd80 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -70,7 +70,7 @@ class ParallelExecutor {
 
  private:
   ParallelExecutorPrivate *member_;
-  std::unique_ptr<details::MultiDevSSAGraphBuilder> builder_;
+  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index e97ada06f06d0538f17160220e3aa3f4ffc55520..c7286dacf01659f3af0927a71856e5a6496cb877 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -15,5 +15,102 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
-namespace framework {}
+namespace framework {
+extern size_t SizeOfType(std::type_index type);
+void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+
+size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+}
+
+void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
+  int64_t size = numel() * SizeOfType(type);
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
+    }
+#else
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing.");
+  return mutable_data(place, holder_->type());
+}
+
+Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
+  return *this;
+}
+
+Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");
+
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / dims_[0];
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(layout_);
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    return dst;
+  }
+}
+
+Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+
+const DDim& Tensor::dims() const { return dims_; }
+
+int64_t Tensor::numel() const { return product(dims_); }
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6f878541e6de1deec1829145b1b325ecd176a034..29566aaa53370b1fffc9ff9a90ae9b740b24f69e 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -54,26 +54,24 @@ class Tensor {
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
-  inline T* data();
+  T* data();
 
   /*! Return a pointer to constant memory block. */
   template <typename T>
-  inline const T* data() const;
+  const T* data() const;
 
-  inline bool IsInitialized() const;
-
-  inline void switch_place(platform::Place new_place);
+  bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place);
 
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type);
 
-  inline void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -84,19 +82,19 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place);
 
   /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
+  const DDim& dims() const;
 
   /*! Return the numel of the memory block. */
-  inline int64_t numel() const;
+  int64_t numel() const;
 
   /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
+  Tensor& Resize(const DDim& dims);
 
   /*! The internal of two tensors share the same memory block. */
-  inline Tensor& ShareDataWith(const Tensor& src);
+  Tensor& ShareDataWith(const Tensor& src);
 
   /**
    * @brief  Return a sub-tensor of the given tensor.
@@ -106,7 +104,7 @@ class Tensor {
    * @param[in] end_idx     The index of the end row(exclusive) to slice.
    *                        The index number begins from 0.
    */
-  inline Tensor Slice(int begin_idx, int end_idx) const;
+  Tensor Slice(int begin_idx, int end_idx) const;
 
   platform::Place place() const {
     PADDLE_ENFORCE_NOT_NULL(
@@ -123,11 +121,11 @@ class Tensor {
   // memory size returns the holding memory size in byte.
   size_t memory_size() const;
 
-  inline void check_memory_size() const;
+  void check_memory_size() const;
 
-  inline DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return layout_; }
 
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+  void set_layout(const DataLayout layout) { layout_ = layout; }
 
  private:
   /**
@@ -210,15 +208,6 @@ class Tensor {
   size_t offset_;
 };
 
-inline void Tensor::switch_place(platform::Place new_place) {
-  if (holder_->place() == new_place) {
-    return;
-  }
-
-  // TODO(tonyyang-svail): do memcpy here.
-  PADDLE_THROW("Not Implemented");
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 2f19ec0f0a9338e2b96d1f64eac45387bae4d1eb..96114678a9992f2975c4173c7cc003114f04d8df 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -20,21 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-extern size_t SizeOfType(std::type_index type);
-inline void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_LE(
-      numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
-}
-
-inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
-
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
@@ -73,88 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) {
   return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
 
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place) ||
-               platform::is_cuda_pinned_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW(
-          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
-    }
-#else
-      if (platform::is_gpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-            boost::get<platform::CUDAPlace>(place), size, type));
-      } else if (platform::is_cuda_pinned_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
-            boost::get<platform::CUDAPinnedPlace>(place), size, type));
-      }
-    }
-#endif
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-inline void* Tensor::mutable_data(platform::Place place) {
-  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
-}
-
-inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-
-inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
-
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-    return dst;
-  }
-}
-
-inline Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
-  return *this;
-}
-
-inline const DDim& Tensor::dims() const { return dims_; }
-
-inline int64_t Tensor::numel() const { return product(dims_); }
-
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
   res.ShareDataWith(src);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 03cf417b62f96fd6812b3eac497ffdf9a484f5eb..669d1bdaa3ec194be817cdc5e1f8484770c70c68 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -553,6 +553,12 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self,
              BuildStrategy::GradientScaleStrategy strategy) {
             self.gradient_scale_ = strategy;
+          })
+      .def_property(
+          "debug_graphviz_path",
+          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
+          [](BuildStrategy &self, const std::string &path) {
+            self.debug_graphviz_path_ = path;
           });
 
   pe.def(py::init<const std::vector<platform::Place> &,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 56f6f26803919a171f6459c909e6bb71ab63b180..221f3ddae589d9992ba7fb92975a698ca4306249 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1182,19 +1182,19 @@ def conv2d(input,
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
 
         - Output:
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
         .. math::
 
-        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
        input(Variable): The input image with [N, C, H, W] format.