From 26e32e095a6c4d643fccf2cea7675b075aad1730 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Thu, 17 Jan 2019 17:34:01 +0800
Subject: [PATCH] allow compiler to use graph

test=develop
---
 paddle/fluid/API.spec                         |   2 +-
 .../fluid/framework/details/build_strategy.cc |  26 +--
 .../fluid/framework/details/build_strategy.h  |   2 +-
 .../fast_threaded_ssa_graph_executor.cc       |   9 +-
 .../fast_threaded_ssa_graph_executor.h        |   4 +-
 .../details/memory_optimize_helper_test.cc    |  26 +--
 .../framework/details/memory_optimize_pass.cc |   3 +-
 .../details/parallel_ssa_graph_executor.cc    |   9 +-
 .../details/parallel_ssa_graph_executor.h     |   4 +-
 .../details/threaded_ssa_graph_executor.cc    |   9 +-
 .../details/threaded_ssa_graph_executor.h     |   4 +-
 paddle/fluid/framework/ir/graph.h             |  16 ++
 paddle/fluid/framework/parallel_executor.cc   | 154 ++++++++++---
 paddle/fluid/framework/parallel_executor.h    |   9 +-
 paddle/fluid/pybind/ir.cc                     |   3 +-
 paddle/fluid/pybind/pybind.cc                 |  10 +-
 python/paddle/fluid/compiler.py               |  83 +++++--
 .../slim/unitest/test_quantization_pass.py    | 204 ++++++++++++++++++
 python/paddle/fluid/executor.py               |   1 +
 python/paddle/fluid/framework.py              |   3 +-
 python/paddle/fluid/parallel_executor.py      |   5 +-
 21 files changed, 460 insertions(+), 126 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index f24cf96cc..711c7481d 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -43,7 +43,7 @@ paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'start
 paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False))
 paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 8c6c9f35e..231abac97 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -171,7 +171,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
 }
 
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> graph,
+    const std::vector<platform::Place> &places,
     const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
     const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@@ -182,7 +183,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
   // Create a default one if not finalized by user.
   CreatePassesFromStrategy(false);
 
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+  std::vector<OpDesc *> all_ops = graph->OriginProgram().Block(0).AllOps();
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
     if (IsMultiDevPass(pass->Type())) {
       pass->Erase(kPlaces);
@@ -204,37 +205,30 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       if (graph->Has(kAllOpDescs)) {
         graph->Erase(kAllOpDescs);
       }
-      const std::vector<OpDesc *> *all_op_descs =
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
-      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
-                                              all_op_descs);  // take ownership
+
+      graph->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs,
+                                                      &all_ops);  // take ownership
 
       pass->Erase(kAllOpDescs);
-      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
+      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, &all_ops);
 
     } else if (pass->Type() == "sequential_execution_pass") {
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
 
       pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, &all_ops);
     } else if (pass->Type() == "all_reduce_deps_pass") {
       LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
                 << ", num_trainers:" << num_trainers_;
 
       pass->Erase(kAllOpDescs);
-      pass->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, &all_ops);
     } else if (pass->Type() == "inplace_pass") {
       if (graph->Has(kAllOpDescs)) {
         graph->Erase(kAllOpDescs);
       }
-      graph->Set<const std::vector<OpDesc *>>(
-          kAllOpDescs,
-          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+      graph->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, &all_ops);
     } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
       if (!use_cuda) {
         LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index e62e3edce..0ea71aa3b 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -114,7 +114,7 @@ struct BuildStrategy {
 
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
+  std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
                                    const std::vector<platform::Place> &places,
                                    const std::string &loss_var_name,
                                    const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 872bc5d65..f03646705 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -24,12 +24,11 @@ namespace details {
 
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
+    const std::vector<platform::Place> &places, ir::Graph *graph)
     : strategy_(strategy),
       local_scopes_(local_scopes),
       places_(places),
-      graph_(std::move(graph)),
+      graph_(graph),
       pool_(strategy.num_threads_),
       prepare_pool_(1),  // add one more thread for generate op_deps
       fetch_ctxs_(places) {
@@ -110,14 +109,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
         }
       }
       if (exception_.IsCaught()) {
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
         exception_.ReThrow();
       }
     }
     num_complete += num_comp;
   }
   // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
   return fetches;
 }
 
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index c3a8b8542..970298950 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                                const std::vector<Scope *> &local_scopes,
                                const std::vector<platform::Place> &places,
-                               std::unique_ptr<ir::Graph> &&graph);
+                               ir::Graph *graph);
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
   const ir::Graph &Graph() const override;
 
@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
 
   std::unordered_map<OpHandleBase *, int> op_deps_;
   std::vector<OpHandleBase *> bootstrap_ops_;
diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
index 3cfe297a7..5389e76e0 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -228,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
   // prepare ir graph
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   ControlFlowGraph cfg(graph);
   cfg.LiveVariableAnalysis();
@@ -256,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
 TEST(SortOpLikeDescOrder, NormalTest) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   auto nodes = SortOpLikeDescOrder(graph);
   auto op_descs = prog.Block(0).AllOps();
@@ -273,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
 TEST(SortOpLikeDescOrder, RemoveOpDesc) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
   auto nodes = graph.Nodes();
   auto op_descs = prog.Block(0).AllOps();
   ir::Node* found_node = nullptr;
@@ -324,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
 // 3. add some op_desc
 TEST(SortOpLikeDescOrder, AddOpDesc) {
   auto prog = FillProgramDesc();
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
   ir::Graph graph(prog);
 
   auto find_node_in_graph = [&](std::string s) {
@@ -342,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 
   // cached desc different with real one
   // mimic the intermidiete pass modify the programdesc.
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto op_descs = prog.Block(0).AllOps();
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
 
   auto op = prog.MutableBlock(0)->AppendOp();
   prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
@@ -376,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
 
   auto find_node_in_graph = [&](std::string s) {
     ir::Node* ret = nullptr;
@@ -392,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
     return ret;
   };
 
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
+
   // remove sum node
-  auto op_descs = prog.Block(0).AllOps();
   ir::Node* found_node = nullptr;
   auto nodes = graph.Nodes();
   for (auto node : nodes) {
@@ -454,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
 TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
   auto prog = FillProgramDesc();
   ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
 
   auto find_node_in_graph = [&](std::string s) {
     ir::Node* ret = nullptr;
@@ -470,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
     return ret;
   };
 
-  auto op_descs = prog.Block(0).AllOps();
   // add node
   auto op = prog.MutableBlock(0)->AppendOp();
   prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index fd02bc469..20d486588 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -336,5 +336,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
 }  // namespace paddle
 
 REGISTER_PASS(memory_optimize_pass,
-              paddle::framework::details::MemoryOptimizePass)
-    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
+              paddle::framework::details::MemoryOptimizePass);
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 4c8f69c68..18b455cc6 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -20,8 +20,7 @@ namespace framework {
 namespace details {
 
 std::vector<std::unique_ptr<ir::Graph>>
-ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
-    std::unique_ptr<ir::Graph> &&graph) {
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph* graph) {
   std::vector<std::unique_ptr<ir::Graph>> graphs;
   graphs.reserve(places_.size());
   for (size_t i = 0; i < places_.size(); ++i) {
@@ -78,7 +77,7 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
 ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
+    const framework::ProgramDesc &main_prog, ir::Graph* graph)
     : strategy_(std::move(strategy)),
       local_scopes_(std::move(local_scopes)),
       pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
@@ -86,7 +85,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
       main_prog_(main_prog),
       // TODO(Yancey1989): Copying graphs is not safely since it deleted the
       // attrs.
-      graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
+      graphs_(SeparateMultiDevicesGraph(graph)) {
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 
   auto seq_allreduce_pass =
@@ -107,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
     executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
-        strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
+        strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
   }
 }
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index 1c35d45fd..a1547878a 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -32,7 +32,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
                            const framework::ProgramDesc &main_prog,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph* graph);
   ~ParallelSSAGraphExecutor() final = default;
 
   const ir::Graph &Graph() const override { return *graphs_[0]; }
@@ -41,7 +41,7 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
 
  private:
   std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-      std::unique_ptr<ir::Graph> &&graph);
+      ir::Graph* graph);
 
   ExecutionStrategy strategy_;
   std::vector<Scope *> local_scopes_;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 72acc337b..9ba295a2b 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -23,9 +23,8 @@ namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<ir::Graph> &&graph)
-    : graph_(std::move(graph)),
+    const std::vector<platform::Place> &places, ir::Graph *graph)
+    : graph_(graph),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
       local_scopes_(local_scopes),
@@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        ClearFetchOp(graph_.get(), &fetch_ops);
+        ClearFetchOp(graph_, &fetch_ops);
         exception_holder_.ReThrow();
       } else {
         continue;
@@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   }
   PADDLE_ENFORCE(ready_ops.empty());
   // Wait FetchOps.
-  ClearFetchOp(graph_.get(), &fetch_ops);
+  ClearFetchOp(graph_, &fetch_ops);
 
   return fetch_data;
 }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 24da56c09..0867f6210 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<ir::Graph> &&graph);
+                           ir::Graph *graph);
 
   const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
@@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
-  std::unique_ptr<ir::Graph> graph_;
+  ir::Graph *graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 296f3b839..6b8115b29 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -195,6 +195,22 @@ class Graph {
     return nullptr;
   }
 
+<<<<<<< HEAD
+=======
+  // Returns reference to the original program.
+  // WARN: After a series of passes, the current graph can be quite
+  // different from OriginProgram. Caller shouldn't assume much from
+  // the returned OriginProgram.
+  const ProgramDesc &OriginProgram() const { return program_; }
+
+  void ResolveHazard(
+      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
+
+ private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
+>>>>>>> polish
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
     PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 56da56600..2e68a2dd0 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -184,7 +184,7 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
 ParallelExecutor::ParallelExecutor(
     const std::vector<platform::Place> &places,
     const std::unordered_set<std::string> &bcast_vars,
-    const ProgramDesc &main_program, const std::string &loss_var_name,
+    const std::vector<ir::Graph *> &graphs, const std::string &loss_var_name,
     Scope *scope, const std::vector<Scope *> &local_scopes,
     const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
     : member_(new ParallelExecutorPrivate(places)) {
@@ -216,15 +216,34 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 
+<<<<<<< HEAD
+  std::unique_ptr<ir::Graph> temp_owned_graph(graph);
+
   // FIXME(Yancey1989): parallel graph mode get better performance
   // in GPU allreduce distributed training. Need an elegant way to
   // choice the execution strategy.
   build_strategy.enable_parallel_graph_ =
-      EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
+      EnableParallelGraphExecution(*temp_owned_graph, exec_strategy, build_strategy);
   if (build_strategy.enable_parallel_graph_)
     VLOG(0) << "The Executor would execute the graph by ParallelGraph "
                "Execution which can get better performance,"
             << "you can force it off by env FLAGS_enable_parallel_graph=0";
+=======
+  // TODO(panyx0718): Update pass interface so we don't need this here.
+  std::vector<std::unique_ptr<ir::Graph>> temp_owned_graphs;
+  for (ir::Graph *g : graphs) {
+    temp_owned_graphs.emplace_back(g);
+  }
+<<<<<<< HEAD
+>>>>>>> fix parallel graph mode program
+
+=======
+  bool parallel_graphs = (temp_owned_graphs.size() > 1);
+  if (parallel_graphs) {
+    PADDLE_ENFORCE_EQ(temp_owned_graphs.size(), places.size());
+  }
+  VLOG(1) << "Enable ParallelGraph Execution: " << parallel_graphs;
+>>>>>>> polish
 
   if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
@@ -236,7 +255,7 @@ ParallelExecutor::ParallelExecutor(
     if (nccl_id_var != nullptr) {
       nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
     }
-    if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) {
+    if (parallel_graphs && member_->nranks_ > 1UL) {
       if (nccl_id == nullptr) {
         local_nccl_id_.reset(new ncclUniqueId());
         platform::dynload::ncclGetUniqueId(local_nccl_id_.get());
@@ -258,44 +277,101 @@ ParallelExecutor::ParallelExecutor(
 
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
+<<<<<<< HEAD
   std::unique_ptr<ir::Graph> graph;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+
+  temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), member_->places_, loss_var_name,
                                member_->local_scopes_, member_->nranks_,
                                member_->use_cuda_, member_->nccl_ctxs_.get());
 #else
-  graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+  temp_owned_graph = build_strategy.Apply(std::move(temp_owned_graph), member_->places_, loss_var_name,
                                member_->local_scopes_, member_->nranks_,
                                member_->use_cuda_);
+
+=======
+  std::vector<ir::Graph *> compiled_graphs;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  if (parallel_graphs) {
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      auto temp_owned_graph = build_strategy.Apply(
+          std::move(temp_owned_graphs[i]), {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
+          member_->nccl_ctxs_.get());
+      compiled_graphs.push_back(temp_owned_graph.release());
+    }
+  } else {
+    auto temp_owned_graph = build_strategy.Apply(
+        std::move(temp_owned_graphs[0]), member_->places_, loss_var_name,
+        member_->local_scopes_, member_->nranks_, member_->use_cuda_,
+        member_->nccl_ctxs_.get());
+    compiled_graphs.push_back(temp_owned_graph.release());
+  }
+#else
+  auto temp_owned_graph = build_strategy.Apply(
+      std::move(temp_owned_graphs[0]), member_->places_, loss_var_name,
+      member_->local_scopes_, member_->nranks_, member_->use_cuda_);
+  compiled_graphs.push_back(temp_owned_graph.release());
+>>>>>>> fix parallel graph mode program
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   VLOG(10) << "Eager Deletion Threshold "
            << static_cast<float>(max_memory_size) / (1 << 30);
   if (max_memory_size >= 0) {
+<<<<<<< HEAD
     graph = member_->PrepareGCAndRefCnts(std::move(graph),
-                                         static_cast<size_t>(max_memory_size));
+                                         static_cast<size_t>(max_memory_size)).release();
+=======
+    for (size_t i = 0; i < graphs.size(); ++i) {
+      compiled_graphs[i] =
+          member_
+              ->PrepareGCAndRefCnts(
+                  std::unique_ptr<ir::Graph>(compiled_graphs[i]),
+                  static_cast<size_t>(max_memory_size))
+              .release();
+    }
+>>>>>>> fix parallel graph mode program
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
+<<<<<<< HEAD
   for (auto &node : graph->Nodes()) {
     if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
       var_infos.emplace_back();
       var_infos.back().name_ = node->Var()->Name();
       var_infos.back().type_ = node->Var()->GetType();
       var_infos.back().persistable_ = node->Var()->Persistable();
+=======
+  for (auto &graph : compiled_graphs) {
+    for (auto &node : graph->Nodes()) {
+      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+        var_infos.emplace_back();
+        var_infos.back().name_ = node->Var()->Name();
+        var_infos.back().type_ = node->Var()->GetType();
+        var_infos.back().persistable_ = node->Var()->Persistable();
+      }
+>>>>>>> fix parallel graph mode program
     }
   }
 
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
+<<<<<<< HEAD
     size_t graph_num = ir::GraphNum(*graph);
+=======
+    size_t graph_num = ir::GraphNum(*compiled_graphs[0]);
+>>>>>>> fix parallel graph mode program
     if (graph_num > 1) {
       LOG(WARNING)
           << "The number of graph should be only one, "
              "but the current graph has "
+<<<<<<< HEAD
           << ir::GraphNum(*graph)
+=======
+          << ir::GraphNum(*compiled_graphs[0])
+>>>>>>> fix parallel graph mode program
           << " sub_graphs. If you want to see the nodes of the "
              "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
              "to specify the output dir. NOTES: if you not do training, "
@@ -303,26 +379,42 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 
+<<<<<<< HEAD
   if (build_strategy.enable_parallel_graph_) {
 #ifdef PADDLE_WITH_CUDA
     // TODO(Yancey1989): Remove passing in the main_program when
     // allreduce_seq_pass doesn't need it as the attr.
+=======
+  if (parallel_graphs) {
+>>>>>>> polish
     member_->executor_.reset(new details::ParallelSSAGraphExecutor(
+<<<<<<< HEAD
         exec_strategy, member_->local_scopes_, member_->places_, main_program,
-        std::move(graph)));
+        graph));
 #else
     PADDLE_THROW(
         "Paddle should be compiled with CUDA for ParallelGraph Execution.");
 #endif
+  } else {
+    if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+      member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
+    } else {
+      member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->places_, graph));
+=======
+        exec_strategy, member_->local_scopes_, member_->places_,
+        compiled_graphs));
   } else {
     if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
       member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          compiled_graphs[0]));
     } else {
       member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
           exec_strategy, member_->local_scopes_, member_->places_,
-          std::move(graph)));
+          compiled_graphs[0]));
+>>>>>>> fix parallel graph mode program
     }
   }
 
@@ -452,24 +544,33 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
   }
 }
 
-bool ParallelExecutor::EnableParallelGraphExecution(
-    const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy,
-    const BuildStrategy &build_strategy) const {
+ParallelExecutor::~ParallelExecutor() {
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  delete member_;
+}
+
+bool EnableParallelGraphExecution(const ir::Graph &graph,
+                                  const ExecutionStrategy &exec_strategy,
+                                  const BuildStrategy &build_strategy) {
   if (!FLAGS_enable_parallel_graph) return false;
 
   bool enable_parallel_graph = true;
-  // TODO(Yancey1989): support sparse update in ParallelGraph mode.
-  for (auto &var_desc : main_program.Block(0).AllVars()) {
-    if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) {
-      enable_parallel_graph = false;
-    }
-  }
 
-  // TODO(Yancey1989): support pserver mode
-  for (auto &op_desc : main_program.Block(0).AllOps()) {
-    if (op_desc->Type() == "send" || op_desc->Type() == "recv") {
-      enable_parallel_graph = false;
-      break;
+  for (ir::Node *node : graph.Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      // TODO(Yancey1989): support sparse update in ParallelGraph mode.
+      if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) {
+        enable_parallel_graph = false;
+        break;
+      }
+    } else if (node->IsOp() && node->Op()) {
+      // TODO(Yancey1989): support pserver mode
+      if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") {
+        enable_parallel_graph = false;
+        break;
+      }
     }
   }
 
@@ -481,13 +582,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
   return enable_parallel_graph;
 }
 
-ParallelExecutor::~ParallelExecutor() {
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  delete member_;
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 121bbd55a..a6c0d65c0 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -46,7 +46,7 @@ class ParallelExecutor {
  public:
   explicit ParallelExecutor(const std::vector<platform::Place> &places,
                             const std::unordered_set<std::string> &bcast_vars,
-                            const ProgramDesc &main_program,
+                            const std::vector<ir::Graph *> &graphs,
                             const std::string &loss_var_name, Scope *scope,
                             const std::vector<Scope *> &local_scopes,
                             const ExecutionStrategy &exec_strategy,
@@ -71,9 +71,6 @@ class ParallelExecutor {
 
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
-  bool EnableParallelGraphExecution(const ProgramDesc &main_program,
-                                    const ExecutionStrategy &exec_strategy,
-                                    const BuildStrategy &build_strategy) const;
 
   ParallelExecutorPrivate *member_;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@@ -81,5 +78,9 @@ class ParallelExecutor {
 #endif
 };
 
+bool EnableParallelGraphExecution(const ir::Graph &graph,
+                                  const ExecutionStrategy &exec_strategy,
+                                  const BuildStrategy &build_strategy);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 1cd1be8e8..069750e24 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -101,7 +101,8 @@ void BindGraph(py::module *m) {
            [](Graph &self, Node &node) { return self.RemoveNode(&node); })
       .def("retrieve_node", &Graph::RetrieveNode,
            return_value_policy::reference)
-      .def("resolve_hazard", &Graph::ResolveHazard);
+      .def("resolve_hazard", &Graph::ResolveHazard)
+      .def("origin_program_desc", &Graph::OriginProgram);
 }
 
 void BindNode(py::module *m) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d8e57a1ac..ccbdb1ab1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -976,6 +976,9 @@ All parameter, weight, gradient are variables in Paddle.
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
   // -- python binds for parallel executor.
+  m.def("_enable_parallel_graph_execution",
+        framework::EnableParallelGraphExecution);
+
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
     ExecutionStrategy allows the user to more preciously control how to run
@@ -1213,9 +1216,10 @@ All parameter, weight, gradient are variables in Paddle.
                 cannot be updated after being finalized.)DOC");
 
   pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::unordered_set<std::string> &, const ProgramDesc &,
-                  const std::string &, Scope *, std::vector<Scope *> &,
-                  const ExecutionStrategy &, const BuildStrategy &>())
+                  const std::unordered_set<std::string> &,
+                  const std::vector<ir::Graph *> &, const std::string &,
+                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
+                  const BuildStrategy &>())
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index fa79db19e..acea09e95 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,6 +17,7 @@ import os
 import six
 import sys
 from .. import compat as cpt
+from . import framework
 
 from . import core
 
@@ -36,7 +37,7 @@ def _place_obj(place):
 
 class CompiledProgram(object):
     """
-    Compiles a Program for execution.
+    Compiles to Graph for execution.
 
     1. Users first create the program with layers.
     2. Optionally, users use CompiledProgram to optimize the program before run.
@@ -51,7 +52,7 @@ class CompiledProgram(object):
 
     Example:
         .. code-block:: python
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
             exe = fluid.Executor(place)
             exe.run(startup)
             compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
@@ -62,11 +63,25 @@ class CompiledProgram(object):
                                      fetch_list=[loss.name])
 
     Args:
-        program: Program instance that contains the model logic.
+        program_or_graph (Graph|Program): If it's Program, it will be first
+            lowered to a graph for further optimizations. If it's a graph
+            (potentially optimized before), it will be directly used for
+            further optimizations. Note: graph is only supported when compiled
+            with with_data_parallel option.
     """
 
-    def __init__(self, program):
-        self._program = program
+    def __init__(self, program_or_graph):
+        if isinstance(program_or_graph, core.Graph):
+            self._graph = program_or_graph
+            self._program = None
+        elif isinstance(program_or_graph, framework.Program):
+            self._graph = core.Graph(program_or_graph.desc)
+            self._program = program_or_graph
+        else:
+            raise ValueError("Wrong program_to_graph type: %s" %
+                             type(program_or_graph))
+
+        self._program_desc = self._graph.origin_program_desc()
         self._scope = None
         self._place = None
         self._executor = None
@@ -101,6 +116,7 @@ class CompiledProgram(object):
             self
         """
         assert not self._is_data_parallel, "Already compiled with parallel."
+        assert not self._is_inference, "Cannot compile both data parallel and inference"
         self._is_data_parallel = True
         self._build_strategy = build_strategy
         self._exec_strategy = exec_strategy
@@ -120,11 +136,13 @@ class CompiledProgram(object):
         Returns:
             self
         """
+        assert not self._is_data_parallel, "Cannot compile both data parallel and inference."
+        assert not self._is_inference, "Already compiled with inference"
+
         assert any([
             isinstance(config, InferNativeConfig),
             isinstance(config, InferAnalysisConfig)
         ])
-        self._is_data_parallel = False
         self._is_inference = True
         self._infer_config = config
         return self
@@ -173,37 +191,56 @@ class CompiledProgram(object):
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                 self._exec_strategy.num_threads = cpu_num * 2
 
-        trainers_endpoints = self._program._trainers_endpoints
-
         # FIXME(dzhwinter): enable_inplace should be after memory_optimize
         # if turn on python memory optimize, turn off the inplace_pass.
         if self._build_strategy.memory_optimize is None:
-            self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
+            self._build_strategy.memory_optimize = False if self._program and self._program._is_mem_optimized else True
         if self._build_strategy.enable_inplace is None:
-            self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
+            self._build_strategy.enable_inplace = False if self._program and self._program._is_mem_optimized else True
+
+
+        # TODO(wuyi): trainer endpoings should be passed in through
+        # build_strategy, not program.xxx.
+        if self._program and self._build_strategy.num_trainers > 1 and \
+                self._program._trainers_endpoints:
+            tps = self._program._trainers_endpoints
 
-        if self._build_strategy.num_trainers > 1 and trainers_endpoints:
             assert self._build_strategy.num_trainers == len(
-                trainers_endpoints), "num_trainers == len(end_points)"
-            self._build_strategy.trainers_endpoints = trainers_endpoints
-
-        self._persistable_vars = set([
-            cpt.to_text(v.name)
-            for v in [
-                var for var in self._program.list_vars()
-                if var.persistable and var.type != core.VarDesc.VarType.RAW
-            ]
-        ])
+                tps), "num_trainers == len(end_points)"
+            self._build_strategy.trainers_endpoints = tps
+
+        self._persistable_vars = []
+        for block_id in range(self._program_desc.num_blocks()):
+            bdesc = self._program_desc.block(block_id)
+            self._persistable_vars.extend([
+                cpt.to_text(v.name()) for v in bdesc.all_vars()
+                if v.persistable() and v.type() != core.VarDesc.VarType.RAW
+            ])
 
         places = list(map(_place_obj, self._places))
+
+        # FIXME(Yancey1989): parallel graph mode get better performance
+        # in GPU allreduce distributed training. Need an elegant way to
+        # choice the execution strategy.
+        enable_parallel_graph = \
+            core._enable_parallel_graph_execution(self._graph,
+                                                  self._exec_strategy,
+                                                  self._build_strategy) and \
+            self._program  # only supported if compile program not graph.
+
+        self._pe_graphs = [self._graph]
+        if enable_parallel_graph:
+            for _ in range(len(places) - 1):
+                self._pe_graphs.append(core.Graph(self._program_desc))
+
         return core.ParallelExecutor(
-            places, self._persistable_vars, self._program.desc,
+            places,
+            set(self._persistable_vars), self._pe_graphs,
             cpt.to_text(self._loss_name)
             if self._loss_name else six.u(''), self._scope, self._local_scopes,
             self._exec_strategy, self._build_strategy)
 
     def _compile_inference(self):
-        assert self._is_data_parallel is False
         return core.create_paddle_predictor(self._infer_config)
 
     def _compile(self, scope, place):
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
new file mode 100644
index 000000000..4f3fee094
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
@@ -0,0 +1,204 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import unittest
+import random
+import numpy as np
+import paddle.fluid as fluid
+import six
+from paddle.fluid.framework import Program
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid import core
+
+
+def linear_fc(num):
+    data = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        hidden = fluid.layers.fc(hidden, size=128, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.softmax_with_cross_entropy(fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.softmax_with_cross_entropy(fc, label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestQuantizationTransformPass(unittest.TestCase):
+    def setUp(self):
+        self.quantizable_op_and_inputs = {
+            'conv2d': ['Input', 'Filter'],
+            'depthwise_conv2d': ['Input', 'Filter'],
+            'mul': ['X', 'Y']
+        }
+        self.quantizable_grad_op_inputs = {
+            'conv2d_grad': ['Input', 'Filter'],
+            'depthwise_conv2d_grad': ['Input', 'Filter'],
+            'mul_grad': ['X', 'Y']
+        }
+
+    def check_program(self, transform_pass, program):
+        quantized_ops = set()
+        for block in program.blocks:
+            for op in block.ops:
+                # check forward
+                if op.type in self.quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        quantized_ops.add(arg_name)
+
+            for op in block.ops:
+                # check backward
+                if op.type in self.quantizable_grad_op_inputs:
+                    for pname in self.quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        self.assertTrue(arg_name in quantized_ops)
+
+    def linear_fc_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
+
+    def test_linear_fc_quant_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.linear_fc_quant('abs_max')
+
+    def test_linear_fc_quant_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.linear_fc_quant('range_abs_max')
+
+    def residual_block_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
+
+    def test_residual_block_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.residual_block_quant('abs_max')
+
+    def test_residual_block_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.residual_block_quant('range_abs_max')
+
+    def test_execute_graph(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.0001)
+            opt.minimize(loss)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            graph = IrGraph(core.Graph(main.desc), for_test=False)
+            exe.run(startup)
+            binary = fluid.CompiledProgram(graph.graph).with_data_parallel(
+                loss_name=loss.name)
+            for i in range(10):
+                loss_val = exe.run(binary,
+                                   feed={
+                                       'image': np.ones(
+                                           [32, 784], dtype=np.float32),
+                                       'label': np.ones(
+                                           [32, 1], dtype=np.int64)
+                                   },
+                                   fetch_list=[loss])
+                if i == 0:
+                    start_loss = np.sum(loss_val)
+                elif i == 9:
+                    end_loss = np.sum(loss_val)
+            self.assertLess(end_loss, start_loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 8815911ea..d0cdb7384 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -538,6 +538,7 @@ class Executor(object):
         else:
             # TODO(panyx0718): Can compile program to optimize executor
             # performance.
+            assert program._program, "CompiledProgram is compiled from graph, can only run with_data_parallel."
             return self._run(
                 program._program,
                 self._default_executor,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 15367c724..72f1eae95 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2322,7 +2322,7 @@ class Program(object):
     @staticmethod
     def _construct_from_desc(desc):
         """
-        Construct a program from program desc.
+        Construct a program from program desc. (Experiment)
 
         Args:
             desc(core.ProgramDesc): The program desc for constructing.
@@ -2332,6 +2332,7 @@ class Program(object):
         """
         p = Program()
         p.desc = desc
+        # TODO(wangzhen): Block.vars/ops are not filled, should fix it.
         p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
         p._sync_with_cpp()
         return p
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 8586670c2..1d513c6ea 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -185,8 +185,11 @@ class ParallelExecutor(object):
         places = list(map(place_obj, self._places))
 
         # step7: init ParallelExecutor
+        # ParallelExecutor API will be deprecated, don't support parallel graph.
+        self._graphs = [core.Graph(main.desc)]
+
         self.executor = core.ParallelExecutor(
-            places, persistable_vars, main.desc,
+            places, persistable_vars, self._graphs,
             cpt.to_text(loss_name) if loss_name else six.u(''), scope,
             local_scopes, exec_strategy, build_strategy)
 
-- 
GitLab