From 885c4e57abdace2e769697b4b464edbfa62b19e6 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 21 Jan 2019 14:31:02 +0800
Subject: [PATCH] fea/infer memory optim2 (#14953)

---
 paddle/fluid/framework/ir/fc_fuse_pass.cc     |   1 +
 paddle/fluid/framework/ir/graph_helper.cc     | 143 +++-
 paddle/fluid/framework/ir/graph_helper.h      |  17 +
 .../framework/ir/graph_to_program_pass.cc     |  31 +-
 .../framework/ir/graph_to_program_pass.h      |   4 +
 paddle/fluid/framework/ir/graph_viz_pass.cc   |   2 +-
 paddle/fluid/framework/ir/node.h              |   2 +-
 paddle/fluid/framework/naive_executor.cc      |   7 +-
 .../fluid/inference/analysis/CMakeLists.txt   |   1 +
 paddle/fluid/inference/analysis/analyzer.cc   |  17 +-
 paddle/fluid/inference/analysis/analyzer.h    |   2 +-
 .../inference/analysis/analyzer_tester.cc     |   4 +
 paddle/fluid/inference/analysis/argument.h    |  11 +
 paddle/fluid/inference/analysis/helper.h      |   7 +
 .../inference/analysis/ir_pass_manager.cc     |   1 +
 .../analysis/ir_passes/CMakeLists.txt         |   2 +-
 .../analysis/ir_passes/subgraph_detector.cc   |   1 -
 .../ir_passes/tensorrt_subgraph_pass.cc       |   4 +
 .../inference/analysis/passes/CMakeLists.txt  |  13 +-
 .../passes/ir_analysis_compose_pass.cc        |  62 --
 .../analysis/passes/ir_analysis_pass.cc       |  14 +-
 .../analysis/passes/ir_analysis_pass.h        |   3 +
 .../passes/ir_graph_to_program_pass.cc        |  45 ++
 ...pose_pass.h => ir_graph_to_program_pass.h} |  20 +-
 .../analysis/passes/memory_optimize_pass.cc   | 647 ++++++++++++++++++
 .../analysis/passes/memory_optimize_pass.h    | 106 +++
 .../fluid/inference/analysis/passes/passes.cc |  13 +-
 paddle/fluid/inference/api/CMakeLists.txt     |  11 +-
 paddle/fluid/inference/api/analysis_config.cc | 102 ++-
 .../fluid/inference/api/analysis_predictor.cc |  92 ++-
 .../fluid/inference/api/analysis_predictor.h  |  10 +
 .../api/analysis_predictor_tester.cc          |  51 ++
 paddle/fluid/inference/api/demo_ci/run.sh     |   1 +
 paddle/fluid/inference/api/helper.h           |  15 +-
 .../inference/api/paddle_analysis_config.h    |  11 +
 .../inference/api/paddle_pass_builder.cc      |   5 +
 .../fluid/inference/api/paddle_pass_builder.h |  46 +-
 .../fluid/inference/tests/api/CMakeLists.txt  |   4 +-
 .../tests/api/analyzer_dam_tester.cc          |  30 +
 .../analyzer_text_classification_tester.cc    |   2 +
 .../tests/api/analyzer_vis_tester.cc          |   9 +-
 .../fluid/inference/tests/api/tester_helper.h |   6 +-
 .../inference/tests/api/trt_models_tester.cc  |   5 +
 paddle/fluid/inference/utils/benchmark.h      |   2 +-
 .../fluid/inference/utils/benchmark_tester.cc |   4 +-
 paddle/fluid/operators/controlflow/feed_op.cc |   1 +
 paddle/fluid/string/pretty_log.h              |  17 +
 47 files changed, 1450 insertions(+), 154 deletions(-)
 delete mode 100644 paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
 create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
 rename paddle/fluid/inference/analysis/passes/{ir_analysis_compose_pass.h => ir_graph_to_program_pass.h} (59%)
 create mode 100644 paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
 create mode 100644 paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 26eac939054..12b31da010c 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index d99f856d8f4..8de93cf285e 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -18,8 +18,10 @@ limitations under the License. */
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
+#include <stack>
 #include <unordered_map>
 #include <unordered_set>
+#include "paddle/fluid/framework/ir/graph_traits.h"
 
 DEFINE_string(print_sub_graph_dir, "",
               "FLAGS_print_sub_graph_dir is used "
@@ -41,7 +43,7 @@ void SortHelper(
     }
   }
 
-  VLOG(3) << "topology sort insert: " << node->Name()
+  VLOG(5) << "topology sort insert: " << node->Name() << " "
           << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
   ret->push_back(node);
 }
@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
   return ret;
 }
 
+// Build operator inlink edge table.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
 
   for (auto &n : graph.Nodes()) {
-    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (!n->IsOp()) continue;
     if (adj_list.find(n) == adj_list.end()) {
       adj_list[n] = std::unordered_set<ir::Node *>();
     }
@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
   return adj_list;
 }
 
+// Build operator outlink edge table.
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+
+  for (auto &n : graph.Nodes()) {
+    if (!n->IsOp()) continue;
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->outputs) {
+      for (auto &adj_n : var->outputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        adj_list[n].insert(adj_n);
+      }
+    }
+  }
+  return adj_list;
+}
+
+std::vector<ir::Node *> OpDFSSort(const Graph &graph) {
+  auto edge_table = BuildOperationOutAdjList(graph);
+  std::stack<Node *> stack;
+  for (auto &ele : edge_table) {
+    if (ele.first->inputs.empty()) {
+      // find the input ops (those without input vars)
+      stack.push(ele.first);
+    } else {
+      // find the ops with only persistable vars as inputs.
+      bool all_persistable = true;
+      for (auto *input : ele.first->inputs) {
+        if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) {
+          all_persistable = false;
+        }
+      }
+      if (all_persistable) {
+        stack.push(ele.first);
+      }
+    }
+  }
+
+  std::vector<Node *> res;
+  // start from the feed op and DFS
+  std::unordered_set<Node *> unique_set;
+  while (!stack.empty()) {
+    // will start from the last feed by default.
+    auto cur = stack.top();
+    stack.pop();
+    unique_set.insert(cur);
+    res.push_back(cur);
+
+    for (auto *op : edge_table[cur]) {
+      if (!unique_set.count(op)) {
+        stack.push(op);
+      }
+    }
+  }
+  return res;
+}
+
+std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph) {
+  std::vector<ir::Node *> nodes;
+  std::unordered_map<Node *, int> in_degree;
+
+  auto set_out_ops_ready = [&](Node *var) {
+    for (auto *op : var->outputs) {
+      --in_degree[op];
+    }
+  };
+  // build in_degree
+  for (auto *node : graph.Nodes()) {
+    if (node->IsOp()) {
+      in_degree[node] += node->inputs.size();
+    } else if (node->IsVar() && node->inputs.empty()) {
+      // put all the inputs of the whole graph ready.
+      set_out_ops_ready(node);
+    }
+  }
+
+  std::deque<Node *> op_queue;
+  // first visit
+  for (auto &node : OpDFSSort(graph)) {
+    if (node->IsOp()) {
+      op_queue.push_back(node);
+    }
+  }
+
+  // traverse the graph
+  int num_ops = op_queue.size();
+  while (num_ops) {
+    for (auto it = op_queue.begin(); it != op_queue.end(); it++) {
+      auto *&cur_op = *it;
+      if (!cur_op || in_degree[cur_op] > 0) continue;
+      // visit this node
+      // put all the output var of this op valid.
+      for (auto *out_var : cur_op->outputs) {
+        if (!out_var) continue;
+        set_out_ops_ready(out_var);
+      }
+      VLOG(8) << "visit " << cur_op->Name();
+      nodes.push_back(cur_op);
+
+      cur_op = nullptr;
+      num_ops--;
+    }
+  }
+
+  return nodes;
+}
+
 size_t GraphNum(const Graph &graph) {
   std::unordered_set<ir::Node *> nodes(graph.Nodes());
   std::unordered_set<ir::Node *> visited_nodes;
@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
   return graph_count;
 }
 
+void CleanIndividualNodes(Graph *graph) {
+  std::unordered_set<Node *> nodes2rm;
+  for (auto *node : graph->Nodes()) {
+    if (node->inputs.empty() && node->outputs.empty()) {
+      nodes2rm.insert(node);
+    }
+  }
+
+  for (auto *node : nodes2rm) {
+    graph->RemoveNode(node);
+  }
+}
+
+std::vector<Node *> TopologyVarientSort(const Graph &graph,
+                                        SortKind sort_kind) {
+  switch (sort_kind) {
+    case SortKind::TS:
+      return framework::ir::TopologySortOperations(graph);
+    default:
+      return framework::ir::TopologyDfsSortOperations(graph);
+  }
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index be525151f9f..fba4936f2c5 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
 // `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
 
+// Topological sort, but try to DFS.
+std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph);
+
+// Different kinds to sort the operators in a graph to a sequence.
+enum class SortKind {
+  // Topological Search
+  TS = 0,
+  // Topological and Depth First Search
+  TDFS
+};
+
+// Several kinds of topological sort.
+std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
+
+// Clean the nodes that doesn't connect to others.
+void CleanIndividualNodes(Graph *graph);
+
 // Build an adjacency list of operations for the `graph`.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph);
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index 36f36933265..3372dcd181d 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-
 #include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
@@ -29,6 +28,14 @@ namespace ir {
 
 std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
     std::unique_ptr<Graph> graph) const {
+  // Remove the unneeded variables after memory optimization.
+  std::unordered_set<std::string> vars2remove;
+  if (graph->Has(kGraphToProgramVarsToRemove)) {
+    vars2remove = graph->Get<std::unordered_set<std::string>>(
+        kGraphToProgramVarsToRemove);
+    VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes";
+  }
+
   ProgramDesc& program = Get<ProgramDesc>("program");
 
   std::unique_ptr<proto::ProgramDesc> program_pb(
@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
   std::unordered_set<std::string> visited_vars;
   for (ir::Node* n : graph->Nodes()) {
     if (n->IsVar()) {
-      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
+      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
+          !vars2remove.count(n->Var()->Name())) {
         visited_vars.insert(n->Var()->Name());
         block->add_vars()->MergeFrom(*n->Var()->Proto());
       }
     }
   }
-
   block->clear_ops();
-  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
+
+  std::vector<ir::Node*> nodes;
+  if (Has(kGraphToProgramSortKind)) {
+    // Inference Memory Optimize relays on this branch.
+    int sort_kind = Get<int>(kGraphToProgramSortKind);
+    nodes = TopologyVarientSort(
+        *graph, static_cast<framework::ir::SortKind>(sort_kind));
+  } else {
+    nodes = TopologySortOperations(*graph);
+  }
+
   for (ir::Node* n : nodes) {
-    if (!n->Op()) {
-      continue;
-    }
+    if (!n->Op()) continue;
+
     block->add_ops()->MergeFrom(*n->Op()->Proto());
   }
 
   program.CopyFrom(*program_pb);
   return graph;
 }
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h
index 124ec5a8e77..4c36c3a5da1 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -20,6 +20,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+const char kGraphToProgramVarsToRemove[] =
+    "__graph_to_program_vars_to_remove__";
+const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
+
 class GraphToProgramPass : public Pass {
  protected:
   std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 31ed98db72c..87a28a2a66c 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -135,4 +135,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
 }  // namespace paddle
 
 REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
-    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
+    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
\ No newline at end of file
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 89dcc677b57..9eade9eaa8f 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -64,7 +64,7 @@ class Node {
 
   std::string Name() const { return name_; }
 
-  VarDesc* Var() {
+  VarDesc* Var() const {
     PADDLE_ENFORCE(IsVar());
     return var_desc_.get();
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 86e6b1f7d92..a37bb6f4da1 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
                              "running Paddle Inference";
 #endif  // PADDLE_ON_INFERENCE
   for (auto &op : ops_) {
-    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
-            << " on scope " << scope_;
+    VLOG(4) << std::this_thread::get_id() << " run "
+            << op->DebugStringEx(scope_) << " on scope " << scope_;
     op->SetIsCalledByExecutor(false);
     op->Run(*scope_, place_);
   }
@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
     anc = anc->parent();
   }
 
+  int num_vars = 0;
   for (auto &var : global_block.AllVars()) {
     if (var->Name() == framework::kEmptyVarName) {
       continue;
     }
+    num_vars++;
 
     if (persistable == var->Persistable()) {
       if (persistable) {
@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
       }
     }
   }
+  VLOG(4) << "naive executor create " << num_vars << " vars";
 }
 
 void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 27b6b80955e..7a795bda820 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -18,6 +18,7 @@ cc_library(analysis SRCS
   analyzer.cc
   analysis_pass
   DEPS ${analysis_deps} analysis_helper
+  ${INFER_IR_PASSES}
   )
 
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index c8ed373ee7c..d82a063d880 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,8 +15,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
 #include "paddle/fluid/inference/analysis/passes/passes.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace inference {
@@ -24,13 +24,16 @@ namespace analysis {
 
 Analyzer::Analyzer() {}
 
-void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
+void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
 
-void Analyzer::RunIrAnalysis(Argument *argument) {
-  std::vector<std::string> passes({"ir_analysis_compose_pass"});
-
-  for (auto &pass : passes) {
-    PassRegistry::Global().Retreive(pass)->Run(argument);
+void Analyzer::RunAnalysis(Argument *argument) {
+  PADDLE_ENFORCE(argument->analysis_passes_valid(),
+                 "analsis_passes is not valid in the argument.");
+  for (auto &pass : argument->analysis_passes()) {
+    string::PrettyLogH1("--- Running analysis [%s]", pass);
+    auto *ptr = PassRegistry::Global().Retreive(pass);
+    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
+    ptr->Run(argument);
   }
 }
 
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index b43e67f20f4..a6de18db600 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -54,7 +54,7 @@ class Analyzer final {
   DISABLE_COPY_AND_ASSIGN(Analyzer);
 
  protected:
-  void RunIrAnalysis(Argument* argument);
+  void RunAnalysis(Argument* argument);
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 4c84d02d867..c814ce45484 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
   argument.SetModelDir(FLAGS_inference_model_dir);
   argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
   argument.SetUseGPU(false);
+  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
+                              "ir_params_sync_among_devices_pass"});
 
   Analyzer analyser;
   analyser.Run(&argument);
@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
   argument.SetModelDir(FLAGS_inference_model_dir);
   argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
   argument.SetUseGPU(false);
+  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
+                              "ir_params_sync_among_devices_pass"});
 
   Analyzer analyser;
   analyser.Run(&argument);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 2d8980b1d15..88ce61f9b92 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -110,16 +110,20 @@ struct Argument {
   // The overall Scope to work on.
   DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
 
+  // The default program, loaded from disk.
   DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
 
   // The ir passes to perform in analysis phase.
   DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
                       std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
+                      std::vector<std::string>);
 
   // Pass a set of op types to enable its mkldnn kernel
   DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                       std::unordered_set<std::string>);
 
+  // Passed from config.
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
   DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
@@ -127,6 +131,13 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
 
+  // Memory optimized related.
+  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
+  // Indicate which kind of sort algorithm is used for operators, the memory
+  // optimization relays on the sort algorithm.
+  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
+
   // The program transformed by IR analysis phase.
   DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
                              framework::proto::ProgramDesc);
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 269a0da9f93..de04713b531 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -28,6 +28,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/port.h"
 
+#ifdef _WIN32
+#define GCC_ATTRIBUTE(attr__) ;
+#else
+#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
+#endif
+#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index e37fea38bcb..4e146422645 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
   PADDLE_ENFORCE(graph.get());
   // Apply all the passes
   for (const auto &pass : passes_) {
+    if (pass->Type() == "graph_viz_pass") continue;
     PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
     graph = pass->Apply(std::move(graph));
   }
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index 9ae5b8aa173..eb6e1768a2c 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
 
-if (TENSORRT_FOUND)
+if (WITH_GPU AND TENSORRT_FOUND)
   cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
 
   set(analysis_deps ${analysis_deps}
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index b6a5dfd087c..a64f85ee9ac 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
   auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
     if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
-    LOG(INFO) << "detect a subgraph size " << subgraph.size();
     std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index bc06e78ae69..5f25303cc1e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace inference {
@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   framework::BlockDesc block_desc(nullptr, &block_proto);
   block_desc.Proto()->set_parent_idx(-1);
   block_desc.Proto()->set_idx(0);
+  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
+                          subgraph.size());
+
   for (auto *node : subgraph) {
     auto *op = block_desc.AppendOp();
     *op->Proto() = *node->Op()->Proto();
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index add9b70f2cd..691c336ebe4 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,11 +1,18 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
 cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
+cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
+
+cc_library(analysis_passes SRCS passes.cc DEPS
+  ir_graph_build_pass
+  ir_analysis_pass
+  ir_params_sync_among_devices_pass
+  memory_optim_pass
+  ir_graph_to_program_pass
+)
 
 set(analysis_deps ${analysis_deps}
-        ir_graph_build_pass
-        ir_analysis_pass
         analysis_passes
         subgraph_detector
         CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
deleted file mode 100644
index 490189e5507..00000000000
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrAnalysisComposePass::RunImpl(Argument *argument) {
-  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
-  ApplyIrPasses(argument);
-  CollectFusionStatis(argument);
-}
-
-std::string IrAnalysisComposePass::repr() const {
-  return "ir-analysis-compose-pass";
-}
-
-void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
-  std::vector<std::string> passes({
-      "ir_graph_build_pass", "ir_analysis_pass",
-      "ir_params_sync_among_devices_pass",
-  });
-  for (const auto &pass : passes) {
-    VLOG(2) << "Run pass " << pass;
-    auto *the_pass = PassRegistry::Global().Retreive(pass);
-    the_pass->Run(argument);
-  }
-}
-
-void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) {
-  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
-    LOG(INFO) << "argument has no fuse statis";
-    return;
-  }
-  argument->SetFusionStatis(
-      argument->main_graph().Get<Argument::fusion_statis_t>(
-          framework::ir::kFuseStatisAttr));
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
index e327bd39f0a..d986811a827 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 
 namespace paddle {
@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
   IRPassManager the_ir_manager(argument);
   graph = the_ir_manager.Apply(std::move(graph));
   PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
-  argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
-      the_ir_manager.AcquireProgram(&graph, argument->main_program())));
   argument->SetMainGraph(graph.release());
+  CollectFusionStatis(argument);
+}
+
+void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
+  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
+    LOG(INFO) << "argument has no fuse statis";
+    return;
+  }
+  argument->SetFusionStatis(
+      argument->main_graph().Get<Argument::fusion_statis_t>(
+          framework::ir::kFuseStatisAttr));
 }
 
 std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
index d8a74498075..2c2113c06d9 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -29,6 +29,9 @@ namespace analysis {
 class IrAnalysisPass : public AnalysisPass {
  public:
   void RunImpl(Argument* argument) override;
+
+  void CollectFusionStatis(Argument* argument);
+
   std::string repr() const override;
 };
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
new file mode 100644
index 00000000000..f1da37af3cc
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void IrGraphToProgramPass::RunImpl(Argument *argument) {
+  auto pass =
+      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
+
+  if (argument->memory_optim_sort_kind_valid()) {
+    pass->Set(framework::ir::kGraphToProgramSortKind,
+              new int(argument->memory_optim_sort_kind()));
+  }
+
+  std::unique_ptr<Graph> graph(argument->main_graph_ptr());
+  framework::ProgramDesc desc(argument->main_program());
+  pass->SetNotOwned("program", &desc);
+  auto thegraph = pass->Apply(std::move(graph));
+  thegraph.release();  // the argument still own the graph.
+
+  argument->SetIrAnalyzedProgram(
+      new framework::proto::ProgramDesc(*desc.Proto()));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
similarity index 59%
rename from paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
rename to paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
index 16c6b7d84df..838ebdbc9d7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
@@ -14,31 +14,17 @@
 
 #pragma once
 
-#include <string>
-#include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/passes/passes.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-/*
- * The analysis pass to run a list of IR passes (like a function call).
- * Currently, it should be the first pass of analysis phase.
- */
-class IrAnalysisComposePass : public AnalysisPass {
+class IrGraphToProgramPass : public AnalysisPass {
  public:
-  void RunImpl(Argument* argument) override;
-  std::string repr() const override;
+  void RunImpl(Argument *argument) override;
 
- private:
-  void ApplyIrPasses(Argument* argument);
-
-  void CollectFusionStatis(Argument* argument);
-
-  // Assign a Scope for IR passes to modify the weights.
-  void AssignScopeToModify(Argument* argument);
+  std::string repr() const override { return "ir-graph-to-param-pass"; }
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
new file mode 100644
index 00000000000..57683c0b727
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -0,0 +1,647 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include <algorithm>
+#include <fstream>
+#include <limits>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using framework::ir::TopologyVarientSort;
+using space_table_t = MemoryOptimizePass::space_table_t;
+
+// Collect the lifecycles of the tensors.
+// Traverse the graph in topological order.
+// The traversal order also affect the lifecycles, so different sort_kind is
+// used.
+void MemoryOptimizePass::CollectLifeCycle(
+    std::unordered_map<std::string, lifecycle_t>* lifecycles,
+    int sort_kind) const {
+  max_lifecycle_ = 0;
+  for (auto* op_node : framework::ir::TopologyVarientSort(
+           *graph_, static_cast<framework::ir::SortKind>(sort_kind))) {
+    if (!op_node->IsOp()) continue;
+    auto reads = op_node->inputs;
+    auto writes = op_node->outputs;
+
+    std::vector<Node*> requires(reads.begin(), reads.end());
+    requires.insert(requires.end(), writes.begin(), writes.end());
+
+    // Disable reuse of feed variables.
+    if (op_node->Name() == "feed") {
+      for (auto* node : op_node->outputs) {
+        auto var = node->Name();
+        lifecycles->emplace(var,
+                            std::make_pair(0, std::numeric_limits<int>::max()));
+      }
+    } else {
+      // Normal operators.
+      for (const Node* node : requires) {
+        if (node->Var()->Persistable()) continue;
+        std::string var = node->Name();
+        if (!lifecycles->count(var)) {
+          (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_);
+        } else {
+          (*lifecycles)[var].second =
+              std::max(max_lifecycle_, lifecycles->at(var).second);  // max()
+        }
+      }
+    }
+
+    ++max_lifecycle_;
+  }
+}
+
+// TODO(Superjomn) Make this a general help method.
+int DataTypeToSpace(framework::proto::VarType_Type type) {
+  switch (type) {
+    case framework::proto::VarType_Type_BOOL:
+      return sizeof(bool);
+    case framework::proto::VarType_Type_FP32:
+      return sizeof(float);
+    case framework::proto::VarType_Type_INT32:
+      return sizeof(int32_t);
+    case framework::proto::VarType_Type_INT64:
+      return sizeof(int64_t);
+    default:
+      PADDLE_THROW("Unknown data type");
+  }
+}
+
+// Collect the memory size of the tensors.
+void MemoryOptimizePass::CollectVarMemorySize(
+    const std::unordered_map<std::string, size_t>& batch_var_ave_dim,
+    std::unordered_map<std::string, Node*>* tensor_nodes,
+    space_table_t* space_table) const {
+  // Collect tensors from graph.
+  for (auto* node : graph_->Nodes()) {
+    if (node->IsVar() &&
+        node->Var()->GetType() ==
+            framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
+      // Parameters will not be reused.
+      if (node->Var()->Persistable()) continue;
+      (*tensor_nodes)[node->Name()] = node;
+      (*space_table)[node->Name()] =
+          DataTypeToSpace(node->Var()->GetDataType()) *
+          batch_var_ave_dim.at(node->Name());
+    }
+  }
+}
+
+// Find a sutable (big enough but smallest to avoid memory waste).
+//
+// Args:
+// @tensor_nodes: the tensor nodes in the ir::Graph.
+// @free_existing_tensors: the allocated tensor and are free.
+// @space_table: the memory space of tensors.
+// @tensor2use: the tensor that requires memory.
+//
+// Returns:
+// true if found some existing tensor to reuse.
+// false if no sutable tensor to reuse, one need to allocate a new tensor for
+// this requirement.
+// The suitable tensor for reuse is one that is approximately equal to the
+// memory demand.
+bool FindSuitableTensorToReuse(
+    const std::string& tensor, int space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    const space_table_t& space_table,
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    std::string* tensor2use) __SHOULD_USE_RESULT__;
+
+bool FindSuitableTensorToReuse(
+    const std::string& tensor, int space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    const space_table_t& space_table,
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    std::string* tensor2use) {
+  std::pair<std::string, size_t> best_fit;
+  best_fit.second = std::numeric_limits<int>::max();
+  VLOG(5) << "Split Tensors to " << var_clusters.size() << " clusters";
+
+  // find the cluster this var belongs to.
+  const std::unordered_set<std::string>* cluster = nullptr;
+  for (const auto& c : var_clusters) {
+    if (c.count(tensor)) {
+      cluster = &c;
+      break;
+    }
+  }
+  PADDLE_ENFORCE_NOT_NULL(cluster,
+                          "something wrong in memory optimization, the "
+                          "variable %s not in the clusters.",
+                          tensor);
+
+  for (auto& candidate : *free_existing_tensors) {
+    // This is not a temporary tensor.
+    if (!space_table.count(candidate)) continue;
+    // Not in the same cluster.
+    if (!cluster->count(candidate)) continue;
+
+    size_t space = space_table.at(candidate);
+    size_t space_diff = std::abs<size_t>(space - space_required);
+    if (space_diff < best_fit.second) {
+      best_fit.first = candidate;
+      best_fit.second = space_diff;
+    }
+  }
+
+  if (best_fit.second < std::numeric_limits<int>::max()) {
+    *tensor2use = best_fit.first;
+    return true;
+  }
+  return false;
+}
+
+// Allocate new tensor instead of reusing the existing one.
+void AllocateNewTensor(
+    const std::string& name, size_t space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    space_table_t* space_table,
+    std::unordered_map<std::string, std::string>* reuse_table) {
+  // The newly born tensor is free to be used.
+  free_existing_tensors->insert(name);
+  // Register the space it has.
+  PADDLE_ENFORCE(space_table->count(name));
+  space_table->at(name) = std::max(space_table->at(name), space_required);
+  // The allocated new tensor use the memory of itself.
+  (*reuse_table)[name] = name;
+}
+
+// Free a tensor and make it resuable.
+// @tensor: the tensor to free.
+// @free_existing_tensors: the free and allocated tensors.
+// @reuse_table: a map from a fake tensor to the existing allocated tensor.
+void FreeATensor(const std::string& tensor,
+                 std::unordered_set<std::string>* free_existing_tensors,
+                 std::unordered_map<std::string, std::string>* reuse_table) {
+  if (tensor == "feed" || tensor == "fetch") return;
+  // the really allocated tensor.
+  const auto& free_tensor = reuse_table->at(tensor);
+
+  free_existing_tensors->insert(free_tensor);
+}
+
+// Reuse a free existing tensor.
+void ReuseATensor(const std::string& tensor, const std::string& tensor2reuse,
+                  size_t memory_size,
+                  std::unordered_set<std::string>* free_existing_tensors,
+                  std::unordered_map<std::string, std::string>* reuse_table,
+                  space_table_t* reused_space_table) {
+  auto it = free_existing_tensors->find(tensor2reuse);
+  PADDLE_ENFORCE(it != free_existing_tensors->end());
+  free_existing_tensors->erase(it);
+  (*reuse_table)[tensor] = tensor2reuse;
+  // Update the memory size of a reused tensor, the memory will grow if the
+  // required memory is larger.
+  (*reused_space_table)[tensor2reuse] =
+      std::max(reused_space_table->at(tensor2reuse), memory_size);
+}
+
+// Calculate the memory usage.
+void EvaluateMemoryUsage(
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    const space_table_t& space_table,
+    const std::unordered_map<std::string, size_t>& var_batch_ave_size,
+    size_t* allocated, size_t* saved) {
+  *allocated = 0;
+  *saved = 0;
+
+  for (auto elem : reuse_table) {
+    if (elem.first == elem.second) {
+      *allocated += space_table.at(elem.first);
+      VLOG(4) << elem.first << " <-> " << elem.second << " "
+              << space_table.at(elem.first) << " "
+              << space_table.at(elem.second);
+    } else {
+      *saved += space_table.at(elem.first);
+      VLOG(4) << "reuse " << elem.first << " -> " << elem.second;
+    }
+  }
+  VLOG(4) << "allocated " << *allocated;
+  VLOG(4) << "saved " << *saved;
+}
+
+// Return saved ratio.
+void MemoryOptimizePass::MakeReusePlan(
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    const std::unordered_map<std::string, size_t>& var_batch_ave_size,
+    const space_table_t& space_table,
+    std::unordered_map<std::string, std::string>* reuse_table, int sort_kind,
+    MemoryAllocation* memory_allocation) const {
+  // Clear the existing plan.
+  reuse_table->clear();
+
+  // The `space_table` stores the real memory size for each tensor.
+  // The `reused_space_table` stores the maximum memory size required by a
+  // tensor during the memory reusing, the small tensor might be reused by a
+  // larger tensor, and the memory size of the small one will grow.
+  auto reused_space_table = space_table;
+
+  std::unordered_map<std::string, lifecycle_t> life_cycles;
+  std::unordered_map<std::string, Node*> tensor_nodes;
+  // The allocated tensors whose memory can be reused, they will live across the
+  // program execution.
+  std::unordered_set<std::string> existing_tensors;
+  // The existing tensor that has been allocated, and is also free to reuse.
+  std::unordered_set<std::string> free_existing_tensors;
+
+  CollectLifeCycle(&life_cycles, sort_kind);
+
+  for (int age = 0; age < max_lifecycle_; ++age) {
+    std::unordered_set<std::string> born_tensors;
+    std::unordered_set<std::string> dead_tensors;
+    // Gather the dead and born tensors.
+    for (auto elem_it = life_cycles.begin(); elem_it != life_cycles.end();
+         elem_it++) {
+      if (elem_it->second.first == -1) {
+        continue;
+      }
+      const auto& tensor = elem_it->first;
+      const auto& lifecycle = elem_it->second;
+      VLOG(4) << "process " << tensor << " reuse " << lifecycle.first << "->"
+              << lifecycle.second;
+
+      // Collect newly born tensors.
+      if (lifecycle.first == age) {
+        born_tensors.insert(tensor);
+      }
+      // Collect dead tensors whose memory can be reused.
+      else if (lifecycle.second < age) {  // NOLINT
+        dead_tensors.insert(tensor);
+        // remove to avoid duplicate process.
+        elem_it->second.first = -1;  // avoid duplicate search
+      }
+    }
+
+    // Reuse the dead tensors for born tensors
+    for (const auto& tensor : born_tensors) {
+      // Skip the feed and fetch tensor for that they share data with others.
+      std::string tensor2reuse;
+      if (!space_table.count(tensor)) continue;
+      size_t space_required = space_table.at(tensor);
+      if (FindSuitableTensorToReuse(tensor, space_required, tensor_nodes,
+                                    &free_existing_tensors, reused_space_table,
+                                    var_clusters, &tensor2reuse)) {
+        if (tensor != tensor2reuse) {
+          VLOG(4) << tensor << " -> " << tensor2reuse;
+        }
+        ReuseATensor(tensor, tensor2reuse, space_required,
+                     &free_existing_tensors, reuse_table, &reused_space_table);
+      } else {
+        VLOG(4) << "allocate " << tensor;
+        AllocateNewTensor(tensor, space_required, tensor_nodes,
+                          &free_existing_tensors, &reused_space_table,
+                          reuse_table);
+        ReuseATensor(tensor, tensor, space_required, &free_existing_tensors,
+                     reuse_table, &reused_space_table);
+      }
+    }
+
+    for (const auto& tensor : dead_tensors) {
+      // free its memory.
+      FreeATensor(tensor, &free_existing_tensors, reuse_table);
+    }
+  }
+
+  EvaluateMemoryUsage(*reuse_table, reused_space_table, var_batch_ave_size,
+                      &(memory_allocation->allocated),
+                      &(memory_allocation->saved));
+  memory_allocation->sort_kind = sort_kind;
+}
+
+void BuildVarNodeTable(Graph* graph,
+                       std::unordered_map<std::string, Node*>* var_node_table) {
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar()) {
+      (*var_node_table)[node->Name()] = node;
+    }
+  }
+}
+
+// NOTE The optimized opdesc doesn't match ir::Graph.
+void UpdateOpDescsByReuse(
+    Graph* graph,
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    int sort_kind) {
+  // TODO(Superjomn) change here to be compatible with the runtime order.
+  for (auto* node : TopologyVarientSort(
+           *graph, static_cast<framework::ir::SortKind>(sort_kind))) {
+    if (node->IsOp()) {
+      // Replace the original inputs/outputs with the reused tensors.
+      std::unordered_map<std::string, std::vector<std::string>> in_args,
+          out_args;
+      for (auto argument : node->Op()->Inputs()) {
+        for (const auto& x : argument.second) {
+          auto name = x;
+          if (reuse_table.count(x) && reuse_table.at(x) != x) {
+            name = reuse_table.at(x);
+          }
+          in_args[argument.first].push_back(name);
+          VLOG(4) << node->Name() << " input " << x << " -> " << name;
+        }
+      }
+
+      for (auto argument : node->Op()->Outputs()) {
+        for (const auto& x : argument.second) {
+          auto name = x;
+          if (reuse_table.count(x) && reuse_table.at(x) != x) {
+            name = reuse_table.at(x);
+          }
+          out_args[argument.first].push_back(name);
+          VLOG(4) << node->Name() << " output " << x << " -> " << name;
+        }
+      }
+
+      // Update arguments.
+      for (auto& arg : in_args) {
+        node->Op()->SetInput(arg.first, arg.second);
+      }
+      for (auto& arg : out_args) {
+        node->Op()->SetOutput(arg.first, arg.second);
+      }
+      node->Op()->Flush();
+    }
+  }
+}
+
+void MemoryOptimizePass::PerformReusePlan(
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    int sort_kind, std::unordered_set<std::string>* vars2remove) const {
+  std::unordered_map<std::string, Node*> var_node_table;
+  BuildVarNodeTable(graph_, &var_node_table);
+  UpdateOpDescsByReuse(graph_, reuse_table, sort_kind);
+
+  for (auto& item : reuse_table) {
+    if (item.first != item.second) {
+      vars2remove->insert(item.first);
+    }
+  }
+  VLOG(2) << "to remove vars " << vars2remove->size();
+}
+
+std::vector<std::string> split(const std::string& line, char delim) {
+  std::vector<std::string> res;
+  std::string field;
+  std::stringstream line_stream(line);
+  while (std::getline(line_stream, field, delim)) {
+    res.emplace_back(field);
+  }
+  return res;
+}
+
+// Deserialize the batch var shapes from the cache file.
+std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
+    const std::string& path) {
+  std::ifstream file(path);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s  to read cache", path);
+  std::string line;
+  std::vector<std::map<std::string, std::vector<int>>> batch_shapes;
+
+  while (std::getline(file, line)) {
+    std::map<std::string, std::vector<int>> batch;
+    for (const auto& var_info : split(line, ';')) {
+      auto fields = split(var_info, ':');
+      PADDLE_ENFORCE_EQ(fields.size(), 2UL);
+      auto var_name = fields.front();
+      auto shape_str = split(fields[1], ',');
+      std::vector<int> shape;
+      for (const auto& v : shape_str) shape.push_back(std::stoi(v));
+      batch[var_name] = shape;
+    }
+    batch_shapes.push_back(batch);
+  }
+  return batch_shapes;
+}
+
+// Calculate the average dim of each tensor from the batch shape cache.
+std::unordered_map<std::string, size_t> GetBatchAverageSize(
+    const std::vector<std::map<std::string, std::vector<int>>>& batches) {
+  std::unordered_map<std::string, size_t> var2size;
+  // The average size of the batches for each variable.
+  int num_batch = 0;
+  for (const auto& batch : batches) {
+    num_batch++;
+    for (const auto& item : batch) {
+      int dim = std::accumulate(item.second.begin(), item.second.end(), 1,
+                                [](int a, int b) { return a * b; });
+      var2size[item.first] += dim;
+    }
+  }
+
+  for (auto& item : var2size) {
+    item.second /= num_batch;
+  }
+
+  return var2size;
+}
+
+// Analysis the batch shapes loading from the cache file.
+// By splitting the variables to different clusters by analyzing their batch
+// size, we can pre-schedule the changes of difference LoDTensor when different
+// length of input sequences is entered.
+// This should works fine for the models operating on sentences.
+std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
+    const std::vector<std::map<std::string, std::vector<int>>>& batches) {
+  // collect the batch size of each shape and combine to a stringstream in
+  // converient to generate a hash.
+  std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
+  for (auto& batch : batches) {
+    for (auto& ele : batch) {
+      int batch_size = ele.second.front();
+      // TODO(Superjomn) might consume large memory here, use combine hash.
+      var_batchsize_hashes[ele.first] << batch_size;
+    }
+  }
+
+  // Split to sets by batch size sequences.
+  std::unordered_map<size_t /*hash*/, std::unordered_set<std::string>>
+      shape_sets;
+  for (auto& ele : var_batchsize_hashes) {
+    auto hash = std::hash<std::string>()(ele.second.str());
+    shape_sets[hash].insert(ele.first);
+  }
+  std::vector<std::unordered_set<std::string>> res;
+  for (auto& ele : shape_sets) {
+    res.emplace_back(std::move(ele.second));
+  }
+
+  VLOG(3) << "Cluster by batch_size and get " << res.size() << " clusters";
+  return res;
+}
+
+// Analysis the batch shapes loading from the cache file, and split them to
+// different clusters by their size.
+// This should works fine for the overall models.
+std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
+    const space_table_t& space_table,
+    const std::vector<std::map<std::string, std::vector<int>>>& batches,
+    int interval = 200000) {
+  PADDLE_ENFORCE_GT(interval, 0);
+  // cluster to different clusters.
+  size_t max_size = 0;
+  for (auto& item : space_table) {
+    max_size = std::max(item.second, max_size);
+  }
+  VLOG(4) << "tensor max size " << max_size;
+
+  std::vector<std::unordered_set<std::string>> res;
+
+  // cluster by intervals.
+  for (size_t interval_size = 0; interval_size <= max_size;
+       interval_size += interval) {
+    std::unordered_set<std::string> cluster;
+    for (auto& item : space_table) {
+      if (interval_size <= item.second &&
+          interval_size + interval > item.second) {
+        cluster.insert(item.first);
+      }
+    }
+    if (!cluster.empty()) {
+      res.push_back(cluster);
+    }
+  }
+
+  VLOG(3) << "Cluster by interval and get " << res.size() << " cluster";
+  return res;
+}
+
+std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
+
+void MemoryOptimizePass::RunImpl(Argument* argument) {
+  // When force update, should not optimize memory.
+  if (!argument->enable_memory_optim() || argument->memory_optim_force_update())
+    return;
+  graph_ = argument->main_graph_ptr();
+
+  auto path = GetMemoryCachePath(
+      argument->model_dir_valid() ? argument->model_dir() : "",
+      argument->model_program_path_valid() ? argument->model_program_path()
+                                           : "");
+  VLOG(3) << "Load memory cache from " << path;
+  if (inference::IsFileExists(path)) {
+    VLOG(4) << "Performing memory optimize";
+    auto batches = DeseralizeBatchVarShapes(path);
+    auto var_batch_ave_size = GetBatchAverageSize(batches);
+
+    std::unordered_map<std::string, Node*> tensor_nodes;
+    space_table_t space_table;
+    CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
+
+    std::unordered_map<std::string, std::string> reuse_table;
+    double max_saving_ratio = 0.;
+
+    std::vector<std::function<MemoryAllocation()>> strategies;
+
+    for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_batch_size =
+            AnalysisBatchShapesByBatchSize(batches);
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_batch_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches, 1024);  // interval 1kb
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches, 1024 * 1024);  // interval 1MB
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches,
+            std::numeric_limits<int>::max());  // no intervals
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+    }
+
+    std::function<MemoryAllocation()>* best_strategy{nullptr};
+
+    // Try all strategies to get the best result.
+    for (auto& strategy : strategies) {
+      auto allocation = strategy();
+      string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
+                              allocation.GetSavingRatio());
+      if (allocation.GetSavingRatio() > max_saving_ratio) {
+        max_saving_ratio = allocation.GetSavingRatio();
+        best_strategy = &strategy;
+      }
+    }
+    if (!best_strategy) {
+      LOG(ERROR)
+          << "This model makes poor memory optimize, skip memory optimize";
+      return;
+    }
+    auto memory_allocation = (*best_strategy)();
+
+    string::PrettyLogH2(
+        "--- Saved %.2f%s memory for workspace(temporary variables)",
+        memory_allocation.GetSavingRatio() * 100, "%");
+    string::PrettyLogDetail("--- Allocated %d MB",
+                            memory_allocation.allocated / 1024. / 1024.);
+    string::PrettyLogDetail("--- Saved %d MB",
+                            memory_allocation.saved / 1024. / 1024.);
+    argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
+                               new std::unordered_set<std::string>);
+    auto& vars2remove =
+        argument->main_graph().Get<std::unordered_set<std::string>>(
+            framework::ir::kGraphToProgramVarsToRemove);
+
+    PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
+    argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
+  }
+}
+
+float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {
+  return (saved / 1024.) / (allocated / 1024. + saved / 1024.);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
new file mode 100644
index 00000000000..fa1ad9c8c6a
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Memory optimization pass for inference with pre-analysis of memory usage
+ * without GC.
+ * Different from training, the inference memory reuse strategies doesn't
+ * include GC for that overhead is too much when batch size equals one.
+ *
+ * The inference memory reuse tries to pre-determine the tensor reusing strategy
+ * without runtime overhead.
+ *
+ * To improve the strategy's performance, a warm-up running is introduced:
+ *   - Before officially deploy the inference program, one should warm it up and
+ *     generate some runtime cache,
+ *   - Run the inference program with several batches of data, it will persist
+ *     some runtime information about memory of tensors to disk, we call the
+ *     information the memory reusing cache,
+ *   - With the memory reusing cache, user can deploy the inference to a
+ *     service, before running the model, the inference program will load the
+ *     memory cache, analysis it and generate the best memory reusing strategy,
+ *     and adjust the execution of the network.
+ *
+ * With the warm-up and memory reusing cache design, the memory reusing
+ * algorithm can analysis the real memory consume of the tensors, even with the
+ * flexible LoDTensor and special shape changing operators such as
+ * sequence-pooling.
+ */
+class MemoryOptimizePass : public AnalysisPass {
+ public:
+  using space_table_t = std::unordered_map<std::string, size_t>;
+  using lifecycle_t = std::pair<int, int>;
+
+  struct MemoryAllocation {
+    size_t allocated;  // allocated memory in byte.
+    size_t saved;      // saved memory in byte.
+    int sort_kind;     // the kind of the corresponding sorting algorithm.
+
+    // Get the memory saving ratio of temporary variables.
+    float GetSavingRatio() const;
+  };
+
+  virtual ~MemoryOptimizePass() = default;
+
+ protected:
+  void RunImpl(Argument *argument) override;
+
+ private:
+  void CollectLifeCycle(
+      std::unordered_map<std::string, lifecycle_t> *lifecycles,
+      int sort_kind) const;
+
+  void CollectVarMemorySize(
+      const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
+      std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
+      space_table_t *space_table) const;
+
+  // Returns percentage of saved memory.
+  void MakeReusePlan(
+      const std::vector<std::unordered_set<std::string>> &var_clusters,
+      const std::unordered_map<std::string, size_t> &var_batch_ave_size,
+      const space_table_t &space_table,
+      std::unordered_map<std::string, std::string> *reuse_table, int sort_kind,
+      MemoryAllocation *memory_allocation) const;
+
+  void PerformReusePlan(
+      const std::unordered_map<std::string, std::string> &reuse_table,
+      int sort_kind, std::unordered_set<std::string> *vars2remove) const;
+
+ public:
+  std::string repr() const override;
+
+ private:
+  mutable framework::ir::Graph *graph_{nullptr};
+  mutable int max_lifecycle_{-1};
+};
+
+static std::string GetMemoryCachePath(const std::string &model_path,
+                                      const std::string &prog_path) {
+  auto path = model_path.empty() ? prog_path : model_path;
+  return path + ".memory_cache";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index 9245e32cee2..161b127d6d5 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -13,24 +13,31 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/passes.h"
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
+
 PassRegistry::PassRegistry() {
+  // Register manually to avoid the trivial `USE_OP` like macro for easier use
+  // and link.
   passes_.emplace("ir_analysis_pass",
                   std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
   passes_.emplace("ir_graph_build_pass",
                   std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
-  passes_.emplace("ir_analysis_compose_pass",
-                  std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+  passes_.emplace("memory_optimize_pass",
+                  std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
   passes_.emplace(
       "ir_params_sync_among_devices_pass",
       std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
+  passes_.emplace(
+      "ir_graph_to_program_pass",
+      std::unique_ptr<IrGraphToProgramPass>(new IrGraphToProgramPass));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 8b3838f69a8..ad0af4005ad 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,8 +18,10 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass
-    ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
+set(inference_deps ${analysis_deps}
+  paddle_inference_api paddle_fluid_api
+  analysis pass naive_executor
+  ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
@@ -29,7 +31,8 @@ add_subdirectory(details)
 
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
+  reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
            lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
            analysis_config paddle_pass_builder zero_copy_tensor
@@ -44,7 +47,7 @@ if(WITH_TESTING)
                       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
   set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
 endif()
-cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
         ARGS --dirname=${WORD2VEC_MODEL_DIR})
 
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 965bbd0fd26..f9da3004ed8 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
 
 contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
   model_dir_ = model_dir;
+
+  Update();
 }
 contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
                                         const std::string &params_file) {
   prog_file_ = prog_file;
   params_file_ = params_file;
+
+  Update();
 }
 void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
                                        const std::string &params_file_path) {
   prog_file_ = prog_file_path;
   params_file_ = params_file_path;
+
+  Update();
 }
 void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                            int device_id) {
@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
   device_id_ = device_id;
 #else
-  LOG(ERROR) << "Please compile with gpu to EnableGpu";
+  LOG(ERROR) << "Please compile with gpu to EnableGpu()";
   use_gpu_ = false;
 #endif
+
+  Update();
+}
+void contrib::AnalysisConfig::DisableGpu() {
+  use_gpu_ = false;
+
+  Update();
 }
-void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
 
 contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   CP_MEMBER(use_gpu_);
   CP_MEMBER(device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
+
+  CP_MEMBER(enable_memory_optim_);
+  CP_MEMBER(memory_optim_force_update_);
   // TensorRT releated.
   CP_MEMBER(use_tensorrt_);
   CP_MEMBER(tensorrt_workspace_size_);
@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   }
 
 #undef CP_MEMBER
+
+  Update();
 }
 
 void contrib::AnalysisConfig::EnableMKLDNN() {
@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
   LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
   use_mkldnn_ = false;
 #endif
+
+  Update();
 }
 
 void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
                                                    int max_batch_size,
                                                    int min_subgraph_size) {
+#ifdef PADDLE_WITH_CUDA
+  if (!use_gpu()) {
+    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
+    return;
+  }
+
   use_tensorrt_ = true;
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
   tensorrt_min_subgraph_size_ = min_subgraph_size;
+
   Update();
+#else
+  LOG(ERROR)
+      << "To use TensorRT engine, please compile inference lib with GPU first.";
+#endif
 }
 
+// TODO(Superjomn) refactor this, buggy.
 void contrib::AnalysisConfig::Update() {
   auto info = SerializeInfoCache();
   if (info == serialized_info_cache_) return;
 
-  if (use_gpu_) {
-    pass_builder_.reset(new GpuPassStrategy);
+  // Transfer pass_builder and copy the existing compatible passes.
+  if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
+    if (use_gpu()) {
+      pass_builder_.reset(new GpuPassStrategy);
+
+      if (use_tensorrt_) {
+        // Append after the Affine_channel_conv_fuse pass.
+        pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
+      }
+    } else {
+      pass_builder_.reset(new CpuPassStrategy);
+    }
+
   } else {
-    pass_builder_.reset(new CpuPassStrategy);
+    if (use_gpu()) {
+      pass_builder_.reset(new GpuPassStrategy(
+          *static_cast<GpuPassStrategy *>(pass_builder_.get())));
+
+    } else {
+      pass_builder_.reset(new CpuPassStrategy(
+          *static_cast<CpuPassStrategy *>(pass_builder_.get())));
+    }
   }
 
   if (use_tensorrt_) {
-    if (!use_gpu_) {
-      LOG(ERROR)
-          << "TensorRT engine is not available when EnableGpu() not actived.";
-    } else {
+    const auto &passes = pass_builder_->AllPasses();
+    if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
+        std::end(passes)) {
       // Append after the Affine_channel_conv_fuse pass.
       pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
     }
@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() {
 #endif
   }
 
+  if (enable_memory_optim_) {
+    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
+  }
+
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() {
 
 std::string contrib::AnalysisConfig::SerializeInfoCache() {
   std::stringstream ss;
+  ss << model_dir_;
+  ss << prog_file_;
+  ss << params_file_;
+
   ss << use_gpu_;
+  ss << device_id_;
   ss << memory_pool_init_size_mb_;
 
   ss << use_tensorrt_;
   ss << tensorrt_workspace_size_;
   ss << tensorrt_max_batchsize_;
+  ss << tensorrt_min_subgraph_size_;
+
+  ss << enable_memory_optim_;
+  ss << memory_optim_force_update_;
 
   ss << use_mkldnn_;
+  for (auto &item : mkldnn_enabled_op_types_) ss << item;
+  ss << ";";
+
+  ss << model_from_memory_;
+
   ss << enable_ir_optim_;
   ss << use_feed_fetch_ops_;
   ss << ir_debug_;
 
+  ss << specify_input_name_;
+  ss << cpu_math_library_num_threads_;
+
   return ss.str();
 }
 
 void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
     int cpu_math_library_num_threads) {
   cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+
+  Update();
 }
 
 float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
 
+void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
+  enable_memory_optim_ = true;
+  memory_optim_force_update_ = force_update_cache;
+
+  Update();
+}
+
+bool contrib::AnalysisConfig::enable_memory_optim() const {
+  return enable_memory_optim_;
+}
+
 void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                              size_t prog_buffer_size,
                                              const char *param_buffer,
@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
   prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
   params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
   model_from_memory_ = true;
+
+  Update();
 }
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3917b9b65b5..2b0cad5faa0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -24,18 +24,21 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#if PADDLE_WITH_TENSORRT
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#endif
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#if PADDLE_WITH_TENSORRT
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#endif
+
 DECLARE_bool(profile);
 
 namespace paddle {
@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     LOG(ERROR) << "fail to get fetches";
     return false;
   }
+
+  // Collect variable shapes for memory optimization.
+  if (need_collect_var_shapes_for_memory_optim()) {
+    CollectVarShapes();
+  }
+
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
 
   // All the containers in the scope will be hold in inference, but the
@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 
   argument_.SetUseGPU(config_.use_gpu());
   argument_.SetGPUDeviceId(config_.gpu_device_id());
+  argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
+  argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
   if (!config_.model_dir().empty()) {
@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
 
   if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
+    LOG(INFO) << "TensorRT subgraph engine is enabled";
     argument_.SetUseTensorRT(true);
     argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
 
   if (config_.use_mkldnn_) {
+    LOG(INFO) << "MKLDNN is enabled";
     argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
   }
 
   auto passes = config_.pass_builder()->AllPasses();
-  if (!config_.ir_optim()) passes.clear();
+  if (!config_.ir_optim()) {
+    passes.clear();
+    LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
+  }
   argument_.SetIrAnalysisPasses(passes);
+  argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
   argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
   Analyzer().Run(&argument_);
 
@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() {
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
+
+  // TODO(Superjomn) deduce the directory path.
+  std::string out_path = inference::analysis::GetMemoryCachePath(
+      config_.model_dir(), config_.prog_file());
+  if (need_collect_var_shapes_for_memory_optim()) {
+    SerializeBatchVarShapes(out_path);
+  }
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   return std::unique_ptr<PaddlePredictor>(x);
 }
 
+void AnalysisPredictor::CollectVarShapes() {
+  VLOG(4) << "Collecting var shapes";
+  if (batch_var_shapes_.size() >= max_shape_collect_count_) return;
+  std::map<std::string, std::vector<int>> var_shapes;
+  for (auto var_name : inference_program_->Block(0).LocalVarNames()) {
+    auto *var = sub_scope_->FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    if (var->Type() == framework::VarTypeTrait<framework::LoDTensor>::kId ||
+        var->Type() == framework::VarTypeTrait<framework::Tensor>::kId) {
+      auto &tensor = var->Get<framework::LoDTensor>();
+      auto shape = framework::vectorize(tensor.dims());
+      var_shapes[var_name].assign(shape.begin(), shape.end());
+    }
+  }
+  batch_var_shapes_.push_back(var_shapes);
+  LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size()
+                       << " batch of var shapes for analysis";
+}
+
+void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) {
+  LOG(INFO) << "serialize batch var shapes to " << path;
+  std::ofstream file(path);
+  if (!file.is_open()) {
+    LOG(ERROR) << "failed to serialize the var shapes to " << path;
+    return;
+  }
+
+  // The sirialized data format:
+  // <tensor_name>:dim0,dim1,dim2,;
+  for (auto &batch : batch_var_shapes_) {
+    for (auto &ele : batch) {
+      file << ele.first << ":";
+      for (size_t i = 0; i < ele.second.size() - 1; i++) {
+        file << ele.second[i] << ",";
+      }
+      file << ele.second.back() << ";";
+    }
+    file << "\n";
+  }
+}
+
+bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
+  if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_;
+  bool need = false;
+  // check if the cache exists
+  if (!config_.enable_memory_optim()) {
+    need = false;
+  } else if (config_.enable_memory_optim() &&
+             !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
+                 config_.model_dir(), config_.prog_file()))) {
+    need = true;
+  } else if (config_.enable_memory_optim() &&
+             config_.memory_optim_force_update_) {
+    need = true;
+  }
+
+  need_collect_var_shapes_ = need ? 1 : 0;
+  return need;
+}
+
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
     const contrib::AnalysisConfig &config) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 6ca4b5e9bed..e25b5a7047b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor {
   void SetMkldnnThreadID(int tid);
 
  protected:
+  // For memory optimization.
+  bool need_collect_var_shapes_for_memory_optim();
+  void CollectVarShapes();
+  void SerializeBatchVarShapes(const std::string &path);
+
   bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
   bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
   bool CreateExecutor();
@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor {
   // A mutex help to make Clone thread safe.
   std::mutex clone_mutex_;
 
+  // For memory optimization.
+  const size_t max_shape_collect_count_{1000};
+  int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
+  std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
+
  private:
   // Some status here that help to determine the status inside the predictor.
   bool status_program_optimized_{false};
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 3df26cde3d5..4688e93d710 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -16,8 +16,10 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thread>  // NOLINT
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 DEFINE_string(dirname, "", "dirname to tests.");
 
@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) {
   }
 }
 
+TEST(AnalysisPredictor, memory_optim) {
+  AnalysisConfig config(FLAGS_dirname);
+  config.DisableGpu();
+  config.EnableMemoryOptim(true);
+  config.pass_builder()->TurnOnDebug();
+
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
+
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> output, output1;
+
+  {
+    // The first predictor help to cache the memory optimize strategy.
+    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+
+    // Run several times to check the parameters are not reused by mistake.
+    for (int i = 0; i < 5; i++) {
+      ASSERT_TRUE(predictor->Run(inputs, &output));
+    }
+  }
+
+  {
+    output.clear();
+    // The second predictor to perform memory optimization.
+    config.EnableMemoryOptim(false);
+    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+
+    // Run with memory optimization
+    ASSERT_TRUE(predictor->Run(inputs, &output));
+  }
+
+  // Run native
+  ASSERT_TRUE(native_predictor->Run(inputs, &output1));
+
+  LOG(INFO) << "the output " << inference::DescribeTensor(output.front());
+  LOG(INFO) << "the native output "
+            << inference::DescribeTensor(output1.front());
+
+  inference::CompareResult(output, output1);
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 9811fe2cd06..963986f245c 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 set -x
 PADDLE_ROOT=$1
 TURN_ON_MKL=$2 # use MKL or Openblas
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index cdd01cb9f06..b92781e4f2c 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,7 +15,10 @@
 #pragma once
 
 #include <glog/logging.h>
-
+#include <fstream>
+#if !defined(_WIN32)
+#include <sys/time.h>
+#endif
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <iterator>
@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
   return true;
 }
 
-static std::string DescribeTensor(const PaddleTensor &tensor) {
+static std::string DescribeTensor(const PaddleTensor &tensor,
+                                  int max_num_of_data = 15) {
   std::stringstream os;
   os << "Tensor [" << tensor.name << "]\n";
   os << " - type: ";
@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
   }
 }
 
+static bool IsFileExists(const std::string &path) {
+  std::ifstream file(path);
+  bool exists = file.is_open();
+  file.close();
+  return exists;
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index ae6ac69854d..1cee8904500 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -192,6 +192,13 @@ struct AnalysisConfig {
    */
   bool model_from_memory() const { return model_from_memory_; }
 
+  /** Turn on memory optimize
+   * NOTE still in development, will release latter.
+   */
+  void EnableMemoryOptim(bool force_update_cache = false);
+  /** Tell whether the memory optimization is activated. */
+  bool enable_memory_optim() const;
+
   friend class ::paddle::AnalysisPredictor;
 
   /** NOTE just for developer, not an official API, easily to be broken.
@@ -232,6 +239,10 @@ struct AnalysisConfig {
   //  subgraph, 3 as default value.
   int tensorrt_min_subgraph_size_{3};
 
+  // memory reuse related.
+  bool enable_memory_optim_{false};
+  bool memory_optim_force_update_{false};
+
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index bc3ce72f083..039389a4cf9 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
+
 #include <glog/logging.h>
 
 namespace paddle {
@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
 
+void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
+  analysis_passes_.push_back(pass);
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index efe1ba106a2..d3a60d20992 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -45,6 +45,9 @@ class PaddlePassBuilder {
   /** Delete all the passes that has type `pass_type`. */
   void DeletePass(const std::string &pass_type);
 
+  /** Append an analysis pass. */
+  void AppendAnalysisPass(const std::string &pass);
+
   /** Visualize the computation graph after each pass by generating a DOT
    * language file, one can draw them with the Graphviz toolkit.
    */
@@ -54,8 +57,18 @@ class PaddlePassBuilder {
   std::string DebugString();
 
   const std::vector<std::string> &AllPasses() const { return passes_; }
+  std::vector<std::string> AnalysisPasses() const {
+    auto passes = analysis_passes_;
+    // To make sure the ir_graph_to_program should be the last pass so any
+    // modication of IR will persist to the program.
+    passes.push_back("ir_graph_to_program_pass");
+    return passes;
+  }
 
  protected:
+  std::vector<std::string> analysis_passes_{
+      {"ir_graph_build_pass", "ir_analysis_pass",
+       "ir_params_sync_among_devices_pass"}};
   std::vector<std::string> passes_;
 };
 
@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder {
   /** The MKLDNN control exists in both CPU and GPU mode, because there can be
    * still some CPU kernels running in CPU mode.
    */
-  virtual void EnableMKLDNN() = 0;
+  virtual void EnableMKLDNN() {}
 
   bool use_gpu() const { return use_gpu_; }
 
@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder {
 
  protected:
   bool use_gpu_{false};
+  bool use_mkldnn_{false};
 };
 
 /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy {
     use_gpu_ = false;
   }
 
+  explicit CpuPassStrategy(const CpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {}
+
   virtual ~CpuPassStrategy() = default;
 
   void EnableMKLDNN() override {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_MKLDNN
-    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
-
-    for (auto &pass :
-         std::vector<std::string>({"depthwise_conv_mkldnn_pass",    //
-                                   "conv_bias_mkldnn_fuse_pass",    //
-                                   "conv3d_bias_mkldnn_fuse_pass",  //
-                                   "conv_relu_mkldnn_fuse_pass",    //
-                                   "conv_elementwise_add_mkldnn_fuse_pass"})) {
-      passes_.push_back(pass);
+    if (!use_mkldnn_) {
+      passes_.insert(passes_.begin(), "mkldnn_placement_pass");
+
+      for (auto &pass : std::vector<std::string>(
+               {"depthwise_conv_mkldnn_pass",    //
+                "conv_bias_mkldnn_fuse_pass",    //
+                "conv3d_bias_mkldnn_fuse_pass",  //
+                "conv_relu_mkldnn_fuse_pass",    //
+                "conv_elementwise_add_mkldnn_fuse_pass"})) {
+        passes_.push_back(pass);
+      }
     }
+    use_mkldnn_ = true;
+#else
+    use_mkldnn_ = false;
 #endif
   }
-
-  CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
 };
 
 /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy {
     use_gpu_ = true;
   }
 
-  GpuPassStrategy(const GpuPassStrategy &other)
+  explicit GpuPassStrategy(const GpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {
     use_gpu_ = true;
   }
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index adbf98e9e8a..423c39813f0 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -19,7 +19,7 @@ endfunction()
 
 function(inference_analysis_api_test target install_dir filename)
     inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
         ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()
 
@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index fc87e0a8d17..4ec9404ab42 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   std::string turn_mask_pre = "turn_mask_";
 
   auto one_batch = data->NextBatch();
+  PADDLE_ENFORCE(!one_batch.response.empty());
   int size = one_batch.response[0].size();
   CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) {
   std::vector<PaddleTensor> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
+
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                  input_slots_all, &outputs, FLAGS_num_threads);
 
@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare result of NativeConfig and AnalysisConfig with memory optimization.
+TEST(Analyzer_dam, compare_with_memory_optim) {
+  // The small dam will core in CI, but works in local.
+  if (FLAGS_max_turn_num == 9) {
+    contrib::AnalysisConfig cfg, cfg1;
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    SetInput(&input_slots_all);
+    // Run the first time to force to update memory cache
+    SetConfig(&cfg);
+    cfg.EnableMemoryOptim(true);
+
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
+
+    // Run second time to use the memory cache and perform memory optimization.
+    SetConfig(&cfg1);
+    cfg1.EnableMemoryOptim();
+
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
+        input_slots_all);
+  }
+}
+
 TEST(Analyzer_dam, compare) { compare(); }
+
 #ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 7b448a32003..2db297e2005 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_Text_Classification, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg.pass_builder()->TurnOnDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) {
 TEST(Analyzer_Text_Classification, compare) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg.EnableMemoryOptim();
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 5a77b53a851..f3e75ffbb59 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) {
                 FLAGS_infer_model + "/__params__");
   cfg->DisableGpu();
   cfg->SwitchIrDebug();
-  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchSpecifyInputNames(false);
   // TODO(TJ): fix fusion gru
   cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }
@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) {
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
   }
+  // cfg.pass_builder()->TurnOnDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -103,9 +105,8 @@ void profile(bool use_mkldnn = false) {
     size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
     CHECK_EQ(numel, refer.data.size());
     for (size_t i = 0; i < numel; ++i) {
-      CHECK_LT(
-          fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
-          1e-5);
+      EXPECT_NEAR(static_cast<float *>(output.data.data())[i], refer.data[i],
+                  1e-5);
     }
   }
 }
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index ac964dc0c86..d2ca1d0b009 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <string>
 #include <thread>  // NOLINT
@@ -28,9 +29,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/inference/utils/benchmark.h"
@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
         float *pdata = static_cast<float *>(out.data.data());
         float *pdata_ref = static_cast<float *>(ref_out.data.data());
         for (size_t j = 0; j < size; ++j) {
-          EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy);
+          CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
         }
         break;
       }
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 9725c190329..5aca807ee3a 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) {
   }
 }
 
+TEST(TensorRT_mobilenet, profile) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  profile(model_dir, true, false);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
index 76a3dd2c299..a1304cf4e77 100644
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
 
+#pragma once
 #include <fstream>
 #include <iostream>
 #include <string>
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
index eb255474082..80763160df3 100644
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
@@ -16,7 +16,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-using namespace paddle::inference;
+using namespace paddle::inference;  // NOLINT
 TEST(Benchmark, basic) {
   Benchmark benchmark;
   benchmark.SetName("key0");
@@ -36,4 +36,4 @@ TEST(Benchmark, PersistToFile) {
   benchmark.PersistToFile("1.log");
   benchmark.PersistToFile("1.log");
   benchmark.PersistToFile("1.log");
-}
\ No newline at end of file
+}
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 86b3114cb3c..0dfed7f5cc1 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase {
             << out_name;
 
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    PADDLE_ENFORCE_LT(static_cast<size_t>(col), feed_list.size());
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
 
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
index 10c9eb80d0a..da4c1f326fb 100644
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt,
   std::cerr << style << Sprintf(fmt, args...) << reset();
 }
 
+template <typename... Args>
+static void PrettyLogInfo(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::info(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogDetail(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::detail(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH1(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H1(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH2(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H2(), fmt, args...);
+}
+
 }  // namespace string
 }  // namespace paddle
-- 
GitLab