fea/infer memory optim2 (#14953)

885c4e57 · Yan Chunwei · GitHub · 6597ccb0 · 885c4e57 · 885c4e57
46 changed file
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -18,8 +18,10 @@ limitations under the License. */
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
+#include <stack>
 #include <unordered_map>
 #include <unordered_set>
+#include "paddle/fluid/framework/ir/graph_traits.h"
 DEFINE_string(print_sub_graph_dir, "",
              "FLAGS_print_sub_graph_dir is used "
@@ -41,7 +43,7 @@ void SortHelper(
    }
  }
-  VLOG(3) << "topology sort insert: " << node->Name()
+  VLOG(5) << "topology sort insert: " << node->Name() << " "
          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
  ret->push_back(node);
 }
@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
  return ret;
 }
+// Build operator inlink edge table.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    const Graph &graph) {
  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
  for (auto &n : graph.Nodes()) {
-    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (!n->IsOp()) continue;
    if (adj_list.find(n) == adj_list.end()) {
      adj_list[n] = std::unordered_set<ir::Node *>();
    }
@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
  return adj_list;
 }
+// Build operator outlink edge table.
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+  for (auto &n : graph.Nodes()) {
+    if (!n->IsOp()) continue;
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->outputs) {
+      for (auto &adj_n : var->outputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        adj_list[n].insert(adj_n);
+      }
+    }
+  }
+  return adj_list;
+}
+std::vector<ir::Node *> OpDFSSort(const Graph &graph) {
+  auto edge_table = BuildOperationOutAdjList(graph);
+  std::stack<Node *> stack;
+  for (auto &ele : edge_table) {
+    if (ele.first->inputs.empty()) {
+      // find the input ops (those without input vars)
+      stack.push(ele.first);
+    } else {
+      // find the ops with only persistable vars as inputs.
+      bool all_persistable = true;
+      for (auto *input : ele.first->inputs) {
+        if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) {
+          all_persistable = false;
+        }
+      }
+      if (all_persistable) {
+        stack.push(ele.first);
+      }
+    }
+  }
+  std::vector<Node *> res;
+  // start from the feed op and DFS
+  std::unordered_set<Node *> unique_set;
+  while (!stack.empty()) {
+    // will start from the last feed by default.
+    auto cur = stack.top();
+    stack.pop();
+    unique_set.insert(cur);
+    res.push_back(cur);
+    for (auto *op : edge_table[cur]) {
+      if (!unique_set.count(op)) {
+        stack.push(op);
+      }
+    }
+  }
+  return res;
+}
+std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph) {
+  std::vector<ir::Node *> nodes;
+  std::unordered_map<Node *, int> in_degree;
+  auto set_out_ops_ready = [&](Node *var) {
+    for (auto *op : var->outputs) {
+      --in_degree[op];
+    }
+  };
+  // build in_degree
+  for (auto *node : graph.Nodes()) {
+    if (node->IsOp()) {
+      in_degree[node] += node->inputs.size();
+    } else if (node->IsVar() && node->inputs.empty()) {
+      // put all the inputs of the whole graph ready.
+      set_out_ops_ready(node);
+    }
+  }
+  std::deque<Node *> op_queue;
+  // first visit
+  for (auto &node : OpDFSSort(graph)) {
+    if (node->IsOp()) {
+      op_queue.push_back(node);
+    }
+  }
+  // traverse the graph
+  int num_ops = op_queue.size();
+  while (num_ops) {
+    for (auto it = op_queue.begin(); it != op_queue.end(); it++) {
+      auto *&cur_op = *it;
+      if (!cur_op || in_degree[cur_op] > 0) continue;
+      // visit this node
+      // put all the output var of this op valid.
+      for (auto *out_var : cur_op->outputs) {
+        if (!out_var) continue;
+        set_out_ops_ready(out_var);
+      }
+      VLOG(8) << "visit " << cur_op->Name();
+      nodes.push_back(cur_op);
+      cur_op = nullptr;
+      num_ops--;
+    }
+  }
+  return nodes;
+}
 size_t GraphNum(const Graph &graph) {
  std::unordered_set<ir::Node *> nodes(graph.Nodes());
  std::unordered_set<ir::Node *> visited_nodes;
@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
  return graph_count;
 }
+void CleanIndividualNodes(Graph *graph) {
+  std::unordered_set<Node *> nodes2rm;
+  for (auto *node : graph->Nodes()) {
+    if (node->inputs.empty() && node->outputs.empty()) {
+      nodes2rm.insert(node);
+    }
+  }
+  for (auto *node : nodes2rm) {
+    graph->RemoveNode(node);
+  }
+}
+std::vector<Node *> TopologyVarientSort(const Graph &graph,
+                                        SortKind sort_kind) {
+  switch (sort_kind) {
+    case SortKind::TS:
+      return framework::ir::TopologySortOperations(graph);
+    default:
+      return framework::ir::TopologyDfsSortOperations(graph);
+  }
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
 // `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
+// Topological sort, but try to DFS.
+std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph);
+// Different kinds to sort the operators in a graph to a sequence.
+enum class SortKind {
+  // Topological Search
+  TS = 0,
+  // Topological and Depth First Search
+  TDFS
+};
+// Several kinds of topological sort.
+std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
+// Clean the nodes that doesn't connect to others.
+void CleanIndividualNodes(Graph *graph);
 // Build an adjacency list of operations for the `graph`.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    const Graph &graph);

--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/program_desc.h"
 namespace paddle {
@@ -29,6 +28,14 @@ namespace ir {
 std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
    std::unique_ptr<Graph> graph) const {
+  // Remove the unneeded variables after memory optimization.
+  std::unordered_set<std::string> vars2remove;
+  if (graph->Has(kGraphToProgramVarsToRemove)) {
+    vars2remove = graph->Get<std::unordered_set<std::string>>(
+        kGraphToProgramVarsToRemove);
+    VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes";
+  }
  ProgramDesc& program = Get<ProgramDesc>("program");
  std::unique_ptr<proto::ProgramDesc> program_pb(
@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
  std::unordered_set<std::string> visited_vars;
  for (ir::Node* n : graph->Nodes()) {
    if (n->IsVar()) {
-      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
+      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
+          !vars2remove.count(n->Var()->Name())) {
        visited_vars.insert(n->Var()->Name());
        block->add_vars()->MergeFrom(*n->Var()->Proto());
      }
    }
  }
  block->clear_ops();
-  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
-  for (ir::Node* n : nodes) {
+  std::vector<ir::Node*> nodes;
-    if (!n->Op()) {
+  if (Has(kGraphToProgramSortKind)) {
-      continue;
+    // Inference Memory Optimize relays on this branch.
+    int sort_kind = Get<int>(kGraphToProgramSortKind);
+    nodes = TopologyVarientSort(
+        *graph, static_cast<framework::ir::SortKind>(sort_kind));
+  } else {
+    nodes = TopologySortOperations(*graph);
  }
+  for (ir::Node* n : nodes) {
+    if (!n->Op()) continue;
    block->add_ops()->MergeFrom(*n->Op()->Proto());
  }
  program.CopyFrom(*program_pb);
  return graph;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -20,6 +20,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
+const char kGraphToProgramVarsToRemove[] =
+    "__graph_to_program_vars_to_remove__";
+const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
 class GraphToProgramPass : public Pass {
 protected:
  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;

--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -64,7 +64,7 @@ class Node {
  std::string Name() const { return name_; }
-  VarDesc* Var() {
+  VarDesc* Var() const {
    PADDLE_ENFORCE(IsVar());
    return var_desc_.get();
  }

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
                             "running Paddle Inference";
 #endif  // PADDLE_ON_INFERENCE
  for (auto &op : ops_) {
-    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
+    VLOG(4) << std::this_thread::get_id() << " run "
-            << " on scope " << scope_;
+            << op->DebugStringEx(scope_) << " on scope " << scope_;
    op->SetIsCalledByExecutor(false);
    op->Run(*scope_, place_);
  }
@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
    anc = anc->parent();
  }
+  int num_vars = 0;
  for (auto &var : global_block.AllVars()) {
    if (var->Name() == framework::kEmptyVarName) {
      continue;
    }
+    num_vars++;
    if (persistable == var->Persistable()) {
      if (persistable) {
@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
      }
    }
  }
+  VLOG(4) << "naive executor create " << num_vars << " vars";
 }
 void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -18,6 +18,7 @@ cc_library(analysis SRCS
  analyzer.cc
  analysis_pass
  DEPS ${analysis_deps} analysis_helper
+  ${INFER_IR_PASSES}
  )
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,8 +15,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
 #include "paddle/fluid/inference/analysis/passes/passes.h"
+#include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
 namespace inference {
@@ -24,13 +24,16 @@ namespace analysis {
 Analyzer::Analyzer() {}
-void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
+void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
-void Analyzer::RunIrAnalysis(Argument *argument) {
+void Analyzer::RunAnalysis(Argument *argument) {
-  std::vector<std::string> passes({"ir_analysis_compose_pass"});
+  PADDLE_ENFORCE(argument->analysis_passes_valid(),
+                 "analsis_passes is not valid in the argument.");
-  for (auto &pass : passes) {
+  for (auto &pass : argument->analysis_passes()) {
-    PassRegistry::Global().Retreive(pass)->Run(argument);
+    string::PrettyLogH1("--- Running analysis [%s]", pass);
+    auto *ptr = PassRegistry::Global().Retreive(pass);
+    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
+    ptr->Run(argument);
  }
 }

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -54,7 +54,7 @@ class Analyzer final {
  DISABLE_COPY_AND_ASSIGN(Analyzer);
 protected:
-  void RunIrAnalysis(Argument* argument);
+  void RunAnalysis(Argument* argument);
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
  argument.SetModelDir(FLAGS_inference_model_dir);
  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
  argument.SetUseGPU(false);
+  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
+                              "ir_params_sync_among_devices_pass"});
  Analyzer analyser;
  analyser.Run(&argument);
@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
  argument.SetModelDir(FLAGS_inference_model_dir);
  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
  argument.SetUseGPU(false);
+  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
+                              "ir_params_sync_among_devices_pass"});
  Analyzer analyser;
  analyser.Run(&argument);

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -110,16 +110,20 @@ struct Argument {
  // The overall Scope to work on.
  DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
+  // The default program, loaded from disk.
  DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
  // The ir passes to perform in analysis phase.
  DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
+                      std::vector<std::string>);
  // Pass a set of op types to enable its mkldnn kernel
  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
+  // Passed from config.
  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
@@ -127,6 +131,13 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
+  // Memory optimized related.
+  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
+  // Indicate which kind of sort algorithm is used for operators, the memory
+  // optimization relays on the sort algorithm.
+  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
  // The program transformed by IR analysis phase.
  DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
                             framework::proto::ProgramDesc);

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -28,6 +28,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/port.h"
+#ifdef _WIN32
+#define GCC_ATTRIBUTE(attr__) ;
+#else
+#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
+#endif
+#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
 namespace paddle {
 namespace inference {
 namespace analysis {

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
  PADDLE_ENFORCE(graph.get());
  // Apply all the passes
  for (const auto &pass : passes_) {
+    if (pass->Type() == "graph_viz_pass") continue;
    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
    graph = pass->Apply(std::move(graph));
  }

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
-if (TENSORRT_FOUND)
+if (WITH_GPU AND TENSORRT_FOUND)
  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
  set(analysis_deps ${analysis_deps}

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
    if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
-    LOG(INFO) << "detect a subgraph size " << subgraph.size();
    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
 namespace inference {
@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  framework::BlockDesc block_desc(nullptr, &block_proto);
  block_desc.Proto()->set_parent_idx(-1);
  block_desc.Proto()->set_idx(0);
+  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
+                          subgraph.size());
  for (auto *node : subgraph) {
    auto *op = block_desc.AppendOp();
    *op->Proto() = *node->Op()->Proto();

--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
 cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
+cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
-set(analysis_deps ${analysis_deps}
+cc_library(analysis_passes SRCS passes.cc DEPS
  ir_graph_build_pass
  ir_analysis_pass
+  ir_params_sync_among_devices_pass
+  memory_optim_pass
+  ir_graph_to_program_pass
+)
+set(analysis_deps ${analysis_deps}
        analysis_passes
        subgraph_detector
        CACHE INTERNAL "")
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 namespace paddle {
@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
  IRPassManager the_ir_manager(argument);
  graph = the_ir_manager.Apply(std::move(graph));
  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
-  argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
-      the_ir_manager.AcquireProgram(&graph, argument->main_program())));
  argument->SetMainGraph(graph.release());
+  CollectFusionStatis(argument);
+}
+void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
+  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
+    LOG(INFO) << "argument has no fuse statis";
+    return;
+  }
+  argument->SetFusionStatis(
+      argument->main_graph().Get<Argument::fusion_statis_t>(
+          framework::ir::kFuseStatisAttr));
 }
 std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -29,6 +29,9 @@ namespace analysis {
 class IrAnalysisPass : public AnalysisPass {
 public:
  void RunImpl(Argument* argument) override;
+  void CollectFusionStatis(Argument* argument);
  std::string repr() const override;
 };

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -12,49 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
-#include <string>
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
-void IrAnalysisComposePass::RunImpl(Argument *argument) {
+void IrGraphToProgramPass::RunImpl(Argument *argument) {
-  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
+  auto pass =
-  ApplyIrPasses(argument);
+      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
-  CollectFusionStatis(argument);
-}
-std::string IrAnalysisComposePass::repr() const {
-  return "ir-analysis-compose-pass";
-}
-void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
+  if (argument->memory_optim_sort_kind_valid()) {
-  std::vector<std::string> passes({
+    pass->Set(framework::ir::kGraphToProgramSortKind,
-      "ir_graph_build_pass", "ir_analysis_pass",
+              new int(argument->memory_optim_sort_kind()));
-      "ir_params_sync_among_devices_pass",
-  });
-  for (const auto &pass : passes) {
-    VLOG(2) << "Run pass " << pass;
-    auto *the_pass = PassRegistry::Global().Retreive(pass);
-    the_pass->Run(argument);
  }
-}
-void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) {
+  std::unique_ptr<Graph> graph(argument->main_graph_ptr());
-  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
+  framework::ProgramDesc desc(argument->main_program());
-    LOG(INFO) << "argument has no fuse statis";
+  pass->SetNotOwned("program", &desc);
-    return;
+  auto thegraph = pass->Apply(std::move(graph));
-  }
+  thegraph.release();  // the argument still own the graph.
-  argument->SetFusionStatis(
-      argument->main_graph().Get<Argument::fusion_statis_t>(
+  argument->SetIrAnalyzedProgram(
-          framework::ir::kFuseStatisAttr));
+      new framework::proto::ProgramDesc(*desc.Proto()));
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
@@ -14,31 +14,17 @@
 #pragma once
-#include <string>
-#include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/passes/passes.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
-/*
+class IrGraphToProgramPass : public AnalysisPass {
- * The analysis pass to run a list of IR passes (like a function call).
- * Currently, it should be the first pass of analysis phase.
- */
-class IrAnalysisComposePass : public AnalysisPass {
 public:
-  void RunImpl(Argument* argument) override;
+  void RunImpl(Argument *argument) override;
-  std::string repr() const override;
- private:
+  std::string repr() const override { return "ir-graph-to-param-pass"; }
-  void ApplyIrPasses(Argument* argument);
-  void CollectFusionStatis(Argument* argument);
-  // Assign a Scope for IR passes to modify the weights.
-  void AssignScopeToModify(Argument* argument);
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+/*
+ * Memory optimization pass for inference with pre-analysis of memory usage
+ * without GC.
+ * Different from training, the inference memory reuse strategies doesn't
+ * include GC for that overhead is too much when batch size equals one.
+ *
+ * The inference memory reuse tries to pre-determine the tensor reusing strategy
+ * without runtime overhead.
+ *
+ * To improve the strategy's performance, a warm-up running is introduced:
+ *   - Before officially deploy the inference program, one should warm it up and
+ *     generate some runtime cache,
+ *   - Run the inference program with several batches of data, it will persist
+ *     some runtime information about memory of tensors to disk, we call the
+ *     information the memory reusing cache,
+ *   - With the memory reusing cache, user can deploy the inference to a
+ *     service, before running the model, the inference program will load the
+ *     memory cache, analysis it and generate the best memory reusing strategy,
+ *     and adjust the execution of the network.
+ *
+ * With the warm-up and memory reusing cache design, the memory reusing
+ * algorithm can analysis the real memory consume of the tensors, even with the
+ * flexible LoDTensor and special shape changing operators such as
+ * sequence-pooling.
+ */
+class MemoryOptimizePass : public AnalysisPass {
+ public:
+  using space_table_t = std::unordered_map<std::string, size_t>;
+  using lifecycle_t = std::pair<int, int>;
+  struct MemoryAllocation {
+    size_t allocated;  // allocated memory in byte.
+    size_t saved;      // saved memory in byte.
+    int sort_kind;     // the kind of the corresponding sorting algorithm.
+    // Get the memory saving ratio of temporary variables.
+    float GetSavingRatio() const;
+  };
+  virtual ~MemoryOptimizePass() = default;
+ protected:
+  void RunImpl(Argument *argument) override;
+ private:
+  void CollectLifeCycle(
+      std::unordered_map<std::string, lifecycle_t> *lifecycles,
+      int sort_kind) const;
+  void CollectVarMemorySize(
+      const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
+      std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
+      space_table_t *space_table) const;
+  // Returns percentage of saved memory.
+  void MakeReusePlan(
+      const std::vector<std::unordered_set<std::string>> &var_clusters,
+      const std::unordered_map<std::string, size_t> &var_batch_ave_size,
+      const space_table_t &space_table,
+      std::unordered_map<std::string, std::string> *reuse_table, int sort_kind,
+      MemoryAllocation *memory_allocation) const;
+  void PerformReusePlan(
+      const std::unordered_map<std::string, std::string> &reuse_table,
+      int sort_kind, std::unordered_set<std::string> *vars2remove) const;
+ public:
+  std::string repr() const override;
+ private:
+  mutable framework::ir::Graph *graph_{nullptr};
+  mutable int max_lifecycle_{-1};
+};
+static std::string GetMemoryCachePath(const std::string &model_path,
+                                      const std::string &prog_path) {
+  auto path = model_path.empty() ? prog_path : model_path;
+  return path + ".memory_cache";
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -13,24 +13,31 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/passes/passes.h"
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
 PassRegistry::PassRegistry() {
+  // Register manually to avoid the trivial `USE_OP` like macro for easier use
+  // and link.
  passes_.emplace("ir_analysis_pass",
                  std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
  passes_.emplace("ir_graph_build_pass",
                  std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
-  passes_.emplace("ir_analysis_compose_pass",
+  passes_.emplace("memory_optimize_pass",
-                  std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+                  std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
  passes_.emplace(
      "ir_params_sync_among_devices_pass",
      std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
+  passes_.emplace(
+      "ir_graph_to_program_pass",
+      std::unique_ptr<IrGraphToProgramPass>(new IrGraphToProgramPass));
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,8 +18,10 @@ if(APPLE)
 endif(APPLE)
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass
+set(inference_deps ${analysis_deps}
-    ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
+  paddle_inference_api paddle_fluid_api
+  analysis pass naive_executor
+  ${GLOB_PASS_LIB})
 if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
@@ -29,7 +31,8 @@ add_subdirectory(details)
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
+  reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
           lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
           analysis_config paddle_pass_builder zero_copy_tensor
@@ -44,7 +47,7 @@ if(WITH_TESTING)
                      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
  set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
 endif()
-cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
        ARGS --dirname=${WORD2VEC_MODEL_DIR})
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
 contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
  model_dir_ = model_dir;
+  Update();
 }
 contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
                                        const std::string &params_file) {
  prog_file_ = prog_file;
  params_file_ = params_file;
+  Update();
 }
 void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
                                       const std::string &params_file_path) {
  prog_file_ = prog_file_path;
  params_file_ = params_file_path;
+  Update();
 }
 void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                           int device_id) {
@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
  memory_pool_init_size_mb_ = memory_pool_init_size_mb;
  device_id_ = device_id;
 #else
-  LOG(ERROR) << "Please compile with gpu to EnableGpu";
+  LOG(ERROR) << "Please compile with gpu to EnableGpu()";
  use_gpu_ = false;
 #endif
+  Update();
+}
+void contrib::AnalysisConfig::DisableGpu() {
+  use_gpu_ = false;
+  Update();
 }
-void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
 contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
  CP_MEMBER(use_gpu_);
  CP_MEMBER(device_id_);
  CP_MEMBER(memory_pool_init_size_mb_);
+  CP_MEMBER(enable_memory_optim_);
+  CP_MEMBER(memory_optim_force_update_);
  // TensorRT releated.
  CP_MEMBER(use_tensorrt_);
  CP_MEMBER(tensorrt_workspace_size_);
@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
  }
 #undef CP_MEMBER
+  Update();
 }
 void contrib::AnalysisConfig::EnableMKLDNN() {
@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
  use_mkldnn_ = false;
 #endif
+  Update();
 }
 void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
                                                   int max_batch_size,
                                                   int min_subgraph_size) {
+#ifdef PADDLE_WITH_CUDA
+  if (!use_gpu()) {
+    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
+    return;
+  }
  use_tensorrt_ = true;
  tensorrt_workspace_size_ = workspace_size;
  tensorrt_max_batchsize_ = max_batch_size;
  tensorrt_min_subgraph_size_ = min_subgraph_size;
  Update();
+#else
+  LOG(ERROR)
+      << "To use TensorRT engine, please compile inference lib with GPU first.";
+#endif
 }
+// TODO(Superjomn) refactor this, buggy.
 void contrib::AnalysisConfig::Update() {
  auto info = SerializeInfoCache();
  if (info == serialized_info_cache_) return;
-  if (use_gpu_) {
+  // Transfer pass_builder and copy the existing compatible passes.
+  if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
+    if (use_gpu()) {
      pass_builder_.reset(new GpuPassStrategy);
+      if (use_tensorrt_) {
+        // Append after the Affine_channel_conv_fuse pass.
+        pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
+      }
    } else {
      pass_builder_.reset(new CpuPassStrategy);
    }
-  if (use_tensorrt_) {
-    if (!use_gpu_) {
-      LOG(ERROR)
-          << "TensorRT engine is not available when EnableGpu() not actived.";
  } else {
+    if (use_gpu()) {
+      pass_builder_.reset(new GpuPassStrategy(
+          *static_cast<GpuPassStrategy *>(pass_builder_.get())));
+    } else {
+      pass_builder_.reset(new CpuPassStrategy(
+          *static_cast<CpuPassStrategy *>(pass_builder_.get())));
+    }
+  }
+  if (use_tensorrt_) {
+    const auto &passes = pass_builder_->AllPasses();
+    if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
+        std::end(passes)) {
      // Append after the Affine_channel_conv_fuse pass.
      pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
    }
@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() {
 #endif
  }
+  if (enable_memory_optim_) {
+    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
+  }
  if (ir_debug_) {
    pass_builder()->TurnOnDebug();
  }
@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() {
 std::string contrib::AnalysisConfig::SerializeInfoCache() {
  std::stringstream ss;
+  ss << model_dir_;
+  ss << prog_file_;
+  ss << params_file_;
  ss << use_gpu_;
+  ss << device_id_;
  ss << memory_pool_init_size_mb_;
  ss << use_tensorrt_;
  ss << tensorrt_workspace_size_;
  ss << tensorrt_max_batchsize_;
+  ss << tensorrt_min_subgraph_size_;
+  ss << enable_memory_optim_;
+  ss << memory_optim_force_update_;
  ss << use_mkldnn_;
+  for (auto &item : mkldnn_enabled_op_types_) ss << item;
+  ss << ";";
+  ss << model_from_memory_;
  ss << enable_ir_optim_;
  ss << use_feed_fetch_ops_;
  ss << ir_debug_;
+  ss << specify_input_name_;
+  ss << cpu_math_library_num_threads_;
  return ss.str();
 }
 void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
    int cpu_math_library_num_threads) {
  cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  Update();
 }
 float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
+void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
+  enable_memory_optim_ = true;
+  memory_optim_force_update_ = force_update_cache;
+  Update();
+}
+bool contrib::AnalysisConfig::enable_memory_optim() const {
+  return enable_memory_optim_;
+}
 void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                             size_t prog_buffer_size,
                                             const char *param_buffer,
@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
  prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
  params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
  model_from_memory_ = true;
+  Update();
 }
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -24,18 +24,21 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#if PADDLE_WITH_TENSORRT
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#endif
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
+#if PADDLE_WITH_TENSORRT
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#endif
 DECLARE_bool(profile);
 namespace paddle {
@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
    LOG(ERROR) << "fail to get fetches";
    return false;
  }
+  // Collect variable shapes for memory optimization.
+  if (need_collect_var_shapes_for_memory_optim()) {
+    CollectVarShapes();
+  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
  // All the containers in the scope will be hold in inference, but the
@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.SetUseGPU(config_.use_gpu());
  argument_.SetGPUDeviceId(config_.gpu_device_id());
+  argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
+  argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
  if (!config_.model_dir().empty()) {
@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
+    LOG(INFO) << "TensorRT subgraph engine is enabled";
    argument_.SetUseTensorRT(true);
    argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  if (config_.use_mkldnn_) {
+    LOG(INFO) << "MKLDNN is enabled";
    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
  }
  auto passes = config_.pass_builder()->AllPasses();
-  if (!config_.ir_optim()) passes.clear();
+  if (!config_.ir_optim()) {
+    passes.clear();
+    LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
+  }
  argument_.SetIrAnalysisPasses(passes);
+  argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
  argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
  Analyzer().Run(&argument_);
@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() {
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
+  // TODO(Superjomn) deduce the directory path.
+  std::string out_path = inference::analysis::GetMemoryCachePath(
+      config_.model_dir(), config_.prog_file());
+  if (need_collect_var_shapes_for_memory_optim()) {
+    SerializeBatchVarShapes(out_path);
+  }
 }
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
  return std::unique_ptr<PaddlePredictor>(x);
 }
+void AnalysisPredictor::CollectVarShapes() {
+  VLOG(4) << "Collecting var shapes";
+  if (batch_var_shapes_.size() >= max_shape_collect_count_) return;
+  std::map<std::string, std::vector<int>> var_shapes;
+  for (auto var_name : inference_program_->Block(0).LocalVarNames()) {
+    auto *var = sub_scope_->FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    if (var->Type() == framework::VarTypeTrait<framework::LoDTensor>::kId ||
+        var->Type() == framework::VarTypeTrait<framework::Tensor>::kId) {
+      auto &tensor = var->Get<framework::LoDTensor>();
+      auto shape = framework::vectorize(tensor.dims());
+      var_shapes[var_name].assign(shape.begin(), shape.end());
+    }
+  }
+  batch_var_shapes_.push_back(var_shapes);
+  LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size()
+                       << " batch of var shapes for analysis";
+}
+void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) {
+  LOG(INFO) << "serialize batch var shapes to " << path;
+  std::ofstream file(path);
+  if (!file.is_open()) {
+    LOG(ERROR) << "failed to serialize the var shapes to " << path;
+    return;
+  }
+  // The sirialized data format:
+  // <tensor_name>:dim0,dim1,dim2,;
+  for (auto &batch : batch_var_shapes_) {
+    for (auto &ele : batch) {
+      file << ele.first << ":";
+      for (size_t i = 0; i < ele.second.size() - 1; i++) {
+        file << ele.second[i] << ",";
+      }
+      file << ele.second.back() << ";";
+    }
+    file << "\n";
+  }
+}
+bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
+  if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_;
+  bool need = false;
+  // check if the cache exists
+  if (!config_.enable_memory_optim()) {
+    need = false;
+  } else if (config_.enable_memory_optim() &&
+             !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
+                 config_.model_dir(), config_.prog_file()))) {
+    need = true;
+  } else if (config_.enable_memory_optim() &&
+             config_.memory_optim_force_update_) {
+    need = true;
+  }
+  need_collect_var_shapes_ = need ? 1 : 0;
+  return need;
+}
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
    const contrib::AnalysisConfig &config) {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor {
  void SetMkldnnThreadID(int tid);
 protected:
+  // For memory optimization.
+  bool need_collect_var_shapes_for_memory_optim();
+  void CollectVarShapes();
+  void SerializeBatchVarShapes(const std::string &path);
  bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
  bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
  bool CreateExecutor();
@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor {
  // A mutex help to make Clone thread safe.
  std::mutex clone_mutex_;
+  // For memory optimization.
+  const size_t max_shape_collect_count_{1000};
+  int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
+  std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
 private:
  // Some status here that help to determine the status inside the predictor.
  bool status_program_optimized_{false};

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -16,8 +16,10 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thread>  // NOLINT
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 DEFINE_string(dirname, "", "dirname to tests.");
@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) {
  }
 }
+TEST(AnalysisPredictor, memory_optim) {
+  AnalysisConfig config(FLAGS_dirname);
+  config.DisableGpu();
+  config.EnableMemoryOptim(true);
+  config.pass_builder()->TurnOnDebug();
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> output, output1;
+  {
+    // The first predictor help to cache the memory optimize strategy.
+    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+    // Run several times to check the parameters are not reused by mistake.
+    for (int i = 0; i < 5; i++) {
+      ASSERT_TRUE(predictor->Run(inputs, &output));
+    }
+  }
+  {
+    output.clear();
+    // The second predictor to perform memory optimization.
+    config.EnableMemoryOptim(false);
+    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+    // Run with memory optimization
+    ASSERT_TRUE(predictor->Run(inputs, &output));
+  }
+  // Run native
+  ASSERT_TRUE(native_predictor->Run(inputs, &output1));
+  LOG(INFO) << "the output " << inference::DescribeTensor(output.front());
+  LOG(INFO) << "the native output "
+            << inference::DescribeTensor(output1.front());
+  inference::CompareResult(output, output1);
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
+#!/bin/bash
 set -x
 PADDLE_ROOT=$1
 TURN_ON_MKL=$2 # use MKL or Openblas

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,7 +15,10 @@
 #pragma once
 #include <glog/logging.h>
+#include <fstream>
+#if !defined(_WIN32)
+#include <sys/time.h>
+#endif
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <iterator>
@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
  return true;
 }
-static std::string DescribeTensor(const PaddleTensor &tensor) {
+static std::string DescribeTensor(const PaddleTensor &tensor,
+                                  int max_num_of_data = 15) {
  std::stringstream os;
  os << "Tensor [" << tensor.name << "]\n";
  os << " - type: ";
@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
  }
 }
+static bool IsFileExists(const std::string &path) {
+  std::ifstream file(path);
+  bool exists = file.is_open();
+  file.close();
+  return exists;
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -192,6 +192,13 @@ struct AnalysisConfig {
   */
  bool model_from_memory() const { return model_from_memory_; }
+  /** Turn on memory optimize
+   * NOTE still in development, will release latter.
+   */
+  void EnableMemoryOptim(bool force_update_cache = false);
+  /** Tell whether the memory optimization is activated. */
+  bool enable_memory_optim() const;
  friend class ::paddle::AnalysisPredictor;
  /** NOTE just for developer, not an official API, easily to be broken.
@@ -232,6 +239,10 @@ struct AnalysisConfig {
  //  subgraph, 3 as default value.
  int tensorrt_min_subgraph_size_{3};
+  // memory reuse related.
+  bool enable_memory_optim_{false};
+  bool memory_optim_force_update_{false};
  bool use_mkldnn_{false};
  std::unordered_set<std::string> mkldnn_enabled_op_types_;

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include <glog/logging.h>
 namespace paddle {
@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
  LOG(ERROR) << "GPU not support MKLDNN yet";
 }
+void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
+  analysis_passes_.push_back(pass);
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -45,6 +45,9 @@ class PaddlePassBuilder {
  /** Delete all the passes that has type `pass_type`. */
  void DeletePass(const std::string &pass_type);
+  /** Append an analysis pass. */
+  void AppendAnalysisPass(const std::string &pass);
  /** Visualize the computation graph after each pass by generating a DOT
   * language file, one can draw them with the Graphviz toolkit.
   */
@@ -54,8 +57,18 @@ class PaddlePassBuilder {
  std::string DebugString();
  const std::vector<std::string> &AllPasses() const { return passes_; }
+  std::vector<std::string> AnalysisPasses() const {
+    auto passes = analysis_passes_;
+    // To make sure the ir_graph_to_program should be the last pass so any
+    // modication of IR will persist to the program.
+    passes.push_back("ir_graph_to_program_pass");
+    return passes;
+  }
 protected:
+  std::vector<std::string> analysis_passes_{
+      {"ir_graph_build_pass", "ir_analysis_pass",
+       "ir_params_sync_among_devices_pass"}};
  std::vector<std::string> passes_;
 };
@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder {
  /** The MKLDNN control exists in both CPU and GPU mode, because there can be
   * still some CPU kernels running in CPU mode.
   */
-  virtual void EnableMKLDNN() = 0;
+  virtual void EnableMKLDNN() {}
  bool use_gpu() const { return use_gpu_; }
@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder {
 protected:
  bool use_gpu_{false};
+  bool use_mkldnn_{false};
 };
 /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy {
    use_gpu_ = false;
  }
+  explicit CpuPassStrategy(const CpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {}
  virtual ~CpuPassStrategy() = default;
  void EnableMKLDNN() override {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_MKLDNN
+    if (!use_mkldnn_) {
      passes_.insert(passes_.begin(), "mkldnn_placement_pass");
-    for (auto &pass :
+      for (auto &pass : std::vector<std::string>(
-         std::vector<std::string>({"depthwise_conv_mkldnn_pass",    //
+               {"depthwise_conv_mkldnn_pass",    //
                "conv_bias_mkldnn_fuse_pass",    //
                "conv3d_bias_mkldnn_fuse_pass",  //
                "conv_relu_mkldnn_fuse_pass",    //
                "conv_elementwise_add_mkldnn_fuse_pass"})) {
        passes_.push_back(pass);
      }
+    }
+    use_mkldnn_ = true;
+#else
+    use_mkldnn_ = false;
 #endif
  }
-  CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
 };
 /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy {
    use_gpu_ = true;
  }
-  GpuPassStrategy(const GpuPassStrategy &other)
+  explicit GpuPassStrategy(const GpuPassStrategy &other)
      : PassStrategy(other.AllPasses()) {
    use_gpu_ = true;
  }

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -19,7 +19,7 @@ endfunction()
 function(inference_analysis_api_test target install_dir filename)
    inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()
@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")

--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  std::string turn_mask_pre = "turn_mask_";
  auto one_batch = data->NextBatch();
+  PADDLE_ENFORCE(!one_batch.response.empty());
  int size = one_batch.response[0].size();
  CHECK_EQ(size, kMaxTurnLen);
  // turn tensor assignment
@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) {
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                 input_slots_all, &outputs, FLAGS_num_threads);
@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+// Compare result of NativeConfig and AnalysisConfig with memory optimization.
+TEST(Analyzer_dam, compare_with_memory_optim) {
+  // The small dam will core in CI, but works in local.
+  if (FLAGS_max_turn_num == 9) {
+    contrib::AnalysisConfig cfg, cfg1;
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    SetInput(&input_slots_all);
+    // Run the first time to force to update memory cache
+    SetConfig(&cfg);
+    cfg.EnableMemoryOptim(true);
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
+    // Run second time to use the memory cache and perform memory optimization.
+    SetConfig(&cfg1);
+    cfg1.EnableMemoryOptim();
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
+        input_slots_all);
+  }
+}
 TEST(Analyzer_dam, compare) { compare(); }
 #ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif

--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_Text_Classification, profile) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg.pass_builder()->TurnOnDebug();
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) {
 TEST(Analyzer_Text_Classification, compare) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg.EnableMemoryOptim();
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) {
                FLAGS_infer_model + "/__params__");
  cfg->DisableGpu();
  cfg->SwitchIrDebug();
-  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchSpecifyInputNames(false);
  // TODO(TJ): fix fusion gru
  cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }
@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) {
  if (use_mkldnn) {
    cfg.EnableMKLDNN();
  }
+  // cfg.pass_builder()->TurnOnDebug();
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -103,8 +105,7 @@ void profile(bool use_mkldnn = false) {
    size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
    CHECK_EQ(numel, refer.data.size());
    for (size_t i = 0; i < numel; ++i) {
-      CHECK_LT(
+      EXPECT_NEAR(static_cast<float *>(output.data.data())[i], refer.data[i],
-          fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
                  1e-5);
    }
  }

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <string>
 #include <thread>  // NOLINT
@@ -28,9 +29,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/inference/utils/benchmark.h"
@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
        float *pdata = static_cast<float *>(out.data.data());
        float *pdata_ref = static_cast<float *>(ref_out.data.data());
        for (size_t j = 0; j < size; ++j) {
-          EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy);
+          CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
        }
        break;
      }

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) {
  }
 }
+TEST(TensorRT_mobilenet, profile) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  profile(model_dir, true, false);
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
+#pragma once
 #include <fstream>
 #include <iostream>
 #include <string>

--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
@@ -16,7 +16,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-using namespace paddle::inference;
+using namespace paddle::inference;  // NOLINT
 TEST(Benchmark, basic) {
  Benchmark benchmark;
  benchmark.SetName("key0");

--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase {
            << out_name;
    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    PADDLE_ENFORCE_LT(static_cast<size_t>(col), feed_list.size());
    auto &feed_item = feed_list.at(static_cast<size_t>(col));
    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();

--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt,
  std::cerr << style << Sprintf(fmt, args...) << reset();
 }
+template <typename... Args>
+static void PrettyLogInfo(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::info(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogDetail(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::detail(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH1(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H1(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH2(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H2(), fmt, args...);
+}
 }  // namespace string
 }  // namespace paddle