fea/infer memory optim2 (#14953)

885c4e57 · Yan Chunwei · GitHub · 6597ccb0 · 885c4e57 · 885c4e57
46 changed file
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -18,8 +18,10 @@ limitations under the License. */
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
+#include <stack>
 #include <unordered_map>
 #include <unordered_set>
+#include "paddle/fluid/framework/ir/graph_traits.h"
 DEFINE_string(print_sub_graph_dir, "",
              "FLAGS_print_sub_graph_dir is used "
@@ -41,7 +43,7 @@ void SortHelper(
    }
  }
-  VLOG(3) << "topology sort insert: " << node->Name()
+  VLOG(5) << "topology sort insert: " << node->Name() << " "
          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
  ret->push_back(node);
 }
@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
  return ret;
 }
+// Build operator inlink edge table.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    const Graph &graph) {
  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
  for (auto &n : graph.Nodes()) {
-    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (!n->IsOp()) continue;
    if (adj_list.find(n) == adj_list.end()) {
      adj_list[n] = std::unordered_set<ir::Node *>();
    }
@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
  return adj_list;
 }
+// Build operator outlink edge table.
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+  for (auto &n : graph.Nodes()) {
+    if (!n->IsOp()) continue;
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->outputs) {
+      for (auto &adj_n : var->outputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        adj_list[n].insert(adj_n);
+      }
+    }
+  }
+  return adj_list;
+}
+std::vector<ir::Node *> OpDFSSort(const Graph &graph) {
+  auto edge_table = BuildOperationOutAdjList(graph);
+  std::stack<Node *> stack;
+  for (auto &ele : edge_table) {
+    if (ele.first->inputs.empty()) {
+      // find the input ops (those without input vars)
+      stack.push(ele.first);
+    } else {
+      // find the ops with only persistable vars as inputs.
+      bool all_persistable = true;
+      for (auto *input : ele.first->inputs) {
+        if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) {
+          all_persistable = false;
+        }
+      }
+      if (all_persistable) {
+        stack.push(ele.first);
+      }
+    }
+  }
+  std::vector<Node *> res;
+  // start from the feed op and DFS
+  std::unordered_set<Node *> unique_set;
+  while (!stack.empty()) {
+    // will start from the last feed by default.
+    auto cur = stack.top();
+    stack.pop();
+    unique_set.insert(cur);
+    res.push_back(cur);
+    for (auto *op : edge_table[cur]) {
+      if (!unique_set.count(op)) {
+        stack.push(op);
+      }
+    }
+  }
+  return res;
+}
+std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph) {
+  std::vector<ir::Node *> nodes;
+  std::unordered_map<Node *, int> in_degree;
+  auto set_out_ops_ready = [&](Node *var) {
+    for (auto *op : var->outputs) {
+      --in_degree[op];
+    }
+  };
+  // build in_degree
+  for (auto *node : graph.Nodes()) {
+    if (node->IsOp()) {
+      in_degree[node] += node->inputs.size();
+    } else if (node->IsVar() && node->inputs.empty()) {
+      // put all the inputs of the whole graph ready.
+      set_out_ops_ready(node);
+    }
+  }
+  std::deque<Node *> op_queue;
+  // first visit
+  for (auto &node : OpDFSSort(graph)) {
+    if (node->IsOp()) {
+      op_queue.push_back(node);
+    }
+  }
+  // traverse the graph
+  int num_ops = op_queue.size();
+  while (num_ops) {
+    for (auto it = op_queue.begin(); it != op_queue.end(); it++) {
+      auto *&cur_op = *it;
+      if (!cur_op || in_degree[cur_op] > 0) continue;
+      // visit this node
+      // put all the output var of this op valid.
+      for (auto *out_var : cur_op->outputs) {
+        if (!out_var) continue;
+        set_out_ops_ready(out_var);
+      }
+      VLOG(8) << "visit " << cur_op->Name();
+      nodes.push_back(cur_op);
+      cur_op = nullptr;
+      num_ops--;
+    }
+  }
+  return nodes;
+}
 size_t GraphNum(const Graph &graph) {
  std::unordered_set<ir::Node *> nodes(graph.Nodes());
  std::unordered_set<ir::Node *> visited_nodes;
@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
  return graph_count;
 }
+void CleanIndividualNodes(Graph *graph) {
+  std::unordered_set<Node *> nodes2rm;
+  for (auto *node : graph->Nodes()) {
+    if (node->inputs.empty() && node->outputs.empty()) {
+      nodes2rm.insert(node);
+    }
+  }
+  for (auto *node : nodes2rm) {
+    graph->RemoveNode(node);
+  }
+}
+std::vector<Node *> TopologyVarientSort(const Graph &graph,
+                                        SortKind sort_kind) {
+  switch (sort_kind) {
+    case SortKind::TS:
+      return framework::ir::TopologySortOperations(graph);
+    default:
+      return framework::ir::TopologyDfsSortOperations(graph);
+  }
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
 // `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
+// Topological sort, but try to DFS.
+std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph);
+// Different kinds to sort the operators in a graph to a sequence.
+enum class SortKind {
+  // Topological Search
+  TS = 0,
+  // Topological and Depth First Search
+  TDFS
+};
+// Several kinds of topological sort.
+std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
+// Clean the nodes that doesn't connect to others.
+void CleanIndividualNodes(Graph *graph);
 // Build an adjacency list of operations for the `graph`.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    const Graph &graph);

--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/program_desc.h"
 namespace paddle {
@@ -29,6 +28,14 @@ namespace ir {
 std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
    std::unique_ptr<Graph> graph) const {
+  // Remove the unneeded variables after memory optimization.
+  std::unordered_set<std::string> vars2remove;
+  if (graph->Has(kGraphToProgramVarsToRemove)) {
+    vars2remove = graph->Get<std::unordered_set<std::string>>(
+        kGraphToProgramVarsToRemove);
+    VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes";
+  }
  ProgramDesc& program = Get<ProgramDesc>("program");
  std::unique_ptr<proto::ProgramDesc> program_pb(
@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
  std::unordered_set<std::string> visited_vars;
  for (ir::Node* n : graph->Nodes()) {
    if (n->IsVar()) {
-      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
+      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
+          !vars2remove.count(n->Var()->Name())) {
        visited_vars.insert(n->Var()->Name());
        block->add_vars()->MergeFrom(*n->Var()->Proto());
      }
    }
  }
  block->clear_ops();
-  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
-  for (ir::Node* n : nodes) {
+  std::vector<ir::Node*> nodes;
-    if (!n->Op()) {
+  if (Has(kGraphToProgramSortKind)) {
-      continue;
+    // Inference Memory Optimize relays on this branch.
+    int sort_kind = Get<int>(kGraphToProgramSortKind);
+    nodes = TopologyVarientSort(
+        *graph, static_cast<framework::ir::SortKind>(sort_kind));
+  } else {
+    nodes = TopologySortOperations(*graph);
  }
+  for (ir::Node* n : nodes) {
+    if (!n->Op()) continue;
    block->add_ops()->MergeFrom(*n->Op()->Proto());
  }
  program.CopyFrom(*program_pb);
  return graph;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -20,6 +20,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
+const char kGraphToProgramVarsToRemove[] =
+    "__graph_to_program_vars_to_remove__";
+const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
 class GraphToProgramPass : public Pass {
 protected:
  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;

--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -64,7 +64,7 @@ class Node {
  std::string Name() const { return name_; }
-  VarDesc* Var() {
+  VarDesc* Var() const {
    PADDLE_ENFORCE(IsVar());
    return var_desc_.get();
  }

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
                             "running Paddle Inference";
 #endif  // PADDLE_ON_INFERENCE
  for (auto &op : ops_) {
-    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
+    VLOG(4) << std::this_thread::get_id() << " run "
-            << " on scope " << scope_;
+            << op->DebugStringEx(scope_) << " on scope " << scope_;
    op->SetIsCalledByExecutor(false);
    op->Run(*scope_, place_);
  }
@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
    anc = anc->parent();
  }
+  int num_vars = 0;
  for (auto &var : global_block.AllVars()) {
    if (var->Name() == framework::kEmptyVarName) {
      continue;
    }
+    num_vars++;
    if (persistable == var->Persistable()) {
      if (persistable) {
@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
      }
    }
  }
+  VLOG(4) << "naive executor create " << num_vars << " vars";
 }
 void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -18,6 +18,7 @@ cc_library(analysis SRCS
  analyzer.cc
  analysis_pass
  DEPS ${analysis_deps} analysis_helper
+  ${INFER_IR_PASSES}
  )
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,8 +15,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
 #include "paddle/fluid/inference/analysis/passes/passes.h"
+#include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
 namespace inference {
@@ -24,13 +24,16 @@ namespace analysis {
 Analyzer::Analyzer() {}
-void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
+void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
-void Analyzer::RunIrAnalysis(Argument *argument) {
+void Analyzer::RunAnalysis(Argument *argument) {
-  std::vector<std::string> passes({"ir_analysis_compose_pass"});
+  PADDLE_ENFORCE(argument->analysis_passes_valid(),
+                 "analsis_passes is not valid in the argument.");
-  for (auto &pass : passes) {
+  for (auto &pass : argument->analysis_passes()) {
-    PassRegistry::Global().Retreive(pass)->Run(argument);
+    string::PrettyLogH1("--- Running analysis [%s]", pass);
+    auto *ptr = PassRegistry::Global().Retreive(pass);
+    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
+    ptr->Run(argument);
  }
 }

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -54,7 +54,7 @@ class Analyzer final {
  DISABLE_COPY_AND_ASSIGN(Analyzer);
 protected:
-  void RunIrAnalysis(Argument* argument);
+  void RunAnalysis(Argument* argument);
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
  argument.SetModelDir(FLAGS_inference_model_dir);
  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
  argument.SetUseGPU(false);
+  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
+                              "ir_params_sync_among_devices_pass"});
  Analyzer analyser;
  analyser.Run(&argument);
@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
  argument.SetModelDir(FLAGS_inference_model_dir);
  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
  argument.SetUseGPU(false);
+  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
+                              "ir_params_sync_among_devices_pass"});
  Analyzer analyser;
  analyser.Run(&argument);

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -110,16 +110,20 @@ struct Argument {
  // The overall Scope to work on.
  DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
+  // The default program, loaded from disk.
  DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
  // The ir passes to perform in analysis phase.
  DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
+                      std::vector<std::string>);
  // Pass a set of op types to enable its mkldnn kernel
  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
+  // Passed from config.
  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
@@ -127,6 +131,13 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
+  // Memory optimized related.
+  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
+  // Indicate which kind of sort algorithm is used for operators, the memory
+  // optimization relays on the sort algorithm.
+  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
  // The program transformed by IR analysis phase.
  DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
                             framework::proto::ProgramDesc);

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -28,6 +28,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/port.h"
+#ifdef _WIN32
+#define GCC_ATTRIBUTE(attr__) ;
+#else
+#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
+#endif
+#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
 namespace paddle {
 namespace inference {
 namespace analysis {

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
  PADDLE_ENFORCE(graph.get());
  // Apply all the passes
  for (const auto &pass : passes_) {
+    if (pass->Type() == "graph_viz_pass") continue;
    PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
    graph = pass->Apply(std::move(graph));
  }

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
-if (TENSORRT_FOUND)
+if (WITH_GPU AND TENSORRT_FOUND)
  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
  set(analysis_deps ${analysis_deps}

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
    if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
-    LOG(INFO) << "detect a subgraph size " << subgraph.size();
    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
 namespace inference {
@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  framework::BlockDesc block_desc(nullptr, &block_proto);
  block_desc.Proto()->set_parent_idx(-1);
  block_desc.Proto()->set_idx(0);
+  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
+                          subgraph.size());
  for (auto *node : subgraph) {
    auto *op = block_desc.AppendOp();
    *op->Proto() = *node->Op()->Proto();

--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
 cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
+cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
-set(analysis_deps ${analysis_deps}
+cc_library(analysis_passes SRCS passes.cc DEPS
  ir_graph_build_pass
  ir_analysis_pass
+  ir_params_sync_among_devices_pass
+  memory_optim_pass
+  ir_graph_to_program_pass
+)
+set(analysis_deps ${analysis_deps}
        analysis_passes
        subgraph_detector
        CACHE INTERNAL "")
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 namespace paddle {
@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
  IRPassManager the_ir_manager(argument);
  graph = the_ir_manager.Apply(std::move(graph));
  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
-  argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
-      the_ir_manager.AcquireProgram(&graph, argument->main_program())));
  argument->SetMainGraph(graph.release());
+  CollectFusionStatis(argument);
+}
+void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
+  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
+    LOG(INFO) << "argument has no fuse statis";
+    return;
+  }
+  argument->SetFusionStatis(
+      argument->main_graph().Get<Argument::fusion_statis_t>(
+          framework::ir::kFuseStatisAttr));
 }
 std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -29,6 +29,9 @@ namespace analysis {
 class IrAnalysisPass : public AnalysisPass {
 public:
  void RunImpl(Argument* argument) override;
+  void CollectFusionStatis(Argument* argument);
  std::string repr() const override;
 };

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -12,49 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
-#include <string>
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
+#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
-void IrAnalysisComposePass::RunImpl(Argument *argument) {
+void IrGraphToProgramPass::RunImpl(Argument *argument) {
-  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
+  auto pass =
-  ApplyIrPasses(argument);
+      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
-  CollectFusionStatis(argument);
-}
-std::string IrAnalysisComposePass::repr() const {
-  return "ir-analysis-compose-pass";
-}
-void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
+  if (argument->memory_optim_sort_kind_valid()) {
-  std::vector<std::string> passes({
+    pass->Set(framework::ir::kGraphToProgramSortKind,
-      "ir_graph_build_pass", "ir_analysis_pass",
+              new int(argument->memory_optim_sort_kind()));
-      "ir_params_sync_among_devices_pass",
-  });
-  for (const auto &pass : passes) {
-    VLOG(2) << "Run pass " << pass;
-    auto *the_pass = PassRegistry::Global().Retreive(pass);
-    the_pass->Run(argument);
  }
-}
-void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) {
+  std::unique_ptr<Graph> graph(argument->main_graph_ptr());
-  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
+  framework::ProgramDesc desc(argument->main_program());
-    LOG(INFO) << "argument has no fuse statis";
+  pass->SetNotOwned("program", &desc);
-    return;
+  auto thegraph = pass->Apply(std::move(graph));
-  }
+  thegraph.release();  // the argument still own the graph.
-  argument->SetFusionStatis(
-      argument->main_graph().Get<Argument::fusion_statis_t>(
+  argument->SetIrAnalyzedProgram(
-          framework::ir::kFuseStatisAttr));
+      new framework::proto::ProgramDesc(*desc.Proto()));
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
@@ -14,31 +14,17 @@
 #pragma once
-#include <string>
-#include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/passes/passes.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
-/*
+class IrGraphToProgramPass : public AnalysisPass {
- * The analysis pass to run a list of IR passes (like a function call).
- * Currently, it should be the first pass of analysis phase.
- */
-class IrAnalysisComposePass : public AnalysisPass {
 public:
-  void RunImpl(Argument* argument) override;
+  void RunImpl(Argument *argument) override;
-  std::string repr() const override;
- private:
+  std::string repr() const override { return "ir-graph-to-param-pass"; }
-  void ApplyIrPasses(Argument* argument);
-  void CollectFusionStatis(Argument* argument);
-  // Assign a Scope for IR passes to modify the weights.
-  void AssignScopeToModify(Argument* argument);
 };
 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include <algorithm>
+#include <fstream>
+#include <limits>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Graph;
+using framework::ir::Node;
+using framework::ir::TopologyVarientSort;
+using space_table_t = MemoryOptimizePass::space_table_t;
+// Collect the lifecycles of the tensors.
+// Traverse the graph in topological order.
+// The traversal order also affect the lifecycles, so different sort_kind is
+// used.
+void MemoryOptimizePass::CollectLifeCycle(
+    std::unordered_map<std::string, lifecycle_t>* lifecycles,
+    int sort_kind) const {
+  max_lifecycle_ = 0;
+  for (auto* op_node : framework::ir::TopologyVarientSort(
+           *graph_, static_cast<framework::ir::SortKind>(sort_kind))) {
+    if (!op_node->IsOp()) continue;
+    auto reads = op_node->inputs;
+    auto writes = op_node->outputs;
+    std::vector<Node*> requires(reads.begin(), reads.end());
+    requires.insert(requires.end(), writes.begin(), writes.end());
+    // Disable reuse of feed variables.
+    if (op_node->Name() == "feed") {
+      for (auto* node : op_node->outputs) {
+        auto var = node->Name();
+        lifecycles->emplace(var,
+                            std::make_pair(0, std::numeric_limits<int>::max()));
+      }
+    } else {
+      // Normal operators.
+      for (const Node* node : requires) {
+        if (node->Var()->Persistable()) continue;
+        std::string var = node->Name();
+        if (!lifecycles->count(var)) {
+          (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_);
+        } else {
+          (*lifecycles)[var].second =
+              std::max(max_lifecycle_, lifecycles->at(var).second);  // max()
+        }
+      }
+    }
+    ++max_lifecycle_;
+  }
+}
+// TODO(Superjomn) Make this a general help method.
+int DataTypeToSpace(framework::proto::VarType_Type type) {
+  switch (type) {
+    case framework::proto::VarType_Type_BOOL:
+      return sizeof(bool);
+    case framework::proto::VarType_Type_FP32:
+      return sizeof(float);
+    case framework::proto::VarType_Type_INT32:
+      return sizeof(int32_t);
+    case framework::proto::VarType_Type_INT64:
+      return sizeof(int64_t);
+    default:
+      PADDLE_THROW("Unknown data type");
+  }
+}
+// Collect the memory size of the tensors.
+void MemoryOptimizePass::CollectVarMemorySize(
+    const std::unordered_map<std::string, size_t>& batch_var_ave_dim,
+    std::unordered_map<std::string, Node*>* tensor_nodes,
+    space_table_t* space_table) const {
+  // Collect tensors from graph.
+  for (auto* node : graph_->Nodes()) {
+    if (node->IsVar() &&
+        node->Var()->GetType() ==
+            framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
+      // Parameters will not be reused.
+      if (node->Var()->Persistable()) continue;
+      (*tensor_nodes)[node->Name()] = node;
+      (*space_table)[node->Name()] =
+          DataTypeToSpace(node->Var()->GetDataType()) *
+          batch_var_ave_dim.at(node->Name());
+    }
+  }
+}
+// Find a sutable (big enough but smallest to avoid memory waste).
+//
+// Args:
+// @tensor_nodes: the tensor nodes in the ir::Graph.
+// @free_existing_tensors: the allocated tensor and are free.
+// @space_table: the memory space of tensors.
+// @tensor2use: the tensor that requires memory.
+//
+// Returns:
+// true if found some existing tensor to reuse.
+// false if no sutable tensor to reuse, one need to allocate a new tensor for
+// this requirement.
+// The suitable tensor for reuse is one that is approximately equal to the
+// memory demand.
+bool FindSuitableTensorToReuse(
+    const std::string& tensor, int space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    const space_table_t& space_table,
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    std::string* tensor2use) __SHOULD_USE_RESULT__;
+bool FindSuitableTensorToReuse(
+    const std::string& tensor, int space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    const space_table_t& space_table,
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    std::string* tensor2use) {
+  std::pair<std::string, size_t> best_fit;
+  best_fit.second = std::numeric_limits<int>::max();
+  VLOG(5) << "Split Tensors to " << var_clusters.size() << " clusters";
+  // find the cluster this var belongs to.
+  const std::unordered_set<std::string>* cluster = nullptr;
+  for (const auto& c : var_clusters) {
+    if (c.count(tensor)) {
+      cluster = &c;
+      break;
+    }
+  }
+  PADDLE_ENFORCE_NOT_NULL(cluster,
+                          "something wrong in memory optimization, the "
+                          "variable %s not in the clusters.",
+                          tensor);
+  for (auto& candidate : *free_existing_tensors) {
+    // This is not a temporary tensor.
+    if (!space_table.count(candidate)) continue;
+    // Not in the same cluster.
+    if (!cluster->count(candidate)) continue;
+    size_t space = space_table.at(candidate);
+    size_t space_diff = std::abs<size_t>(space - space_required);
+    if (space_diff < best_fit.second) {
+      best_fit.first = candidate;
+      best_fit.second = space_diff;
+    }
+  }
+  if (best_fit.second < std::numeric_limits<int>::max()) {
+    *tensor2use = best_fit.first;
+    return true;
+  }
+  return false;
+}
+// Allocate new tensor instead of reusing the existing one.
+void AllocateNewTensor(
+    const std::string& name, size_t space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    space_table_t* space_table,
+    std::unordered_map<std::string, std::string>* reuse_table) {
+  // The newly born tensor is free to be used.
+  free_existing_tensors->insert(name);
+  // Register the space it has.
+  PADDLE_ENFORCE(space_table->count(name));
+  space_table->at(name) = std::max(space_table->at(name), space_required);
+  // The allocated new tensor use the memory of itself.
+  (*reuse_table)[name] = name;
+}
+// Free a tensor and make it resuable.
+// @tensor: the tensor to free.
+// @free_existing_tensors: the free and allocated tensors.
+// @reuse_table: a map from a fake tensor to the existing allocated tensor.
+void FreeATensor(const std::string& tensor,
+                 std::unordered_set<std::string>* free_existing_tensors,
+                 std::unordered_map<std::string, std::string>* reuse_table) {
+  if (tensor == "feed" || tensor == "fetch") return;
+  // the really allocated tensor.
+  const auto& free_tensor = reuse_table->at(tensor);
+  free_existing_tensors->insert(free_tensor);
+}
+// Reuse a free existing tensor.
+void ReuseATensor(const std::string& tensor, const std::string& tensor2reuse,
+                  size_t memory_size,
+                  std::unordered_set<std::string>* free_existing_tensors,
+                  std::unordered_map<std::string, std::string>* reuse_table,
+                  space_table_t* reused_space_table) {
+  auto it = free_existing_tensors->find(tensor2reuse);
+  PADDLE_ENFORCE(it != free_existing_tensors->end());
+  free_existing_tensors->erase(it);
+  (*reuse_table)[tensor] = tensor2reuse;
+  // Update the memory size of a reused tensor, the memory will grow if the
+  // required memory is larger.
+  (*reused_space_table)[tensor2reuse] =
+      std::max(reused_space_table->at(tensor2reuse), memory_size);
+}
+// Calculate the memory usage.
+void EvaluateMemoryUsage(
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    const space_table_t& space_table,
+    const std::unordered_map<std::string, size_t>& var_batch_ave_size,
+    size_t* allocated, size_t* saved) {
+  *allocated = 0;
+  *saved = 0;
+  for (auto elem : reuse_table) {
+    if (elem.first == elem.second) {
+      *allocated += space_table.at(elem.first);
+      VLOG(4) << elem.first << " <-> " << elem.second << " "
+              << space_table.at(elem.first) << " "
+              << space_table.at(elem.second);
+    } else {
+      *saved += space_table.at(elem.first);
+      VLOG(4) << "reuse " << elem.first << " -> " << elem.second;
+    }
+  }
+  VLOG(4) << "allocated " << *allocated;
+  VLOG(4) << "saved " << *saved;
+}
+// Return saved ratio.
+void MemoryOptimizePass::MakeReusePlan(
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    const std::unordered_map<std::string, size_t>& var_batch_ave_size,
+    const space_table_t& space_table,
+    std::unordered_map<std::string, std::string>* reuse_table, int sort_kind,
+    MemoryAllocation* memory_allocation) const {
+  // Clear the existing plan.
+  reuse_table->clear();
+  // The `space_table` stores the real memory size for each tensor.
+  // The `reused_space_table` stores the maximum memory size required by a
+  // tensor during the memory reusing, the small tensor might be reused by a
+  // larger tensor, and the memory size of the small one will grow.
+  auto reused_space_table = space_table;
+  std::unordered_map<std::string, lifecycle_t> life_cycles;
+  std::unordered_map<std::string, Node*> tensor_nodes;
+  // The allocated tensors whose memory can be reused, they will live across the
+  // program execution.
+  std::unordered_set<std::string> existing_tensors;
+  // The existing tensor that has been allocated, and is also free to reuse.
+  std::unordered_set<std::string> free_existing_tensors;
+  CollectLifeCycle(&life_cycles, sort_kind);
+  for (int age = 0; age < max_lifecycle_; ++age) {
+    std::unordered_set<std::string> born_tensors;
+    std::unordered_set<std::string> dead_tensors;
+    // Gather the dead and born tensors.
+    for (auto elem_it = life_cycles.begin(); elem_it != life_cycles.end();
+         elem_it++) {
+      if (elem_it->second.first == -1) {
+        continue;
+      }
+      const auto& tensor = elem_it->first;
+      const auto& lifecycle = elem_it->second;
+      VLOG(4) << "process " << tensor << " reuse " << lifecycle.first << "->"
+              << lifecycle.second;
+      // Collect newly born tensors.
+      if (lifecycle.first == age) {
+        born_tensors.insert(tensor);
+      }
+      // Collect dead tensors whose memory can be reused.
+      else if (lifecycle.second < age) {  // NOLINT
+        dead_tensors.insert(tensor);
+        // remove to avoid duplicate process.
+        elem_it->second.first = -1;  // avoid duplicate search
+      }
+    }
+    // Reuse the dead tensors for born tensors
+    for (const auto& tensor : born_tensors) {
+      // Skip the feed and fetch tensor for that they share data with others.
+      std::string tensor2reuse;
+      if (!space_table.count(tensor)) continue;
+      size_t space_required = space_table.at(tensor);
+      if (FindSuitableTensorToReuse(tensor, space_required, tensor_nodes,
+                                    &free_existing_tensors, reused_space_table,
+                                    var_clusters, &tensor2reuse)) {
+        if (tensor != tensor2reuse) {
+          VLOG(4) << tensor << " -> " << tensor2reuse;
+        }
+        ReuseATensor(tensor, tensor2reuse, space_required,
+                     &free_existing_tensors, reuse_table, &reused_space_table);
+      } else {
+        VLOG(4) << "allocate " << tensor;
+        AllocateNewTensor(tensor, space_required, tensor_nodes,
+                          &free_existing_tensors, &reused_space_table,
+                          reuse_table);
+        ReuseATensor(tensor, tensor, space_required, &free_existing_tensors,
+                     reuse_table, &reused_space_table);
+      }
+    }
+    for (const auto& tensor : dead_tensors) {
+      // free its memory.
+      FreeATensor(tensor, &free_existing_tensors, reuse_table);
+    }
+  }
+  EvaluateMemoryUsage(*reuse_table, reused_space_table, var_batch_ave_size,
+                      &(memory_allocation->allocated),
+                      &(memory_allocation->saved));
+  memory_allocation->sort_kind = sort_kind;
+}
+void BuildVarNodeTable(Graph* graph,
+                       std::unordered_map<std::string, Node*>* var_node_table) {
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar()) {
+      (*var_node_table)[node->Name()] = node;
+    }
+  }
+}
+// NOTE The optimized opdesc doesn't match ir::Graph.
+void UpdateOpDescsByReuse(
+    Graph* graph,
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    int sort_kind) {
+  // TODO(Superjomn) change here to be compatible with the runtime order.
+  for (auto* node : TopologyVarientSort(
+           *graph, static_cast<framework::ir::SortKind>(sort_kind))) {
+    if (node->IsOp()) {
+      // Replace the original inputs/outputs with the reused tensors.
+      std::unordered_map<std::string, std::vector<std::string>> in_args,
+          out_args;
+      for (auto argument : node->Op()->Inputs()) {
+        for (const auto& x : argument.second) {
+          auto name = x;
+          if (reuse_table.count(x) && reuse_table.at(x) != x) {
+            name = reuse_table.at(x);
+          }
+          in_args[argument.first].push_back(name);
+          VLOG(4) << node->Name() << " input " << x << " -> " << name;
+        }
+      }
+      for (auto argument : node->Op()->Outputs()) {
+        for (const auto& x : argument.second) {
+          auto name = x;
+          if (reuse_table.count(x) && reuse_table.at(x) != x) {
+            name = reuse_table.at(x);
+          }
+          out_args[argument.first].push_back(name);
+          VLOG(4) << node->Name() << " output " << x << " -> " << name;
+        }
+      }
+      // Update arguments.
+      for (auto& arg : in_args) {
+        node->Op()->SetInput(arg.first, arg.second);
+      }
+      for (auto& arg : out_args) {
+        node->Op()->SetOutput(arg.first, arg.second);
+      }
+      node->Op()->Flush();
+    }
+  }
+}
+void MemoryOptimizePass::PerformReusePlan(
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    int sort_kind, std::unordered_set<std::string>* vars2remove) const {
+  std::unordered_map<std::string, Node*> var_node_table;
+  BuildVarNodeTable(graph_, &var_node_table);
+  UpdateOpDescsByReuse(graph_, reuse_table, sort_kind);
+  for (auto& item : reuse_table) {
+    if (item.first != item.second) {
+      vars2remove->insert(item.first);
+    }
+  }
+  VLOG(2) << "to remove vars " << vars2remove->size();
+}
+std::vector<std::string> split(const std::string& line, char delim) {
+  std::vector<std::string> res;
+  std::string field;
+  std::stringstream line_stream(line);
+  while (std::getline(line_stream, field, delim)) {
+    res.emplace_back(field);
+  }
+  return res;
+}
+// Deserialize the batch var shapes from the cache file.
+std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
+    const std::string& path) {
+  std::ifstream file(path);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s  to read cache", path);
+  std::string line;
+  std::vector<std::map<std::string, std::vector<int>>> batch_shapes;
+  while (std::getline(file, line)) {
+    std::map<std::string, std::vector<int>> batch;
+    for (const auto& var_info : split(line, ';')) {
+      auto fields = split(var_info, ':');
+      PADDLE_ENFORCE_EQ(fields.size(), 2UL);
+      auto var_name = fields.front();
+      auto shape_str = split(fields[1], ',');
+      std::vector<int> shape;
+      for (const auto& v : shape_str) shape.push_back(std::stoi(v));
+      batch[var_name] = shape;
+    }
+    batch_shapes.push_back(batch);
+  }
+  return batch_shapes;
+}
+// Calculate the average dim of each tensor from the batch shape cache.
+std::unordered_map<std::string, size_t> GetBatchAverageSize(
+    const std::vector<std::map<std::string, std::vector<int>>>& batches) {
+  std::unordered_map<std::string, size_t> var2size;
+  // The average size of the batches for each variable.
+  int num_batch = 0;
+  for (const auto& batch : batches) {
+    num_batch++;
+    for (const auto& item : batch) {
+      int dim = std::accumulate(item.second.begin(), item.second.end(), 1,
+                                [](int a, int b) { return a * b; });
+      var2size[item.first] += dim;
+    }
+  }
+  for (auto& item : var2size) {
+    item.second /= num_batch;
+  }
+  return var2size;
+}
+// Analysis the batch shapes loading from the cache file.
+// By splitting the variables to different clusters by analyzing their batch
+// size, we can pre-schedule the changes of difference LoDTensor when different
+// length of input sequences is entered.
+// This should works fine for the models operating on sentences.
+std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
+    const std::vector<std::map<std::string, std::vector<int>>>& batches) {
+  // collect the batch size of each shape and combine to a stringstream in
+  // converient to generate a hash.
+  std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
+  for (auto& batch : batches) {
+    for (auto& ele : batch) {
+      int batch_size = ele.second.front();
+      // TODO(Superjomn) might consume large memory here, use combine hash.
+      var_batchsize_hashes[ele.first] << batch_size;
+    }
+  }
+  // Split to sets by batch size sequences.
+  std::unordered_map<size_t /*hash*/, std::unordered_set<std::string>>
+      shape_sets;
+  for (auto& ele : var_batchsize_hashes) {
+    auto hash = std::hash<std::string>()(ele.second.str());
+    shape_sets[hash].insert(ele.first);
+  }
+  std::vector<std::unordered_set<std::string>> res;
+  for (auto& ele : shape_sets) {
+    res.emplace_back(std::move(ele.second));
+  }
+  VLOG(3) << "Cluster by batch_size and get " << res.size() << " clusters";
+  return res;
+}
+// Analysis the batch shapes loading from the cache file, and split them to
+// different clusters by their size.
+// This should works fine for the overall models.
+std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
+    const space_table_t& space_table,
+    const std::vector<std::map<std::string, std::vector<int>>>& batches,
+    int interval = 200000) {
+  PADDLE_ENFORCE_GT(interval, 0);
+  // cluster to different clusters.
+  size_t max_size = 0;
+  for (auto& item : space_table) {
+    max_size = std::max(item.second, max_size);
+  }
+  VLOG(4) << "tensor max size " << max_size;
+  std::vector<std::unordered_set<std::string>> res;
+  // cluster by intervals.
+  for (size_t interval_size = 0; interval_size <= max_size;
+       interval_size += interval) {
+    std::unordered_set<std::string> cluster;
+    for (auto& item : space_table) {
+      if (interval_size <= item.second &&
+          interval_size + interval > item.second) {
+        cluster.insert(item.first);
+      }
+    }
+    if (!cluster.empty()) {
+      res.push_back(cluster);
+    }
+  }
+  VLOG(3) << "Cluster by interval and get " << res.size() << " cluster";
+  return res;
+}
+std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
+void MemoryOptimizePass::RunImpl(Argument* argument) {
+  // When force update, should not optimize memory.
+  if (!argument->enable_memory_optim() || argument->memory_optim_force_update())
+    return;
+  graph_ = argument->main_graph_ptr();
+  auto path = GetMemoryCachePath(
+      argument->model_dir_valid() ? argument->model_dir() : "",
+      argument->model_program_path_valid() ? argument->model_program_path()
+                                           : "");
+  VLOG(3) << "Load memory cache from " << path;
+  if (inference::IsFileExists(path)) {
+    VLOG(4) << "Performing memory optimize";
+    auto batches = DeseralizeBatchVarShapes(path);
+    auto var_batch_ave_size = GetBatchAverageSize(batches);
+    std::unordered_map<std::string, Node*> tensor_nodes;
+    space_table_t space_table;
+    CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
+    std::unordered_map<std::string, std::string> reuse_table;
+    double max_saving_ratio = 0.;
+    std::vector<std::function<MemoryAllocation()>> strategies;
+    for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_batch_size =
+            AnalysisBatchShapesByBatchSize(batches);
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_batch_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches, 1024);  // interval 1kb
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches, 1024 * 1024);  // interval 1MB
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches,
+            std::numeric_limits<int>::max());  // no intervals
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+    }
+    std::function<MemoryAllocation()>* best_strategy{nullptr};
+    // Try all strategies to get the best result.
+    for (auto& strategy : strategies) {
+      auto allocation = strategy();
+      string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
+                              allocation.GetSavingRatio());
+      if (allocation.GetSavingRatio() > max_saving_ratio) {
+        max_saving_ratio = allocation.GetSavingRatio();
+        best_strategy = &strategy;
+      }
+    }
+    if (!best_strategy) {
+      LOG(ERROR)
+          << "This model makes poor memory optimize, skip memory optimize";
+      return;
+    }
+    auto memory_allocation = (*best_strategy)();
+    string::PrettyLogH2(
+        "--- Saved %.2f%s memory for workspace(temporary variables)",
+        memory_allocation.GetSavingRatio() * 100, "%");
+    string::PrettyLogDetail("--- Allocated %d MB",
+                            memory_allocation.allocated / 1024. / 1024.);
+    string::PrettyLogDetail("--- Saved %d MB",
+                            memory_allocation.saved / 1024. / 1024.);
+    argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
+                               new std::unordered_set<std::string>);
+    auto& vars2remove =
+        argument->main_graph().Get<std::unordered_set<std::string>>(
+            framework::ir::kGraphToProgramVarsToRemove);
+    PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
+    argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
+  }
+}
+float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {
+  return (saved / 1024.) / (allocated / 1024. + saved / 1024.);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+/*
+ * Memory optimization pass for inference with pre-analysis of memory usage
+ * without GC.
+ * Different from training, the inference memory reuse strategies doesn't
+ * include GC for that overhead is too much when batch size equals one.
+ *
+ * The inference memory reuse tries to pre-determine the tensor reusing strategy
+ * without runtime overhead.
+ *
+ * To improve the strategy's performance, a warm-up running is introduced:
+ *   - Before officially deploy the inference program, one should warm it up and
+ *     generate some runtime cache,
+ *   - Run the inference program with several batches of data, it will persist
+ *     some runtime information about memory of tensors to disk, we call the
+ *     information the memory reusing cache,
+ *   - With the memory reusing cache, user can deploy the inference to a
+ *     service, before running the model, the inference program will load the
+ *     memory cache, analysis it and generate the best memory reusing strategy,
+ *     and adjust the execution of the network.
+ *
+ * With the warm-up and memory reusing cache design, the memory reusing
+ * algorithm can analysis the real memory consume of the tensors, even with the
+ * flexible LoDTensor and special shape changing operators such as
+ * sequence-pooling.
+ */
+class MemoryOptimizePass : public AnalysisPass {
+ public:
+  using space_table_t = std::unordered_map<std::string, size_t>;
+  using lifecycle_t = std::pair<int, int>;
+  struct MemoryAllocation {
+    size_t allocated;  // allocated memory in byte.
+    size_t saved;      // saved memory in byte.
+    int sort_kind;     // the kind of the corresponding sorting algorithm.
+    // Get the memory saving ratio of temporary variables.
+    float GetSavingRatio() const;
+  };
+  virtual ~MemoryOptimizePass() = default;
+ protected:
+  void RunImpl(Argument *argument) override;
+ private:
+  void CollectLifeCycle(
+      std::unordered_map<std::string, lifecycle_t> *lifecycles,
+      int sort_kind) const;
+  void CollectVarMemorySize(
+      const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
+      std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
+      space_table_t *space_table) const;
+  // Returns percentage of saved memory.
+  void MakeReusePlan(
+      const std::vector<std::unordered_set<std::string>> &var_clusters,
+      const std::unordered_map<std::string, size_t> &var_batch_ave_size,
+      const space_table_t &space_table,
+      std::unordered_map<std::string, std::string> *reuse_table, int sort_kind,
+      MemoryAllocation *memory_allocation) const;
+  void PerformReusePlan(
+      const std::unordered_map<std::string, std::string> &reuse_table,
+      int sort_kind, std::unordered_set<std::string> *vars2remove) const;
+ public:
+  std::string repr() const override;
+ private:
+  mutable framework::ir::Graph *graph_{nullptr};
+  mutable int max_lifecycle_{-1};
+};
+static std::string GetMemoryCachePath(const std::string &model_path,
+                                      const std::string &prog_path) {
+  auto path = model_path.empty() ? prog_path : model_path;
+  return path + ".memory_cache";
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -13,24 +13,31 @@
 // limitations under the License.
 #include "paddle/fluid/inference/analysis/passes/passes.h"
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
 PassRegistry::PassRegistry() {
+  // Register manually to avoid the trivial `USE_OP` like macro for easier use
+  // and link.
  passes_.emplace("ir_analysis_pass",
                  std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
  passes_.emplace("ir_graph_build_pass",
                  std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
-  passes_.emplace("ir_analysis_compose_pass",
+  passes_.emplace("memory_optimize_pass",
-                  std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+                  std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
  passes_.emplace(
      "ir_params_sync_among_devices_pass",
      std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
+  passes_.emplace(
+      "ir_graph_to_program_pass",
+      std::unique_ptr<IrGraphToProgramPass>(new IrGraphToProgramPass));
 }
 }  // namespace analysis

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,8 +18,10 @@ if(APPLE)
 endif(APPLE)
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass
+set(inference_deps ${analysis_deps}
-    ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
+  paddle_inference_api paddle_fluid_api
+  analysis pass naive_executor
+  ${GLOB_PASS_LIB})
 if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
@@ -29,7 +31,8 @@ add_subdirectory(details)
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
+  reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
           lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
           analysis_config paddle_pass_builder zero_copy_tensor
@@ -44,7 +47,7 @@ if(WITH_TESTING)
                      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
  set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
 endif()
-cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
        ARGS --dirname=${WORD2VEC_MODEL_DIR})
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
 contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
  model_dir_ = model_dir;
+  Update();
 }
 contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
                                        const std::string &params_file) {
  prog_file_ = prog_file;
  params_file_ = params_file;
+  Update();
 }
 void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
                                       const std::string &params_file_path) {
  prog_file_ = prog_file_path;
  params_file_ = params_file_path;
+  Update();
 }
 void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                           int device_id) {
@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
  memory_pool_init_size_mb_ = memory_pool_init_size_mb;
  device_id_ = device_id;
 #else
-  LOG(ERROR) << "Please compile with gpu to EnableGpu";
+  LOG(ERROR) << "Please compile with gpu to EnableGpu()";
  use_gpu_ = false;
 #endif
+  Update();
+}
+void contrib::AnalysisConfig::DisableGpu() {
+  use_gpu_ = false;
+  Update();
 }
-void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
 contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
  CP_MEMBER(use_gpu_);
  CP_MEMBER(device_id_);
  CP_MEMBER(memory_pool_init_size_mb_);
+  CP_MEMBER(enable_memory_optim_);
+  CP_MEMBER(memory_optim_force_update_);
  // TensorRT releated.
  CP_MEMBER(use_tensorrt_);
  CP_MEMBER(tensorrt_workspace_size_);
@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
  }
 #undef CP_MEMBER
+  Update();
 }
 void contrib::AnalysisConfig::EnableMKLDNN() {
@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
  use_mkldnn_ = false;
 #endif
+  Update();
 }
 void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
                                                   int max_batch_size,
                                                   int min_subgraph_size) {
+#ifdef PADDLE_WITH_CUDA
+  if (!use_gpu()) {
+    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
+    return;
+  }
  use_tensorrt_ = true;
  tensorrt_workspace_size_ = workspace_size;
  tensorrt_max_batchsize_ = max_batch_size;
  tensorrt_min_subgraph_size_ = min_subgraph_size;
  Update();
+#else
+  LOG(ERROR)
+      << "To use TensorRT engine, please compile inference lib with GPU first.";
+#endif
 }
+// TODO(Superjomn) refactor this, buggy.
 void contrib::AnalysisConfig::Update() {
  auto info = SerializeInfoCache();
  if (info == serialized_info_cache_) return;
-  if (use_gpu_) {
+  // Transfer pass_builder and copy the existing compatible passes.
+  if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
+    if (use_gpu()) {
      pass_builder_.reset(new GpuPassStrategy);
+      if (use_tensorrt_) {
+        // Append after the Affine_channel_conv_fuse pass.
+        pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
+      }
    } else {
      pass_builder_.reset(new CpuPassStrategy);
    }
-  if (use_tensorrt_) {
-    if (!use_gpu_) {
-      LOG(ERROR)
-          << "TensorRT engine is not available when EnableGpu() not actived.";
  } else {
+    if (use_gpu()) {
+      pass_builder_.reset(new GpuPassStrategy(
+          *static_cast<GpuPassStrategy *>(pass_builder_.get())));
+    } else {
+      pass_builder_.reset(new CpuPassStrategy(
+          *static_cast<CpuPassStrategy *>(pass_builder_.get())));
+    }
+  }
+  if (use_tensorrt_) {
+    const auto &passes = pass_builder_->AllPasses();
+    if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
+        std::end(passes)) {
      // Append after the Affine_channel_conv_fuse pass.
      pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
    }
@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() {
 #endif
  }
+  if (enable_memory_optim_) {
+    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
+  }
  if (ir_debug_) {
    pass_builder()->TurnOnDebug();
  }
@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() {
 std::string contrib::AnalysisConfig::SerializeInfoCache() {
  std::stringstream ss;
+  ss << model_dir_;
+  ss << prog_file_;
+  ss << params_file_;
  ss << use_gpu_;
+  ss << device_id_;
  ss << memory_pool_init_size_mb_;
  ss << use_tensorrt_;
  ss << tensorrt_workspace_size_;
  ss << tensorrt_max_batchsize_;
+  ss << tensorrt_min_subgraph_size_;
+  ss << enable_memory_optim_;
+  ss << memory_optim_force_update_;
  ss << use_mkldnn_;
+  for (auto &item : mkldnn_enabled_op_types_) ss << item;
+  ss << ";";
+  ss << model_from_memory_;
  ss << enable_ir_optim_;
  ss << use_feed_fetch_ops_;
  ss << ir_debug_;
+  ss << specify_input_name_;
+  ss << cpu_math_library_num_threads_;
  return ss.str();
 }
 void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
    int cpu_math_library_num_threads) {
  cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  Update();
 }
 float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
+void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
+  enable_memory_optim_ = true;
+  memory_optim_force_update_ = force_update_cache;
+  Update();
+}
+bool contrib::AnalysisConfig::enable_memory_optim() const {
+  return enable_memory_optim_;
+}
 void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                             size_t prog_buffer_size,
                                             const char *param_buffer,
@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
  prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
  params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
  model_from_memory_ = true;
+  Update();
 }
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -24,18 +24,21 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#if PADDLE_WITH_TENSORRT
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#endif
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
+#if PADDLE_WITH_TENSORRT
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#endif
 DECLARE_bool(profile);
 namespace paddle {
@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
    LOG(ERROR) << "fail to get fetches";
    return false;
  }
+  // Collect variable shapes for memory optimization.
+  if (need_collect_var_shapes_for_memory_optim()) {
+    CollectVarShapes();
+  }
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
  // All the containers in the scope will be hold in inference, but the
@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.SetUseGPU(config_.use_gpu());
  argument_.SetGPUDeviceId(config_.gpu_device_id());
+  argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
+  argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
  if (!config_.model_dir().empty()) {
@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
+    LOG(INFO) << "TensorRT subgraph engine is enabled";
    argument_.SetUseTensorRT(true);
    argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  if (config_.use_mkldnn_) {
+    LOG(INFO) << "MKLDNN is enabled";
    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
  }
  auto passes = config_.pass_builder()->AllPasses();
-  if (!config_.ir_optim()) passes.clear();
+  if (!config_.ir_optim()) {
+    passes.clear();
+    LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
+  }
  argument_.SetIrAnalysisPasses(passes);
+  argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
  argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
  Analyzer().Run(&argument_);
@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() {
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
+  // TODO(Superjomn) deduce the directory path.
+  std::string out_path = inference::analysis::GetMemoryCachePath(
+      config_.model_dir(), config_.prog_file());
+  if (need_collect_var_shapes_for_memory_optim()) {
+    SerializeBatchVarShapes(out_path);
+  }
 }
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
  return std::unique_ptr<PaddlePredictor>(x);
 }
+void AnalysisPredictor::CollectVarShapes() {
+  VLOG(4) << "Collecting var shapes";
+  if (batch_var_shapes_.size() >= max_shape_collect_count_) return;
+  std::map<std::string, std::vector<int>> var_shapes;
+  for (auto var_name : inference_program_->Block(0).LocalVarNames()) {
+    auto *var = sub_scope_->FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    if (var->Type() == framework::VarTypeTrait<framework::LoDTensor>::kId ||
+        var->Type() == framework::VarTypeTrait<framework::Tensor>::kId) {
+      auto &tensor = var->Get<framework::LoDTensor>();
+      auto shape = framework::vectorize(tensor.dims());
+      var_shapes[var_name].assign(shape.begin(), shape.end());
+    }
+  }
+  batch_var_shapes_.push_back(var_shapes);
+  LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size()
+                       << " batch of var shapes for analysis";
+}
+void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) {
+  LOG(INFO) << "serialize batch var shapes to " << path;
+  std::ofstream file(path);
+  if (!file.is_open()) {
+    LOG(ERROR) << "failed to serialize the var shapes to " << path;
+    return;
+  }
+  // The sirialized data format:
+  // <tensor_name>:dim0,dim1,dim2,;
+  for (auto &batch : batch_var_shapes_) {
+    for (auto &ele : batch) {
+      file << ele.first << ":";
+      for (size_t i = 0; i < ele.second.size() - 1; i++) {
+        file << ele.second[i] << ",";
+      }
+      file << ele.second.back() << ";";
+    }
+    file << "\n";
+  }
+}
+bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
+  if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_;
+  bool need = false;
+  // check if the cache exists
+  if (!config_.enable_memory_optim()) {
+    need = false;
+  } else if (config_.enable_memory_optim() &&
+             !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
+                 config_.model_dir(), config_.prog_file()))) {
+    need = true;
+  } else if (config_.enable_memory_optim() &&
+             config_.memory_optim_force_update_) {
+    need = true;
+  }
+  need_collect_var_shapes_ = need ? 1 : 0;
+  return need;
+}
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
    const contrib::AnalysisConfig &config) {

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor {
  void SetMkldnnThreadID(int tid);
 protected:
+  // For memory optimization.
+  bool need_collect_var_shapes_for_memory_optim();
+  void CollectVarShapes();
+  void SerializeBatchVarShapes(const std::string &path);
  bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
  bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
  bool CreateExecutor();
@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor {
  // A mutex help to make Clone thread safe.
  std::mutex clone_mutex_;
+  // For memory optimization.
+  const size_t max_shape_collect_count_{1000};
+  int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
+  std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
 private:
  // Some status here that help to determine the status inside the predictor.
  bool status_program_optimized_{false};

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -16,8 +16,10 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thread>  // NOLINT
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 DEFINE_string(dirname, "", "dirname to tests.");
@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) {
  }
 }
+TEST(AnalysisPredictor, memory_optim) {
+  AnalysisConfig config(FLAGS_dirname);
+  config.DisableGpu();
+  config.EnableMemoryOptim(true);
+  config.pass_builder()->TurnOnDebug();
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> output, output1;
+  {
+    // The first predictor help to cache the memory optimize strategy.
+    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+    // Run several times to check the parameters are not reused by mistake.
+    for (int i = 0; i < 5; i++) {
+      ASSERT_TRUE(predictor->Run(inputs, &output));
+    }
+  }
+  {
+    output.clear();
+    // The second predictor to perform memory optimization.
+    config.EnableMemoryOptim(false);
+    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+    // Run with memory optimization
+    ASSERT_TRUE(predictor->Run(inputs, &output));
+  }
+  // Run native
+  ASSERT_TRUE(native_predictor->Run(inputs, &output1));
+  LOG(INFO) << "the output " << inference::DescribeTensor(output.front());
+  LOG(INFO) << "the native output "
+            << inference::DescribeTensor(output1.front());
+  inference::CompareResult(output, output1);
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
+#!/bin/bash
 set -x
 PADDLE_ROOT=$1
 TURN_ON_MKL=$2 # use MKL or Openblas

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,7 +15,10 @@
 #pragma once
 #include <glog/logging.h>
+#include <fstream>
+#if !defined(_WIN32)
+#include <sys/time.h>
+#endif
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <iterator>
@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
  return true;
 }
-static std::string DescribeTensor(const PaddleTensor &tensor) {
+static std::string DescribeTensor(const PaddleTensor &tensor,
+                                  int max_num_of_data = 15) {
  std::stringstream os;
  os << "Tensor [" << tensor.name << "]\n";
  os << " - type: ";
@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
  }
 }
+static bool IsFileExists(const std::string &path) {
+  std::ifstream file(path);
+  bool exists = file.is_open();
+  file.close();
+  return exists;
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -192,6 +192,13 @@ struct AnalysisConfig {
   */
  bool model_from_memory() const { return model_from_memory_; }
+  /** Turn on memory optimize
+   * NOTE still in development, will release latter.
+   */
+  void EnableMemoryOptim(bool force_update_cache = false);
+  /** Tell whether the memory optimization is activated. */
+  bool enable_memory_optim() const;
  friend class ::paddle::AnalysisPredictor;
  /** NOTE just for developer, not an official API, easily to be broken.
@@ -232,6 +239,10 @@ struct AnalysisConfig {
  //  subgraph, 3 as default value.
  int tensorrt_min_subgraph_size_{3};
+  // memory reuse related.
+  bool enable_memory_optim_{false};
+  bool memory_optim_force_update_{false};
  bool use_mkldnn_{false};
  std::unordered_set<std::string> mkldnn_enabled_op_types_;

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include <glog/logging.h>
 namespace paddle {
@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
  LOG(ERROR) << "GPU not support MKLDNN yet";
 }
+void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
+  analysis_passes_.push_back(pass);
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -45,6 +45,9 @@ class PaddlePassBuilder {
  /** Delete all the passes that has type `pass_type`. */
  void DeletePass(const std::string &pass_type);
+  /** Append an analysis pass. */
+  void AppendAnalysisPass(const std::string &pass);
  /** Visualize the computation graph after each pass by generating a DOT
   * language file, one can draw them with the Graphviz toolkit.
   */
@@ -54,8 +57,18 @@ class PaddlePassBuilder {
  std::string DebugString();
  const std::vector<std::string> &AllPasses() const { return passes_; }
+  std::vector<std::string> AnalysisPasses() const {
+    auto passes = analysis_passes_;
+    // To make sure the ir_graph_to_program should be the last pass so any
+    // modication of IR will persist to the program.
+    passes.push_back("ir_graph_to_program_pass");
+    return passes;
+  }
 protected:
+  std::vector<std::string> analysis_passes_{
+      {"ir_graph_build_pass", "ir_analysis_pass",
+       "ir_params_sync_among_devices_pass"}};
  std::vector<std::string> passes_;
 };
@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder {
  /** The MKLDNN control exists in both CPU and GPU mode, because there can be
   * still some CPU kernels running in CPU mode.
   */
-  virtual void EnableMKLDNN() = 0;
+  virtual void EnableMKLDNN() {}
  bool use_gpu() const { return use_gpu_; }
@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder {
 protected:
  bool use_gpu_{false};
+  bool use_mkldnn_{false};
 };
 /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy {
    use_gpu_ = false;
  }
+  explicit CpuPassStrategy(const CpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {}
  virtual ~CpuPassStrategy() = default;
  void EnableMKLDNN() override {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_MKLDNN
+    if (!use_mkldnn_) {
      passes_.insert(passes_.begin(), "mkldnn_placement_pass");
-    for (auto &pass :
+      for (auto &pass : std::vector<std::string>(
-         std::vector<std::string>({"depthwise_conv_mkldnn_pass",    //
+               {"depthwise_conv_mkldnn_pass",    //
                "conv_bias_mkldnn_fuse_pass",    //
                "conv3d_bias_mkldnn_fuse_pass",  //
                "conv_relu_mkldnn_fuse_pass",    //
                "conv_elementwise_add_mkldnn_fuse_pass"})) {
        passes_.push_back(pass);
      }
+    }
+    use_mkldnn_ = true;
+#else
+    use_mkldnn_ = false;
 #endif
  }
-  CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
 };
 /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy {
    use_gpu_ = true;
  }
-  GpuPassStrategy(const GpuPassStrategy &other)
+  explicit GpuPassStrategy(const GpuPassStrategy &other)
      : PassStrategy(other.AllPasses()) {
    use_gpu_ = true;
  }

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -19,7 +19,7 @@ endfunction()
 function(inference_analysis_api_test target install_dir filename)
    inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()
@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")

--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  std::string turn_mask_pre = "turn_mask_";
  auto one_batch = data->NextBatch();
+  PADDLE_ENFORCE(!one_batch.response.empty());
  int size = one_batch.response[0].size();
  CHECK_EQ(size, kMaxTurnLen);
  // turn tensor assignment
@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) {
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                 input_slots_all, &outputs, FLAGS_num_threads);
@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+// Compare result of NativeConfig and AnalysisConfig with memory optimization.
+TEST(Analyzer_dam, compare_with_memory_optim) {
+  // The small dam will core in CI, but works in local.
+  if (FLAGS_max_turn_num == 9) {
+    contrib::AnalysisConfig cfg, cfg1;
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    SetInput(&input_slots_all);
+    // Run the first time to force to update memory cache
+    SetConfig(&cfg);
+    cfg.EnableMemoryOptim(true);
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
+    // Run second time to use the memory cache and perform memory optimization.
+    SetConfig(&cfg1);
+    cfg1.EnableMemoryOptim();
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
+        input_slots_all);
+  }
+}
 TEST(Analyzer_dam, compare) { compare(); }
 #ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif

--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_Text_Classification, profile) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg.pass_builder()->TurnOnDebug();
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) {
 TEST(Analyzer_Text_Classification, compare) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  cfg.EnableMemoryOptim();
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) {
                FLAGS_infer_model + "/__params__");
  cfg->DisableGpu();
  cfg->SwitchIrDebug();
-  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchSpecifyInputNames(false);
  // TODO(TJ): fix fusion gru
  cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }
@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) {
  if (use_mkldnn) {
    cfg.EnableMKLDNN();
  }
+  // cfg.pass_builder()->TurnOnDebug();
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -103,8 +105,7 @@ void profile(bool use_mkldnn = false) {
    size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
    CHECK_EQ(numel, refer.data.size());
    for (size_t i = 0; i < numel; ++i) {
-      CHECK_LT(
+      EXPECT_NEAR(static_cast<float *>(output.data.data())[i], refer.data[i],
-          fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
                  1e-5);
    }
  }

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <string>
 #include <thread>  // NOLINT
@@ -28,9 +29,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/inference/utils/benchmark.h"
@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
        float *pdata = static_cast<float *>(out.data.data());
        float *pdata_ref = static_cast<float *>(ref_out.data.data());
        for (size_t j = 0; j < size; ++j) {
-          EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy);
+          CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
        }
        break;
      }

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) {
  }
 }
+TEST(TensorRT_mobilenet, profile) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  profile(model_dir, true, false);
+}
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
+#pragma once
 #include <fstream>
 #include <iostream>
 #include <string>

--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
@@ -16,7 +16,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-using namespace paddle::inference;
+using namespace paddle::inference;  // NOLINT
 TEST(Benchmark, basic) {
  Benchmark benchmark;
  benchmark.SetName("key0");

--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase {
            << out_name;
    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    PADDLE_ENFORCE_LT(static_cast<size_t>(col), feed_list.size());
    auto &feed_item = feed_list.at(static_cast<size_t>(col));
    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();

--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt,
  std::cerr << style << Sprintf(fmt, args...) << reset();
 }
+template <typename... Args>
+static void PrettyLogInfo(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::info(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogDetail(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::detail(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH1(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H1(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH2(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H2(), fmt, args...);
+}
 }  // namespace string
 }  // namespace paddle