diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 100a7504b8526b3587858dd7783913757ba09895..8c7d58678fd29cb25d13d64a08e6c6f26f242d8b 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
     }
     return false;
   };
+
   for (auto &node : graph) {
     for (auto *in : node->inlinks) {
       // The Value that is written by nodes inside a sub-graph shouldn't be the
@@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
                         std::vector<Node *>(outputs.begin(), outputs.end()));
 }
 
+// Filter the Intermediate results of the subgraph node.
 void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
   std::vector<Node *> op_nodes;
   for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
@@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
     for (auto *out : op_nodes[i]->outlinks) {
       if (follow_up_input_names.count(out->name())) {
         filtered_subgraph_outlinks.push_back(out);
+      } else {
+        out->SetDeleted();
       }
     }
-    PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
+    // The filtered_subgraph_outlinks may be empty.
     op_nodes[i]->outlinks = filtered_subgraph_outlinks;
   }
 }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 8579845d51e80d73d220465d25b70944f5ad9bf2..5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
 
   // collect inputs
   std::unordered_set<std::string> input_names;
+  std::unordered_set<std::string> input_names_with_id;
   for (auto *x : func->inlinks) {
     input_names.insert(x->name());
+    input_names_with_id.insert(x->name() + std::to_string(x->id()));
   }
   desc.SetInput(
       "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
   std::unordered_set<std::string> output_names;
+  std::unordered_set<std::string> output_names_with_id;
   for (auto *x : func->outlinks) {
     output_names.insert(x->name());
+    output_names_with_id.insert(x->name() + std::to_string(x->id()));
   }
 
-  std::vector<std::string> output_temp(output_names.begin(),
-                                       output_names.end());
-  desc.SetOutput("Ys", output_temp);
+  desc.SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
   desc.SetType("tensorrt_engine");
 
   std::unordered_map<std::string, std::string> output_name_map;
@@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
       std::vector<std::string> replaced_names;
       for (int k = 0; k < in_var->arguments_size(); k++) {
         std::string arg_value = in_var->arguments(k);
-        if (input_names.count(arg_value)) {
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
           replaced_names.push_back(arg_value);
         } else {
-          replaced_names.push_back(arg_value +
-                                   std::to_string(var2id[arg_value]));
+          replaced_names.push_back(arg_value_with_id);
         }
       }
       in_var->clear_arguments();
@@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
       std::vector<std::string> replaced_names;
       for (int k = 0; k < out_var->arguments_size(); k++) {
         std::string arg_value = out_var->arguments(k);
-        if (output_names.count(arg_value)) {
-          output_name_map[arg_value] =
-              arg_value + std::to_string(var2id[arg_value]);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id.count(arg_value_with_id)) {
+          output_name_map[arg_value] = arg_value_with_id;
         }
-        replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
+        replaced_names.push_back(arg_value_with_id);
       }
       out_var->clear_arguments();
       for (size_t k = 0; k < replaced_names.size(); k++) {
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 670a8de667494c655bed15aa3e4ce8265448635a..c3a2dbf9d18f80f93381d485a3870e43411bd992 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -74,13 +74,134 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
   node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
 }
 
+// This is a simple representation of a graph.
+// The BriefNode hold the pointer of the Node.
+// This is to avoid changing the original graph
+// in the process of trt graph analysis.
+struct BriefNode {
+  explicit BriefNode(Node *n) { node = n; }
+  Node *node;
+  std::vector<BriefNode *> inlinks;
+  std::vector<BriefNode *> outlinks;
+};
+
+// Union two adjacent BriefNode.
+// Suppose we have two adjacent nodes src and dst.
+// We will perform the following operations:
+// 1. add all inputs(except src) of dst to src inlinks.
+// 2. add all outputs of dst to src outlinks.
+// 3. change all the dst's inputs and outputs
+// corresponding inlinks and outlinks to src node.
+// 4. delete all dst's inlinks and outlinks.
+void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
+                          int src_id, int dst_id) {
+  // merge the two adjacent nodes into one node.
+  BriefNode *src_node = node_map.at(src_id);
+  BriefNode *dst_node = node_map.at(dst_id);
+
+  std::unordered_set<BriefNode *> inputs(src_node->inlinks.begin(),
+                                         src_node->inlinks.end());
+  std::unordered_set<BriefNode *> outputs;
+
+  for (auto *n : src_node->outlinks) {
+    if (n != dst_node) outputs.insert(n);
+  }
+
+  // Add the inlinks and outlinks of dst node to src node.
+  std::vector<BriefNode *> dst_in_nodes = dst_node->inlinks;
+  for (BriefNode *node : dst_in_nodes) {
+    if (node != src_node) {
+      inputs.insert(node);
+    }
+  }
+
+  std::vector<BriefNode *> dst_out_nodes = dst_node->outlinks;
+  for (BriefNode *node : dst_out_nodes) {
+    outputs.insert(node);
+  }
+
+  // update the dst and src node's inlinks and outlinks.
+  src_node->inlinks =
+      std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
+  src_node->outlinks =
+      std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
+  dst_node->inlinks.clear();
+  dst_node->outlinks.clear();
+
+  auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
+    for (auto *&n : nodes) {
+      if (n == src_node || n == dst_node) {
+        n = src_node;
+      }
+    }
+  };
+  // Change all the dst inputs and outputs corresponding inlink and
+  // outlink to the src node.
+  for (auto *node : src_node->inlinks) {
+    inlink_or_outlink_cleaner(node->outlinks);
+  }
+
+  for (auto *node : src_node->outlinks) {
+    inlink_or_outlink_cleaner(node->inlinks);
+  }
+}
+
+// FlexibleDFS
+// If reverse is true, do reverse dfs.
+// If enter func is not nullptr, calls enter(node) before visiting any children
+// of node.
+// If leave func not nullptr, calls leave(node) after visiting all parents of
+// node.
+void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
+                 const std::function<bool(const BriefNode *)> &enter,
+                 const std::function<bool(const BriefNode *)> &leave) {
+  typedef struct {
+    const BriefNode *node;
+    bool leave;
+  } FNode;
+
+  std::vector<FNode> stack;
+  for (auto &node : source) {
+    stack.push_back(FNode{node, false});
+  }
+  std::unordered_set<const BriefNode *> visited;
+  while (!stack.empty()) {
+    auto fnode = stack.back();
+    stack.pop_back();
+
+    if (fnode.leave) {
+      if (leave && !leave(fnode.node)) return;
+    }
+    if (visited.count(fnode.node)) continue;
+    visited.insert(fnode.node);
+
+    if (enter && !enter(fnode.node)) return;
+
+    if (leave) stack.push_back(FNode{fnode.node, true});
+    const std::vector<BriefNode *> iter_nodes =
+        reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
+    for (const BriefNode *node : iter_nodes) {
+      if (!visited.count(node)) {
+        stack.push_back(FNode{node, false});
+      }
+    }
+  }
+}
+
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+  // Run the Extract algorithm to find all subgraphs.
   std::vector<Node *> marked_nodes;
+  //  We use brief_node_map to represent the original graph in order to avoid
+  //  changing the original graph.
+  std::unordered_map<int, BriefNode *> brief_node_map;
+
   for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
+    brief_node_map[node.id()] = new BriefNode(&node);
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
   }
+
   // extract sub-graphs in the marked node set, use Union Find algorithm.
   node_map_t node_map;  // id to ptr
   for (auto *n : marked_nodes) {
@@ -88,11 +209,73 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
     n->attr(kUnionFindParent).Int32() = n->id();
     node_map[n->id()] = n;
   }
-  std::unordered_set<Node *> visited;
-  for (auto *n : marked_nodes) {
-    for (auto *out : n->outlinks) {
-      if (node_map.count(out->id())) {
-        UnionFindCombine(node_map, n->id(), out->id());
+
+  // create breif node map
+  for (auto &itr : brief_node_map) {
+    for (Node *node : itr.second->node->inlinks) {
+      itr.second->inlinks.push_back(brief_node_map[node->id()]);
+    }
+
+    for (Node *node : itr.second->node->outlinks) {
+      itr.second->outlinks.push_back(brief_node_map[node->id()]);
+    }
+  }
+
+  for (auto &itr : brief_node_map) {
+    BriefNode *brief_node = itr.second;
+
+    if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
+      VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
+      continue;
+    }
+
+    //  Our algorithm must guarantee that:
+    //  1. The graph is always directed acyclic graph（DAG）.
+    //  2. If there is a path in the subgraph from X to Y (X and Y are both
+    //  nodes in the subgraph), then all paths from X to Y are in the
+    //  subgraph.
+    //
+    //  In order to achieve the above guarantee.
+    //  For adjacent nodes src -> dst.
+    //  1. Get all dst input nodes except src.
+    //  2. Reverse DFS from those input nodes
+    //  3. If there is a path from input nodes to src,
+    //  then the src and dst nodes can not be fused into one node,
+    //  otherwise it can be done.
+
+    while (true) {
+      std::unordered_set<BriefNode *> contract_nodes;
+      for (auto *out : brief_node->outlinks) {
+        // must be an trt candidate
+        if (!out->node->attr(kMarkerAttrName).Bool()) continue;
+        // get all dst input nodes except src.
+        std::vector<BriefNode *> source_nodes;
+        for (auto *n : out->inlinks) {
+          if (n != brief_node) {
+            source_nodes.push_back(n);
+          }
+        }
+
+        // Reverse DFS from the source_nodes.
+        bool have_excess_path = false;
+        FlexibleDFS(source_nodes, true, nullptr,
+                    [&have_excess_path, brief_node](const BriefNode *n) {
+                      if (n == brief_node) {
+                        have_excess_path = true;
+                        return false;
+                      }
+                      return true;
+                    });
+        if (have_excess_path) continue;
+        contract_nodes.insert(out);
+      }
+      if (contract_nodes.empty()) break;
+
+      for (auto dst_node : contract_nodes) {
+        UnionFindCombine(node_map, brief_node->node->id(),
+                         dst_node->node->id());
+        UnionContractedNodes(brief_node_map, brief_node->node->id(),
+                             dst_node->node->id());
       }
     }
   }
@@ -128,6 +311,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
     block_node->inlinks = std::move(io.first);
     block_node->outlinks = std::move(io.second);
+
     for (auto *node : subgraph) {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 39cc433b40fad17f4f12359d4e907a250a88bd63..531a170512f727d891aa6644ee08a60c25f16876 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) {
 
   // At least one nodes should be deleted.
   ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
-  ASSERT_EQ(6, count1);
+  ASSERT_EQ(11, count1);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index e1cace9cc1b06f036f52e82b7b86c99a02d50f50..8168cdff1b85fc05d22fbec7fac6ab8892f3a907 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter {
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
         nvinfer1::ActivationType::kRELU);
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("relu (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 94f8b0ae5606d39a722ffe28501645c9b6fc5d2e..3330af2da6c97ad153dcecd86be4b441eac62b5e 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter {
                              scale_weights.get(), power_weights.get());
 
     auto output_name = op_desc.Output("Y").front();
+    layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->weight_map[op_desc.Input("Bias").front()] =
         std::move(combile_bias_tensor);
     engine_->weight_map[op_desc.Input("Scale").front()] =
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index bb9627bf957b63993b2c8d23e7ec8122eb004eaf..a11dfa1e8f2dacfad067d025678911200db500fb 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter {
     axis = axis - 1;  // Remove batch dim
     layer->setAxis(axis);
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("concat (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 841a95db38ce7cf0cb5961ff04cb569ee2633e6f..0a37d3968c39d2c244bbd82161afddf6330e421d 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter {
     layer->setNbGroups(groups);
 
     auto output_name = op_desc.Output("Output").front();
+    layer->setName(("conv2d (Output: " + output_name + ")").c_str());
     engine_->weight_map[op_desc.Input("Filter").front()] =
         std::move(weight_tensor);
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {
       engine_->DeclareOutput(output_name);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 60a72b4eb5c75b5cd12305f13763a9a1a567213f..0a6ce568f194f03c7259e1ebf28dd6ce4df2d594 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
         shift_weights.get(), scale_weights.get(), power_weights.get());
     auto output_name = op_desc.Output("Out")[0];
 
+    layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
@@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter {
         *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
 
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {  // the test framework can not determine which is the
                       // output, so place the declaration inside.
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index ad98d85aae9cf594922aca00c43718ccfbce2278..7c21ecd95da07b498eed2ab1bbdcc0e8cd184787 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter {
                                        n_output, tmp_weight.get(), bias.get());
 
     auto output_name = op_desc.Output("Out").front();
+    layer->setName(("fc (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
     if (test_mode) {
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 73f1b28ddf73403862e55d102a259d7b6cf67b1f..f9bb66a6e9f81a10368db7710108c319860e940a 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter {
     layer->setPadding(nv_paddings);
 
     auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("pool2d (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
     if (test_mode) {
       engine_->DeclareOutput(output_name);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index bc556ab3643cefa3e45d2a8a3835937753af723f..79e75ea9a035b654f0bb7026d3a491bebe0b23c4 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -160,11 +160,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
           fluid_t->mutable_data<float>(platform::CUDAPlace(
               boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
           size * sizeof(float));
-      //} else {
-      // engine->GetOutputInGPU(
-      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
-      // size * sizeof(float));
-      //}
+
       output_index += 1;
     }