diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index 100a7504b8526b3587858dd7783913757ba09895..8c7d58678fd29cb25d13d64a08e6c6f26f242d8b 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT } return false; }; + for (auto &node : graph) { for (auto *in : node->inlinks) { // The Value that is written by nodes inside a sub-graph shouldn't be the @@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT std::vector(outputs.begin(), outputs.end())); } +// Filter the Intermediate results of the subgraph node. void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { std::vector op_nodes; for (auto &node : GraphTraits(*graph).nodes_in_TS()) { @@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { for (auto *out : op_nodes[i]->outlinks) { if (follow_up_input_names.count(out->name())) { filtered_subgraph_outlinks.push_back(out); + } else { + out->SetDeleted(); } } - PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL); + // The filtered_subgraph_outlinks may be empty. op_nodes[i]->outlinks = filtered_subgraph_outlinks; } } diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 8579845d51e80d73d220465d25b70944f5ad9bf2..5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, // collect inputs std::unordered_set input_names; + std::unordered_set input_names_with_id; for (auto *x : func->inlinks) { input_names.insert(x->name()); + input_names_with_id.insert(x->name() + std::to_string(x->id())); } desc.SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); std::unordered_set output_names; + std::unordered_set output_names_with_id; for (auto *x : func->outlinks) { output_names.insert(x->name()); + output_names_with_id.insert(x->name() + std::to_string(x->id())); } - std::vector output_temp(output_names.begin(), - output_names.end()); - desc.SetOutput("Ys", output_temp); + desc.SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); desc.SetType("tensorrt_engine"); std::unordered_map output_name_map; @@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, std::vector replaced_names; for (int k = 0; k < in_var->arguments_size(); k++) { std::string arg_value = in_var->arguments(k); - if (input_names.count(arg_value)) { + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { replaced_names.push_back(arg_value); } else { - replaced_names.push_back(arg_value + - std::to_string(var2id[arg_value])); + replaced_names.push_back(arg_value_with_id); } } in_var->clear_arguments(); @@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, std::vector replaced_names; for (int k = 0; k < out_var->arguments_size(); k++) { std::string arg_value = out_var->arguments(k); - if (output_names.count(arg_value)) { - output_name_map[arg_value] = - arg_value + std::to_string(var2id[arg_value]); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id.count(arg_value_with_id)) { + output_name_map[arg_value] = arg_value_with_id; } - replaced_names.push_back(arg_value + std::to_string(var2id[arg_value])); + replaced_names.push_back(arg_value_with_id); } out_var->clear_arguments(); for (size_t k = 0; k < replaced_names.size(); k++) { diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 670a8de667494c655bed15aa3e4ce8265448635a..c3a2dbf9d18f80f93381d485a3870e43411bd992 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -74,13 +74,134 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor; } +// This is a simple representation of a graph. +// The BriefNode hold the pointer of the Node. +// This is to avoid changing the original graph +// in the process of trt graph analysis. +struct BriefNode { + explicit BriefNode(Node *n) { node = n; } + Node *node; + std::vector inlinks; + std::vector outlinks; +}; + +// Union two adjacent BriefNode. +// Suppose we have two adjacent nodes src and dst. +// We will perform the following operations: +// 1. add all inputs(except src) of dst to src inlinks. +// 2. add all outputs of dst to src outlinks. +// 3. change all the dst's inputs and outputs +// corresponding inlinks and outlinks to src node. +// 4. delete all dst's inlinks and outlinks. +void UnionContractedNodes(const std::unordered_map &node_map, + int src_id, int dst_id) { + // merge the two adjacent nodes into one node. + BriefNode *src_node = node_map.at(src_id); + BriefNode *dst_node = node_map.at(dst_id); + + std::unordered_set inputs(src_node->inlinks.begin(), + src_node->inlinks.end()); + std::unordered_set outputs; + + for (auto *n : src_node->outlinks) { + if (n != dst_node) outputs.insert(n); + } + + // Add the inlinks and outlinks of dst node to src node. + std::vector dst_in_nodes = dst_node->inlinks; + for (BriefNode *node : dst_in_nodes) { + if (node != src_node) { + inputs.insert(node); + } + } + + std::vector dst_out_nodes = dst_node->outlinks; + for (BriefNode *node : dst_out_nodes) { + outputs.insert(node); + } + + // update the dst and src node's inlinks and outlinks. + src_node->inlinks = + std::move(std::vector(inputs.begin(), inputs.end())); + src_node->outlinks = + std::move(std::vector(outputs.begin(), outputs.end())); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); + + auto inlink_or_outlink_cleaner = [&](std::vector &nodes) { + for (auto *&n : nodes) { + if (n == src_node || n == dst_node) { + n = src_node; + } + } + }; + // Change all the dst inputs and outputs corresponding inlink and + // outlink to the src node. + for (auto *node : src_node->inlinks) { + inlink_or_outlink_cleaner(node->outlinks); + } + + for (auto *node : src_node->outlinks) { + inlink_or_outlink_cleaner(node->inlinks); + } +} + +// FlexibleDFS +// If reverse is true, do reverse dfs. +// If enter func is not nullptr, calls enter(node) before visiting any children +// of node. +// If leave func not nullptr, calls leave(node) after visiting all parents of +// node. +void FlexibleDFS(const std::vector &source, bool reverse, + const std::function &enter, + const std::function &leave) { + typedef struct { + const BriefNode *node; + bool leave; + } FNode; + + std::vector stack; + for (auto &node : source) { + stack.push_back(FNode{node, false}); + } + std::unordered_set visited; + while (!stack.empty()) { + auto fnode = stack.back(); + stack.pop_back(); + + if (fnode.leave) { + if (leave && !leave(fnode.node)) return; + } + if (visited.count(fnode.node)) continue; + visited.insert(fnode.node); + + if (enter && !enter(fnode.node)) return; + + if (leave) stack.push_back(FNode{fnode.node, true}); + const std::vector iter_nodes = + reverse == true ? fnode.node->inlinks : fnode.node->outlinks; + for (const BriefNode *node : iter_nodes) { + if (!visited.count(node)) { + stack.push_back(FNode{node, false}); + } + } + } +} + std::vector> SubGraphSplitter::ExtractSubGraphs() { + // Run the Extract algorithm to find all subgraphs. std::vector marked_nodes; + // We use brief_node_map to represent the original graph in order to avoid + // changing the original graph. + std::unordered_map brief_node_map; + for (auto &node : GraphTraits(*graph_).nodes_in_TS()) { + brief_node_map[node.id()] = new BriefNode(&node); if (node.attr(kMarkerAttrName).Bool()) { marked_nodes.push_back(&node); } } + // extract sub-graphs in the marked node set, use Union Find algorithm. node_map_t node_map; // id to ptr for (auto *n : marked_nodes) { @@ -88,11 +209,73 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { n->attr(kUnionFindParent).Int32() = n->id(); node_map[n->id()] = n; } - std::unordered_set visited; - for (auto *n : marked_nodes) { - for (auto *out : n->outlinks) { - if (node_map.count(out->id())) { - UnionFindCombine(node_map, n->id(), out->id()); + + // create breif node map + for (auto &itr : brief_node_map) { + for (Node *node : itr.second->node->inlinks) { + itr.second->inlinks.push_back(brief_node_map[node->id()]); + } + + for (Node *node : itr.second->node->outlinks) { + itr.second->outlinks.push_back(brief_node_map[node->id()]); + } + } + + for (auto &itr : brief_node_map) { + BriefNode *brief_node = itr.second; + + if (!brief_node->node->attr(kMarkerAttrName).Bool()) { + VLOG(4) << brief_node->node->id() << " node not a trt candicate."; + continue; + } + + // Our algorithm must guarantee that: + // 1. The graph is always directed acyclic graph(DAG). + // 2. If there is a path in the subgraph from X to Y (X and Y are both + // nodes in the subgraph), then all paths from X to Y are in the + // subgraph. + // + // In order to achieve the above guarantee. + // For adjacent nodes src -> dst. + // 1. Get all dst input nodes except src. + // 2. Reverse DFS from those input nodes + // 3. If there is a path from input nodes to src, + // then the src and dst nodes can not be fused into one node, + // otherwise it can be done. + + while (true) { + std::unordered_set contract_nodes; + for (auto *out : brief_node->outlinks) { + // must be an trt candidate + if (!out->node->attr(kMarkerAttrName).Bool()) continue; + // get all dst input nodes except src. + std::vector source_nodes; + for (auto *n : out->inlinks) { + if (n != brief_node) { + source_nodes.push_back(n); + } + } + + // Reverse DFS from the source_nodes. + bool have_excess_path = false; + FlexibleDFS(source_nodes, true, nullptr, + [&have_excess_path, brief_node](const BriefNode *n) { + if (n == brief_node) { + have_excess_path = true; + return false; + } + return true; + }); + if (have_excess_path) continue; + contract_nodes.insert(out); + } + if (contract_nodes.empty()) break; + + for (auto dst_node : contract_nodes) { + UnionFindCombine(node_map, brief_node->node->id(), + dst_node->node->id()); + UnionContractedNodes(brief_node_map, brief_node->node->id(), + dst_node->node->id()); } } } @@ -128,6 +311,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { auto io = ExtractInputAndOutputOfSubGraph(subgraph); block_node->inlinks = std::move(io.first); block_node->outlinks = std::move(io.second); + for (auto *node : subgraph) { // TODO(Superjomn) need a unified mechanism to treat deleted node in each // pass. diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 39cc433b40fad17f4f12359d4e907a250a88bd63..531a170512f727d891aa6644ee08a60c25f16876 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) { // At least one nodes should be deleted. ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock - ASSERT_EQ(6, count1); + ASSERT_EQ(11, count1); } } // namespace analysis diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index e1cace9cc1b06f036f52e82b7b86c99a02d50f50..8168cdff1b85fc05d22fbec7fac6ab8892f3a907 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter { engine_, Activation, *const_cast(input_tensor), nvinfer1::ActivationType::kRELU); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("relu (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 94f8b0ae5606d39a722ffe28501645c9b6fc5d2e..3330af2da6c97ad153dcecd86be4b441eac62b5e 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter { scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); + layer->setName(("batch_norm (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Bias").front()] = std::move(combile_bias_tensor); engine_->weight_map[op_desc.Input("Scale").front()] = diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index bb9627bf957b63993b2c8d23e7ec8122eb004eaf..a11dfa1e8f2dacfad067d025678911200db500fb 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter { axis = axis - 1; // Remove batch dim layer->setAxis(axis); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("concat (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 841a95db38ce7cf0cb5961ff04cb569ee2633e6f..0a37d3968c39d2c244bbd82161afddf6330e421d 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter { layer->setNbGroups(groups); auto output_name = op_desc.Output("Output").front(); + layer->setName(("conv2d (Output: " + output_name + ")").c_str()); engine_->weight_map[op_desc.Input("Filter").front()] = std::move(weight_tensor); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 60a72b4eb5c75b5cd12305f13763a9a1a567213f..0a6ce568f194f03c7259e1ebf28dd6ce4df2d594 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter { shift_weights.get(), scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("elementwise_add (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the @@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter { *const_cast(Y), op_pair->second); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("elementwise (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index ad98d85aae9cf594922aca00c43718ccfbce2278..7c21ecd95da07b498eed2ab1bbdcc0e8cd184787 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter { n_output, tmp_weight.get(), bias.get()); auto output_name = op_desc.Output("Out").front(); + layer->setName(("fc (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp); if (test_mode) { diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 73f1b28ddf73403862e55d102a259d7b6cf67b1f..f9bb66a6e9f81a10368db7710108c319860e940a 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter { layer->setPadding(nv_paddings); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("pool2d (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index bc556ab3643cefa3e45d2a8a3835937753af723f..79e75ea9a035b654f0bb7026d3a491bebe0b23c4 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -160,11 +160,7 @@ class TensorRTEngineKernel : public framework::OpKernel { fluid_t->mutable_data(platform::CUDAPlace( boost::get(context.GetPlace()).device)), size * sizeof(float)); - //} else { - // engine->GetOutputInGPU( - // y, fluid_t->mutable_data(platform::CUDAPlace()), - // size * sizeof(float)); - //} + output_index += 1; }