diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 98bdfcc00b9f0e8f40dfc92e4021b2bd6fb19313..c4ab26a2288bb9d8f3cd54a797d2062e0606b219 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -24,7 +24,7 @@ namespace paddle { -DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, +DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true, "Enable subgraph to TensorRT engine for acceleration"); DEFINE_string(inference_analysis_graphviz_log_root, "./", @@ -42,10 +42,19 @@ class DfgPassManagerImpl final : public DfgPassManager { // TODO(Superjomn) set the key with pass reprs. AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { - auto trt_teller = [](const Node* node) { + auto trt_teller = [&](const Node* node) { + std::unordered_set teller_set( + {"elementwise_add", "mul", "conv2d", "pool2d", "relu"}); if (!node->IsFunction()) return false; - return static_cast(node)->func_type() == "mul"; + + const auto* func = static_cast(node); + if (teller_set.count(func->func_type())) + return true; + else { + return false; + } }; + AddPass("tensorrt-subgraph-marker", new TensorRTSubgraphNodeMarkPass(trt_teller)); AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 2328d870422c5a31c22d7b09980aae35e01b2b25..aaf7ca67011fb7bd4a74f6d8f57317594c528ca4 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -23,7 +23,7 @@ namespace paddle { namespace inference { -DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size"); +DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size"); DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size"); namespace analysis { @@ -87,34 +87,113 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { } void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, - const framework::proto::BlockDesc &block) { + framework::proto::BlockDesc *block) { static int counter{0}; PADDLE_ENFORCE(node->IsFunctionBlock()); framework::OpDesc desc; auto *func = static_cast(node); // collect inputs - std::vector io; + std::unordered_set input_names; for (auto *x : func->inlinks) { - io.push_back(x->name()); + input_names.insert(x->name()); } - desc.SetInput("Xs", io); + desc.SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); - // collect outputs - io.clear(); + std::unordered_set output_names; for (auto *x : func->outlinks) { - io.push_back(x->name()); + output_names.insert(x->name()); } - desc.SetOutput("Ys", io); + + std::vector output_temp(output_names.begin(), + output_names.end()); + desc.SetOutput("Ys", output_temp); desc.SetType("tensorrt_engine"); - PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc"); + std::unordered_map output_name_map; + + // The following procedure is used to rename all the intermediate + // variables and the output variables of the subgraph. + // Why we do this? + // During the transition from fluid OP to tensorrt OP, we map + // the input and output Tensor(fluid data structure) of fluid OP + // to the correspondin ITensor (trt data structure) through the + // Tensor name. When we set up ITensor for an variable, we must + // ensure that it has not been set before. + // If there is variable in the fluid graph, which is not only the + // input of a OP, but also the output of a Op, there will be problems. + // So we have to rename the variable in the subgraph to make sure + // it is either an OP's input or an OP's output. + + auto subgraph_nodes = func->subgraph; + for (int index = 0; index < block->ops_size(); index++) { + framework::proto::OpDesc *op = block->mutable_ops(index); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->name(), op->type()); + + std::unordered_map var2id; + for (auto *in_var : correspond_node->inlinks) { + var2id[in_var->name()] = in_var->id(); + } + // rename for the input variables of op inside subgraph + for (int i = 0; i < op->inputs_size(); i++) { + framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { + std::string arg_value = in_var->arguments(k); + if (input_names.count(arg_value)) { + replaced_names.push_back(arg_value); + } else { + replaced_names.push_back(arg_value + + std::to_string(var2id[arg_value])); + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outlinks) { + var2id[out_var->name()] = out_var->id(); + } + + // rename for the output variables of op inside subgraph + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + std::string arg_value = out_var->arguments(k); + if (output_names.count(arg_value)) { + output_name_map[arg_value] = + arg_value + std::to_string(var2id[arg_value]); + } + replaced_names.push_back(arg_value + std::to_string(var2id[arg_value])); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } + // When tensorrt engine runs at the end of the operation, + // output_mapping help us copy the data from the renamed ITensor + // to Tensor. + std::vector output_mapping; + for (auto name : output_names) { + PADDLE_ENFORCE(output_name_map.count(name) != 0); + output_mapping.push_back(output_name_map[name]); + } + + PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc"); // Set attrs - SetAttr(desc.Proto(), "subgraph", block.SerializeAsString()); + SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize); SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); + SetAttr(desc.Proto(), "output_name_mapping", output_mapping); node->SetPbMsg(desc.Proto()->SerializeAsString()); } @@ -146,15 +225,17 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { LOG(INFO) << "transformed variable size: " << block_desc.Proto()->vars().size(); // copy ops. + for (auto *node : block_node->subgraph) { auto *op = block_desc.AppendOp(); PADDLE_ENFORCE(!node->pb_msg().empty()); op->Proto()->ParseFromString(node->pb_msg()); } + *block_desc.Proto()->mutable_vars() = argument_->origin_program_desc->blocks(0).vars(); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); - CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto()); + CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto()); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *op = main_block->add_ops(); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 389f9e1a9148a4daf0e5b751cce5cb6325252a4e..80809d4c43ca08298bad25cf614dcb4117d3f99a 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { std::vector> SubGraphSplitter::ExtractSubGraphs() { std::vector marked_nodes; - for (auto &node : GraphTraits(graph_).nodes()) { + for (auto &node : GraphTraits(graph_).nodes_in_TS()) { if (node.attr(kMarkerAttrName).Bool()) { marked_nodes.push_back(&node); } diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index d86c046f2e5b08a4c00cf6cad19627e6a196c798..8f42a37cd3f8978b917b42e8f45a128b8422aa57 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,6 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc +activation_op.cc DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 1b6a0ad82f3ceb00cec15c28c8121adc22271b7a..41faaf7212accaaec238062b1340e8da8fa6be33 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -55,7 +55,6 @@ class OpConverter { it = Registry::Lookup("fc"); } } - if (op_desc.Type().find("elementwise") != std::string::npos) { static std::unordered_set add_tensor_op_set{ "add", "mul", "sub", "div", "max", "min", "pow"}; @@ -72,6 +71,8 @@ class OpConverter { "Unsupported elementwise type" + op_type); it = Registry::Lookup("elementwise_" + op_type + "_weight"); + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); } else { PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0, "Unsupported elementwise type" + op_type); diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 1172822e12222ded219104e3bad2613d30f891b8..ee3078876c15b06a887064f08dc0c05d450b5f77 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -55,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, "TensorRT' tensor input requires at most 4 dimensions"); - - switch (shape.size()) { - case 2: - return nvinfer1::Dims2(1, shape[1]); - case 3: - return nvinfer1::Dims3(1, shape[1], shape[2]); - case 4: - return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]); - default: - return nvinfer1::Dims(); - } - return nvinfer1::Dims(); + PADDLE_ENFORCE_EQ(shape.size(), 4UL); + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); } } // namespace @@ -86,6 +76,9 @@ void TensorRTEngineKernel::Prepare( parameters.insert(param); } + std::vector output_maps = + context.Attr>("output_name_mapping"); + // TODO(Superjomn) replace this with a different stream auto *engine = Singleton::Global().Create( max_batch, max_workspace, nullptr /*engine hold its own stream*/, @@ -97,6 +90,7 @@ void TensorRTEngineKernel::Prepare( // Add inputs VLOG(4) << "declare inputs"; for (auto &input : context.Inputs("Xs")) { + if (parameters.count(input)) continue; VLOG(4) << "declare input " << input; auto *var = block.FindVar(input); // TensorRT engine need to create parameters. The parameter's description @@ -122,7 +116,7 @@ void TensorRTEngineKernel::Prepare( block_desc, parameters, context.scope(), engine); // Add outputs - for (auto &output : context.Outputs("Ys")) { + for (auto &output : output_maps) { engine->DeclareOutput(output); } diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 32d10fd8a5687ebaae1d7d75af531cbc45ef4245..2cbe1213a2f428a3ce56b06f97636baeb4b66c26 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -66,8 +66,17 @@ class TensorRTEngineKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, context.Attr("max_batch")); + std::vector output_maps = + context.Attr>("output_name_mapping"); + + auto params = context.Attr>("parameters"); + std::unordered_set parameters; + for (const auto& param : params) { + parameters.insert(param); + } // Convert input tensor from fluid to engine. for (const auto& x : context.Inputs("Xs")) { + if (parameters.count(x)) continue; // convert input and copy to TRT engine's buffer auto& t = inference::analysis::GetFromScope( context.scope(), x); @@ -82,10 +91,12 @@ class TensorRTEngineKernel : public framework::OpKernel { // Execute the engine. PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); engine->Execute(FLAGS_tensorrt_engine_batch_size); + // Convert output tensor from engine to fluid + int output_index = 0; for (const auto& y : context.Outputs("Ys")) { // convert output and copy to fluid. - nvinfer1::ITensor* trt_t = engine->GetITensor(y); + nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); // Use the output ITensor's dims to reshape the Fluid Tensor. std::vector ddim(dims.d, dims.d + dims.nbDims); @@ -102,7 +113,7 @@ class TensorRTEngineKernel : public framework::OpKernel { // TODO(Superjomn) change this float to dtype size. auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * FLAGS_tensorrt_engine_batch_size; - engine->GetOutputInCPU(y, + engine->GetOutputInCPU(output_maps[output_index], fluid_t->mutable_data(platform::CPUPlace()), size * sizeof(float)); //} else { @@ -110,6 +121,7 @@ class TensorRTEngineKernel : public framework::OpKernel { // y, fluid_t->mutable_data(platform::CUDAPlace()), // size * sizeof(float)); //} + output_index += 1; } cudaStreamSynchronize(*engine->stream()); diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 7cb1e47a1516c32fb31a7818e7203b498e31e431..37657fa0b0498986fe67027415279af1775e58b9 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -103,6 +103,9 @@ TEST(TensorRTEngineOp, manual) { SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); + SetAttr>(engine_op_desc.Proto(), + "output_name_mapping", + std::vector({"z0"})); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); @@ -196,6 +199,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { std::vector({"y0", "y1", "y2", "y3"})); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "b_engine"); + SetAttr>(engine_op_desc.Proto(), + "output_name_mapping", + std::vector({"z3"})); + auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto()); // Execute them.