提交 71636e67 编写于 作者: N nhzlx

add min_subgraph_size attr to tensorrt config

test=develop
上级 a6aa8ea7
...@@ -127,6 +127,7 @@ struct Argument { ...@@ -127,6 +127,7 @@ struct Argument {
std::function<bool(const framework::ir::Node*)>); std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
// The program transformed by IR analysis phase. // The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
......
...@@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument,
argument->tensorrt_node_teller_ptr()); argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size",
new int(argument->tensorrt_min_subgraph_size()));
} }
// graph_ = pass->Apply(std::move(graph_)); // graph_ = pass->Apply(std::move(graph_));
......
...@@ -38,7 +38,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl( ...@@ -38,7 +38,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
auto teller = auto teller =
Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller"); Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/); SubGraphFuser fuser(graph.get(), teller,
Get<int>("min_subgraph_size") /*min subgraph size*/);
fuser(); fuser();
for (auto *node : graph->Nodes()) { for (auto *node : graph->Nodes()) {
...@@ -233,4 +234,5 @@ REGISTER_PASS(tensorrt_subgraph_pass, ...@@ -233,4 +234,5 @@ REGISTER_PASS(tensorrt_subgraph_pass,
paddle::inference::analysis::TensorRtSubgraphPass) paddle::inference::analysis::TensorRtSubgraphPass)
.RequirePassAttr("tensorrt_node_teller") .RequirePassAttr("tensorrt_node_teller")
.RequirePassAttr("max_batch_size") .RequirePassAttr("max_batch_size")
.RequirePassAttr("workspace_size"); .RequirePassAttr("workspace_size")
.RequirePassAttr("min_subgraph_size");
...@@ -57,6 +57,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { ...@@ -57,6 +57,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
use_tensorrt_ = other.use_tensorrt_; use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_; model_from_memory_ = other.model_from_memory_;
if (use_gpu) { if (use_gpu) {
...@@ -89,6 +90,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { ...@@ -89,6 +90,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
use_tensorrt_ = other.use_tensorrt_; use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_; model_from_memory_ = other.model_from_memory_;
pass_builder_ = std::move(other.pass_builder_); pass_builder_ = std::move(other.pass_builder_);
...@@ -105,11 +107,13 @@ void contrib::AnalysisConfig::EnableMKLDNN() { ...@@ -105,11 +107,13 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
} }
void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
int max_batch_size) { int max_batch_size,
int min_subgraph_size) {
use_tensorrt_ = true; use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
// Append after the infer_clean pass. tensorrt_min_subgraph_size_ = min_subgraph_size;
// Append after the conv+affine_channel fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
} }
......
...@@ -328,6 +328,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -328,6 +328,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseTensorRT(true); argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
} }
if (config_.use_mkldnn_) { if (config_.use_mkldnn_) {
......
...@@ -49,7 +49,7 @@ struct AnalysisConfig : public NativeConfig { ...@@ -49,7 +49,7 @@ struct AnalysisConfig : public NativeConfig {
bool use_feed_fetch_ops{true}; bool use_feed_fetch_ops{true};
void EnableTensorRtEngine(int workspace_size = 1 << 20, void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1); int max_batch_size = 1, int min_subgraph_size = 3);
bool use_tensorrt() const { return use_tensorrt_; } bool use_tensorrt() const { return use_tensorrt_; }
void EnableMKLDNN(); void EnableMKLDNN();
...@@ -69,8 +69,19 @@ struct AnalysisConfig : public NativeConfig { ...@@ -69,8 +69,19 @@ struct AnalysisConfig : public NativeConfig {
bool use_tensorrt_{false}; bool use_tensorrt_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_; std::unordered_set<std::string> mkldnn_enabled_op_types_;
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int tensorrt_workspace_size_; int tensorrt_workspace_size_;
// While TensorRT allows an engine optimized for a given max batch size
// to run at any smaller size, the performance for those smaller
// sizes may not be as well-optimized. Therefore, Max batch is best
// equivalent to the runtime batch size.
int tensorrt_max_batchsize_; int tensorrt_max_batchsize_;
// We transform the Ops that can be converted into TRT layer in the model,
// and aggregate these Ops into subgraphs for TRT execution.
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
std::unique_ptr<PassStrategy> pass_builder_; std::unique_ptr<PassStrategy> pass_builder_;
bool model_from_memory_{false}; bool model_from_memory_{false};
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册