提交 71636e67 编写于 作者: N nhzlx

add min_subgraph_size attr to tensorrt config

test=develop
上级 a6aa8ea7
......@@ -127,6 +127,7 @@ struct Argument {
std::function<bool(const framework::ir::Node*)>);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
// The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
......
......@@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument,
argument->tensorrt_node_teller_ptr());
pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size",
new int(argument->tensorrt_min_subgraph_size()));
}
// graph_ = pass->Apply(std::move(graph_));
......
......@@ -38,7 +38,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
auto teller =
Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/);
SubGraphFuser fuser(graph.get(), teller,
Get<int>("min_subgraph_size") /*min subgraph size*/);
fuser();
for (auto *node : graph->Nodes()) {
......@@ -233,4 +234,5 @@ REGISTER_PASS(tensorrt_subgraph_pass,
paddle::inference::analysis::TensorRtSubgraphPass)
.RequirePassAttr("tensorrt_node_teller")
.RequirePassAttr("max_batch_size")
.RequirePassAttr("workspace_size");
.RequirePassAttr("workspace_size")
.RequirePassAttr("min_subgraph_size");
......@@ -57,6 +57,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_;
if (use_gpu) {
......@@ -89,6 +90,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_;
model_from_memory_ = other.model_from_memory_;
pass_builder_ = std::move(other.pass_builder_);
......@@ -105,11 +107,13 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
}
void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
int max_batch_size) {
int max_batch_size,
int min_subgraph_size) {
use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
// Append after the infer_clean pass.
tensorrt_min_subgraph_size_ = min_subgraph_size;
// Append after the conv+affine_channel fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
......
......@@ -328,6 +328,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
}
if (config_.use_mkldnn_) {
......
......@@ -49,7 +49,7 @@ struct AnalysisConfig : public NativeConfig {
bool use_feed_fetch_ops{true};
void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1);
int max_batch_size = 1, int min_subgraph_size = 3);
bool use_tensorrt() const { return use_tensorrt_; }
void EnableMKLDNN();
......@@ -69,8 +69,19 @@ struct AnalysisConfig : public NativeConfig {
bool use_tensorrt_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
int tensorrt_workspace_size_;
// While TensorRT allows an engine optimized for a given max batch size
// to run at any smaller size, the performance for those smaller
// sizes may not be as well-optimized. Therefore, Max batch is best
// equivalent to the runtime batch size.
int tensorrt_max_batchsize_;
// We transform the Ops that can be converted into TRT layer in the model,
// and aggregate these Ops into subgraphs for TRT execution.
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
std::unique_ptr<PassStrategy> pass_builder_;
bool model_from_memory_{false};
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册