未验证 提交 4a69a536 编写于 作者: Z Zhang Jun 提交者: GitHub

[inference][trt]add trt sparse weights switch (#53562)

上级 04e5e7b7
...@@ -250,6 +250,9 @@ struct Argument { ...@@ -250,6 +250,9 @@ struct Argument {
TensorRtAllowBuildAtRuntime, TensorRtAllowBuildAtRuntime,
bool); bool);
DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool); DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_sparse_weights,
TensorRtUseSparseWeights,
bool);
DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool); DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int); DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
......
...@@ -213,6 +213,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -213,6 +213,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("use_static_engine", new bool(use_static_engine)); pass->Set("use_static_engine", new bool(use_static_engine));
pass->Set("model_from_memory", new bool(argument->model_from_memory())); pass->Set("model_from_memory", new bool(argument->model_from_memory()));
pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector())); pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector()));
pass->Set("use_sparse_weights",
new bool(argument->tensorrt_use_sparse_weights()));
// tuned trt dynamic_shape // tuned trt dynamic_shape
pass->Set("trt_shape_range_info_path", pass->Set("trt_shape_range_info_path",
......
...@@ -523,6 +523,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -523,6 +523,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime); op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
op_desc->SetAttr("shape_range_info_path", shape_range_info_path); op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
op_desc->SetAttr("use_inspector", Get<bool>("use_inspector")); op_desc->SetAttr("use_inspector", Get<bool>("use_inspector"));
op_desc->SetAttr("use_sparse_weights", Get<bool>("use_sparse_weights"));
op_desc->SetAttr("model_precision", Get<int>("model_precision")); op_desc->SetAttr("model_precision", Get<int>("model_precision"));
op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape); op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape);
...@@ -614,17 +615,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -614,17 +615,14 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
opt_input_shape = {}; opt_input_shape = {};
} }
auto to_major_version = [&](int full_version) -> float { const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION);
return (full_version / 100) / 10.0; const float trt_runtime_version =
}; tensorrt::TrtMajorVersion(tensorrt::GetInferLibVersion());
const float compile_time_trt_version = to_major_version(TRT_VERSION); if (trt_compile_version != trt_runtime_version) {
const float run_time_trt_version =
to_major_version(tensorrt::GetInferLibVersion());
if (compile_time_trt_version != run_time_trt_version) {
LOG_FIRST_N(WARNING, 1) LOG_FIRST_N(WARNING, 1)
<< "The Paddle Inference library is compiled with " << "The Paddle Inference library is compiled with "
<< compile_time_trt_version << " version TensorRT, " << trt_compile_version << " version TensorRT, "
<< "but the runtime TensorRT you are using is " << run_time_trt_version << "but the runtime TensorRT you are using is " << trt_runtime_version
<< " version. " << " version. "
"This might cause serious compatibility issues. We strongly " "This might cause serious compatibility issues. We strongly "
"recommend using the same TRT version at runtime."; "recommend using the same TRT version at runtime.";
...@@ -666,6 +664,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp( ...@@ -666,6 +664,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine->SetUseDLA(Get<bool>("trt_use_dla")); trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
trt_engine->SetDLACore(Get<int>("trt_dla_core")); trt_engine->SetDLACore(Get<int>("trt_dla_core"));
trt_engine->SetUseInspector(Get<bool>("use_inspector")); trt_engine->SetUseInspector(Get<bool>("use_inspector"));
trt_engine->SetUseSparseWeights(Get<bool>("use_sparse_weights"));
trt_engine->SetWithErnie( trt_engine->SetWithErnie(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) && graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass)); graph->Has(framework::ir::kMultiheadMatmulPass));
......
...@@ -451,6 +451,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -451,6 +451,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(collect_shape_range_info_); CP_MEMBER(collect_shape_range_info_);
CP_MEMBER(shape_range_info_path_); CP_MEMBER(shape_range_info_path_);
CP_MEMBER(trt_use_inspector_); CP_MEMBER(trt_use_inspector_);
CP_MEMBER(trt_use_sparse_weights_);
CP_MEMBER(trt_engine_memory_sharing_); CP_MEMBER(trt_engine_memory_sharing_);
CP_MEMBER(trt_engine_memory_sharing_identifier_); CP_MEMBER(trt_engine_memory_sharing_identifier_);
// Dlnne related // Dlnne related
...@@ -805,6 +806,10 @@ void AnalysisConfig::EnableTensorRtDLA(int dla_core) { ...@@ -805,6 +806,10 @@ void AnalysisConfig::EnableTensorRtDLA(int dla_core) {
void AnalysisConfig::EnableTensorRtInspector() { trt_use_inspector_ = true; } void AnalysisConfig::EnableTensorRtInspector() { trt_use_inspector_ = true; }
void AnalysisConfig::EnableTensorRtSparseWeights() {
trt_use_sparse_weights_ = true;
}
void AnalysisConfig::Exp_DisableTensorRtOPs( void AnalysisConfig::Exp_DisableTensorRtOPs(
const std::vector<std::string> &ops) { const std::vector<std::string> &ops) {
trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end()); trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());
......
...@@ -1358,6 +1358,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1358,6 +1358,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetTensorRtAllowBuildAtRuntime( argument_->SetTensorRtAllowBuildAtRuntime(
config_.trt_allow_build_at_runtime()); config_.trt_allow_build_at_runtime());
argument_->SetTensorRtUseInspector(config_.trt_use_inspector_); argument_->SetTensorRtUseInspector(config_.trt_use_inspector_);
argument_->SetTensorRtUseSparseWeights(config_.trt_use_sparse_weights_);
argument_->SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing()); argument_->SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
} }
......
...@@ -742,6 +742,9 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -742,6 +742,9 @@ struct PD_INFER_DECL AnalysisConfig {
void EnableTensorRtInspector(); void EnableTensorRtInspector();
bool tensorrt_inspector_enabled() { return trt_use_inspector_; } bool tensorrt_inspector_enabled() { return trt_use_inspector_; }
void EnableTensorRtSparseWeights();
bool tensorrt_sparse_weights_enabled() { return trt_use_sparse_weights_; }
void EnableDlnne( void EnableDlnne(
int min_subgraph_size = 3, int min_subgraph_size = 3,
int max_batch_size = 1, int max_batch_size = 1,
...@@ -1118,6 +1121,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -1118,6 +1121,7 @@ struct PD_INFER_DECL AnalysisConfig {
// tune to get dynamic_shape info. // tune to get dynamic_shape info.
bool trt_tuned_dynamic_shape_{false}; bool trt_tuned_dynamic_shape_{false};
bool trt_use_inspector_{false}; bool trt_use_inspector_{false};
bool trt_use_sparse_weights_{false};
// In CollectShapeInfo mode, we will collect the shape information of // In CollectShapeInfo mode, we will collect the shape information of
// all intermediate tensors in the compute graph and calculate the // all intermediate tensors in the compute graph and calculate the
......
...@@ -207,12 +207,6 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -207,12 +207,6 @@ void TensorRTEngine::FreezeNetwork() {
infer_builder_config_->setMaxWorkspaceSize(max_workspace_); infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
#endif #endif
#if IS_TRT_VERSION_GE(8500)
infer_builder_config_->setPreviewFeature(
nvinfer1::PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805, true);
#else
#endif
bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf); bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
if (enable_fp16) { if (enable_fp16) {
bool support_fp16 = infer_builder_->platformHasFastFp16(); bool support_fp16 = infer_builder_->platformHasFastFp16();
...@@ -363,6 +357,7 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -363,6 +357,7 @@ void TensorRTEngine::FreezeNetwork() {
"opt_shape, false /*disable_trt_plugin_fp16*/)'"; "opt_shape, false /*disable_trt_plugin_fp16*/)'";
} }
} }
#if IS_TRT_VERSION_GE(8200) #if IS_TRT_VERSION_GE(8200)
if (use_inspector_) { if (use_inspector_) {
infer_builder_config_->setProfilingVerbosity( infer_builder_config_->setProfilingVerbosity(
...@@ -374,7 +369,9 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -374,7 +369,9 @@ void TensorRTEngine::FreezeNetwork() {
infer_engine_.reset(infer_builder_->buildEngineWithConfig( infer_engine_.reset(infer_builder_->buildEngineWithConfig(
*network(), *infer_builder_config_)); *network(), *infer_builder_config_));
#else #else
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); if (use_sparse_weights_) {
infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
}
ihost_memory_.reset(infer_builder_->buildSerializedNetwork( ihost_memory_.reset(infer_builder_->buildSerializedNetwork(
*network(), *infer_builder_config_)); *network(), *infer_builder_config_));
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_)); infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
......
...@@ -739,6 +739,9 @@ class TensorRTEngine { ...@@ -739,6 +739,9 @@ class TensorRTEngine {
void GetEngineInfo(); void GetEngineInfo();
void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; } void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
void SetUseSparseWeights(bool use_sparse_weights) {
use_sparse_weights_ = use_sparse_weights;
}
void SetScope(const framework::Scope& scope) { scope_ = &scope; } void SetScope(const framework::Scope& scope) { scope_ = &scope; }
void SetContextMemorySharing(bool context_memory_sharing) { void SetContextMemorySharing(bool context_memory_sharing) {
...@@ -827,6 +830,7 @@ class TensorRTEngine { ...@@ -827,6 +830,7 @@ class TensorRTEngine {
#endif #endif
std::mutex mutex_; std::mutex mutex_;
bool use_inspector_; bool use_inspector_;
bool use_sparse_weights_{false};
public: public:
thread_local static int predictor_id_per_thread; thread_local static int predictor_id_per_thread;
......
...@@ -96,6 +96,10 @@ static std::tuple<int, int, int> GetTrtCompileVersion() { ...@@ -96,6 +96,10 @@ static std::tuple<int, int, int> GetTrtCompileVersion() {
NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH}; NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH};
} }
static float TrtMajorVersion(int full_version) {
return (full_version / 100) / 10.0;
}
template <typename T> template <typename T>
struct Destroyer { struct Destroyer {
void operator()(T* x) { void operator()(T* x) {
......
...@@ -879,6 +879,10 @@ void BindAnalysisConfig(py::module *m) { ...@@ -879,6 +879,10 @@ void BindAnalysisConfig(py::module *m) {
&AnalysisConfig::EnableTensorRtInspector) &AnalysisConfig::EnableTensorRtInspector)
.def("tensorrt_inspector_enabled", .def("tensorrt_inspector_enabled",
&AnalysisConfig::tensorrt_inspector_enabled) &AnalysisConfig::tensorrt_inspector_enabled)
.def("enable_tensorrt_sparse_weights",
&AnalysisConfig::EnableTensorRtSparseWeights)
.def("tensorrt_sparse_weights_enabled",
&AnalysisConfig::tensorrt_sparse_weights_enabled)
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("enable_dlnne", .def("enable_dlnne",
&AnalysisConfig::EnableDlnne, &AnalysisConfig::EnableDlnne,
......
...@@ -84,6 +84,8 @@ class BackendPaddle: ...@@ -84,6 +84,8 @@ class BackendPaddle:
# enable memory optim # enable memory optim
if not self.args.enable_tune: if not self.args.enable_tune:
config.enable_memory_optim() config.enable_memory_optim()
if self.args.enable_trt_sparse_weights:
config.enable_tensorrt_sparse_weights()
config.set_cpu_math_library_num_threads(self.args.cpu_threads) config.set_cpu_math_library_num_threads(self.args.cpu_threads)
config.switch_ir_optim(True) config.switch_ir_optim(True)
...@@ -258,6 +260,9 @@ def parse_args(): ...@@ -258,6 +260,9 @@ def parse_args():
parser.add_argument('--enable_dynamic_shape', type=str2bool, default=True) parser.add_argument('--enable_dynamic_shape', type=str2bool, default=True)
parser.add_argument('--enable_tune', type=str2bool, default=False) parser.add_argument('--enable_tune', type=str2bool, default=False)
parser.add_argument('--enable_profile', type=str2bool, default=False) parser.add_argument('--enable_profile', type=str2bool, default=False)
parser.add_argument(
'--enable_trt_sparse_weights', type=str2bool, default=False
)
parser.add_argument('--enable_benchmark', type=str2bool, default=True) parser.add_argument('--enable_benchmark', type=str2bool, default=True)
parser.add_argument('--save_result', type=str2bool, default=False) parser.add_argument('--save_result', type=str2bool, default=False)
parser.add_argument('--return_result', type=str2bool, default=False) parser.add_argument('--return_result', type=str2bool, default=False)
...@@ -308,6 +313,13 @@ def run_infer(model_path): ...@@ -308,6 +313,13 @@ def run_infer(model_path):
backend.load(conf) backend.load(conf)
backend.predict() backend.predict()
# run inference predictor, enable trt sparse weights
conf.enable_tune = False
conf.enable_trt_sparse_weights = True
backend = BackendPaddle()
backend.load(conf)
backend.predict()
class ConvBNLayer(paddle.nn.Layer): class ConvBNLayer(paddle.nn.Layer):
def __init__( def __init__(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册