未验证 提交 638965c5 编写于 作者: D denglin-github 提交者: GitHub

Update DlNNE engine (#45027)

* add config param for enable_dlnne and support calibration mode
* remove useless file
* refine code and add annotation
* refine code of Warnning tips
上级 d7d9807e
...@@ -250,6 +250,22 @@ struct Argument { ...@@ -250,6 +250,22 @@ struct Argument {
DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool); DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int); DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int); DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
DECL_ARGUMENT_FIELD(dlnne_use_static_batch, DlnneUseStaticBatch, bool);
DECL_ARGUMENT_FIELD(dlnne_weight_share_mode,
DlnneWeightShareMode,
std::string);
DECL_ARGUMENT_FIELD(dlnne_disable_nodes_by_outputs,
DlnneDisableNodesByOutputs,
std::unordered_set<std::string>);
DECL_ARGUMENT_FIELD(dlnne_use_calib_mode, DlnneUseCalibMode, bool);
DECL_ARGUMENT_FIELD(dlnne_precision_mode,
DlnnePrecisionMode,
AnalysisConfig::Precision);
using dlnne_input_shape_type = std::map<std::string, std::vector<int64_t>>;
DECL_ARGUMENT_FIELD(dlnne_input_shape_dict,
DlnneInputShapeDict,
dlnne_input_shape_type);
DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int); DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
DECL_ARGUMENT_FIELD(lite_passes_filter, DECL_ARGUMENT_FIELD(lite_passes_filter,
......
...@@ -209,8 +209,23 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -209,8 +209,23 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("disable_trt_plugin_fp16", pass->Set("disable_trt_plugin_fp16",
new bool(argument->disable_trt_plugin_fp16())); new bool(argument->disable_trt_plugin_fp16()));
} else if (pass_name == "dlnne_subgraph_pass") { } else if (pass_name == "dlnne_subgraph_pass") {
auto precision_mode = argument->dlnne_precision_mode();
pass->Set("min_subgraph_size", pass->Set("min_subgraph_size",
new int(argument->dlnne_min_subgraph_size())); new int(argument->dlnne_min_subgraph_size()));
pass->Set("max_batch_size", new int(argument->dlnne_max_batch_size()));
pass->Set("use_static_batch",
new bool(argument->dlnne_use_static_batch()));
pass->Set("weight_share_mode",
new std::string(argument->dlnne_weight_share_mode()));
pass->Set("disable_nodes_by_outputs",
new std::unordered_set<std::string>(
argument->dlnne_disable_nodes_by_outputs()));
pass->Set("use_calib_mode", new bool(argument->dlnne_use_calib_mode()));
pass->Set("precision_mode",
new AnalysisConfig::Precision(precision_mode));
pass->Set("input_shape_dict",
new std::map<std::string, std::vector<int64_t>>(
argument->dlnne_input_shape_dict()));
pass->Set("program", pass->Set("program",
new framework::ProgramDesc *(&argument->main_program())); new framework::ProgramDesc *(&argument->main_program()));
} }
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace paddle {
namespace inference {
int RegisterPyFunc(const std::string& name, void* pfn);
} // namespace inference
} // namespace paddle
...@@ -34,9 +34,6 @@ class Node; ...@@ -34,9 +34,6 @@ class Node;
namespace paddle { namespace paddle {
namespace inference { namespace inference {
int ConvertGraph(std::string graph_name);
namespace analysis { namespace analysis {
class DlnneSubgraphPass : public framework::ir::FusePassBase { class DlnneSubgraphPass : public framework::ir::FusePassBase {
...@@ -44,6 +41,8 @@ class DlnneSubgraphPass : public framework::ir::FusePassBase { ...@@ -44,6 +41,8 @@ class DlnneSubgraphPass : public framework::ir::FusePassBase {
void ApplyImpl(framework::ir::Graph *graph) const override; void ApplyImpl(framework::ir::Graph *graph) const override;
private: private:
void InferShapeForDlnneMainGraph() const;
bool IsDynamicOp(std::string var_name, bool use_static_batch) const;
void CleanIntermediateOutputs(framework::ir::Node *node); void CleanIntermediateOutputs(framework::ir::Node *node);
void CreateDlnneOp(framework::ir::Node *x, void CreateDlnneOp(framework::ir::Node *x,
framework::ir::Graph *graph, framework::ir::Graph *graph,
......
...@@ -283,6 +283,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -283,6 +283,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
// Dlnne related // Dlnne related
CP_MEMBER(use_dlnne_); CP_MEMBER(use_dlnne_);
CP_MEMBER(dlnne_min_subgraph_size_); CP_MEMBER(dlnne_min_subgraph_size_);
CP_MEMBER(dlnne_max_batchsize_);
CP_MEMBER(dlnne_use_static_batch_);
CP_MEMBER(dlnne_weight_share_mode_);
CP_MEMBER(dlnne_use_calib_mode_);
CP_MEMBER(dlnne_precision_mode_);
CP_MEMBER(dlnne_disable_nodes_by_outputs_);
CP_MEMBER(dlnne_input_shape_dict_);
// MKLDNN related. // MKLDNN related.
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
...@@ -544,9 +551,24 @@ void AnalysisConfig::EnableTensorRtEngine( ...@@ -544,9 +551,24 @@ void AnalysisConfig::EnableTensorRtEngine(
#endif #endif
} }
void AnalysisConfig::EnableDlnne(int min_subgraph_size) { void AnalysisConfig::EnableDlnne(
int min_subgraph_size,
int max_batch_size,
bool use_static_batch,
std::string weight_share_mode,
std::unordered_set<std::string> disable_nodes_by_ouputs,
std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
bool use_calib_mode,
AnalysisConfig::Precision precision_mode) {
use_dlnne_ = true; use_dlnne_ = true;
dlnne_min_subgraph_size_ = min_subgraph_size; dlnne_min_subgraph_size_ = min_subgraph_size;
dlnne_max_batchsize_ = max_batch_size;
dlnne_use_static_batch_ = use_static_batch;
dlnne_weight_share_mode_ = weight_share_mode;
dlnne_disable_nodes_by_outputs_ = disable_nodes_by_ouputs;
dlnne_input_shape_dict_ = dlnne_input_shape_dict;
dlnne_use_calib_mode_ = use_calib_mode;
dlnne_precision_mode_ = precision_mode;
Update(); Update();
} }
......
...@@ -1107,6 +1107,14 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1107,6 +1107,14 @@ void AnalysisPredictor::PrepareArgument() {
LOG(INFO) << "Dlnne subgraph is enabled"; LOG(INFO) << "Dlnne subgraph is enabled";
argument_.SetUseDlnne(true); argument_.SetUseDlnne(true);
argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
argument_.SetDlnneMaxBatchSize(config_.dlnne_max_batchsize_);
argument_.SetDlnneUseStaticBatch(config_.dlnne_use_static_batch_);
argument_.SetDlnneWeightShareMode(config_.dlnne_weight_share_mode_);
argument_.SetDlnneDisableNodesByOutputs(
config_.dlnne_disable_nodes_by_outputs_);
argument_.SetDlnneInputShapeDict(config_.dlnne_input_shape_dict_);
argument_.SetDlnneUseCalibMode(config_.dlnne_use_calib_mode_);
argument_.SetDlnnePrecisionMode(config_.dlnne_precision_mode_);
} }
if (config_.lite_engine_enabled()) { if (config_.lite_engine_enabled()) {
......
...@@ -663,7 +663,15 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -663,7 +663,15 @@ struct PD_INFER_DECL AnalysisConfig {
void EnableTensorRtInspector(); void EnableTensorRtInspector();
bool tensorrt_inspector_enabled() { return trt_use_inspector_; } bool tensorrt_inspector_enabled() { return trt_use_inspector_; }
void EnableDlnne(int min_subgraph_size = 3); void EnableDlnne(
int min_subgraph_size = 3,
int max_batch_size = 1,
bool use_static_batch = false,
std::string weight_share_mode = "0",
std::unordered_set<std::string> disable_nodes_by_outputs = {},
std::map<std::string, std::vector<int64_t>> input_dict = {},
bool use_calib_mode = false,
AnalysisConfig::Precision precision_mode = Precision::kFloat32);
bool dlnne_enabled() const { return use_dlnne_; } bool dlnne_enabled() const { return use_dlnne_; }
/// ///
...@@ -1006,6 +1014,13 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -1006,6 +1014,13 @@ struct PD_INFER_DECL AnalysisConfig {
// dlnne related. // dlnne related.
bool use_dlnne_{false}; bool use_dlnne_{false};
int dlnne_min_subgraph_size_{3}; int dlnne_min_subgraph_size_{3};
int dlnne_max_batchsize_{1};
std::unordered_set<std::string> dlnne_disable_nodes_by_outputs_;
bool dlnne_use_static_batch_{true};
std::string dlnne_weight_share_mode_;
std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict_{};
bool dlnne_use_calib_mode_{false};
Precision dlnne_precision_mode_{Precision::kFloat32};
// memory reuse related. // memory reuse related.
bool enable_memory_optim_{false}; bool enable_memory_optim_{false};
......
...@@ -269,12 +269,28 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) { ...@@ -269,12 +269,28 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
return config->config.tensorrt_engine_enabled(); return config->config.tensorrt_engine_enabled();
} }
void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) { void PD_EnableDlnne(
PD_AnalysisConfig* config,
int min_subgraph_size,
int max_batch_size,
bool use_static_batch,
std::string weight_share_mode,
std::unordered_set<std::string> disable_nodes_by_ouputs,
std::map<std::string, std::vector<int64_t>> dlnne_input_shape_dict,
bool use_calib_mode,
AnalysisConfig::Precision precision_mode) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
config, config,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The pointer of analysis configuration shouldn't be nullptr")); "The pointer of analysis configuration shouldn't be nullptr"));
config->config.EnableDlnne(min_subgraph_size); config->config.EnableDlnne(min_subgraph_size,
max_batch_size,
use_static_batch,
weight_share_mode,
disable_nodes_by_ouputs,
dlnne_input_shape_dict,
use_calib_mode,
precision_mode);
} }
bool PD_DlnneEnabled(const PD_AnalysisConfig* config) { bool PD_DlnneEnabled(const PD_AnalysisConfig* config) {
......
...@@ -9,21 +9,19 @@ endforeach() ...@@ -9,21 +9,19 @@ endforeach()
# add nne # add nne
find_path( find_path(
DLNNE_INCLUDE_DIR dlnne.h DLNNE_INCLUDE_DIR dlnne.h
PATHS $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/include/dlnne
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
find_library( find_library(
DLNNE_LIB libdlnne.so DLNNE_LIB libdlnne.so
PATHS $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/lib
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
find_path(CUDA_INCLUDE_DIR cuda.h find_path(CUDA_INCLUDE_DIR cuda.h $ENV{DL_SDK_DIR}/include)
$ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include)
find_library( find_library(
CURT_LIB libcurt.so CURT_LIB libcurt.so
PATHS $ENV{SOFTWARE_BUILD_DIR} PATHS $ENV{DL_SDK_DIR} $ENV{DL_SDK_DIR}/lib
$ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR}) message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})
......
...@@ -28,6 +28,105 @@ void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) { ...@@ -28,6 +28,105 @@ void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
} }
std::string ConvertType(paddle::experimental::DataType type) {
switch (type) {
case paddle::experimental::DataType::FLOAT32: {
return "float32";
}
case paddle::experimental::DataType::INT64: {
return "int64";
}
case paddle::experimental::DataType::INT32: {
return "int32";
}
case paddle::experimental::DataType::FLOAT16: {
return "float16";
}
default: {
PADDLE_THROW(
platform::errors::Fatal("The DLNNE Calibration only support "
"float/float16/int32_t/int64_t input."));
}
}
}
int GetDataByte(paddle::experimental::DataType type) {
switch (type) {
case paddle::experimental::DataType::FLOAT32: {
return 4;
}
case paddle::experimental::DataType::INT64: {
return 8;
}
case paddle::experimental::DataType::INT32: {
return 4;
}
case paddle::experimental::DataType::FLOAT16: {
return 2;
}
default: {
PADDLE_THROW(
platform::errors::Fatal("The DLNNE Calibration only support "
"float/float16/int32_t/int64_t input."));
}
}
}
std::string GenerateRandomKey() {
std::string str(
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
std::random_device rd;
std::mt19937 generator(rd());
std::shuffle(str.begin(), str.end(), generator);
return str.substr(0, 32);
}
void ConvertPaddle2Onnx(std::string onnx_file_name,
std::string subgraph_root_path) {
if (!FileExists(onnx_file_name.c_str())) {
std::stringstream convert_cmd;
convert_cmd << "paddle2onnx --model_dir " << subgraph_root_path
<< " --save_file " << onnx_file_name << " --opset_version 11";
LOG(INFO) << convert_cmd.str();
int convert_flag = system(convert_cmd.str().c_str());
PADDLE_ENFORCE_EQ(
convert_flag,
0,
platform::errors::Unavailable("Convert paddle to onnx failed"));
}
}
void QuantizeOnnx(std::string onnx_file_name,
std::string rlym_file_name,
std::string quantized_rlym_file_name,
std::string dataset_path,
std::string dataset_plugin_path) {
if (!FileExists(rlym_file_name.c_str())) {
std::stringstream convert_cmd;
convert_cmd << "python -m dl convert " << onnx_file_name
<< " --output-model " << rlym_file_name;
LOG(INFO) << convert_cmd.str();
int convert_flag = system(convert_cmd.str().c_str());
PADDLE_ENFORCE_EQ(
convert_flag,
0,
platform::errors::Unavailable("Convert onnx to rlym failed"));
}
if (!FileExists(quantized_rlym_file_name.c_str())) {
std::stringstream quantize_cmd;
quantize_cmd << "python -m dl quantize "
<< "--dataset " << dataset_path << " --plugin "
<< dataset_plugin_path << " " << rlym_file_name;
LOG(INFO) << quantize_cmd.str();
int quantize_flag = system(quantize_cmd.str().c_str());
PADDLE_ENFORCE_EQ(quantize_flag,
0,
platform::errors::Unavailable("quantize model failed"));
}
}
} // namespace inference } // namespace inference
namespace operators { namespace operators {
...@@ -41,7 +140,23 @@ class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -41,7 +140,23 @@ class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<std::string>( AddAttr<std::string>(
"engine_key", "engine_key",
"The engine_key here is used to distinguish different DLNNE Engines"); "The engine_key here is used to distinguish different DLNNE Engines");
AddAttr<framework::BlockDesc*>("sub_block", "the trt block"); AddAttr<int32_t>("max_batch_size", "engine max_batch_size");
AddAttr<bool>("use_static_batch", "static batch fix for [?,H,W,C]");
AddAttr<std::string>("weight_share_mode",
"dlnne weight_share_mode, can be '0', '1', '2', '3', "
"'01', '23', '0123' ");
// when use_calib_mode is true and enable_int8 is true,
// the calibration_runtime start,
// when calibration_mode is true, the calibration_runtiime
// go to the first stage of calibration, and when finish
// fisrt stage, the calibration_mode is set false, the
// calibration_runtime go to the second stage
AddAttr<bool>("use_calib_mode", "dlnne use calib mode");
AddAttr<bool>("enable_int8", "dlnne enable int8");
AddAttr<bool>("calibration_mode", "dlnne calibration_mode");
AddAttr<std::string>("calibration_data_path", "calibration data path");
AddAttr<std::string>("subgraph_root_path", "subgraph root path");
AddAttr<framework::BlockDesc*>("sub_block", "the dlnne block");
AddComment("Dlnne engine operator."); AddComment("Dlnne engine operator.");
} }
}; };
......
...@@ -730,7 +730,16 @@ void BindAnalysisConfig(py::module *m) { ...@@ -730,7 +730,16 @@ void BindAnalysisConfig(py::module *m) {
.def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
.def("enable_dlnne", .def("enable_dlnne",
&AnalysisConfig::EnableDlnne, &AnalysisConfig::EnableDlnne,
py::arg("min_subgraph_size") = 3) py::arg("min_subgraph_size") = 3,
py::arg("max_batch_size") = 1,
py::arg("use_static_batch") = false,
py::arg("weight_share_mode") = "0",
py::arg("disable_nodes_by_outputs") =
std::unordered_set<std::string>(),
py::arg("input_shape_dict") =
std::map<std::string, std::vector<int64_t>>(),
py::arg("use_calib_mode") = false,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
.def("enable_lite_engine", .def("enable_lite_engine",
&AnalysisConfig::EnableLiteEngine, &AnalysisConfig::EnableLiteEngine,
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册