未验证 提交 b62b384b 编写于 作者: Z zhupengyang 提交者: GitHub

refine xpu inference api (#54342)

上级 8f65f72e
...@@ -367,8 +367,10 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant( ...@@ -367,8 +367,10 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
with_time_step, with_time_step,
with_seq_lengths, with_seq_lengths,
with_src_mask); with_src_mask);
int quant_weight_bits = int quant_post_dynamic_weight_precision =
Has("quant_weight_bits") ? Get<int>("quant_weight_bits") : -1; Has("quant_post_dynamic_weight_precision ")
? Get<int>("quant_post_dynamic_weight_precision ")
: -1;
int found_subgraph_count = 0; int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
...@@ -421,7 +423,7 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant( ...@@ -421,7 +423,7 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
w_node, w_node,
nullptr, nullptr,
platform::errors::Fatal("w node should not be nullptr")); platform::errors::Fatal("w node should not be nullptr"));
if (quant_weight_bits == 8) { if (quant_post_dynamic_weight_precision == 0) {
PrepareWeight<int8_t>( PrepareWeight<int8_t>(
graph, scope, block, w_node, &w_intx, &w_max, need_transpose); graph, scope, block, w_node, &w_intx, &w_max, need_transpose);
} else { } else {
......
...@@ -93,6 +93,25 @@ struct Argument { ...@@ -93,6 +93,25 @@ struct Argument {
private: \ private: \
type__ field__##_; type__ field__##_;
#define DECL_POINTER_ARGUMENT_FIELD(field__, Field, type__) \
public: \
type__& field__() { \
PADDLE_ENFORCE_EQ( \
Has(#field__), \
true, \
platform::errors::PreconditionNotMet("There is no such field")); \
return field__##_; \
} \
void Set##Field(type__ x) { \
field__##_ = x; \
valid_fields_.insert(#field__); \
} \
DECL_ARGUMENT_FIELD_VALID(field__); \
type__* field__##_ptr() { return &field__##_; } \
\
private: \
type__ field__##_;
#define DECL_ARGUMENT_FIELD_VALID(field__) \ #define DECL_ARGUMENT_FIELD_VALID(field__) \
bool field__##_valid() { return Has(#field__); } bool field__##_valid() { return Has(#field__); }
...@@ -276,20 +295,48 @@ struct Argument { ...@@ -276,20 +295,48 @@ struct Argument {
DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool); DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool); DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool); DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string); DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool); DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_bits, // XpuConfig
XpuQuantPostDynamicWeightBits, DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_file, XpuConvAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_file_writeback,
XpuConvAutotuneFileWriteback,
bool);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_level, XpuFcAutotuneLevel, int);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_file, XpuFcAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_file_writeback,
XpuFcAutotuneFileWriteback,
bool);
DECL_ARGUMENT_FIELD(xpu_gemm_compute_precision, XpuGemmComputePrecision, int);
DECL_ARGUMENT_FIELD(xpu_transformer_softmax_optimize_level,
XpuTransformerSoftmaxOptimizeLevel,
int);
DECL_ARGUMENT_FIELD(xpu_transformer_encoder_adaptive_seqlen,
XpuTransformerEncoderAdaptiveSeqlen,
bool);
DECL_ARGUMENT_FIELD(xpu_quant_post_static_gelu_out_threshold,
XpuQuantPostStaticGeluOutThreshold,
float);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_activation_method,
XpuQuantPostDynamicActivationMethod,
int);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_precision,
XpuQuantPostDynamicWeightPrecision,
int); int);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_op_types, DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_op_types,
XpuQuantPostDynamicOpTypes, XpuQuantPostDynamicOpTypes,
std::vector<std::string>); std::vector<std::string>);
DECL_ARGUMENT_FIELD(xpu_lite_l3_locked, XpuLiteL3Locked, bool);
DECL_ARGUMENT_FIELD(xpu_lite_enable_multi_stream,
XpuLiteEnableMultiStream,
bool);
DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool); DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool);
......
...@@ -267,20 +267,41 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -267,20 +267,41 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("enable_int8", new bool(lite_enable_int8)); pass->Set("enable_int8", new bool(lite_enable_int8));
pass->Set("use_gpu", new bool(argument->use_gpu())); pass->Set("use_gpu", new bool(argument->use_gpu()));
pass->Set("zero_copy", new bool(argument->lite_zero_copy())); pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
pass->Set("xpu_l3_workspace_size", pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
new int(argument->xpu_l3_workspace_size())); pass->Set("xpu_l3_size", new size_t(argument->xpu_l3_size()));
pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
pass->Set("xpu_l3_autotune_size",
new size_t(argument->xpu_l3_autotune_size()));
pass->Set("xpu_stream", new void *(argument->xpu_stream()));
pass->Set("xpu_conv_autotune_level",
new int(argument->xpu_conv_autotune_level()));
pass->Set("xpu_conv_autotune_file",
new std::string(argument->xpu_conv_autotune_file()));
pass->Set("xpu_conv_autotune_file_writeback",
new bool(argument->xpu_conv_autotune_file_writeback()));
pass->Set("xpu_fc_autotune_level",
new int(argument->xpu_fc_autotune_level()));
pass->Set("xpu_fc_autotune_file",
new std::string(argument->xpu_fc_autotune_file()));
pass->Set("xpu_fc_autotune_file_writeback",
new bool(argument->xpu_fc_autotune_file_writeback()));
pass->Set("xpu_gemm_compute_precision",
new int(argument->xpu_gemm_compute_precision()));
pass->Set("xpu_transformer_softmax_optimize_level",
new int(argument->xpu_transformer_softmax_optimize_level()));
pass->Set("xpu_transformer_encoder_adaptive_seqlen",
new bool(argument->xpu_transformer_encoder_adaptive_seqlen()));
pass->Set(
"xpu_quant_post_static_gelu_out_threshold",
new float(argument->xpu_quant_post_static_gelu_out_threshold()));
pass->Set("xpu_quant_post_dynamic_activation_method",
new int(argument->xpu_quant_post_dynamic_activation_method()));
pass->Set("xpu_l3_locked", new bool(argument->xpu_lite_l3_locked()));
pass->Set("xpu_enable_multi_stream",
new bool(argument->xpu_lite_enable_multi_stream()));
pass->Set("use_opencl", new bool(argument->use_opencl())); pass->Set("use_opencl", new bool(argument->use_opencl()));
pass->Set("cpu_math_library_num_threads", pass->Set("cpu_math_library_num_threads",
new int(argument->cpu_math_library_num_threads())); new int(argument->cpu_math_library_num_threads()));
pass->Set("locked", new bool(argument->xpu_locked()));
pass->Set("autotune", new bool(argument->xpu_autotune()));
pass->Set("autotune_file",
new std::string(argument->xpu_autotune_file()));
pass->Set("precision", new std::string(argument->xpu_precision()));
pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
pass->Set("enable_multi_stream",
new bool(argument->xpu_enable_multi_stream()));
// NNAdapter Related // NNAdapter Related
pass->Set("use_nnadapter", new bool(argument->use_nnadapter())); pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
pass->Set("nnadapter_model_cache_dir", pass->Set("nnadapter_model_cache_dir",
...@@ -313,12 +334,10 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -313,12 +334,10 @@ void IRPassManager::CreatePasses(Argument *argument,
bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding(); bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
pass->Set("use_fc_padding", new bool(use_fc_padding)); pass->Set("use_fc_padding", new bool(use_fc_padding));
} else if (pass_name == "fused_multi_transformer_xpu_pass") { } else if (pass_name == "fused_multi_transformer_xpu_pass") {
auto op_types = argument->xpu_quant_post_dynamic_op_types(); int quant_post_dynamic_weight_precision =
if (std::count(op_types.begin(), argument->xpu_quant_post_dynamic_weight_precision();
op_types.end(), if (quant_post_dynamic_weight_precision == 0) {
"fused_multi_transformer") > 0) { pass->Set("quant_post_dynamic_weight_precision ", new int(0));
pass->Set("quant_weight_bits",
new int(argument->xpu_quant_post_dynamic_weight_bits()));
} }
} }
pre_pass = pass_name; pre_pass = pass_name;
......
...@@ -249,17 +249,27 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -249,17 +249,27 @@ void LiteSubgraphPass::SetUpEngine(
bool use_gpu = Get<bool>("use_gpu"); bool use_gpu = Get<bool>("use_gpu");
bool enable_int8 = Get<bool>("enable_int8"); bool enable_int8 = Get<bool>("enable_int8");
bool use_xpu = Get<bool>("use_xpu");
int xpu_device_id = Get<int>("xpu_device_id");
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
bool use_opencl = Get<bool>("use_opencl"); bool use_opencl = Get<bool>("use_opencl");
int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads"); int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
bool locked = Get<bool>("locked"); bool use_xpu = Get<bool>("use_xpu");
bool autotune = Get<bool>("autotune"); int xpu_device_id = Get<int>("xpu_device_id");
std::string autotune_file = Get<std::string>("autotune_file"); size_t xpu_l3_size = Get<size_t>("xpu_l3_size");
std::string precision = Get<std::string>("precision"); bool xpu_l3_locked = Get<bool>("xpu_l3_locked");
bool adaptive_seqlen = Get<bool>("adaptive_seqlen"); bool xpu_conv_autotune = Get<int>("xpu_conv_autotune_level") > 0;
bool enable_multi_stream = Get<bool>("enable_multi_stream"); std::string xpu_conv_autotune_file =
Get<std::string>("xpu_conv_autotune_file");
int xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
std::string xpu_transformer_encoder_precision{"int16"};
if (xpu_gemm_compute_precision == 0) {
xpu_transformer_encoder_precision = "int8";
} else if (xpu_gemm_compute_precision == 1) {
xpu_transformer_encoder_precision = "int16";
} else if (xpu_gemm_compute_precision == 2) {
xpu_transformer_encoder_precision = "int31";
}
bool xpu_transformer_encoder_adaptive_seqlen =
Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
bool xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
// NNAdapter Related // NNAdapter Related
bool use_nnadapter = Get<bool>("use_nnadapter"); bool use_nnadapter = Get<bool>("use_nnadapter");
std::string nnadapter_model_cache_dir = std::string nnadapter_model_cache_dir =
...@@ -344,14 +354,15 @@ void LiteSubgraphPass::SetUpEngine( ...@@ -344,14 +354,15 @@ void LiteSubgraphPass::SetUpEngine(
} }
config.cpu_math_library_num_threads = cpu_math_library_num_threads; config.cpu_math_library_num_threads = cpu_math_library_num_threads;
config.xpu_l3_workspace_size = xpu_l3_workspace_size; config.xpu_l3_size = xpu_l3_size;
config.device_id = xpu_device_id; config.device_id = xpu_device_id;
config.locked = locked; config.xpu_l3_locked = xpu_l3_locked;
config.autotune = autotune; config.xpu_conv_autotune = xpu_conv_autotune;
config.autotune_file = autotune_file; config.xpu_conv_autotune_file = xpu_conv_autotune_file;
config.precision = precision; config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision;
config.adaptive_seqlen = adaptive_seqlen; config.xpu_transformer_encoder_adaptive_seqlen =
config.enable_multi_stream = enable_multi_stream; xpu_transformer_encoder_adaptive_seqlen;
config.xpu_enable_multi_stream = xpu_enable_multi_stream;
// NNAdapter Related // NNAdapter Related
config.nnadapter_model_cache_dir = nnadapter_model_cache_dir; config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
config.nnadapter_device_names = nnadapter_device_names; config.nnadapter_device_names = nnadapter_device_names;
......
...@@ -172,22 +172,34 @@ void AnalysisConfig::DisableFCPadding() { ...@@ -172,22 +172,34 @@ void AnalysisConfig::DisableFCPadding() {
Update(); Update();
} }
void AnalysisConfig::EnableXpu(int l3_workspace_size, void AnalysisConfig::EnableXpu(int l3_size,
bool locked, bool l3_locked,
bool autotune, bool conv_autotune,
const std::string &autotune_file, const std::string &conv_autotune_file,
const std::string &precision, const std::string &transformer_encoder_precision,
bool adaptive_seqlen, bool transformer_encoder_adaptive_seqlen,
bool enable_multi_stream) { bool enable_multi_stream) {
#ifdef PADDLE_WITH_XPU
use_xpu_ = true; use_xpu_ = true;
xpu_l3_workspace_size_ = l3_workspace_size; xpu_config_.l3_size = l3_size;
xpu_locked_ = locked; xpu_config_.conv_autotune_level = conv_autotune;
xpu_autotune_ = autotune; xpu_config_.conv_autotune_file = conv_autotune_file;
xpu_autotune_file_ = autotune_file; if (transformer_encoder_precision == "int8") {
xpu_precision_ = precision; xpu_config_.gemm_compute_precision = 0;
xpu_adaptive_seqlen_ = adaptive_seqlen; } else if (transformer_encoder_precision == "int16") {
xpu_enable_multi_stream_ = enable_multi_stream; xpu_config_.gemm_compute_precision = 1;
} else if (transformer_encoder_precision == "int31") {
xpu_config_.gemm_compute_precision = 2;
}
xpu_config_.transformer_encoder_adaptive_seqlen =
transformer_encoder_adaptive_seqlen;
xpu_lite_l3_locked_ = l3_locked;
xpu_lite_enable_multi_stream_ = enable_multi_stream;
Update(); Update();
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"To use XPU inference, please compile with option 'WITH_XPU' first."));
#endif
} }
void AnalysisConfig::SetXpuDeviceId(int device_id) { void AnalysisConfig::SetXpuDeviceId(int device_id) {
...@@ -195,15 +207,22 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) { ...@@ -195,15 +207,22 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
true, true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Should call EnableXpu before SetXpuDeviceId.")); "Should call EnableXpu before SetXpuDeviceId."));
xpu_device_id_ = device_id; xpu_config_.device_id = device_id;
Update(); Update();
} }
void AnalysisConfig::SetXpuConfig( void AnalysisConfig::SetXpuConfig(const XpuConfig &config) {
int quant_post_dynamic_weight_bits, PADDLE_ENFORCE(use_xpu_,
const std::vector<std::string> &quant_post_dynamic_op_types) { platform::errors::PreconditionNotMet(
xpu_quant_post_dynamic_weight_bits_ = quant_post_dynamic_weight_bits; "Should call EnableXpu before SetXpuConfig."));
xpu_quant_post_dynamic_op_types_ = quant_post_dynamic_op_types; PADDLE_ENFORCE_LE(
config.l3_autotune_size,
config.l3_size,
phi::errors::InvalidArgument(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
config.l3_autotune_size,
config.l3_size));
xpu_config_ = config;
Update(); Update();
} }
...@@ -494,16 +513,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -494,16 +513,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
// XPU related. // XPU related.
CP_MEMBER(use_xpu_); CP_MEMBER(use_xpu_);
CP_MEMBER(xpu_device_id_); CP_MEMBER(xpu_config_);
CP_MEMBER(xpu_l3_workspace_size_); CP_MEMBER(xpu_lite_l3_locked_);
CP_MEMBER(xpu_locked_); CP_MEMBER(xpu_lite_enable_multi_stream_);
CP_MEMBER(xpu_autotune_);
CP_MEMBER(xpu_autotune_file_);
CP_MEMBER(xpu_precision_);
CP_MEMBER(xpu_adaptive_seqlen_);
CP_MEMBER(xpu_enable_multi_stream_);
CP_MEMBER(xpu_quant_post_dynamic_weight_bits_);
CP_MEMBER(xpu_quant_post_dynamic_op_types_);
// Lite OpenCL Related // Lite OpenCL Related
CP_MEMBER(use_opencl_); CP_MEMBER(use_opencl_);
...@@ -1033,7 +1045,6 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -1033,7 +1045,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << exec_stream_; ss << exec_stream_;
ss << use_fc_padding_; ss << use_fc_padding_;
ss << gpu_device_id_; ss << gpu_device_id_;
ss << xpu_device_id_;
ss << memory_pool_init_size_mb_; ss << memory_pool_init_size_mb_;
ss << use_tensorrt_; ss << use_tensorrt_;
...@@ -1080,17 +1091,26 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -1080,17 +1091,26 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << use_lite_; ss << use_lite_;
ss << use_xpu_; ss << use_xpu_;
ss << xpu_l3_workspace_size_; ss << xpu_config_.device_id;
ss << xpu_locked_; ss << xpu_config_.l3_size;
ss << xpu_autotune_; ss << xpu_config_.l3_ptr;
ss << xpu_autotune_file_; ss << xpu_config_.l3_autotune_size;
ss << xpu_precision_; ss << xpu_config_.stream;
ss << xpu_adaptive_seqlen_; ss << xpu_config_.conv_autotune_level;
ss << xpu_enable_multi_stream_; ss << xpu_config_.conv_autotune_file;
ss << xpu_quant_post_dynamic_weight_bits_; ss << xpu_config_.conv_autotune_file_writeback;
for (auto op_type : xpu_quant_post_dynamic_op_types_) { ss << xpu_config_.fc_autotune_level;
ss << op_type; ss << xpu_config_.fc_autotune_file;
} ss << xpu_config_.fc_autotune_file_writeback;
ss << xpu_config_.gemm_compute_precision;
ss << xpu_config_.transformer_softmax_optimize_level;
ss << xpu_config_.transformer_encoder_adaptive_seqlen;
ss << xpu_config_.quant_post_static_gelu_out_threshold;
ss << xpu_config_.quant_post_dynamic_activation_method;
ss << xpu_config_.quant_post_dynamic_weight_precision;
for (auto type : xpu_config_.quant_post_dynamic_op_types) ss << type;
ss << xpu_lite_l3_locked_;
ss << xpu_lite_enable_multi_stream_;
ss << thread_local_stream_; ss << thread_local_stream_;
...@@ -1318,16 +1338,49 @@ std::string AnalysisConfig::Summary() { ...@@ -1318,16 +1338,49 @@ std::string AnalysisConfig::Summary() {
// xpu info // xpu info
os.InsertRow({"use_xpu", use_xpu_ ? "true" : "false"}); os.InsertRow({"use_xpu", use_xpu_ ? "true" : "false"});
if (use_xpu_) { if (use_xpu_) {
os.InsertRow({"xpu_device_id", std::to_string(xpu_device_id_)}); os.InsertRow({"xpu_device_id", std::to_string(xpu_config_.device_id)});
os.InsertRow({"xpu_l3_size", std::to_string(xpu_config_.l3_size)});
os.InsertRow( os.InsertRow(
{"xpu_l3_workspace_size", std::to_string(xpu_l3_workspace_size_)}); {"xpu_l3_ptr",
os.InsertRow({"xpu_quant_post_dynamic_weight_bits", std::to_string(reinterpret_cast<int64_t>(xpu_config_.l3_ptr))});
std::to_string(xpu_quant_post_dynamic_weight_bits_)}); os.InsertRow(
std::vector<std::string> op_types{"xpu_quant_post_dynamic_op_types"}; {"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)});
for (auto op_type : xpu_quant_post_dynamic_op_types_) { os.InsertRow(
op_types.push_back(op_type); {"xpu_stream",
} std::to_string(reinterpret_cast<int64_t>(xpu_config_.stream))});
os.InsertRow(op_types); os.InsertRow({"xpu_conv_autotune_level",
std::to_string(xpu_config_.conv_autotune_level)});
os.InsertRow({"xpu_conv_autotune_file", xpu_config_.conv_autotune_file});
os.InsertRow({"xpu_conv_autotune_file_writeback",
std::to_string(xpu_config_.conv_autotune_file_writeback)});
os.InsertRow({"xpu_fc_autotune_level",
std::to_string(xpu_config_.fc_autotune_level)});
os.InsertRow({"xpu_fc_autotune_file", xpu_config_.fc_autotune_file});
os.InsertRow({"xpu_fc_autotune_file_writeback",
std::to_string(xpu_config_.fc_autotune_file_writeback)});
os.InsertRow({"xpu_gemm_compute_precision",
std::to_string(xpu_config_.gemm_compute_precision)});
os.InsertRow(
{"xpu_transformer_softmax_optimize_level",
std::to_string(xpu_config_.transformer_softmax_optimize_level)});
os.InsertRow(
{"xpu_transformer_encoder_adaptive_seqlen",
std::to_string(xpu_config_.transformer_encoder_adaptive_seqlen)});
os.InsertRow(
{"xpu_quant_post_static_gelu_out_threshold",
std::to_string(xpu_config_.quant_post_static_gelu_out_threshold)});
os.InsertRow(
{"xpu_quant_post_dynamic_activation_method",
std::to_string(xpu_config_.quant_post_dynamic_activation_method)});
os.InsertRow(
{"xpu_quant_post_dynamic_weight_precision ",
std::to_string(xpu_config_.quant_post_dynamic_weight_precision)});
std::vector<std::string> quant_post_dynamic_op_types_info =
xpu_config_.quant_post_dynamic_op_types;
quant_post_dynamic_op_types_info.insert(
quant_post_dynamic_op_types_info.begin(),
"xpu_quant_post_dynamic_op_types");
os.InsertRow(quant_post_dynamic_op_types_info);
} }
os.InsetDivider(); os.InsetDivider();
......
...@@ -389,7 +389,7 @@ bool AnalysisPredictor::Init( ...@@ -389,7 +389,7 @@ bool AnalysisPredictor::Init(
} }
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
if (config_.use_xpu_) { if (config_.use_xpu_ && !config_.use_lite_) {
private_context_ = true; private_context_ = true;
if (!status_is_cloned_ && config_.external_stream_enabled()) { if (!status_is_cloned_ && config_.external_stream_enabled()) {
predictor_stream_ = config_.GetExecStream(); predictor_stream_ = config_.GetExecStream();
...@@ -1418,14 +1418,8 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1418,14 +1418,8 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetLitePassesFilter(config_.lite_passes_filter_); argument_->SetLitePassesFilter(config_.lite_passes_filter_);
argument_->SetLiteOpsFilter(config_.lite_ops_filter_); argument_->SetLiteOpsFilter(config_.lite_ops_filter_);
argument_->SetLiteZeroCopy(config_.lite_zero_copy_); argument_->SetLiteZeroCopy(config_.lite_zero_copy_);
argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); argument_->SetXpuLocked(config_.xpu_lite_l3_locked_);
argument_->SetXpuLocked(config_.xpu_locked_); argument_->SetXpuEnableMultiStream(config_.xpu_lite_enable_multi_stream_);
argument_->SetXpuAutotune(config_.xpu_autotune_);
argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
argument_->SetXpuPrecision(config_.xpu_precision_);
argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
argument_->SetXpuDeviceId(config_.xpu_device_id_);
argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
argument_->SetUseOpenCL(config_.use_opencl_); argument_->SetUseOpenCL(config_.use_opencl_);
// NNAdapter related // NNAdapter related
argument_->SetUseNNAdapter(config_.NNAdapter().use_nnadapter); argument_->SetUseNNAdapter(config_.NNAdapter().use_nnadapter);
...@@ -1506,21 +1500,36 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1506,21 +1500,36 @@ void AnalysisPredictor::PrepareArgument() {
} }
#endif #endif
#ifdef PADDLE_WITH_XPU
argument_->SetUseXpu(config_.use_xpu_); argument_->SetUseXpu(config_.use_xpu_);
argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); argument_->SetXpuDeviceId(config_.xpu_config_.device_id);
argument_->SetXpuLocked(config_.xpu_locked_); argument_->SetXpuL3Size(config_.xpu_config_.l3_size);
argument_->SetXpuAutotune(config_.xpu_autotune_); argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr);
argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_); argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size);
argument_->SetXpuPrecision(config_.xpu_precision_); argument_->SetXpuStream(config_.xpu_config_.stream);
argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level);
argument_->SetXpuDeviceId(config_.xpu_device_id_); argument_->SetXpuConvAutotuneFile(config_.xpu_config_.conv_autotune_file);
argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_); argument_->SetXpuConvAutotuneFileWriteback(
argument_->SetXpuQuantPostDynamicWeightBits( config_.xpu_config_.conv_autotune_file_writeback);
config_.xpu_quant_post_dynamic_weight_bits_); argument_->SetXpuFcAutotuneLevel(config_.xpu_config_.fc_autotune_level);
argument_->SetXpuFcAutotuneFile(config_.xpu_config_.fc_autotune_file);
argument_->SetXpuFcAutotuneFileWriteback(
config_.xpu_config_.fc_autotune_file_writeback);
argument_->SetXpuGemmComputePrecision(
config_.xpu_config_.gemm_compute_precision);
argument_->SetXpuTransformerSoftmaxOptimizeLevel(
config_.xpu_config_.transformer_softmax_optimize_level);
argument_->SetXpuTransformerEncoderAdaptiveSeqlen(
config_.xpu_config_.transformer_encoder_adaptive_seqlen);
argument_->SetXpuQuantPostStaticGeluOutThreshold(
config_.xpu_config_.quant_post_static_gelu_out_threshold);
argument_->SetXpuQuantPostDynamicActivationMethod(
config_.xpu_config_.quant_post_dynamic_activation_method);
argument_->SetXpuQuantPostDynamicWeightPrecision(
config_.xpu_config_.quant_post_dynamic_weight_precision);
argument_->SetXpuQuantPostDynamicOpTypes( argument_->SetXpuQuantPostDynamicOpTypes(
config_.xpu_quant_post_dynamic_op_types_); config_.xpu_config_.quant_post_dynamic_op_types);
#endif argument_->SetXpuLiteL3Locked(config_.xpu_lite_l3_locked_);
argument_->SetXpuLiteEnableMultiStream(config_.xpu_lite_enable_multi_stream_);
auto *pass_builder = config_.pass_builder(); auto *pass_builder = config_.pass_builder();
// TODO(inference): Need to reconstruct the pass_builder, pass should be // TODO(inference): Need to reconstruct the pass_builder, pass should be
...@@ -2076,9 +2085,36 @@ bool AnalysisPredictor::ZeroCopyRun() { ...@@ -2076,9 +2085,36 @@ bool AnalysisPredictor::ZeroCopyRun() {
} }
#endif #endif
#ifdef PADDLE_WITH_XPU
InferXPUContext *infer_xpu_ctx = nullptr;
if (config_.use_xpu_ && !config_.use_lite_) {
PADDLE_ENFORCE(
private_context_,
paddle::platform::errors::Fatal(
"Must use private context if run predictor on xpu place."));
auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
infer_xpu_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
infer_xpu_ctx->SetStream(predictor_stream_);
infer_xpu_ctx->SetL3Info(config_.xpu_config_.l3_size,
config_.xpu_config_.l3_ptr,
config_.xpu_config_.l3_autotune_size,
place_);
}
#endif
executor_->Run(); executor_->Run();
inference::DisplayMemoryInfo(place_, "after run"); inference::DisplayMemoryInfo(place_, "after run");
#ifdef PADDLE_WITH_XPU
if (config_.use_xpu_ && !config_.use_lite_ && infer_xpu_ctx != nullptr) {
infer_xpu_ctx->L3CacheAutotune();
}
#endif
if (config_.shape_range_info_collected()) { if (config_.shape_range_info_collected()) {
CollectShapeRangeInfo(); CollectShapeRangeInfo();
} }
...@@ -2148,18 +2184,6 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { ...@@ -2148,18 +2184,6 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) { bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
PADDLE_ENFORCE(
private_context_,
paddle::platform::errors::Fatal(
"Must use private context if run predictor with external config."));
auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
auto xpu_runtime_config = auto xpu_runtime_config =
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config); reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
auto *stream = xpu_runtime_config->stream; auto *stream = xpu_runtime_config->stream;
...@@ -2167,12 +2191,10 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) { ...@@ -2167,12 +2191,10 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
paddle::platform::XPUStreamSync( paddle::platform::XPUStreamSync(
static_cast<paddle::xpuStream>(predictor_stream_)); static_cast<paddle::xpuStream>(predictor_stream_));
predictor_stream_ = stream; predictor_stream_ = stream;
dev_ctx->SetStream(stream);
} }
size_t l3_size = xpu_runtime_config->l3_size; auto l3_size = xpu_runtime_config->l3_size;
void *l3_ptr = xpu_runtime_config->l3_ptr; auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
size_t l3_autotune_size = xpu_runtime_config->l3_autotune_size;
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
l3_autotune_size, l3_autotune_size,
l3_size, l3_size,
...@@ -2180,11 +2202,11 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) { ...@@ -2180,11 +2202,11 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).", "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
l3_autotune_size, l3_autotune_size,
l3_size)); l3_size));
dev_ctx->SetL3Info(l3_size, l3_ptr, l3_autotune_size, place_); config_.xpu_config_.l3_size = l3_size;
config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
config_.xpu_config_.l3_autotune_size = l3_autotune_size;
bool ret = ZeroCopyRun(); return ZeroCopyRun();
dev_ctx->L3CacheAutotune();
return ret;
#endif #endif
return false; return false;
} }
......
...@@ -76,6 +76,77 @@ struct LiteNNAdapterConfig { ...@@ -76,6 +76,77 @@ struct LiteNNAdapterConfig {
LiteNNAdapterConfig& Disable(); LiteNNAdapterConfig& Disable();
}; };
struct PD_INFER_DECL XpuConfig {
// Select which xpu device to run model.
int device_id{0};
// Available l3 size (Byte)
// For kunlun1, max l3_size is 16773120 Byte
// For kunlun2, max l3_size is 67104768 Byte
size_t l3_size{0};
// If l3_ptr is not nullptr, it is used as l3 buffer.
// If l3_ptr is nullptr, new l3 buffer will be created.
void* l3_ptr{nullptr};
// Available l3 size for autotune.
// If l3_autotune_size is 0, autotune is closed.
// Note: The remaining l3 size (l3_size - l3_autotune_size) is for
// kernels (both paddle/xdnn kernels)
size_t l3_autotune_size{0};
// Stream for execution.
// If stream is nullptr, default stream will be used.
void* stream{nullptr};
// Conv autotune level. Default 0 means no autotune.
// Note: Paddle-Lite only.
int conv_autotune_level{0};
// Base conv autotune info is read from conv_autotune_file.
// Note: Paddle-Lite only.
std::string conv_autotune_file;
// Whether write new conv autotune info to conv_autotune_file.
// Note: Paddle-Lite only.
bool conv_autotune_file_writeback{false};
// Fc autotune level. The Optional values are 0-9. Default 0 means no
// autotune. Note: Paddle-Lite only.
int fc_autotune_level{0};
// Base fc autotune info is read from fc_autotune_file.
// Note: Paddle-Lite only.
std::string fc_autotune_file;
// Whether write new fc autotune info to fc_autotune_file.
// Note: Paddle-Lite only.
bool fc_autotune_file_writeback{false};
// Gemm compute precision. Optional values are 0(int8),1(int16),2(int31).
// Note: "gemm_compute_precision" has no effect on quanted ops of quant model
// Note: Paddle-Lite only.
int gemm_compute_precision{1};
// Which method to optimize softmax in transformer structure. Optional values
// are 0,1,2. Note: Paddle-Lite only.
int transformer_softmax_optimize_level{0};
// Whether enable adaptive_seqlen optimize on transformer encoder.
// Note: Paddle-Lite only.
bool transformer_encoder_adaptive_seqlen{true};
// Gelu out max threshold is limited to quant_post_static_gelu_out_threshold
// if use static post-quantization.
// Note: Paddle-Lite only.
float quant_post_static_gelu_out_threshold{10.f};
// Activation method if use dynamic post-quantization.
// For kunlun1, optional values are 0(per_tensor),1(per_batch),2(per_head).
// For kunlun2, optional values are 0(per_tensor) or non-zero(every_16).
// Note: Paddle-Lite only.
int quant_post_dynamic_activation_method{0};
// Preprocess weight to quant_post_dynamic_weight_precision if use dynamic
// post-quantization. Optional values is 0,1,2.
// * If 0, preprocess weight to int8.
// * If 1, preprocess weight to int16.
// * If 2, preprocess weight to float.
// Note: PaddleInference only.
int quant_post_dynamic_weight_precision{1};
std::vector<std::string> quant_post_dynamic_op_types;
};
struct DistConfig { struct DistConfig {
bool use_dist_model() const { return use_dist_model_; } bool use_dist_model() const { return use_dist_model_; }
void EnableDistModel(bool use_dist_model) { void EnableDistModel(bool use_dist_model) {
...@@ -271,42 +342,46 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -271,42 +342,46 @@ struct PD_INFER_DECL AnalysisConfig {
/// \brief Turn on XPU. /// \brief Turn on XPU.
/// ///
/// \param l3_workspace_size The size of the video memory allocated by the l3 /// \param l3_workspace_size The size of the video memory allocated by the l3
/// cache, the maximum is 16M. /// cache, the maximum is 16M.
/// \param locked Whether the allocated L3 cache can be locked. If false, /// \param l3_locked Whether the allocated L3 cache can be locked. If false,
/// it means that the L3 cache is not locked, and the allocated L3 /// it means that the L3 cache is not locked, and the allocated L3
/// cache can be shared by multiple models, and multiple models /// cache can be shared by multiple models, and multiple models
/// sharing the L3 cache will be executed sequentially on the card. /// sharing the L3 cache will be executed sequentially on the card.
/// \param autotune Whether to autotune the conv operator in the model. If /// \param conv_autotune Whether to autotune the conv operator in the model.
/// true, when the conv operator of a certain dimension is executed /// If true, when the conv operator of a certain dimension is executed
/// for the first time, it will automatically search for a better /// for the first time, it will automatically search for a better
/// algorithm to improve the performance of subsequent conv operators /// algorithm to improve the performance of subsequent conv operators
/// of the same dimension. /// of the same dimension.
/// \param autotune_file Specify the path of the autotune file. If /// \param conv_autotune_file Specify the path of the autotune file. If
/// autotune_file is specified, the algorithm specified in the /// autotune_file is specified, the algorithm specified in the
/// file will be used and autotune will not be performed again. /// file will be used and autotune will not be performed again.
/// \param precision Calculation accuracy of multi_encoder /// \param transformer_encoder_precision Calculation accuracy of multi_encoder
/// \param adaptive_seqlen Is the input of multi_encoder variable length /// \param transformer_encoder_adaptive_seqlen Is the input of multi_encoder
/// \param enable_multi_stream Whether to enable the multi stream of xpu. /// variable length
/// /// \param enable_multi_stream Whether to enable the multi
void EnableXpu(int l3_workspace_size = 0xfffc00, /// stream of xpu.
bool locked = false, ///
bool autotune = true, void EnableXpu(int l3_size = 0xfffc00,
const std::string& autotune_file = "", bool l3_locked = false,
const std::string& precision = "int16", bool conv_autotune = true,
bool adaptive_seqlen = false, const std::string& conv_autotune_file = "",
const std::string& transformer_encoder_precision = "int16",
bool transformer_encoder_adaptive_seqlen = false,
bool enable_multi_stream = false); bool enable_multi_stream = false);
/// ///
/// \brief configs of XPU /// \brief configs of XPU
/// ///
/// \param quant_post_dynamic_weight_bits Weight bits used in dynamic post /// \param config Configs for xpu. See XpuConfig for more details.
/// quantization. Optional value: -1, 8, 16. Default value is -1, means using ///
/// the recommended way. \param quant_post_dynamic_op_types Ops used in void SetXpuConfig(const XpuConfig& config);
/// dynamic post quantization.
///
/// \brief Get configs of xpu
///
/// \return XpuConfig The configs of xpu.
/// ///
void SetXpuConfig( XpuConfig xpu_config() { return xpu_config_; }
int quant_post_dynamic_weight_bits = -1,
const std::vector<std::string>& quant_post_dynamic_op_types = {});
/// ///
/// \brief configs of IPU /// \brief configs of IPU
...@@ -462,7 +537,7 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -462,7 +537,7 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
/// \return int The XPU device id. /// \return int The XPU device id.
/// ///
int xpu_device_id() const { return xpu_device_id_; } int xpu_device_id() const { return xpu_config_.device_id; }
/// \brief Get the number of IPU device . /// \brief Get the number of IPU device .
/// ///
/// \return int The number of IPU device. /// \return int The number of IPU device.
...@@ -1191,16 +1266,9 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -1191,16 +1266,9 @@ struct PD_INFER_DECL AnalysisConfig {
// XPU related. // XPU related.
bool use_xpu_{false}; bool use_xpu_{false};
int xpu_device_id_{0}; XpuConfig xpu_config_;
int xpu_l3_workspace_size_{0}; bool xpu_lite_l3_locked_{false};
bool xpu_locked_; bool xpu_lite_enable_multi_stream_{false};
bool xpu_autotune_;
std::string xpu_autotune_file_;
std::string xpu_precision_;
bool xpu_adaptive_seqlen_;
bool xpu_enable_multi_stream_;
int xpu_quant_post_dynamic_weight_bits_{-1};
std::vector<std::string> xpu_quant_post_dynamic_op_types_;
// LITE OPENCL SETTINGS // LITE OPENCL SETTINGS
bool use_opencl_{false}; bool use_opencl_{false};
......
...@@ -47,6 +47,7 @@ namespace paddle_infer { ...@@ -47,6 +47,7 @@ namespace paddle_infer {
using PrecisionType = paddle::AnalysisConfig::Precision; using PrecisionType = paddle::AnalysisConfig::Precision;
using Config = paddle::AnalysisConfig; using Config = paddle::AnalysisConfig;
using DistConfig = paddle::DistConfig; using DistConfig = paddle::DistConfig;
using XpuConfig = paddle::XpuConfig;
/// ///
/// \class Predictor /// \class Predictor
......
...@@ -154,20 +154,20 @@ void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) { ...@@ -154,20 +154,20 @@ void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
} }
void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
int32_t l3_workspace_size, int32_t l3_size,
PD_Bool locked, PD_Bool l3_locked,
PD_Bool autotune, PD_Bool conv_autotune,
const char* autotune_file, const char* conv_autotune_file,
const char* precision, const char* transformer_encoder_precision,
PD_Bool adaptive_seqlen, PD_Bool transformer_encoder_adaptive_seqlen,
PD_Bool enable_multi_stream) { PD_Bool enable_multi_stream) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
config->EnableXpu(l3_workspace_size, config->EnableXpu(l3_size,
locked, l3_locked,
autotune, conv_autotune,
autotune_file, conv_autotune_file,
precision, transformer_encoder_precision,
adaptive_seqlen, transformer_encoder_adaptive_seqlen,
enable_multi_stream); enable_multi_stream);
} }
......
...@@ -193,23 +193,22 @@ func (config *Config) EnableORTOptimization() { ...@@ -193,23 +193,22 @@ func (config *Config) EnableORTOptimization() {
/// ///
/// \brief Turn on XPU. /// \brief Turn on XPU.
/// ///
/// \param l3_workspace_size The size of the video memory allocated by the l3 cache, the maximum is 16M. /// \param l3Size The size of the video memory allocated by the l3 cache, the maximum is 16M.
/// \param locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card. /// \param l3Locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card.
/// \param autotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension. /// \param convAutotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
/// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again. /// \param convAutotuneFile Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
/// \param precision Calculation accuracy of multi_encoder /// \param transformerEencoderPrecision Calculation accuracy of multi_encoder
/// \param adaptive_seqlen Is the input of multi_encoder variable length /// \param transformerEncoderAdaptiveSeqlen Is the input of multi_encoder variable length
/// \param enable_multi_stream Whether to enable the multi stream of xpu /// \param enable_multi_stream Whether to enable the multi stream of xpu
/// ///
func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool, enableMultiStream bool) { func (config *Config) EnableXpu(l3Size int32, l3Locked bool, convAutotune bool, convAutotuneFile string, transformerEencoderPrecision string, transformerEncoderAdaptiveSeqlen bool, enableMultiStream bool) {
cAutotuneFile := C.CString(autotuneFile) cConvAutotuneFile := C.CString(convAutotuneFile)
cPrecision := C.CString(precision) cTransformerEencoderPrecision := C.CString(transformerEencoderPrecision)
defer func() { defer func() {
C.free(unsafe.Pointer(cAutotuneFile)) C.free(unsafe.Pointer(cConvAutotuneFile))
C.free(unsafe.Pointer(cPrecision)) C.free(unsafe.Pointer(cTransformerEencoderPrecision))
}() }()
C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune), C.PD_ConfigEnableXpu(config.c, C.int32_t(l3Size), cvtGoBoolToPD(l3Locked), cvtGoBoolToPD(convAutotune), cConvAutotuneFile, cTransformerEencoderPrecision, cvtGoBoolToPD(transformerEncoderAdaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
} }
/// ///
......
...@@ -56,16 +56,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( ...@@ -56,16 +56,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
#endif #endif
#ifdef LITE_SUBGRAPH_WITH_XPU #ifdef LITE_SUBGRAPH_WITH_XPU
// Deprecated in Paddle-Lite release/v2.8 lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_size, cfg.xpu_l3_locked);
lite_cxx_config.set_xpu_workspace_l3_size_per_thread( lite_cxx_config.set_xpu_conv_autotune(cfg.xpu_conv_autotune,
cfg.xpu_l3_workspace_size); cfg.xpu_conv_autotune_file);
lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size, lite_cxx_config.set_xpu_multi_encoder_method(
cfg.locked); cfg.xpu_transformer_encoder_precision,
lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file); cfg.xpu_transformer_encoder_adaptive_seqlen);
lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
cfg.adaptive_seqlen);
lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id); lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id);
if (cfg.enable_multi_stream) { if (cfg.xpu_enable_multi_stream) {
lite_cxx_config.enable_xpu_multi_stream(); lite_cxx_config.enable_xpu_multi_stream();
} }
#endif #endif
......
...@@ -41,13 +41,13 @@ struct EngineConfig { ...@@ -41,13 +41,13 @@ struct EngineConfig {
int device_id = 0; int device_id = 0;
// for xpu // for xpu
size_t xpu_l3_workspace_size; size_t xpu_l3_size{0};
bool locked = false; bool xpu_l3_locked = false;
bool autotune = true; bool xpu_conv_autotune = true;
std::string autotune_file = ""; std::string xpu_conv_autotune_file = "";
std::string precision = "int16"; std::string xpu_transformer_encoder_precision = "int16";
bool adaptive_seqlen = false; bool xpu_transformer_encoder_adaptive_seqlen = false;
bool enable_multi_stream = false; bool xpu_enable_multi_stream = false;
// for x86 or arm // for x86 or arm
int cpu_math_library_num_threads{1}; int cpu_math_library_num_threads{1};
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
*paddle::internal*; *paddle::internal*;
*paddle::get_version*; *paddle::get_version*;
*paddle::LiteNNAdapterConfig*; *paddle::LiteNNAdapterConfig*;
*paddle::XpuConfig*;
*paddle::AnalysisConfig::*; *paddle::AnalysisConfig::*;
*paddle::PaddlePredictor::*; *paddle::PaddlePredictor::*;
*paddle::CreatePaddlePredictor*; *paddle::CreatePaddlePredictor*;
......
...@@ -108,6 +108,7 @@ void BindPaddlePredictor(py::module *m); ...@@ -108,6 +108,7 @@ void BindPaddlePredictor(py::module *m);
void BindNativeConfig(py::module *m); void BindNativeConfig(py::module *m);
void BindNativePredictor(py::module *m); void BindNativePredictor(py::module *m);
void BindLiteNNAdapterConfig(py::module *m); void BindLiteNNAdapterConfig(py::module *m);
void BindXpuConfig(py::module *m);
void BindAnalysisConfig(py::module *m); void BindAnalysisConfig(py::module *m);
void BindAnalysisPredictor(py::module *m); void BindAnalysisPredictor(py::module *m);
void BindZeroCopyTensor(py::module *m); void BindZeroCopyTensor(py::module *m);
...@@ -476,6 +477,7 @@ void BindInferenceApi(py::module *m) { ...@@ -476,6 +477,7 @@ void BindInferenceApi(py::module *m) {
BindNativeConfig(m); BindNativeConfig(m);
BindNativePredictor(m); BindNativePredictor(m);
BindLiteNNAdapterConfig(m); BindLiteNNAdapterConfig(m);
BindXpuConfig(m);
BindAnalysisConfig(m); BindAnalysisConfig(m);
BindAnalysisPredictor(m); BindAnalysisPredictor(m);
BindPaddleInferPredictor(m); BindPaddleInferPredictor(m);
...@@ -756,21 +758,21 @@ void BindAnalysisConfig(py::module *m) { ...@@ -756,21 +758,21 @@ void BindAnalysisConfig(py::module *m) {
#endif #endif
.def("enable_xpu", .def("enable_xpu",
&AnalysisConfig::EnableXpu, &AnalysisConfig::EnableXpu,
py::arg("l3_workspace_size") = 16 * 1024 * 1024, py::arg("l3_size") = 16 * 1024 * 1024,
py::arg("locked") = false, py::arg("l3_locked") = false,
py::arg("autotune") = true, py::arg("conv_autotune") = true,
py::arg("autotune_file") = "", py::arg("conv_autotune_file") = "",
py::arg("precision") = "int16", py::arg("transformer_encoder_precision") = "int16",
py::arg("adaptive_seqlen") = false, py::arg("transformer_encoder_adaptive_seqlen") = false,
py::arg("enable_multi_stream") = false) py::arg("enable_multi_stream") = false)
.def("set_xpu_device_id", .def("set_xpu_device_id",
&AnalysisConfig::SetXpuDeviceId, &AnalysisConfig::SetXpuDeviceId,
py::arg("device_id") = 0) py::arg("device_id") = 0)
.def( .def("set_xpu_config",
"set_xpu_config", [](AnalysisConfig &self, const paddle_infer::XpuConfig &xpu_config) {
&AnalysisConfig::SetXpuConfig, self.SetXpuConfig(xpu_config);
py::arg("quant_post_dynamic_weight_bits") = -1, })
py::arg("quant_post_dynamic_op_types") = std::vector<std::string>({})) .def("xpu_config", &AnalysisConfig::xpu_config)
.def("enable_custom_device", .def("enable_custom_device",
&AnalysisConfig::EnableCustomDevice, &AnalysisConfig::EnableCustomDevice,
py::arg("device_type"), py::arg("device_type"),
...@@ -1000,6 +1002,38 @@ void BindLiteNNAdapterConfig(py::module *m) { ...@@ -1000,6 +1002,38 @@ void BindLiteNNAdapterConfig(py::module *m) {
.def("disable", &LiteNNAdapterConfig::Disable); .def("disable", &LiteNNAdapterConfig::Disable);
} }
void BindXpuConfig(py::module *m) {
py::class_<XpuConfig>(*m, "XpuConfig")
.def(py::init<>())
.def_readwrite("device_id", &XpuConfig::device_id)
.def_readwrite("l3_ptr", &XpuConfig::l3_ptr)
.def_readwrite("l3_size", &XpuConfig::l3_size)
.def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size)
.def_readwrite("stream", &XpuConfig::stream)
.def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level)
.def_readwrite("conv_autotune_file", &XpuConfig::conv_autotune_file)
.def_readwrite("conv_autotune_file_writeback",
&XpuConfig::conv_autotune_file_writeback)
.def_readwrite("fc_autotune_level", &XpuConfig::fc_autotune_level)
.def_readwrite("fc_autotune_file", &XpuConfig::fc_autotune_file)
.def_readwrite("fc_autotune_file_writeback",
&XpuConfig::fc_autotune_file_writeback)
.def_readwrite("gemm_compute_precision",
&XpuConfig::gemm_compute_precision)
.def_readwrite("transformer_softmax_optimize_level",
&XpuConfig::transformer_softmax_optimize_level)
.def_readwrite("transformer_encoder_adaptive_seqlen",
&XpuConfig::transformer_encoder_adaptive_seqlen)
.def_readwrite("quant_post_static_gelu_out_threshold",
&XpuConfig::quant_post_static_gelu_out_threshold)
.def_readwrite("quant_post_dynamic_activation_method",
&XpuConfig::quant_post_dynamic_activation_method)
.def_readwrite("quant_post_dynamic_weight_precision",
&XpuConfig::quant_post_dynamic_weight_precision)
.def_readwrite("quant_post_dynamic_op_types",
&XpuConfig::quant_post_dynamic_op_types);
}
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
void BindMkldnnQuantizerConfig(py::module *m) { void BindMkldnnQuantizerConfig(py::module *m) {
py::class_<MkldnnQuantizerConfig> quantizer_config(*m, py::class_<MkldnnQuantizerConfig> quantizer_config(*m,
......
...@@ -30,6 +30,7 @@ from paddle.fluid.core import ( ...@@ -30,6 +30,7 @@ from paddle.fluid.core import (
get_trt_runtime_version, get_trt_runtime_version,
get_num_bytes_of_data_type, get_num_bytes_of_data_type,
PredictorPool, PredictorPool,
XpuConfig,
) )
__all__ = [ # noqa __all__ = [ # noqa
...@@ -47,4 +48,5 @@ __all__ = [ # noqa ...@@ -47,4 +48,5 @@ __all__ = [ # noqa
'get_trt_runtime_version', 'get_trt_runtime_version',
'get_num_bytes_of_data_type', 'get_num_bytes_of_data_type',
'PredictorPool', 'PredictorPool',
'XpuConfig',
] ]
...@@ -1476,6 +1476,15 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST) ...@@ -1476,6 +1476,15 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
endif() endif()
if(WITH_XPU) if(WITH_XPU)
inference_analysis_test(
xpu_config_resnet50_test
SRCS
xpu_config_resnet50_test.cc
EXTRA_DEPS
paddle_inference_shared
python
ARGS
--infer_model=${RESNET50_MODEL_DIR})
inference_analysis_test( inference_analysis_test(
xpu_runtime_config_resnet50_test xpu_runtime_config_resnet50_test
SRCS SRCS
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cmath>
#include "gflags/gflags.h"
#include "test/cpp/inference/api/tester_helper.h"
namespace paddle_infer {
static const std::vector<float> TRUTH_VALUES = {
127.779f, 738.165f, 1013.22f, -438.17f, 366.401f, 927.659f, 736.222f,
-633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f,
-242.675f, 117.448f, -801.723f, -391.514f, -404.818f, 454.16f, 515.48f,
-133.031f, 69.293f, 590.096f, -1434.69f, -1070.89f, 307.074f, 400.525f,
-316.12f, -587.125f, -161.056f, 800.363f, -96.4708f, 748.706f, 868.174f,
-447.938f, 112.737f, 1127.2f, 47.4355f, 677.72f, 593.186f, -336.4f,
551.362f, 397.823f, 78.3979f, -715.398f, 405.969f, 404.256f, 246.019f,
-8.42969f, 131.365f, -648.051f};
void PrepareInput(std::shared_ptr<Predictor> predictor) {
const int batch = 1;
const int channel = 3;
const int height = 318;
const int width = 318;
const int input_num = batch * channel * height * width;
std::vector<float> input(input_num, 1);
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputHandle(input_names[0]);
input_t->Reshape({batch, channel, height, width});
input_t->CopyFromCpu(input.data());
}
void CompareOutput(std::shared_ptr<Predictor> predictor) {
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
size_t out_num = std::accumulate(
output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
std::vector<float> out_data;
out_data.resize(out_num);
output_t->CopyToCpu(out_data.data());
float* data_o = out_data.data();
for (size_t j = 0; j < out_num; j += 10) {
EXPECT_NEAR(
(data_o[j] - TRUTH_VALUES[j / 10]) / TRUTH_VALUES[j / 10], 0., 10e-3);
}
}
TEST(xpu_config, inference) {
size_t l3_size = 10 * 1024 * 1024;
XpuConfig xpu_config;
xpu_config.l3_size = l3_size;
std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableXpu();
config.SetXpuConfig(xpu_config);
XpuConfig xpu_config_test = config.xpu_config();
CHECK_EQ(xpu_config_test.l3_size, l3_size);
auto predictor = CreatePredictor(config);
PrepareInput(predictor);
predictor->Run();
CompareOutput(predictor);
}
TEST(xpu_config, lite) {
size_t l3_size = 10 * 1024 * 1024;
XpuConfig xpu_config;
xpu_config.l3_size = l3_size;
std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableXpu();
config.SetXpuConfig(xpu_config);
config.EnableLiteEngine();
XpuConfig xpu_config_test = config.xpu_config();
CHECK_EQ(xpu_config_test.l3_size, l3_size);
auto predictor = CreatePredictor(config);
PrepareInput(predictor);
predictor->Run();
CompareOutput(predictor);
}
} // namespace paddle_infer
...@@ -63,7 +63,7 @@ void CompareOutput(std::shared_ptr<Predictor> predictor) { ...@@ -63,7 +63,7 @@ void CompareOutput(std::shared_ptr<Predictor> predictor) {
} }
} }
Config XpuConfig() { Config InferXpuConfig() {
std::string model_dir = FLAGS_infer_model + "/" + "model"; std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config; Config config;
config.SetModel(model_dir + "/model", model_dir + "/params"); config.SetModel(model_dir + "/model", model_dir + "/params");
...@@ -72,7 +72,7 @@ Config XpuConfig() { ...@@ -72,7 +72,7 @@ Config XpuConfig() {
} }
TEST(resnet50_xpu, basic) { TEST(resnet50_xpu, basic) {
Config config = XpuConfig(); Config config = InferXpuConfig();
auto predictor = CreatePredictor(config); auto predictor = CreatePredictor(config);
PrepareInput(predictor); PrepareInput(predictor);
predictor->Run(); predictor->Run();
...@@ -80,7 +80,7 @@ TEST(resnet50_xpu, basic) { ...@@ -80,7 +80,7 @@ TEST(resnet50_xpu, basic) {
} }
#define RUN_WITH_RUNTIME_CONFIG(idx_, config_) \ #define RUN_WITH_RUNTIME_CONFIG(idx_, config_) \
Config config##idx_ = XpuConfig(); \ Config config##idx_ = InferXpuConfig(); \
auto predictor##idx_ = CreatePredictor(config##idx_); \ auto predictor##idx_ = CreatePredictor(config##idx_); \
PrepareInput(predictor##idx_); \ PrepareInput(predictor##idx_); \
experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \ experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册