未验证 提交 b62b384b 编写于 作者: Z zhupengyang 提交者: GitHub

refine xpu inference api (#54342)

上级 8f65f72e
......@@ -367,8 +367,10 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
with_time_step,
with_seq_lengths,
with_src_mask);
int quant_weight_bits =
Has("quant_weight_bits") ? Get<int>("quant_weight_bits") : -1;
int quant_post_dynamic_weight_precision =
Has("quant_post_dynamic_weight_precision ")
? Get<int>("quant_post_dynamic_weight_precision ")
: -1;
int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
......@@ -421,7 +423,7 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
w_node,
nullptr,
platform::errors::Fatal("w node should not be nullptr"));
if (quant_weight_bits == 8) {
if (quant_post_dynamic_weight_precision == 0) {
PrepareWeight<int8_t>(
graph, scope, block, w_node, &w_intx, &w_max, need_transpose);
} else {
......
......@@ -93,6 +93,25 @@ struct Argument {
private: \
type__ field__##_;
#define DECL_POINTER_ARGUMENT_FIELD(field__, Field, type__) \
public: \
type__& field__() { \
PADDLE_ENFORCE_EQ( \
Has(#field__), \
true, \
platform::errors::PreconditionNotMet("There is no such field")); \
return field__##_; \
} \
void Set##Field(type__ x) { \
field__##_ = x; \
valid_fields_.insert(#field__); \
} \
DECL_ARGUMENT_FIELD_VALID(field__); \
type__* field__##_ptr() { return &field__##_; } \
\
private: \
type__ field__##_;
#define DECL_ARGUMENT_FIELD_VALID(field__) \
bool field__##_valid() { return Has(#field__); }
......@@ -276,20 +295,48 @@ struct Argument {
DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_bits,
XpuQuantPostDynamicWeightBits,
// XpuConfig
DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int);
DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_file, XpuConvAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_file_writeback,
XpuConvAutotuneFileWriteback,
bool);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_level, XpuFcAutotuneLevel, int);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_file, XpuFcAutotuneFile, std::string);
DECL_ARGUMENT_FIELD(xpu_fc_autotune_file_writeback,
XpuFcAutotuneFileWriteback,
bool);
DECL_ARGUMENT_FIELD(xpu_gemm_compute_precision, XpuGemmComputePrecision, int);
DECL_ARGUMENT_FIELD(xpu_transformer_softmax_optimize_level,
XpuTransformerSoftmaxOptimizeLevel,
int);
DECL_ARGUMENT_FIELD(xpu_transformer_encoder_adaptive_seqlen,
XpuTransformerEncoderAdaptiveSeqlen,
bool);
DECL_ARGUMENT_FIELD(xpu_quant_post_static_gelu_out_threshold,
XpuQuantPostStaticGeluOutThreshold,
float);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_activation_method,
XpuQuantPostDynamicActivationMethod,
int);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_weight_precision,
XpuQuantPostDynamicWeightPrecision,
int);
DECL_ARGUMENT_FIELD(xpu_quant_post_dynamic_op_types,
XpuQuantPostDynamicOpTypes,
std::vector<std::string>);
DECL_ARGUMENT_FIELD(xpu_lite_l3_locked, XpuLiteL3Locked, bool);
DECL_ARGUMENT_FIELD(xpu_lite_enable_multi_stream,
XpuLiteEnableMultiStream,
bool);
DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool);
......
......@@ -267,20 +267,41 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("enable_int8", new bool(lite_enable_int8));
pass->Set("use_gpu", new bool(argument->use_gpu()));
pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
pass->Set("xpu_l3_workspace_size",
new int(argument->xpu_l3_workspace_size()));
pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
pass->Set("xpu_l3_size", new size_t(argument->xpu_l3_size()));
pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
pass->Set("xpu_l3_autotune_size",
new size_t(argument->xpu_l3_autotune_size()));
pass->Set("xpu_stream", new void *(argument->xpu_stream()));
pass->Set("xpu_conv_autotune_level",
new int(argument->xpu_conv_autotune_level()));
pass->Set("xpu_conv_autotune_file",
new std::string(argument->xpu_conv_autotune_file()));
pass->Set("xpu_conv_autotune_file_writeback",
new bool(argument->xpu_conv_autotune_file_writeback()));
pass->Set("xpu_fc_autotune_level",
new int(argument->xpu_fc_autotune_level()));
pass->Set("xpu_fc_autotune_file",
new std::string(argument->xpu_fc_autotune_file()));
pass->Set("xpu_fc_autotune_file_writeback",
new bool(argument->xpu_fc_autotune_file_writeback()));
pass->Set("xpu_gemm_compute_precision",
new int(argument->xpu_gemm_compute_precision()));
pass->Set("xpu_transformer_softmax_optimize_level",
new int(argument->xpu_transformer_softmax_optimize_level()));
pass->Set("xpu_transformer_encoder_adaptive_seqlen",
new bool(argument->xpu_transformer_encoder_adaptive_seqlen()));
pass->Set(
"xpu_quant_post_static_gelu_out_threshold",
new float(argument->xpu_quant_post_static_gelu_out_threshold()));
pass->Set("xpu_quant_post_dynamic_activation_method",
new int(argument->xpu_quant_post_dynamic_activation_method()));
pass->Set("xpu_l3_locked", new bool(argument->xpu_lite_l3_locked()));
pass->Set("xpu_enable_multi_stream",
new bool(argument->xpu_lite_enable_multi_stream()));
pass->Set("use_opencl", new bool(argument->use_opencl()));
pass->Set("cpu_math_library_num_threads",
new int(argument->cpu_math_library_num_threads()));
pass->Set("locked", new bool(argument->xpu_locked()));
pass->Set("autotune", new bool(argument->xpu_autotune()));
pass->Set("autotune_file",
new std::string(argument->xpu_autotune_file()));
pass->Set("precision", new std::string(argument->xpu_precision()));
pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
pass->Set("xpu_device_id", new int(argument->xpu_device_id()));
pass->Set("enable_multi_stream",
new bool(argument->xpu_enable_multi_stream()));
// NNAdapter Related
pass->Set("use_nnadapter", new bool(argument->use_nnadapter()));
pass->Set("nnadapter_model_cache_dir",
......@@ -313,12 +334,10 @@ void IRPassManager::CreatePasses(Argument *argument,
bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
pass->Set("use_fc_padding", new bool(use_fc_padding));
} else if (pass_name == "fused_multi_transformer_xpu_pass") {
auto op_types = argument->xpu_quant_post_dynamic_op_types();
if (std::count(op_types.begin(),
op_types.end(),
"fused_multi_transformer") > 0) {
pass->Set("quant_weight_bits",
new int(argument->xpu_quant_post_dynamic_weight_bits()));
int quant_post_dynamic_weight_precision =
argument->xpu_quant_post_dynamic_weight_precision();
if (quant_post_dynamic_weight_precision == 0) {
pass->Set("quant_post_dynamic_weight_precision ", new int(0));
}
}
pre_pass = pass_name;
......
......@@ -249,17 +249,27 @@ void LiteSubgraphPass::SetUpEngine(
bool use_gpu = Get<bool>("use_gpu");
bool enable_int8 = Get<bool>("enable_int8");
bool use_xpu = Get<bool>("use_xpu");
int xpu_device_id = Get<int>("xpu_device_id");
int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
bool use_opencl = Get<bool>("use_opencl");
int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
bool locked = Get<bool>("locked");
bool autotune = Get<bool>("autotune");
std::string autotune_file = Get<std::string>("autotune_file");
std::string precision = Get<std::string>("precision");
bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
bool enable_multi_stream = Get<bool>("enable_multi_stream");
bool use_xpu = Get<bool>("use_xpu");
int xpu_device_id = Get<int>("xpu_device_id");
size_t xpu_l3_size = Get<size_t>("xpu_l3_size");
bool xpu_l3_locked = Get<bool>("xpu_l3_locked");
bool xpu_conv_autotune = Get<int>("xpu_conv_autotune_level") > 0;
std::string xpu_conv_autotune_file =
Get<std::string>("xpu_conv_autotune_file");
int xpu_gemm_compute_precision = Get<int>("xpu_gemm_compute_precision");
std::string xpu_transformer_encoder_precision{"int16"};
if (xpu_gemm_compute_precision == 0) {
xpu_transformer_encoder_precision = "int8";
} else if (xpu_gemm_compute_precision == 1) {
xpu_transformer_encoder_precision = "int16";
} else if (xpu_gemm_compute_precision == 2) {
xpu_transformer_encoder_precision = "int31";
}
bool xpu_transformer_encoder_adaptive_seqlen =
Get<bool>("xpu_transformer_encoder_adaptive_seqlen");
bool xpu_enable_multi_stream = Get<bool>("xpu_enable_multi_stream");
// NNAdapter Related
bool use_nnadapter = Get<bool>("use_nnadapter");
std::string nnadapter_model_cache_dir =
......@@ -344,14 +354,15 @@ void LiteSubgraphPass::SetUpEngine(
}
config.cpu_math_library_num_threads = cpu_math_library_num_threads;
config.xpu_l3_workspace_size = xpu_l3_workspace_size;
config.xpu_l3_size = xpu_l3_size;
config.device_id = xpu_device_id;
config.locked = locked;
config.autotune = autotune;
config.autotune_file = autotune_file;
config.precision = precision;
config.adaptive_seqlen = adaptive_seqlen;
config.enable_multi_stream = enable_multi_stream;
config.xpu_l3_locked = xpu_l3_locked;
config.xpu_conv_autotune = xpu_conv_autotune;
config.xpu_conv_autotune_file = xpu_conv_autotune_file;
config.xpu_transformer_encoder_precision = xpu_transformer_encoder_precision;
config.xpu_transformer_encoder_adaptive_seqlen =
xpu_transformer_encoder_adaptive_seqlen;
config.xpu_enable_multi_stream = xpu_enable_multi_stream;
// NNAdapter Related
config.nnadapter_model_cache_dir = nnadapter_model_cache_dir;
config.nnadapter_device_names = nnadapter_device_names;
......
......@@ -172,22 +172,34 @@ void AnalysisConfig::DisableFCPadding() {
Update();
}
void AnalysisConfig::EnableXpu(int l3_workspace_size,
bool locked,
bool autotune,
const std::string &autotune_file,
const std::string &precision,
bool adaptive_seqlen,
void AnalysisConfig::EnableXpu(int l3_size,
bool l3_locked,
bool conv_autotune,
const std::string &conv_autotune_file,
const std::string &transformer_encoder_precision,
bool transformer_encoder_adaptive_seqlen,
bool enable_multi_stream) {
#ifdef PADDLE_WITH_XPU
use_xpu_ = true;
xpu_l3_workspace_size_ = l3_workspace_size;
xpu_locked_ = locked;
xpu_autotune_ = autotune;
xpu_autotune_file_ = autotune_file;
xpu_precision_ = precision;
xpu_adaptive_seqlen_ = adaptive_seqlen;
xpu_enable_multi_stream_ = enable_multi_stream;
xpu_config_.l3_size = l3_size;
xpu_config_.conv_autotune_level = conv_autotune;
xpu_config_.conv_autotune_file = conv_autotune_file;
if (transformer_encoder_precision == "int8") {
xpu_config_.gemm_compute_precision = 0;
} else if (transformer_encoder_precision == "int16") {
xpu_config_.gemm_compute_precision = 1;
} else if (transformer_encoder_precision == "int31") {
xpu_config_.gemm_compute_precision = 2;
}
xpu_config_.transformer_encoder_adaptive_seqlen =
transformer_encoder_adaptive_seqlen;
xpu_lite_l3_locked_ = l3_locked;
xpu_lite_enable_multi_stream_ = enable_multi_stream;
Update();
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"To use XPU inference, please compile with option 'WITH_XPU' first."));
#endif
}
void AnalysisConfig::SetXpuDeviceId(int device_id) {
......@@ -195,15 +207,22 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
true,
platform::errors::PreconditionNotMet(
"Should call EnableXpu before SetXpuDeviceId."));
xpu_device_id_ = device_id;
xpu_config_.device_id = device_id;
Update();
}
void AnalysisConfig::SetXpuConfig(
int quant_post_dynamic_weight_bits,
const std::vector<std::string> &quant_post_dynamic_op_types) {
xpu_quant_post_dynamic_weight_bits_ = quant_post_dynamic_weight_bits;
xpu_quant_post_dynamic_op_types_ = quant_post_dynamic_op_types;
void AnalysisConfig::SetXpuConfig(const XpuConfig &config) {
PADDLE_ENFORCE(use_xpu_,
platform::errors::PreconditionNotMet(
"Should call EnableXpu before SetXpuConfig."));
PADDLE_ENFORCE_LE(
config.l3_autotune_size,
config.l3_size,
phi::errors::InvalidArgument(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
config.l3_autotune_size,
config.l3_size));
xpu_config_ = config;
Update();
}
......@@ -494,16 +513,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
// XPU related.
CP_MEMBER(use_xpu_);
CP_MEMBER(xpu_device_id_);
CP_MEMBER(xpu_l3_workspace_size_);
CP_MEMBER(xpu_locked_);
CP_MEMBER(xpu_autotune_);
CP_MEMBER(xpu_autotune_file_);
CP_MEMBER(xpu_precision_);
CP_MEMBER(xpu_adaptive_seqlen_);
CP_MEMBER(xpu_enable_multi_stream_);
CP_MEMBER(xpu_quant_post_dynamic_weight_bits_);
CP_MEMBER(xpu_quant_post_dynamic_op_types_);
CP_MEMBER(xpu_config_);
CP_MEMBER(xpu_lite_l3_locked_);
CP_MEMBER(xpu_lite_enable_multi_stream_);
// Lite OpenCL Related
CP_MEMBER(use_opencl_);
......@@ -1033,7 +1045,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << exec_stream_;
ss << use_fc_padding_;
ss << gpu_device_id_;
ss << xpu_device_id_;
ss << memory_pool_init_size_mb_;
ss << use_tensorrt_;
......@@ -1080,17 +1091,26 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << use_lite_;
ss << use_xpu_;
ss << xpu_l3_workspace_size_;
ss << xpu_locked_;
ss << xpu_autotune_;
ss << xpu_autotune_file_;
ss << xpu_precision_;
ss << xpu_adaptive_seqlen_;
ss << xpu_enable_multi_stream_;
ss << xpu_quant_post_dynamic_weight_bits_;
for (auto op_type : xpu_quant_post_dynamic_op_types_) {
ss << op_type;
}
ss << xpu_config_.device_id;
ss << xpu_config_.l3_size;
ss << xpu_config_.l3_ptr;
ss << xpu_config_.l3_autotune_size;
ss << xpu_config_.stream;
ss << xpu_config_.conv_autotune_level;
ss << xpu_config_.conv_autotune_file;
ss << xpu_config_.conv_autotune_file_writeback;
ss << xpu_config_.fc_autotune_level;
ss << xpu_config_.fc_autotune_file;
ss << xpu_config_.fc_autotune_file_writeback;
ss << xpu_config_.gemm_compute_precision;
ss << xpu_config_.transformer_softmax_optimize_level;
ss << xpu_config_.transformer_encoder_adaptive_seqlen;
ss << xpu_config_.quant_post_static_gelu_out_threshold;
ss << xpu_config_.quant_post_dynamic_activation_method;
ss << xpu_config_.quant_post_dynamic_weight_precision;
for (auto type : xpu_config_.quant_post_dynamic_op_types) ss << type;
ss << xpu_lite_l3_locked_;
ss << xpu_lite_enable_multi_stream_;
ss << thread_local_stream_;
......@@ -1318,16 +1338,49 @@ std::string AnalysisConfig::Summary() {
// xpu info
os.InsertRow({"use_xpu", use_xpu_ ? "true" : "false"});
if (use_xpu_) {
os.InsertRow({"xpu_device_id", std::to_string(xpu_device_id_)});
os.InsertRow({"xpu_device_id", std::to_string(xpu_config_.device_id)});
os.InsertRow({"xpu_l3_size", std::to_string(xpu_config_.l3_size)});
os.InsertRow(
{"xpu_l3_workspace_size", std::to_string(xpu_l3_workspace_size_)});
os.InsertRow({"xpu_quant_post_dynamic_weight_bits",
std::to_string(xpu_quant_post_dynamic_weight_bits_)});
std::vector<std::string> op_types{"xpu_quant_post_dynamic_op_types"};
for (auto op_type : xpu_quant_post_dynamic_op_types_) {
op_types.push_back(op_type);
}
os.InsertRow(op_types);
{"xpu_l3_ptr",
std::to_string(reinterpret_cast<int64_t>(xpu_config_.l3_ptr))});
os.InsertRow(
{"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)});
os.InsertRow(
{"xpu_stream",
std::to_string(reinterpret_cast<int64_t>(xpu_config_.stream))});
os.InsertRow({"xpu_conv_autotune_level",
std::to_string(xpu_config_.conv_autotune_level)});
os.InsertRow({"xpu_conv_autotune_file", xpu_config_.conv_autotune_file});
os.InsertRow({"xpu_conv_autotune_file_writeback",
std::to_string(xpu_config_.conv_autotune_file_writeback)});
os.InsertRow({"xpu_fc_autotune_level",
std::to_string(xpu_config_.fc_autotune_level)});
os.InsertRow({"xpu_fc_autotune_file", xpu_config_.fc_autotune_file});
os.InsertRow({"xpu_fc_autotune_file_writeback",
std::to_string(xpu_config_.fc_autotune_file_writeback)});
os.InsertRow({"xpu_gemm_compute_precision",
std::to_string(xpu_config_.gemm_compute_precision)});
os.InsertRow(
{"xpu_transformer_softmax_optimize_level",
std::to_string(xpu_config_.transformer_softmax_optimize_level)});
os.InsertRow(
{"xpu_transformer_encoder_adaptive_seqlen",
std::to_string(xpu_config_.transformer_encoder_adaptive_seqlen)});
os.InsertRow(
{"xpu_quant_post_static_gelu_out_threshold",
std::to_string(xpu_config_.quant_post_static_gelu_out_threshold)});
os.InsertRow(
{"xpu_quant_post_dynamic_activation_method",
std::to_string(xpu_config_.quant_post_dynamic_activation_method)});
os.InsertRow(
{"xpu_quant_post_dynamic_weight_precision ",
std::to_string(xpu_config_.quant_post_dynamic_weight_precision)});
std::vector<std::string> quant_post_dynamic_op_types_info =
xpu_config_.quant_post_dynamic_op_types;
quant_post_dynamic_op_types_info.insert(
quant_post_dynamic_op_types_info.begin(),
"xpu_quant_post_dynamic_op_types");
os.InsertRow(quant_post_dynamic_op_types_info);
}
os.InsetDivider();
......
......@@ -389,7 +389,7 @@ bool AnalysisPredictor::Init(
}
#endif
#if defined(PADDLE_WITH_XPU)
if (config_.use_xpu_) {
if (config_.use_xpu_ && !config_.use_lite_) {
private_context_ = true;
if (!status_is_cloned_ && config_.external_stream_enabled()) {
predictor_stream_ = config_.GetExecStream();
......@@ -1418,14 +1418,8 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetLitePassesFilter(config_.lite_passes_filter_);
argument_->SetLiteOpsFilter(config_.lite_ops_filter_);
argument_->SetLiteZeroCopy(config_.lite_zero_copy_);
argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
argument_->SetXpuLocked(config_.xpu_locked_);
argument_->SetXpuAutotune(config_.xpu_autotune_);
argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
argument_->SetXpuPrecision(config_.xpu_precision_);
argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
argument_->SetXpuDeviceId(config_.xpu_device_id_);
argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
argument_->SetXpuLocked(config_.xpu_lite_l3_locked_);
argument_->SetXpuEnableMultiStream(config_.xpu_lite_enable_multi_stream_);
argument_->SetUseOpenCL(config_.use_opencl_);
// NNAdapter related
argument_->SetUseNNAdapter(config_.NNAdapter().use_nnadapter);
......@@ -1506,21 +1500,36 @@ void AnalysisPredictor::PrepareArgument() {
}
#endif
#ifdef PADDLE_WITH_XPU
argument_->SetUseXpu(config_.use_xpu_);
argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
argument_->SetXpuLocked(config_.xpu_locked_);
argument_->SetXpuAutotune(config_.xpu_autotune_);
argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
argument_->SetXpuPrecision(config_.xpu_precision_);
argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
argument_->SetXpuDeviceId(config_.xpu_device_id_);
argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
argument_->SetXpuQuantPostDynamicWeightBits(
config_.xpu_quant_post_dynamic_weight_bits_);
argument_->SetXpuDeviceId(config_.xpu_config_.device_id);
argument_->SetXpuL3Size(config_.xpu_config_.l3_size);
argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr);
argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size);
argument_->SetXpuStream(config_.xpu_config_.stream);
argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level);
argument_->SetXpuConvAutotuneFile(config_.xpu_config_.conv_autotune_file);
argument_->SetXpuConvAutotuneFileWriteback(
config_.xpu_config_.conv_autotune_file_writeback);
argument_->SetXpuFcAutotuneLevel(config_.xpu_config_.fc_autotune_level);
argument_->SetXpuFcAutotuneFile(config_.xpu_config_.fc_autotune_file);
argument_->SetXpuFcAutotuneFileWriteback(
config_.xpu_config_.fc_autotune_file_writeback);
argument_->SetXpuGemmComputePrecision(
config_.xpu_config_.gemm_compute_precision);
argument_->SetXpuTransformerSoftmaxOptimizeLevel(
config_.xpu_config_.transformer_softmax_optimize_level);
argument_->SetXpuTransformerEncoderAdaptiveSeqlen(
config_.xpu_config_.transformer_encoder_adaptive_seqlen);
argument_->SetXpuQuantPostStaticGeluOutThreshold(
config_.xpu_config_.quant_post_static_gelu_out_threshold);
argument_->SetXpuQuantPostDynamicActivationMethod(
config_.xpu_config_.quant_post_dynamic_activation_method);
argument_->SetXpuQuantPostDynamicWeightPrecision(
config_.xpu_config_.quant_post_dynamic_weight_precision);
argument_->SetXpuQuantPostDynamicOpTypes(
config_.xpu_quant_post_dynamic_op_types_);
#endif
config_.xpu_config_.quant_post_dynamic_op_types);
argument_->SetXpuLiteL3Locked(config_.xpu_lite_l3_locked_);
argument_->SetXpuLiteEnableMultiStream(config_.xpu_lite_enable_multi_stream_);
auto *pass_builder = config_.pass_builder();
// TODO(inference): Need to reconstruct the pass_builder, pass should be
......@@ -2076,9 +2085,36 @@ bool AnalysisPredictor::ZeroCopyRun() {
}
#endif
#ifdef PADDLE_WITH_XPU
InferXPUContext *infer_xpu_ctx = nullptr;
if (config_.use_xpu_ && !config_.use_lite_) {
PADDLE_ENFORCE(
private_context_,
paddle::platform::errors::Fatal(
"Must use private context if run predictor on xpu place."));
auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
infer_xpu_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
infer_xpu_ctx->SetStream(predictor_stream_);
infer_xpu_ctx->SetL3Info(config_.xpu_config_.l3_size,
config_.xpu_config_.l3_ptr,
config_.xpu_config_.l3_autotune_size,
place_);
}
#endif
executor_->Run();
inference::DisplayMemoryInfo(place_, "after run");
#ifdef PADDLE_WITH_XPU
if (config_.use_xpu_ && !config_.use_lite_ && infer_xpu_ctx != nullptr) {
infer_xpu_ctx->L3CacheAutotune();
}
#endif
if (config_.shape_range_info_collected()) {
CollectShapeRangeInfo();
}
......@@ -2148,18 +2184,6 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
#ifdef PADDLE_WITH_XPU
PADDLE_ENFORCE(
private_context_,
paddle::platform::errors::Fatal(
"Must use private context if run predictor with external config."));
auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
auto xpu_runtime_config =
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
auto *stream = xpu_runtime_config->stream;
......@@ -2167,12 +2191,10 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
paddle::platform::XPUStreamSync(
static_cast<paddle::xpuStream>(predictor_stream_));
predictor_stream_ = stream;
dev_ctx->SetStream(stream);
}
size_t l3_size = xpu_runtime_config->l3_size;
void *l3_ptr = xpu_runtime_config->l3_ptr;
size_t l3_autotune_size = xpu_runtime_config->l3_autotune_size;
auto l3_size = xpu_runtime_config->l3_size;
auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
PADDLE_ENFORCE_LE(
l3_autotune_size,
l3_size,
......@@ -2180,11 +2202,11 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
l3_autotune_size,
l3_size));
dev_ctx->SetL3Info(l3_size, l3_ptr, l3_autotune_size, place_);
config_.xpu_config_.l3_size = l3_size;
config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
config_.xpu_config_.l3_autotune_size = l3_autotune_size;
bool ret = ZeroCopyRun();
dev_ctx->L3CacheAutotune();
return ret;
return ZeroCopyRun();
#endif
return false;
}
......
......@@ -76,6 +76,77 @@ struct LiteNNAdapterConfig {
LiteNNAdapterConfig& Disable();
};
struct PD_INFER_DECL XpuConfig {
// Select which xpu device to run model.
int device_id{0};
// Available l3 size (Byte)
// For kunlun1, max l3_size is 16773120 Byte
// For kunlun2, max l3_size is 67104768 Byte
size_t l3_size{0};
// If l3_ptr is not nullptr, it is used as l3 buffer.
// If l3_ptr is nullptr, new l3 buffer will be created.
void* l3_ptr{nullptr};
// Available l3 size for autotune.
// If l3_autotune_size is 0, autotune is closed.
// Note: The remaining l3 size (l3_size - l3_autotune_size) is for
// kernels (both paddle/xdnn kernels)
size_t l3_autotune_size{0};
// Stream for execution.
// If stream is nullptr, default stream will be used.
void* stream{nullptr};
// Conv autotune level. Default 0 means no autotune.
// Note: Paddle-Lite only.
int conv_autotune_level{0};
// Base conv autotune info is read from conv_autotune_file.
// Note: Paddle-Lite only.
std::string conv_autotune_file;
// Whether write new conv autotune info to conv_autotune_file.
// Note: Paddle-Lite only.
bool conv_autotune_file_writeback{false};
// Fc autotune level. The Optional values are 0-9. Default 0 means no
// autotune. Note: Paddle-Lite only.
int fc_autotune_level{0};
// Base fc autotune info is read from fc_autotune_file.
// Note: Paddle-Lite only.
std::string fc_autotune_file;
// Whether write new fc autotune info to fc_autotune_file.
// Note: Paddle-Lite only.
bool fc_autotune_file_writeback{false};
// Gemm compute precision. Optional values are 0(int8),1(int16),2(int31).
// Note: "gemm_compute_precision" has no effect on quanted ops of quant model
// Note: Paddle-Lite only.
int gemm_compute_precision{1};
// Which method to optimize softmax in transformer structure. Optional values
// are 0,1,2. Note: Paddle-Lite only.
int transformer_softmax_optimize_level{0};
// Whether enable adaptive_seqlen optimize on transformer encoder.
// Note: Paddle-Lite only.
bool transformer_encoder_adaptive_seqlen{true};
// Gelu out max threshold is limited to quant_post_static_gelu_out_threshold
// if use static post-quantization.
// Note: Paddle-Lite only.
float quant_post_static_gelu_out_threshold{10.f};
// Activation method if use dynamic post-quantization.
// For kunlun1, optional values are 0(per_tensor),1(per_batch),2(per_head).
// For kunlun2, optional values are 0(per_tensor) or non-zero(every_16).
// Note: Paddle-Lite only.
int quant_post_dynamic_activation_method{0};
// Preprocess weight to quant_post_dynamic_weight_precision if use dynamic
// post-quantization. Optional values is 0,1,2.
// * If 0, preprocess weight to int8.
// * If 1, preprocess weight to int16.
// * If 2, preprocess weight to float.
// Note: PaddleInference only.
int quant_post_dynamic_weight_precision{1};
std::vector<std::string> quant_post_dynamic_op_types;
};
struct DistConfig {
bool use_dist_model() const { return use_dist_model_; }
void EnableDistModel(bool use_dist_model) {
......@@ -271,42 +342,46 @@ struct PD_INFER_DECL AnalysisConfig {
/// \brief Turn on XPU.
///
/// \param l3_workspace_size The size of the video memory allocated by the l3
/// cache, the maximum is 16M.
/// \param locked Whether the allocated L3 cache can be locked. If false,
/// cache, the maximum is 16M.
/// \param l3_locked Whether the allocated L3 cache can be locked. If false,
/// it means that the L3 cache is not locked, and the allocated L3
/// cache can be shared by multiple models, and multiple models
/// sharing the L3 cache will be executed sequentially on the card.
/// \param autotune Whether to autotune the conv operator in the model. If
/// true, when the conv operator of a certain dimension is executed
/// \param conv_autotune Whether to autotune the conv operator in the model.
/// If true, when the conv operator of a certain dimension is executed
/// for the first time, it will automatically search for a better
/// algorithm to improve the performance of subsequent conv operators
/// of the same dimension.
/// \param autotune_file Specify the path of the autotune file. If
/// \param conv_autotune_file Specify the path of the autotune file. If
/// autotune_file is specified, the algorithm specified in the
/// file will be used and autotune will not be performed again.
/// \param precision Calculation accuracy of multi_encoder
/// \param adaptive_seqlen Is the input of multi_encoder variable length
/// \param enable_multi_stream Whether to enable the multi stream of xpu.
///
void EnableXpu(int l3_workspace_size = 0xfffc00,
bool locked = false,
bool autotune = true,
const std::string& autotune_file = "",
const std::string& precision = "int16",
bool adaptive_seqlen = false,
/// \param transformer_encoder_precision Calculation accuracy of multi_encoder
/// \param transformer_encoder_adaptive_seqlen Is the input of multi_encoder
/// variable length
/// \param enable_multi_stream Whether to enable the multi
/// stream of xpu.
///
void EnableXpu(int l3_size = 0xfffc00,
bool l3_locked = false,
bool conv_autotune = true,
const std::string& conv_autotune_file = "",
const std::string& transformer_encoder_precision = "int16",
bool transformer_encoder_adaptive_seqlen = false,
bool enable_multi_stream = false);
///
/// \brief configs of XPU
///
/// \param quant_post_dynamic_weight_bits Weight bits used in dynamic post
/// quantization. Optional value: -1, 8, 16. Default value is -1, means using
/// the recommended way. \param quant_post_dynamic_op_types Ops used in
/// dynamic post quantization.
/// \param config Configs for xpu. See XpuConfig for more details.
///
void SetXpuConfig(const XpuConfig& config);
///
/// \brief Get configs of xpu
///
/// \return XpuConfig The configs of xpu.
///
void SetXpuConfig(
int quant_post_dynamic_weight_bits = -1,
const std::vector<std::string>& quant_post_dynamic_op_types = {});
XpuConfig xpu_config() { return xpu_config_; }
///
/// \brief configs of IPU
......@@ -462,7 +537,7 @@ struct PD_INFER_DECL AnalysisConfig {
///
/// \return int The XPU device id.
///
int xpu_device_id() const { return xpu_device_id_; }
int xpu_device_id() const { return xpu_config_.device_id; }
/// \brief Get the number of IPU device .
///
/// \return int The number of IPU device.
......@@ -1191,16 +1266,9 @@ struct PD_INFER_DECL AnalysisConfig {
// XPU related.
bool use_xpu_{false};
int xpu_device_id_{0};
int xpu_l3_workspace_size_{0};
bool xpu_locked_;
bool xpu_autotune_;
std::string xpu_autotune_file_;
std::string xpu_precision_;
bool xpu_adaptive_seqlen_;
bool xpu_enable_multi_stream_;
int xpu_quant_post_dynamic_weight_bits_{-1};
std::vector<std::string> xpu_quant_post_dynamic_op_types_;
XpuConfig xpu_config_;
bool xpu_lite_l3_locked_{false};
bool xpu_lite_enable_multi_stream_{false};
// LITE OPENCL SETTINGS
bool use_opencl_{false};
......
......@@ -47,6 +47,7 @@ namespace paddle_infer {
using PrecisionType = paddle::AnalysisConfig::Precision;
using Config = paddle::AnalysisConfig;
using DistConfig = paddle::DistConfig;
using XpuConfig = paddle::XpuConfig;
///
/// \class Predictor
......
......@@ -154,20 +154,20 @@ void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) {
}
void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
int32_t l3_workspace_size,
PD_Bool locked,
PD_Bool autotune,
const char* autotune_file,
const char* precision,
PD_Bool adaptive_seqlen,
int32_t l3_size,
PD_Bool l3_locked,
PD_Bool conv_autotune,
const char* conv_autotune_file,
const char* transformer_encoder_precision,
PD_Bool transformer_encoder_adaptive_seqlen,
PD_Bool enable_multi_stream) {
CHECK_AND_CONVERT_PD_CONFIG;
config->EnableXpu(l3_workspace_size,
locked,
autotune,
autotune_file,
precision,
adaptive_seqlen,
config->EnableXpu(l3_size,
l3_locked,
conv_autotune,
conv_autotune_file,
transformer_encoder_precision,
transformer_encoder_adaptive_seqlen,
enable_multi_stream);
}
......
......@@ -193,23 +193,22 @@ func (config *Config) EnableORTOptimization() {
///
/// \brief Turn on XPU.
///
/// \param l3_workspace_size The size of the video memory allocated by the l3 cache, the maximum is 16M.
/// \param locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card.
/// \param autotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
/// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
/// \param precision Calculation accuracy of multi_encoder
/// \param adaptive_seqlen Is the input of multi_encoder variable length
/// \param l3Size The size of the video memory allocated by the l3 cache, the maximum is 16M.
/// \param l3Locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card.
/// \param convAutotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
/// \param convAutotuneFile Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again.
/// \param transformerEencoderPrecision Calculation accuracy of multi_encoder
/// \param transformerEncoderAdaptiveSeqlen Is the input of multi_encoder variable length
/// \param enable_multi_stream Whether to enable the multi stream of xpu
///
func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool, enableMultiStream bool) {
cAutotuneFile := C.CString(autotuneFile)
cPrecision := C.CString(precision)
func (config *Config) EnableXpu(l3Size int32, l3Locked bool, convAutotune bool, convAutotuneFile string, transformerEencoderPrecision string, transformerEncoderAdaptiveSeqlen bool, enableMultiStream bool) {
cConvAutotuneFile := C.CString(convAutotuneFile)
cTransformerEencoderPrecision := C.CString(transformerEencoderPrecision)
defer func() {
C.free(unsafe.Pointer(cAutotuneFile))
C.free(unsafe.Pointer(cPrecision))
C.free(unsafe.Pointer(cConvAutotuneFile))
C.free(unsafe.Pointer(cTransformerEencoderPrecision))
}()
C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune),
cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
C.PD_ConfigEnableXpu(config.c, C.int32_t(l3Size), cvtGoBoolToPD(l3Locked), cvtGoBoolToPD(convAutotune), cConvAutotuneFile, cTransformerEencoderPrecision, cvtGoBoolToPD(transformerEncoderAdaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
}
///
......
......@@ -56,16 +56,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
#endif
#ifdef LITE_SUBGRAPH_WITH_XPU
// Deprecated in Paddle-Lite release/v2.8
lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
cfg.xpu_l3_workspace_size);
lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size,
cfg.locked);
lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file);
lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
cfg.adaptive_seqlen);
lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_size, cfg.xpu_l3_locked);
lite_cxx_config.set_xpu_conv_autotune(cfg.xpu_conv_autotune,
cfg.xpu_conv_autotune_file);
lite_cxx_config.set_xpu_multi_encoder_method(
cfg.xpu_transformer_encoder_precision,
cfg.xpu_transformer_encoder_adaptive_seqlen);
lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id);
if (cfg.enable_multi_stream) {
if (cfg.xpu_enable_multi_stream) {
lite_cxx_config.enable_xpu_multi_stream();
}
#endif
......
......@@ -41,13 +41,13 @@ struct EngineConfig {
int device_id = 0;
// for xpu
size_t xpu_l3_workspace_size;
bool locked = false;
bool autotune = true;
std::string autotune_file = "";
std::string precision = "int16";
bool adaptive_seqlen = false;
bool enable_multi_stream = false;
size_t xpu_l3_size{0};
bool xpu_l3_locked = false;
bool xpu_conv_autotune = true;
std::string xpu_conv_autotune_file = "";
std::string xpu_transformer_encoder_precision = "int16";
bool xpu_transformer_encoder_adaptive_seqlen = false;
bool xpu_enable_multi_stream = false;
// for x86 or arm
int cpu_math_library_num_threads{1};
......
......@@ -21,6 +21,7 @@
*paddle::internal*;
*paddle::get_version*;
*paddle::LiteNNAdapterConfig*;
*paddle::XpuConfig*;
*paddle::AnalysisConfig::*;
*paddle::PaddlePredictor::*;
*paddle::CreatePaddlePredictor*;
......
......@@ -108,6 +108,7 @@ void BindPaddlePredictor(py::module *m);
void BindNativeConfig(py::module *m);
void BindNativePredictor(py::module *m);
void BindLiteNNAdapterConfig(py::module *m);
void BindXpuConfig(py::module *m);
void BindAnalysisConfig(py::module *m);
void BindAnalysisPredictor(py::module *m);
void BindZeroCopyTensor(py::module *m);
......@@ -476,6 +477,7 @@ void BindInferenceApi(py::module *m) {
BindNativeConfig(m);
BindNativePredictor(m);
BindLiteNNAdapterConfig(m);
BindXpuConfig(m);
BindAnalysisConfig(m);
BindAnalysisPredictor(m);
BindPaddleInferPredictor(m);
......@@ -756,21 +758,21 @@ void BindAnalysisConfig(py::module *m) {
#endif
.def("enable_xpu",
&AnalysisConfig::EnableXpu,
py::arg("l3_workspace_size") = 16 * 1024 * 1024,
py::arg("locked") = false,
py::arg("autotune") = true,
py::arg("autotune_file") = "",
py::arg("precision") = "int16",
py::arg("adaptive_seqlen") = false,
py::arg("l3_size") = 16 * 1024 * 1024,
py::arg("l3_locked") = false,
py::arg("conv_autotune") = true,
py::arg("conv_autotune_file") = "",
py::arg("transformer_encoder_precision") = "int16",
py::arg("transformer_encoder_adaptive_seqlen") = false,
py::arg("enable_multi_stream") = false)
.def("set_xpu_device_id",
&AnalysisConfig::SetXpuDeviceId,
py::arg("device_id") = 0)
.def(
"set_xpu_config",
&AnalysisConfig::SetXpuConfig,
py::arg("quant_post_dynamic_weight_bits") = -1,
py::arg("quant_post_dynamic_op_types") = std::vector<std::string>({}))
.def("set_xpu_config",
[](AnalysisConfig &self, const paddle_infer::XpuConfig &xpu_config) {
self.SetXpuConfig(xpu_config);
})
.def("xpu_config", &AnalysisConfig::xpu_config)
.def("enable_custom_device",
&AnalysisConfig::EnableCustomDevice,
py::arg("device_type"),
......@@ -1000,6 +1002,38 @@ void BindLiteNNAdapterConfig(py::module *m) {
.def("disable", &LiteNNAdapterConfig::Disable);
}
void BindXpuConfig(py::module *m) {
py::class_<XpuConfig>(*m, "XpuConfig")
.def(py::init<>())
.def_readwrite("device_id", &XpuConfig::device_id)
.def_readwrite("l3_ptr", &XpuConfig::l3_ptr)
.def_readwrite("l3_size", &XpuConfig::l3_size)
.def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size)
.def_readwrite("stream", &XpuConfig::stream)
.def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level)
.def_readwrite("conv_autotune_file", &XpuConfig::conv_autotune_file)
.def_readwrite("conv_autotune_file_writeback",
&XpuConfig::conv_autotune_file_writeback)
.def_readwrite("fc_autotune_level", &XpuConfig::fc_autotune_level)
.def_readwrite("fc_autotune_file", &XpuConfig::fc_autotune_file)
.def_readwrite("fc_autotune_file_writeback",
&XpuConfig::fc_autotune_file_writeback)
.def_readwrite("gemm_compute_precision",
&XpuConfig::gemm_compute_precision)
.def_readwrite("transformer_softmax_optimize_level",
&XpuConfig::transformer_softmax_optimize_level)
.def_readwrite("transformer_encoder_adaptive_seqlen",
&XpuConfig::transformer_encoder_adaptive_seqlen)
.def_readwrite("quant_post_static_gelu_out_threshold",
&XpuConfig::quant_post_static_gelu_out_threshold)
.def_readwrite("quant_post_dynamic_activation_method",
&XpuConfig::quant_post_dynamic_activation_method)
.def_readwrite("quant_post_dynamic_weight_precision",
&XpuConfig::quant_post_dynamic_weight_precision)
.def_readwrite("quant_post_dynamic_op_types",
&XpuConfig::quant_post_dynamic_op_types);
}
#ifdef PADDLE_WITH_MKLDNN
void BindMkldnnQuantizerConfig(py::module *m) {
py::class_<MkldnnQuantizerConfig> quantizer_config(*m,
......
......@@ -30,6 +30,7 @@ from paddle.fluid.core import (
get_trt_runtime_version,
get_num_bytes_of_data_type,
PredictorPool,
XpuConfig,
)
__all__ = [ # noqa
......@@ -47,4 +48,5 @@ __all__ = [ # noqa
'get_trt_runtime_version',
'get_num_bytes_of_data_type',
'PredictorPool',
'XpuConfig',
]
......@@ -1476,6 +1476,15 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
endif()
if(WITH_XPU)
inference_analysis_test(
xpu_config_resnet50_test
SRCS
xpu_config_resnet50_test.cc
EXTRA_DEPS
paddle_inference_shared
python
ARGS
--infer_model=${RESNET50_MODEL_DIR})
inference_analysis_test(
xpu_runtime_config_resnet50_test
SRCS
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cmath>
#include "gflags/gflags.h"
#include "test/cpp/inference/api/tester_helper.h"
namespace paddle_infer {
static const std::vector<float> TRUTH_VALUES = {
127.779f, 738.165f, 1013.22f, -438.17f, 366.401f, 927.659f, 736.222f,
-633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f,
-242.675f, 117.448f, -801.723f, -391.514f, -404.818f, 454.16f, 515.48f,
-133.031f, 69.293f, 590.096f, -1434.69f, -1070.89f, 307.074f, 400.525f,
-316.12f, -587.125f, -161.056f, 800.363f, -96.4708f, 748.706f, 868.174f,
-447.938f, 112.737f, 1127.2f, 47.4355f, 677.72f, 593.186f, -336.4f,
551.362f, 397.823f, 78.3979f, -715.398f, 405.969f, 404.256f, 246.019f,
-8.42969f, 131.365f, -648.051f};
void PrepareInput(std::shared_ptr<Predictor> predictor) {
const int batch = 1;
const int channel = 3;
const int height = 318;
const int width = 318;
const int input_num = batch * channel * height * width;
std::vector<float> input(input_num, 1);
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputHandle(input_names[0]);
input_t->Reshape({batch, channel, height, width});
input_t->CopyFromCpu(input.data());
}
void CompareOutput(std::shared_ptr<Predictor> predictor) {
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
size_t out_num = std::accumulate(
output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
std::vector<float> out_data;
out_data.resize(out_num);
output_t->CopyToCpu(out_data.data());
float* data_o = out_data.data();
for (size_t j = 0; j < out_num; j += 10) {
EXPECT_NEAR(
(data_o[j] - TRUTH_VALUES[j / 10]) / TRUTH_VALUES[j / 10], 0., 10e-3);
}
}
TEST(xpu_config, inference) {
size_t l3_size = 10 * 1024 * 1024;
XpuConfig xpu_config;
xpu_config.l3_size = l3_size;
std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableXpu();
config.SetXpuConfig(xpu_config);
XpuConfig xpu_config_test = config.xpu_config();
CHECK_EQ(xpu_config_test.l3_size, l3_size);
auto predictor = CreatePredictor(config);
PrepareInput(predictor);
predictor->Run();
CompareOutput(predictor);
}
TEST(xpu_config, lite) {
size_t l3_size = 10 * 1024 * 1024;
XpuConfig xpu_config;
xpu_config.l3_size = l3_size;
std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableXpu();
config.SetXpuConfig(xpu_config);
config.EnableLiteEngine();
XpuConfig xpu_config_test = config.xpu_config();
CHECK_EQ(xpu_config_test.l3_size, l3_size);
auto predictor = CreatePredictor(config);
PrepareInput(predictor);
predictor->Run();
CompareOutput(predictor);
}
} // namespace paddle_infer
......@@ -63,7 +63,7 @@ void CompareOutput(std::shared_ptr<Predictor> predictor) {
}
}
Config XpuConfig() {
Config InferXpuConfig() {
std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
......@@ -72,7 +72,7 @@ Config XpuConfig() {
}
TEST(resnet50_xpu, basic) {
Config config = XpuConfig();
Config config = InferXpuConfig();
auto predictor = CreatePredictor(config);
PrepareInput(predictor);
predictor->Run();
......@@ -80,7 +80,7 @@ TEST(resnet50_xpu, basic) {
}
#define RUN_WITH_RUNTIME_CONFIG(idx_, config_) \
Config config##idx_ = XpuConfig(); \
Config config##idx_ = InferXpuConfig(); \
auto predictor##idx_ = CreatePredictor(config##idx_); \
PrepareInput(predictor##idx_); \
experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册