未验证 提交 52ad918b 编写于 作者: A AlbertVan 提交者: GitHub

[XPU] add context_gm_size in XpuConfig, don't alloc gm in pass. (#54674)

上级 f38e126e
......@@ -9,9 +9,13 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
set(XPU_XFT_LIB_NAME "libxft.so")
set(XPU_XPTI_LIB_NAME "libxpti.so")
set(XPU_BASE_DATE "20230602")
if(NOT DEFINED XPU_BASE_DATE)
set(XPU_BASE_DATE "20230602")
endif()
set(XPU_XCCL_BASE_VERSION "1.0.49.2")
set(XPU_XFT_BASE_VERSION "latest")
if(NOT DEFINED XPU_XFT_BASE_VERSION)
set(XPU_XFT_BASE_VERSION "20230602")
endif()
set(XPU_XPTI_BASE_VERSION "0.0.1")
if(NOT DEFINED XPU_BASE_URL)
......
......@@ -265,7 +265,7 @@ void PrepareWeight(phi::DenseTensor* weight,
}
// Find max
int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(0);
int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
int size = weight_fp32.numel();
auto* weight_data = weight_fp32.data<float>();
float max_val = FindMaxAbs(weight_data, size);
......
......@@ -303,6 +303,7 @@ struct Argument {
DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
DECL_ARGUMENT_FIELD(xpu_context_gm_size, XpuContextGmSize, int);
DECL_POINTER_ARGUMENT_FIELD(xpu_context, XpuContext, void*);
DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
......
......@@ -272,6 +272,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
pass->Set("xpu_l3_autotune_size",
new size_t(argument->xpu_l3_autotune_size()));
pass->Set("xpu_context_gm_size",
new int(argument->xpu_context_gm_size()));
pass->Set("xpu_context", new void *(argument->xpu_context()));
pass->Set("xpu_stream", new void *(argument->xpu_stream()));
pass->Set("xpu_conv_autotune_level",
......
......@@ -1096,6 +1096,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << xpu_config_.l3_size;
ss << xpu_config_.l3_ptr;
ss << xpu_config_.l3_autotune_size;
ss << xpu_config_.context_gm_size;
ss << xpu_config_.context;
ss << xpu_config_.stream;
ss << xpu_config_.conv_autotune_level;
......@@ -1347,6 +1348,8 @@ std::string AnalysisConfig::Summary() {
std::to_string(reinterpret_cast<int64_t>(xpu_config_.l3_ptr))});
os.InsertRow(
{"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)});
os.InsertRow(
{"xpu_context_gm_size", std::to_string(xpu_config_.context_gm_size)});
os.InsertRow(
{"xpu_context",
std::to_string(reinterpret_cast<int64_t>(xpu_config_.context))});
......
......@@ -93,6 +93,10 @@
#include "paddle/fluid/platform/device/ipu/paddle_ipu_handler.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/xpu_info.h"
#endif
namespace paddle {
namespace {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -436,6 +440,7 @@ void AnalysisPredictor::InitPlace() {
#endif // LITE_SUBGRAPH_WITH_XPU
} else {
#ifdef PADDLE_WITH_XPU
phi::backends::xpu::SetXPUDeviceId(config_.xpu_device_id());
place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
#else
PADDLE_THROW(platform::errors::Unavailable(
......@@ -509,7 +514,8 @@ void AnalysisPredictor::InitDeviceContexts() {
device_contexts_.emplace(
place_, std::async(std::launch::deferred, [=] {
auto &instance = memory::allocation::AllocatorFacade::Instance();
auto *xpu_context = new InferXPUContext(place_);
auto *xpu_context =
new InferXPUContext(place_, config_.xpu_config().context_gm_size);
xpu_context->SetAllocator(instance.GetAllocator(place_).get());
xpu_context->SetGenerator(
phi::DefaultXPUGenerator(place_.GetDeviceId()).get());
......@@ -1504,6 +1510,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetXpuL3Size(config_.xpu_config_.l3_size);
argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr);
argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size);
argument_->SetXpuContextGmSize(config_.xpu_config_.context_gm_size);
argument_->SetXpuContext(config_.xpu_config_.context);
argument_->SetXpuStream(config_.xpu_config_.stream);
argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level);
......
......@@ -27,8 +27,15 @@ InferGPUContext::InferGPUContext(const phi::Place& place)
#endif
#ifdef PADDLE_WITH_XPU
InferXPUContext::InferXPUContext(const phi::Place& place)
: phi::XPUContext(place) {}
InferXPUContext::InferXPUContext(const phi::Place& place, int context_gm_size)
: phi::XPUContext(place) {
if (context_gm_size >= 0) {
x_context()->set_option("XPUAPI_DEFAULT_SIZE",
std::to_string(context_gm_size).c_str());
} else {
x_context()->set_option("XPUAPI_DEFAULT_SIZE", "");
}
}
void* InferXPUContext::Alloc(phi::TensorBase* tensor,
phi::DataType dtype,
......
......@@ -52,7 +52,7 @@ class InferGPUContext : public phi::GPUContext {
#ifdef PADDLE_WITH_XPU
class InferXPUContext : public phi::XPUContext {
public:
explicit InferXPUContext(const phi::Place& place);
explicit InferXPUContext(const phi::Place& place, int context_gm_size = -1);
void* Alloc(phi::TensorBase* tensor,
phi::DataType dtype,
......
......@@ -93,6 +93,10 @@ struct PD_INFER_DECL XpuConfig {
// kernels (both paddle/xdnn kernels)
size_t l3_autotune_size{0};
// Reserved xpu global memory size for xpu_context;
// If not set(-1), default memory size for xpu_context is 128MB in XPU2 or
// 64MB in XPU1. If set 1*1024*1024, memory size for xpu_conext will be 1MB;
int context_gm_size{-1};
// xpu_context(from baidu::xpu::api::create_context) for execution.
// If context is nullptr, new context will be created by default.
void* context{nullptr};
......
......@@ -1009,6 +1009,7 @@ void BindXpuConfig(py::module *m) {
.def_readwrite("l3_ptr", &XpuConfig::l3_ptr)
.def_readwrite("l3_size", &XpuConfig::l3_size)
.def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size)
.def_readwrite("context_gm_size", &XpuConfig::context_gm_size)
.def_readwrite("context", &XpuConfig::context)
.def_readwrite("stream", &XpuConfig::stream)
.def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level)
......
......@@ -27,7 +27,7 @@ inline std::string get_type_name() {
name = name.substr(name.find(key));
assert(!name.empty() && "Unable to find the template parameter!");
name = name.substr(key.size());
assert(name.back() == "]" && "Name doesn't end in the substitution key!");
assert(name.back() == ']' && "Name doesn't end in the substitution key!");
auto sem_pos = name.find_first_of(";");
if (sem_pos == std::string::npos)
name.pop_back();
......
......@@ -151,6 +151,7 @@ struct XPUContext::Impl {
LOG_FIRST_N(WARNING, 1)
<< "Please NOTE: xpu device: " << static_cast<int>(place_.device);
context_ = xpu::create_context();
context_->set_option("XPUAPI_DEFAULT_SIZE", "1");
xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache();
}
......
......@@ -187,6 +187,9 @@ void MemcpySyncD2D(void* dst,
XPUVersion get_xpu_version(int dev_id) {
uint64_t v = 0;
if (dev_id == -1) {
dev_id = GetXPUCurrentDeviceId();
}
PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
if (v == K100 || v == K200) {
......
......@@ -68,7 +68,7 @@ bool is_in_xpu_black_list(const std::string& fluid_op_name) {
bool is_xpu_kp_support_op(const std::string& fluid_op_name,
const phi::DataType type) {
if (is_in_xpu_black_list(fluid_op_name)) return false;
auto v = get_xpu_version(0);
auto v = get_xpu_version(-1);
auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1)
? phi::backends::xpu::get_kl1_ops()
: phi::backends::xpu::get_kp_ops();
......@@ -84,7 +84,7 @@ bool is_xpu_kp_support_op(const std::string& fluid_op_name,
bool is_xpu_support_op(const std::string& fluid_op_name,
const phi::DataType type) {
if (is_in_xpu_black_list(fluid_op_name)) return false;
auto v = get_xpu_version(0);
auto v = get_xpu_version(-1);
auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
: get_kl2_ops();
if (ops.find(fluid_op_name) != ops.end() &&
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册