未验证 提交 52ad918b 编写于 作者: A AlbertVan 提交者: GitHub

[XPU] add context_gm_size in XpuConfig, don't alloc gm in pass. (#54674)

上级 f38e126e
...@@ -9,9 +9,13 @@ set(XPU_RT_LIB_NAME "libxpurt.so") ...@@ -9,9 +9,13 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
set(XPU_XFT_LIB_NAME "libxft.so") set(XPU_XFT_LIB_NAME "libxft.so")
set(XPU_XPTI_LIB_NAME "libxpti.so") set(XPU_XPTI_LIB_NAME "libxpti.so")
set(XPU_BASE_DATE "20230602") if(NOT DEFINED XPU_BASE_DATE)
set(XPU_BASE_DATE "20230602")
endif()
set(XPU_XCCL_BASE_VERSION "1.0.49.2") set(XPU_XCCL_BASE_VERSION "1.0.49.2")
set(XPU_XFT_BASE_VERSION "latest") if(NOT DEFINED XPU_XFT_BASE_VERSION)
set(XPU_XFT_BASE_VERSION "20230602")
endif()
set(XPU_XPTI_BASE_VERSION "0.0.1") set(XPU_XPTI_BASE_VERSION "0.0.1")
if(NOT DEFINED XPU_BASE_URL) if(NOT DEFINED XPU_BASE_URL)
......
...@@ -265,7 +265,7 @@ void PrepareWeight(phi::DenseTensor* weight, ...@@ -265,7 +265,7 @@ void PrepareWeight(phi::DenseTensor* weight,
} }
// Find max // Find max
int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(0); int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
int size = weight_fp32.numel(); int size = weight_fp32.numel();
auto* weight_data = weight_fp32.data<float>(); auto* weight_data = weight_fp32.data<float>();
float max_val = FindMaxAbs(weight_data, size); float max_val = FindMaxAbs(weight_data, size);
......
...@@ -303,6 +303,7 @@ struct Argument { ...@@ -303,6 +303,7 @@ struct Argument {
DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t); DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*); DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t); DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
DECL_ARGUMENT_FIELD(xpu_context_gm_size, XpuContextGmSize, int);
DECL_POINTER_ARGUMENT_FIELD(xpu_context, XpuContext, void*); DECL_POINTER_ARGUMENT_FIELD(xpu_context, XpuContext, void*);
DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*); DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int); DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
......
...@@ -272,6 +272,8 @@ void IRPassManager::CreatePasses(Argument *argument, ...@@ -272,6 +272,8 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr())); pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
pass->Set("xpu_l3_autotune_size", pass->Set("xpu_l3_autotune_size",
new size_t(argument->xpu_l3_autotune_size())); new size_t(argument->xpu_l3_autotune_size()));
pass->Set("xpu_context_gm_size",
new int(argument->xpu_context_gm_size()));
pass->Set("xpu_context", new void *(argument->xpu_context())); pass->Set("xpu_context", new void *(argument->xpu_context()));
pass->Set("xpu_stream", new void *(argument->xpu_stream())); pass->Set("xpu_stream", new void *(argument->xpu_stream()));
pass->Set("xpu_conv_autotune_level", pass->Set("xpu_conv_autotune_level",
......
...@@ -1096,6 +1096,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -1096,6 +1096,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << xpu_config_.l3_size; ss << xpu_config_.l3_size;
ss << xpu_config_.l3_ptr; ss << xpu_config_.l3_ptr;
ss << xpu_config_.l3_autotune_size; ss << xpu_config_.l3_autotune_size;
ss << xpu_config_.context_gm_size;
ss << xpu_config_.context; ss << xpu_config_.context;
ss << xpu_config_.stream; ss << xpu_config_.stream;
ss << xpu_config_.conv_autotune_level; ss << xpu_config_.conv_autotune_level;
...@@ -1347,6 +1348,8 @@ std::string AnalysisConfig::Summary() { ...@@ -1347,6 +1348,8 @@ std::string AnalysisConfig::Summary() {
std::to_string(reinterpret_cast<int64_t>(xpu_config_.l3_ptr))}); std::to_string(reinterpret_cast<int64_t>(xpu_config_.l3_ptr))});
os.InsertRow( os.InsertRow(
{"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)}); {"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)});
os.InsertRow(
{"xpu_context_gm_size", std::to_string(xpu_config_.context_gm_size)});
os.InsertRow( os.InsertRow(
{"xpu_context", {"xpu_context",
std::to_string(reinterpret_cast<int64_t>(xpu_config_.context))}); std::to_string(reinterpret_cast<int64_t>(xpu_config_.context))});
......
...@@ -93,6 +93,10 @@ ...@@ -93,6 +93,10 @@
#include "paddle/fluid/platform/device/ipu/paddle_ipu_handler.h" #include "paddle/fluid/platform/device/ipu/paddle_ipu_handler.h"
#endif #endif
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/xpu_info.h"
#endif
namespace paddle { namespace paddle {
namespace { namespace {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -436,6 +440,7 @@ void AnalysisPredictor::InitPlace() { ...@@ -436,6 +440,7 @@ void AnalysisPredictor::InitPlace() {
#endif // LITE_SUBGRAPH_WITH_XPU #endif // LITE_SUBGRAPH_WITH_XPU
} else { } else {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
phi::backends::xpu::SetXPUDeviceId(config_.xpu_device_id());
place_ = paddle::platform::XPUPlace(config_.xpu_device_id()); place_ = paddle::platform::XPUPlace(config_.xpu_device_id());
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
...@@ -509,7 +514,8 @@ void AnalysisPredictor::InitDeviceContexts() { ...@@ -509,7 +514,8 @@ void AnalysisPredictor::InitDeviceContexts() {
device_contexts_.emplace( device_contexts_.emplace(
place_, std::async(std::launch::deferred, [=] { place_, std::async(std::launch::deferred, [=] {
auto &instance = memory::allocation::AllocatorFacade::Instance(); auto &instance = memory::allocation::AllocatorFacade::Instance();
auto *xpu_context = new InferXPUContext(place_); auto *xpu_context =
new InferXPUContext(place_, config_.xpu_config().context_gm_size);
xpu_context->SetAllocator(instance.GetAllocator(place_).get()); xpu_context->SetAllocator(instance.GetAllocator(place_).get());
xpu_context->SetGenerator( xpu_context->SetGenerator(
phi::DefaultXPUGenerator(place_.GetDeviceId()).get()); phi::DefaultXPUGenerator(place_.GetDeviceId()).get());
...@@ -1504,6 +1510,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1504,6 +1510,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetXpuL3Size(config_.xpu_config_.l3_size); argument_->SetXpuL3Size(config_.xpu_config_.l3_size);
argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr); argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr);
argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size); argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size);
argument_->SetXpuContextGmSize(config_.xpu_config_.context_gm_size);
argument_->SetXpuContext(config_.xpu_config_.context); argument_->SetXpuContext(config_.xpu_config_.context);
argument_->SetXpuStream(config_.xpu_config_.stream); argument_->SetXpuStream(config_.xpu_config_.stream);
argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level); argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level);
......
...@@ -27,8 +27,15 @@ InferGPUContext::InferGPUContext(const phi::Place& place) ...@@ -27,8 +27,15 @@ InferGPUContext::InferGPUContext(const phi::Place& place)
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
InferXPUContext::InferXPUContext(const phi::Place& place) InferXPUContext::InferXPUContext(const phi::Place& place, int context_gm_size)
: phi::XPUContext(place) {} : phi::XPUContext(place) {
if (context_gm_size >= 0) {
x_context()->set_option("XPUAPI_DEFAULT_SIZE",
std::to_string(context_gm_size).c_str());
} else {
x_context()->set_option("XPUAPI_DEFAULT_SIZE", "");
}
}
void* InferXPUContext::Alloc(phi::TensorBase* tensor, void* InferXPUContext::Alloc(phi::TensorBase* tensor,
phi::DataType dtype, phi::DataType dtype,
......
...@@ -52,7 +52,7 @@ class InferGPUContext : public phi::GPUContext { ...@@ -52,7 +52,7 @@ class InferGPUContext : public phi::GPUContext {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
class InferXPUContext : public phi::XPUContext { class InferXPUContext : public phi::XPUContext {
public: public:
explicit InferXPUContext(const phi::Place& place); explicit InferXPUContext(const phi::Place& place, int context_gm_size = -1);
void* Alloc(phi::TensorBase* tensor, void* Alloc(phi::TensorBase* tensor,
phi::DataType dtype, phi::DataType dtype,
......
...@@ -93,6 +93,10 @@ struct PD_INFER_DECL XpuConfig { ...@@ -93,6 +93,10 @@ struct PD_INFER_DECL XpuConfig {
// kernels (both paddle/xdnn kernels) // kernels (both paddle/xdnn kernels)
size_t l3_autotune_size{0}; size_t l3_autotune_size{0};
// Reserved xpu global memory size for xpu_context;
// If not set(-1), default memory size for xpu_context is 128MB in XPU2 or
// 64MB in XPU1. If set 1*1024*1024, memory size for xpu_conext will be 1MB;
int context_gm_size{-1};
// xpu_context(from baidu::xpu::api::create_context) for execution. // xpu_context(from baidu::xpu::api::create_context) for execution.
// If context is nullptr, new context will be created by default. // If context is nullptr, new context will be created by default.
void* context{nullptr}; void* context{nullptr};
......
...@@ -1009,6 +1009,7 @@ void BindXpuConfig(py::module *m) { ...@@ -1009,6 +1009,7 @@ void BindXpuConfig(py::module *m) {
.def_readwrite("l3_ptr", &XpuConfig::l3_ptr) .def_readwrite("l3_ptr", &XpuConfig::l3_ptr)
.def_readwrite("l3_size", &XpuConfig::l3_size) .def_readwrite("l3_size", &XpuConfig::l3_size)
.def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size) .def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size)
.def_readwrite("context_gm_size", &XpuConfig::context_gm_size)
.def_readwrite("context", &XpuConfig::context) .def_readwrite("context", &XpuConfig::context)
.def_readwrite("stream", &XpuConfig::stream) .def_readwrite("stream", &XpuConfig::stream)
.def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level) .def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level)
......
...@@ -27,7 +27,7 @@ inline std::string get_type_name() { ...@@ -27,7 +27,7 @@ inline std::string get_type_name() {
name = name.substr(name.find(key)); name = name.substr(name.find(key));
assert(!name.empty() && "Unable to find the template parameter!"); assert(!name.empty() && "Unable to find the template parameter!");
name = name.substr(key.size()); name = name.substr(key.size());
assert(name.back() == "]" && "Name doesn't end in the substitution key!"); assert(name.back() == ']' && "Name doesn't end in the substitution key!");
auto sem_pos = name.find_first_of(";"); auto sem_pos = name.find_first_of(";");
if (sem_pos == std::string::npos) if (sem_pos == std::string::npos)
name.pop_back(); name.pop_back();
......
...@@ -151,6 +151,7 @@ struct XPUContext::Impl { ...@@ -151,6 +151,7 @@ struct XPUContext::Impl {
LOG_FIRST_N(WARNING, 1) LOG_FIRST_N(WARNING, 1)
<< "Please NOTE: xpu device: " << static_cast<int>(place_.device); << "Please NOTE: xpu device: " << static_cast<int>(place_.device);
context_ = xpu::create_context(); context_ = xpu::create_context();
context_->set_option("XPUAPI_DEFAULT_SIZE", "1");
xpu_version_ = backends::xpu::get_xpu_version(place_.device); xpu_version_ = backends::xpu::get_xpu_version(place_.device);
SetL3Cache(); SetL3Cache();
} }
......
...@@ -187,6 +187,9 @@ void MemcpySyncD2D(void* dst, ...@@ -187,6 +187,9 @@ void MemcpySyncD2D(void* dst,
XPUVersion get_xpu_version(int dev_id) { XPUVersion get_xpu_version(int dev_id) {
uint64_t v = 0; uint64_t v = 0;
if (dev_id == -1) {
dev_id = GetXPUCurrentDeviceId();
}
PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id)); PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
if (v == K100 || v == K200) { if (v == K100 || v == K200) {
......
...@@ -68,7 +68,7 @@ bool is_in_xpu_black_list(const std::string& fluid_op_name) { ...@@ -68,7 +68,7 @@ bool is_in_xpu_black_list(const std::string& fluid_op_name) {
bool is_xpu_kp_support_op(const std::string& fluid_op_name, bool is_xpu_kp_support_op(const std::string& fluid_op_name,
const phi::DataType type) { const phi::DataType type) {
if (is_in_xpu_black_list(fluid_op_name)) return false; if (is_in_xpu_black_list(fluid_op_name)) return false;
auto v = get_xpu_version(0); auto v = get_xpu_version(-1);
auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1)
? phi::backends::xpu::get_kl1_ops() ? phi::backends::xpu::get_kl1_ops()
: phi::backends::xpu::get_kp_ops(); : phi::backends::xpu::get_kp_ops();
...@@ -84,7 +84,7 @@ bool is_xpu_kp_support_op(const std::string& fluid_op_name, ...@@ -84,7 +84,7 @@ bool is_xpu_kp_support_op(const std::string& fluid_op_name,
bool is_xpu_support_op(const std::string& fluid_op_name, bool is_xpu_support_op(const std::string& fluid_op_name,
const phi::DataType type) { const phi::DataType type) {
if (is_in_xpu_black_list(fluid_op_name)) return false; if (is_in_xpu_black_list(fluid_op_name)) return false;
auto v = get_xpu_version(0); auto v = get_xpu_version(-1);
auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops() auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
: get_kl2_ops(); : get_kl2_ops();
if (ops.find(fluid_op_name) != ops.end() && if (ops.find(fluid_op_name) != ops.end() &&
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册