未验证 提交 d0d7d01f 编写于 作者: Z zhupengyang 提交者: GitHub

set xpu context at runtime (#54587)

上级 4ee3815e
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/xpu/xpu_info.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/assign_kernel.h"
#include "paddle/phi/kernels/cast_kernel.h"
......@@ -264,17 +265,7 @@ void PrepareWeight(phi::DenseTensor* weight,
}
// Find max
paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
const auto& dev_ctxs = pool.device_contexts();
auto place = phi::XPUPlace(); // xpu:0
for (auto it = dev_ctxs.begin(); it != dev_ctxs.end(); it++) {
if (it->first.GetType() == phi::AllocationType::XPU) { // maybe xpu:1
place = it->first;
}
}
phi::XPUContext* xpu_ctx = static_cast<phi::XPUContext*>(pool.Get(place));
int max_ptr_size = xpu_ctx->x_context()->max_ptr_size();
int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(0);
int size = weight_fp32.numel();
auto* weight_data = weight_fp32.data<float>();
float max_val = FindMaxAbs(weight_data, size);
......
......@@ -303,6 +303,7 @@ struct Argument {
DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*);
DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t);
DECL_POINTER_ARGUMENT_FIELD(xpu_context, XpuContext, void*);
DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int);
DECL_ARGUMENT_FIELD(xpu_conv_autotune_file, XpuConvAutotuneFile, std::string);
......
......@@ -272,6 +272,7 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr()));
pass->Set("xpu_l3_autotune_size",
new size_t(argument->xpu_l3_autotune_size()));
pass->Set("xpu_context", new void *(argument->xpu_context()));
pass->Set("xpu_stream", new void *(argument->xpu_stream()));
pass->Set("xpu_conv_autotune_level",
new int(argument->xpu_conv_autotune_level()));
......
......@@ -1095,6 +1095,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << xpu_config_.l3_size;
ss << xpu_config_.l3_ptr;
ss << xpu_config_.l3_autotune_size;
ss << xpu_config_.context;
ss << xpu_config_.stream;
ss << xpu_config_.conv_autotune_level;
ss << xpu_config_.conv_autotune_file;
......@@ -1345,6 +1346,9 @@ std::string AnalysisConfig::Summary() {
std::to_string(reinterpret_cast<int64_t>(xpu_config_.l3_ptr))});
os.InsertRow(
{"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)});
os.InsertRow(
{"xpu_context",
std::to_string(reinterpret_cast<int64_t>(xpu_config_.context))});
os.InsertRow(
{"xpu_stream",
std::to_string(reinterpret_cast<int64_t>(xpu_config_.stream))});
......
......@@ -394,11 +394,10 @@ bool AnalysisPredictor::Init(
if (!status_is_cloned_ && config_.external_stream_enabled()) {
predictor_stream_ = config_.GetExecStream();
}
auto *global_context = static_cast<phi::XPUContext *>(
platform::DeviceContextPool::Instance().Get(place_));
auto global_stream = global_context->stream();
if (predictor_stream_ == nullptr) {
predictor_stream_ = global_stream;
auto *global_context = static_cast<phi::XPUContext *>(
platform::DeviceContextPool::Instance().Get(place_));
predictor_stream_ = global_context->stream();
}
InitDeviceContexts();
}
......@@ -1505,6 +1504,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetXpuL3Size(config_.xpu_config_.l3_size);
argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr);
argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size);
argument_->SetXpuContext(config_.xpu_config_.context);
argument_->SetXpuStream(config_.xpu_config_.stream);
argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level);
argument_->SetXpuConvAutotuneFile(config_.xpu_config_.conv_autotune_file);
......@@ -2098,6 +2098,10 @@ bool AnalysisPredictor::ZeroCopyRun() {
this->GetDeviceContexts());
infer_xpu_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
auto *x_context = static_cast<xpu::Context *>(config_.xpu_config_.context);
if (x_context != nullptr) {
infer_xpu_ctx->SetXContext(x_context);
}
infer_xpu_ctx->SetStream(predictor_stream_);
infer_xpu_ctx->SetL3Info(config_.xpu_config_.l3_size,
config_.xpu_config_.l3_ptr,
......@@ -2186,6 +2190,8 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
#ifdef PADDLE_WITH_XPU
auto xpu_runtime_config =
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
config_.xpu_config_.context = xpu_runtime_config->context;
auto *stream = xpu_runtime_config->stream;
if (stream != nullptr && stream != predictor_stream_) {
paddle::platform::XPUStreamSync(
......
......@@ -75,6 +75,22 @@ void* InferXPUContext::Alloc(phi::TensorBase* tensor,
}
}
void InferXPUContext::SetXContext(xpu::Context* x_context) {
auto* old_x_context = this->x_context();
if (old_x_context != x_context) {
if (l3_owned_ && l3_size_ > 0 &&
(x_context->_l3_mgr.get_size() != l3_size_ ||
x_context->_l3_mgr.get_ptr() != l3_ptr_)) {
xpu_free(l3_ptr_);
}
old_x_context->_l3_mgr.set(nullptr, 0);
l3_size_ = x_context->_l3_mgr.get_size();
l3_ptr_ = x_context->_l3_mgr.get_ptr();
l3_owned_ = false;
phi::XPUContext::SetXContext(x_context);
}
}
void InferXPUContext::SetL3Info(size_t l3_size,
void* l3_ptr,
size_t l3_autotune_size,
......
......@@ -60,6 +60,8 @@ class InferXPUContext : public phi::XPUContext {
bool pinned = false,
bool fake_alloc = false) const override;
void SetXContext(xpu::Context* x_context);
void SetL3Info(size_t l3_size,
void* l3_ptr,
size_t l3_autotune_size,
......
......@@ -93,6 +93,9 @@ struct PD_INFER_DECL XpuConfig {
// kernels (both paddle/xdnn kernels)
size_t l3_autotune_size{0};
// xpu_context(from baidu::xpu::api::create_context) for execution.
// If context is nullptr, new context will be created by default.
void* context{nullptr};
// Stream for execution.
// If stream is nullptr, default stream will be used.
void* stream{nullptr};
......
......@@ -472,9 +472,25 @@ class Tensor;
using Config = paddle::AnalysisConfig;
namespace experimental {
struct XpuRuntimeConfig {
// xpu_context(from baidu::xpu::api::create_context) for execution.
// If context is nullptr, default context is used.
void* context{nullptr};
// Stream for execution.
// Note: It has a higher priority than stream in "context"
void* stream{nullptr};
// Available l3 size (Byte)
// For kunlun1, max l3_size is 16773120 Byte
// For kunlun2, max l3_size is 67104768 Byte
// Note: If it is difference from l3_size in "context", new l3 buffer is
// malloced.
size_t l3_size{16773120};
// If l3_ptr is not nullptr, it is used as l3 buffer.
// If l3_ptr is nullptr, new l3 buffer will be created.
void* l3_ptr{nullptr};
// Available l3 size for autotune.
// If l3_autotune_size is 0, autotune is closed.
// Note: The remaining l3 size (l3_size - l3_autotune_size) is for
// kernels (both paddle/xdnn kernels)
size_t l3_autotune_size{0};
};
......
......@@ -1009,6 +1009,7 @@ void BindXpuConfig(py::module *m) {
.def_readwrite("l3_ptr", &XpuConfig::l3_ptr)
.def_readwrite("l3_size", &XpuConfig::l3_size)
.def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size)
.def_readwrite("context", &XpuConfig::context)
.def_readwrite("stream", &XpuConfig::stream)
.def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level)
.def_readwrite("conv_autotune_file", &XpuConfig::conv_autotune_file)
......
......@@ -103,6 +103,9 @@ struct XPUContext::Impl {
// Set external stream for context
void SetStream(void* stream) {
if (context_->xpu_stream != nullptr && stream_owned_) {
xpu_stream_destroy(context_->xpu_stream);
}
stream_owned_ = false;
context_->set_stream(static_cast<XPUStream>(stream));
}
......@@ -152,7 +155,22 @@ struct XPUContext::Impl {
SetL3Cache();
}
void SetXContext(xpu::Context* context) { context_ = context; }
void SetXContext(xpu::Context* context) {
if (context_ != nullptr) {
backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId());
xpu_wait(context_->xpu_stream);
if (context_->xpu_stream != nullptr && stream_owned_) {
xpu_stream_destroy(context_->xpu_stream);
stream_owned_ = false;
context_->xpu_stream = nullptr;
}
if (owned_) {
xpu::destroy_context(context_);
}
}
context_ = context;
owned_ = false;
}
void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; }
......
......@@ -198,6 +198,24 @@ XPUVersion get_xpu_version(int dev_id) {
}
}
int get_xpu_max_ptr_size(int dev_id) {
auto xpu_version = get_xpu_version(dev_id);
int max_ptr_size = 0;
switch (xpu_version) {
case XPUVersion::XPU1:
max_ptr_size = 4;
break;
case XPUVersion::XPU2:
max_ptr_size = 6;
break;
default:
PADDLE_THROW(phi::errors::InvalidArgument(
"Only support get max ptr size of XPU1 or XPU2."));
break;
}
return max_ptr_size;
}
} // namespace xpu
} // namespace backends
} // namespace phi
......@@ -95,6 +95,8 @@ class XPUDeviceGuard {
enum XPUVersion { XPU1, XPU2 };
XPUVersion get_xpu_version(int dev_id);
int get_xpu_max_ptr_size(int dev_id);
} // namespace xpu
} // namespace backends
} // namespace phi
......@@ -89,7 +89,12 @@ TEST(resnet50_xpu, basic) {
CHECK_EQ(predictor##idx_->GetExecStream(), config_.stream);
TEST(runtime_stream, null_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config;
xpu_runtime_config.context = nullptr;
xpu_runtime_config.stream = nullptr;
xpu_runtime_config.l3_size = 0;
xpu_runtime_config.l3_ptr = nullptr;
xpu_runtime_config.l3_autotune_size = 0;
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
}
......@@ -98,26 +103,45 @@ TEST(runtime_stream, new_stream) {
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
{
experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config;
xpu_runtime_config.context = nullptr;
xpu_runtime_config.stream = stream;
xpu_runtime_config.l3_size = 0;
xpu_runtime_config.l3_ptr = nullptr;
xpu_runtime_config.l3_autotune_size = 0;
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
}
xpu_stream_destroy(stream);
}
TEST(runtime_stream, 2_null_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config;
xpu_runtime_config.context = nullptr;
xpu_runtime_config.stream = nullptr;
xpu_runtime_config.l3_size = 0;
xpu_runtime_config.l3_ptr = nullptr;
xpu_runtime_config.l3_autotune_size = 0;
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config);
}
TEST(runtime_stream, null_and_new_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config0 = {nullptr, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config0;
xpu_runtime_config0.context = nullptr;
xpu_runtime_config0.stream = nullptr;
xpu_runtime_config0.l3_size = 0;
xpu_runtime_config0.l3_ptr = nullptr;
xpu_runtime_config0.l3_autotune_size = 0;
void* stream = nullptr;
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
{
experimental::XpuRuntimeConfig xpu_runtime_config1 = {
stream, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config1;
xpu_runtime_config1.context = nullptr;
xpu_runtime_config1.stream = stream;
xpu_runtime_config1.l3_size = 0;
xpu_runtime_config1.l3_ptr = nullptr;
xpu_runtime_config1.l3_autotune_size = 0;
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
}
......@@ -128,7 +152,12 @@ TEST(runtime_stream, 2_new_same_stream) {
void* stream = nullptr;
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config;
xpu_runtime_config.context = nullptr;
xpu_runtime_config.stream = stream;
xpu_runtime_config.l3_size = 0;
xpu_runtime_config.l3_ptr = nullptr;
xpu_runtime_config.l3_autotune_size = 0;
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config);
......@@ -140,11 +169,21 @@ TEST(runtime_stream, 2_new_different_stream) {
void* stream0 = nullptr;
xpu_stream_create(&stream0);
CHECK_NOTNULL(stream0);
experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config0;
xpu_runtime_config0.context = nullptr;
xpu_runtime_config0.stream = stream0;
xpu_runtime_config0.l3_size = 0;
xpu_runtime_config0.l3_ptr = nullptr;
xpu_runtime_config0.l3_autotune_size = 0;
void* stream1 = nullptr;
xpu_stream_create(&stream1);
CHECK_NOTNULL(stream1);
experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config1;
xpu_runtime_config1.context = nullptr;
xpu_runtime_config1.stream = stream1;
xpu_runtime_config1.l3_size = 0;
xpu_runtime_config1.l3_ptr = nullptr;
xpu_runtime_config1.l3_autotune_size = 0;
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
......@@ -167,12 +206,22 @@ TEST(runtime_stream, 2_thread) {
void* stream0 = nullptr;
xpu_stream_create(&stream0);
CHECK_NOTNULL(stream0);
experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config0;
xpu_runtime_config0.context = nullptr;
xpu_runtime_config0.stream = stream0;
xpu_runtime_config0.l3_size = 0;
xpu_runtime_config0.l3_ptr = nullptr;
xpu_runtime_config0.l3_autotune_size = 0;
void* stream1 = nullptr;
xpu_stream_create(&stream1);
CHECK_NOTNULL(stream1);
experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0};
experimental::XpuRuntimeConfig xpu_runtime_config1;
xpu_runtime_config1.context = nullptr;
xpu_runtime_config1.stream = stream1;
xpu_runtime_config1.l3_size = 0;
xpu_runtime_config1.l3_ptr = nullptr;
xpu_runtime_config1.l3_autotune_size = 0;
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
......@@ -189,4 +238,19 @@ TEST(runtime_stream, 2_thread) {
xpu_stream_destroy(stream1);
}
TEST(runtime_context, new_context) {
auto* context = baidu::xpu::api::create_context();
CHECK_NOTNULL(context);
{
experimental::XpuRuntimeConfig xpu_runtime_config;
xpu_runtime_config.context = context;
xpu_runtime_config.stream = nullptr;
xpu_runtime_config.l3_size = 0;
xpu_runtime_config.l3_ptr = nullptr;
xpu_runtime_config.l3_autotune_size = 0;
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
}
baidu::xpu::api::destroy_context(context);
}
} // namespace paddle_infer
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册