From d0d7d01fbdde5a554161b8adb3b3cee34ca8dbf1 Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Wed, 14 Jun 2023 13:10:14 +0800 Subject: [PATCH] set xpu context at runtime (#54587) --- paddle/fluid/framework/ir/xpu/quant_utils.cc | 13 +-- paddle/fluid/inference/analysis/argument.h | 1 + .../inference/analysis/ir_pass_manager.cc | 1 + paddle/fluid/inference/api/analysis_config.cc | 4 + .../fluid/inference/api/analysis_predictor.cc | 14 ++- paddle/fluid/inference/api/infer_context.cc | 16 ++++ paddle/fluid/inference/api/infer_context.h | 2 + .../inference/api/paddle_analysis_config.h | 3 + paddle/fluid/inference/api/paddle_api.h | 16 ++++ paddle/fluid/pybind/inference_api.cc | 1 + paddle/phi/backends/xpu/xpu_context.cc | 20 ++++- paddle/phi/backends/xpu/xpu_info.cc | 18 ++++ paddle/phi/backends/xpu/xpu_info.h | 2 + .../api/xpu_runtime_config_resnet50_test.cc | 86 ++++++++++++++++--- 14 files changed, 170 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc index 643e0e33744..65d080930be 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.cc +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/xpu/quant_utils.h" #include #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/xpu/xpu_info.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/assign_kernel.h" #include "paddle/phi/kernels/cast_kernel.h" @@ -264,17 +265,7 @@ void PrepareWeight(phi::DenseTensor* weight, } // Find max - paddle::platform::DeviceContextPool& pool = - paddle::platform::DeviceContextPool::Instance(); - const auto& dev_ctxs = pool.device_contexts(); - auto place = phi::XPUPlace(); // xpu:0 - for (auto it = dev_ctxs.begin(); it != dev_ctxs.end(); it++) { - if (it->first.GetType() == phi::AllocationType::XPU) { // maybe xpu:1 - place = it->first; - } - } - phi::XPUContext* xpu_ctx = static_cast(pool.Get(place)); - int max_ptr_size = xpu_ctx->x_context()->max_ptr_size(); + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(0); int size = weight_fp32.numel(); auto* weight_data = weight_fp32.data(); float max_val = FindMaxAbs(weight_data, size); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2b5b066a3a3..b52d6813628 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -303,6 +303,7 @@ struct Argument { DECL_ARGUMENT_FIELD(xpu_l3_size, XpuL3Size, size_t); DECL_POINTER_ARGUMENT_FIELD(xpu_l3_ptr, XpuL3Ptr, void*); DECL_ARGUMENT_FIELD(xpu_l3_autotune_size, XpuL3AutotuneSize, size_t); + DECL_POINTER_ARGUMENT_FIELD(xpu_context, XpuContext, void*); DECL_POINTER_ARGUMENT_FIELD(xpu_stream, XpuStream, void*); DECL_ARGUMENT_FIELD(xpu_conv_autotune_level, XpuConvAutotuneLevel, int); DECL_ARGUMENT_FIELD(xpu_conv_autotune_file, XpuConvAutotuneFile, std::string); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index d6936684165..4e58e32f82d 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -272,6 +272,7 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("xpu_l3_ptr", new void *(argument->xpu_l3_ptr())); pass->Set("xpu_l3_autotune_size", new size_t(argument->xpu_l3_autotune_size())); + pass->Set("xpu_context", new void *(argument->xpu_context())); pass->Set("xpu_stream", new void *(argument->xpu_stream())); pass->Set("xpu_conv_autotune_level", new int(argument->xpu_conv_autotune_level())); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index e05c1f0ca9b..87b97df6377 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -1095,6 +1095,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << xpu_config_.l3_size; ss << xpu_config_.l3_ptr; ss << xpu_config_.l3_autotune_size; + ss << xpu_config_.context; ss << xpu_config_.stream; ss << xpu_config_.conv_autotune_level; ss << xpu_config_.conv_autotune_file; @@ -1345,6 +1346,9 @@ std::string AnalysisConfig::Summary() { std::to_string(reinterpret_cast(xpu_config_.l3_ptr))}); os.InsertRow( {"xpu_l3_autotune_size", std::to_string(xpu_config_.l3_autotune_size)}); + os.InsertRow( + {"xpu_context", + std::to_string(reinterpret_cast(xpu_config_.context))}); os.InsertRow( {"xpu_stream", std::to_string(reinterpret_cast(xpu_config_.stream))}); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index c43ee1df887..373a9b5ffb3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -394,11 +394,10 @@ bool AnalysisPredictor::Init( if (!status_is_cloned_ && config_.external_stream_enabled()) { predictor_stream_ = config_.GetExecStream(); } - auto *global_context = static_cast( - platform::DeviceContextPool::Instance().Get(place_)); - auto global_stream = global_context->stream(); if (predictor_stream_ == nullptr) { - predictor_stream_ = global_stream; + auto *global_context = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + predictor_stream_ = global_context->stream(); } InitDeviceContexts(); } @@ -1505,6 +1504,7 @@ void AnalysisPredictor::PrepareArgument() { argument_->SetXpuL3Size(config_.xpu_config_.l3_size); argument_->SetXpuL3Ptr(config_.xpu_config_.l3_ptr); argument_->SetXpuL3AutotuneSize(config_.xpu_config_.l3_autotune_size); + argument_->SetXpuContext(config_.xpu_config_.context); argument_->SetXpuStream(config_.xpu_config_.stream); argument_->SetXpuConvAutotuneLevel(config_.xpu_config_.conv_autotune_level); argument_->SetXpuConvAutotuneFile(config_.xpu_config_.conv_autotune_file); @@ -2098,6 +2098,10 @@ bool AnalysisPredictor::ZeroCopyRun() { this->GetDeviceContexts()); infer_xpu_ctx = static_cast(dev_ctxs->at(place_).get().get()); + auto *x_context = static_cast(config_.xpu_config_.context); + if (x_context != nullptr) { + infer_xpu_ctx->SetXContext(x_context); + } infer_xpu_ctx->SetStream(predictor_stream_); infer_xpu_ctx->SetL3Info(config_.xpu_config_.l3_size, config_.xpu_config_.l3_ptr, @@ -2186,6 +2190,8 @@ bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) { #ifdef PADDLE_WITH_XPU auto xpu_runtime_config = reinterpret_cast(config); + + config_.xpu_config_.context = xpu_runtime_config->context; auto *stream = xpu_runtime_config->stream; if (stream != nullptr && stream != predictor_stream_) { paddle::platform::XPUStreamSync( diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc index bda0eecec0b..952554fc28d 100644 --- a/paddle/fluid/inference/api/infer_context.cc +++ b/paddle/fluid/inference/api/infer_context.cc @@ -75,6 +75,22 @@ void* InferXPUContext::Alloc(phi::TensorBase* tensor, } } +void InferXPUContext::SetXContext(xpu::Context* x_context) { + auto* old_x_context = this->x_context(); + if (old_x_context != x_context) { + if (l3_owned_ && l3_size_ > 0 && + (x_context->_l3_mgr.get_size() != l3_size_ || + x_context->_l3_mgr.get_ptr() != l3_ptr_)) { + xpu_free(l3_ptr_); + } + old_x_context->_l3_mgr.set(nullptr, 0); + l3_size_ = x_context->_l3_mgr.get_size(); + l3_ptr_ = x_context->_l3_mgr.get_ptr(); + l3_owned_ = false; + phi::XPUContext::SetXContext(x_context); + } +} + void InferXPUContext::SetL3Info(size_t l3_size, void* l3_ptr, size_t l3_autotune_size, diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h index 121399dca5a..a7fa06447f9 100644 --- a/paddle/fluid/inference/api/infer_context.h +++ b/paddle/fluid/inference/api/infer_context.h @@ -60,6 +60,8 @@ class InferXPUContext : public phi::XPUContext { bool pinned = false, bool fake_alloc = false) const override; + void SetXContext(xpu::Context* x_context); + void SetL3Info(size_t l3_size, void* l3_ptr, size_t l3_autotune_size, diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 11ba4feaecb..c771983e27f 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -93,6 +93,9 @@ struct PD_INFER_DECL XpuConfig { // kernels (both paddle/xdnn kernels) size_t l3_autotune_size{0}; + // xpu_context(from baidu::xpu::api::create_context) for execution. + // If context is nullptr, new context will be created by default. + void* context{nullptr}; // Stream for execution. // If stream is nullptr, default stream will be used. void* stream{nullptr}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 211f6b59539..a53600769a8 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -472,9 +472,25 @@ class Tensor; using Config = paddle::AnalysisConfig; namespace experimental { struct XpuRuntimeConfig { + // xpu_context(from baidu::xpu::api::create_context) for execution. + // If context is nullptr, default context is used. + void* context{nullptr}; + // Stream for execution. + // Note: It has a higher priority than stream in "context" void* stream{nullptr}; + // Available l3 size (Byte) + // For kunlun1, max l3_size is 16773120 Byte + // For kunlun2, max l3_size is 67104768 Byte + // Note: If it is difference from l3_size in "context", new l3 buffer is + // malloced. size_t l3_size{16773120}; + // If l3_ptr is not nullptr, it is used as l3 buffer. + // If l3_ptr is nullptr, new l3 buffer will be created. void* l3_ptr{nullptr}; + // Available l3 size for autotune. + // If l3_autotune_size is 0, autotune is closed. + // Note: The remaining l3 size (l3_size - l3_autotune_size) is for + // kernels (both paddle/xdnn kernels) size_t l3_autotune_size{0}; }; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index a7c2c9d580c..ad504419725 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -1009,6 +1009,7 @@ void BindXpuConfig(py::module *m) { .def_readwrite("l3_ptr", &XpuConfig::l3_ptr) .def_readwrite("l3_size", &XpuConfig::l3_size) .def_readwrite("l3_autotune_size", &XpuConfig::l3_autotune_size) + .def_readwrite("context", &XpuConfig::context) .def_readwrite("stream", &XpuConfig::stream) .def_readwrite("conv_autotune_level", &XpuConfig::conv_autotune_level) .def_readwrite("conv_autotune_file", &XpuConfig::conv_autotune_file) diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index 34f640e66a6..fc318ef7164 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -103,6 +103,9 @@ struct XPUContext::Impl { // Set external stream for context void SetStream(void* stream) { + if (context_->xpu_stream != nullptr && stream_owned_) { + xpu_stream_destroy(context_->xpu_stream); + } stream_owned_ = false; context_->set_stream(static_cast(stream)); } @@ -152,7 +155,22 @@ struct XPUContext::Impl { SetL3Cache(); } - void SetXContext(xpu::Context* context) { context_ = context; } + void SetXContext(xpu::Context* context) { + if (context_ != nullptr) { + backends::xpu::XPUDeviceGuard guard(place_.GetDeviceId()); + xpu_wait(context_->xpu_stream); + if (context_->xpu_stream != nullptr && stream_owned_) { + xpu_stream_destroy(context_->xpu_stream); + stream_owned_ = false; + context_->xpu_stream = nullptr; + } + if (owned_) { + xpu::destroy_context(context_); + } + } + context_ = context; + owned_ = false; + } void SetBkclContext(xpu::BKCLContext_t context) { bkcl_context_ = context; } diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc index 121c05a6069..4ce32d6a306 100644 --- a/paddle/phi/backends/xpu/xpu_info.cc +++ b/paddle/phi/backends/xpu/xpu_info.cc @@ -198,6 +198,24 @@ XPUVersion get_xpu_version(int dev_id) { } } +int get_xpu_max_ptr_size(int dev_id) { + auto xpu_version = get_xpu_version(dev_id); + int max_ptr_size = 0; + switch (xpu_version) { + case XPUVersion::XPU1: + max_ptr_size = 4; + break; + case XPUVersion::XPU2: + max_ptr_size = 6; + break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "Only support get max ptr size of XPU1 or XPU2.")); + break; + } + return max_ptr_size; +} + } // namespace xpu } // namespace backends } // namespace phi diff --git a/paddle/phi/backends/xpu/xpu_info.h b/paddle/phi/backends/xpu/xpu_info.h index bbd13193cd4..b4fbdec7a93 100644 --- a/paddle/phi/backends/xpu/xpu_info.h +++ b/paddle/phi/backends/xpu/xpu_info.h @@ -95,6 +95,8 @@ class XPUDeviceGuard { enum XPUVersion { XPU1, XPU2 }; XPUVersion get_xpu_version(int dev_id); +int get_xpu_max_ptr_size(int dev_id); + } // namespace xpu } // namespace backends } // namespace phi diff --git a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc index f1a0cef01cb..b9ab6ea68d7 100644 --- a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc +++ b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc @@ -89,7 +89,12 @@ TEST(resnet50_xpu, basic) { CHECK_EQ(predictor##idx_->GetExecStream(), config_.stream); TEST(runtime_stream, null_stream) { - experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config; + xpu_runtime_config.context = nullptr; + xpu_runtime_config.stream = nullptr; + xpu_runtime_config.l3_size = 0; + xpu_runtime_config.l3_ptr = nullptr; + xpu_runtime_config.l3_autotune_size = 0; RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); } @@ -98,26 +103,45 @@ TEST(runtime_stream, new_stream) { xpu_stream_create(&stream); CHECK_NOTNULL(stream); { - experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config; + xpu_runtime_config.context = nullptr; + xpu_runtime_config.stream = stream; + xpu_runtime_config.l3_size = 0; + xpu_runtime_config.l3_ptr = nullptr; + xpu_runtime_config.l3_autotune_size = 0; RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); } xpu_stream_destroy(stream); } TEST(runtime_stream, 2_null_stream) { - experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config; + xpu_runtime_config.context = nullptr; + xpu_runtime_config.stream = nullptr; + xpu_runtime_config.l3_size = 0; + xpu_runtime_config.l3_ptr = nullptr; + xpu_runtime_config.l3_autotune_size = 0; RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config); } TEST(runtime_stream, null_and_new_stream) { - experimental::XpuRuntimeConfig xpu_runtime_config0 = {nullptr, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config0; + xpu_runtime_config0.context = nullptr; + xpu_runtime_config0.stream = nullptr; + xpu_runtime_config0.l3_size = 0; + xpu_runtime_config0.l3_ptr = nullptr; + xpu_runtime_config0.l3_autotune_size = 0; void* stream = nullptr; xpu_stream_create(&stream); CHECK_NOTNULL(stream); { - experimental::XpuRuntimeConfig xpu_runtime_config1 = { - stream, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config1; + xpu_runtime_config1.context = nullptr; + xpu_runtime_config1.stream = stream; + xpu_runtime_config1.l3_size = 0; + xpu_runtime_config1.l3_ptr = nullptr; + xpu_runtime_config1.l3_autotune_size = 0; RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0); RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1); } @@ -128,7 +152,12 @@ TEST(runtime_stream, 2_new_same_stream) { void* stream = nullptr; xpu_stream_create(&stream); CHECK_NOTNULL(stream); - experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config; + xpu_runtime_config.context = nullptr; + xpu_runtime_config.stream = stream; + xpu_runtime_config.l3_size = 0; + xpu_runtime_config.l3_ptr = nullptr; + xpu_runtime_config.l3_autotune_size = 0; { RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config); @@ -140,11 +169,21 @@ TEST(runtime_stream, 2_new_different_stream) { void* stream0 = nullptr; xpu_stream_create(&stream0); CHECK_NOTNULL(stream0); - experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config0; + xpu_runtime_config0.context = nullptr; + xpu_runtime_config0.stream = stream0; + xpu_runtime_config0.l3_size = 0; + xpu_runtime_config0.l3_ptr = nullptr; + xpu_runtime_config0.l3_autotune_size = 0; void* stream1 = nullptr; xpu_stream_create(&stream1); CHECK_NOTNULL(stream1); - experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config1; + xpu_runtime_config1.context = nullptr; + xpu_runtime_config1.stream = stream1; + xpu_runtime_config1.l3_size = 0; + xpu_runtime_config1.l3_ptr = nullptr; + xpu_runtime_config1.l3_autotune_size = 0; { RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0); RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1); @@ -167,12 +206,22 @@ TEST(runtime_stream, 2_thread) { void* stream0 = nullptr; xpu_stream_create(&stream0); CHECK_NOTNULL(stream0); - experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config0; + xpu_runtime_config0.context = nullptr; + xpu_runtime_config0.stream = stream0; + xpu_runtime_config0.l3_size = 0; + xpu_runtime_config0.l3_ptr = nullptr; + xpu_runtime_config0.l3_autotune_size = 0; void* stream1 = nullptr; xpu_stream_create(&stream1); CHECK_NOTNULL(stream1); - experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0}; + experimental::XpuRuntimeConfig xpu_runtime_config1; + xpu_runtime_config1.context = nullptr; + xpu_runtime_config1.stream = stream1; + xpu_runtime_config1.l3_size = 0; + xpu_runtime_config1.l3_ptr = nullptr; + xpu_runtime_config1.l3_autotune_size = 0; { RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0); @@ -189,4 +238,19 @@ TEST(runtime_stream, 2_thread) { xpu_stream_destroy(stream1); } +TEST(runtime_context, new_context) { + auto* context = baidu::xpu::api::create_context(); + CHECK_NOTNULL(context); + { + experimental::XpuRuntimeConfig xpu_runtime_config; + xpu_runtime_config.context = context; + xpu_runtime_config.stream = nullptr; + xpu_runtime_config.l3_size = 0; + xpu_runtime_config.l3_ptr = nullptr; + xpu_runtime_config.l3_autotune_size = 0; + RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); + } + baidu::xpu::api::destroy_context(context); +} + } // namespace paddle_infer -- GitLab