From e135069de04fd5b67b718cff14c86d00cee991a6 Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Mon, 22 May 2023 13:09:07 +0800 Subject: [PATCH] [xpu][infer] support runtime configs (#53595) --- .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 9 +- .../fluid/inference/api/analysis_predictor.cc | 106 +++++----- .../fluid/inference/api/analysis_predictor.h | 3 + paddle/fluid/inference/api/infer_context.cc | 128 +++++++++++- paddle/fluid/inference/api/infer_context.h | 32 ++- paddle/fluid/inference/api/paddle_api.h | 11 +- .../fluid/inference/api/resource_manager.cc | 121 ----------- paddle/fluid/inference/api/resource_manager.h | 52 +---- paddle/phi/backends/CMakeLists.txt | 2 +- paddle/phi/backends/xpu/xpu_context.cc | 10 +- paddle/phi/backends/xpu/xpu_l3_strategy.cc | 153 ++++++++++++++ paddle/phi/backends/xpu/xpu_l3_strategy.h | 53 +++++ paddle/phi/core/device_context.cc | 7 +- paddle/phi/core/device_context.h | 10 +- test/cpp/inference/api/CMakeLists.txt | 11 + .../api/analysis_predictor_tester.cc | 55 ----- .../api/xpu_runtime_config_resnet50_test.cc | 192 ++++++++++++++++++ 17 files changed, 648 insertions(+), 307 deletions(-) create mode 100644 paddle/phi/backends/xpu/xpu_l3_strategy.cc create mode 100644 paddle/phi/backends/xpu/xpu_l3_strategy.h create mode 100644 test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index 91fe7dae50b..40af4f8c000 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -566,12 +566,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, } else { conv_bias.push_back(0); } - if (conv->Op()->HasAttr("padding_algorithm")) { - conv2d_xpu_op_desc.SetAttr( - "padding_algorithm", - PADDLE_GET_CONST(std::string, - conv->Op()->GetAttr("padding_algorithm"))); - } + conv2d_xpu_op_desc.SetAttr( + "padding_algorithm", + conv->Op()->GetAttrIfExists("padding_algorithm")); auto conv_paddings = PADDLE_GET_CONST(std::vector, conv->Op()->GetAttr("paddings")); if (conv_paddings.size() == 2) { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6ab15be533a..849c7ee48ae 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -389,25 +389,21 @@ bool AnalysisPredictor::Init( } #endif #if defined(PADDLE_WITH_XPU) - if (config_.use_xpu_ && config_.use_external_stream_) { + if (config_.use_xpu_) { private_context_ = true; - } - if (private_context_) { - if (!status_is_cloned_) { + if (!status_is_cloned_ && config_.external_stream_enabled()) { predictor_stream_ = config_.GetExecStream(); } - // NOTE: If the external_stream equals to global_device_contexts's stream, - // then fallback. - auto global_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(place_)) - ->stream(); - if (predictor_stream_ != global_stream) { - InitResourceManager(predictor_stream_); - InitDeviceContexts(); + auto *global_context = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + auto global_stream = global_context->stream(); + if (predictor_stream_ == nullptr) { + predictor_stream_ = global_stream; } + InitDeviceContexts(); } #endif + inference::DisplayMemoryInfo(place_, "Init predictor"); return true; } @@ -492,15 +488,12 @@ void AnalysisPredictor::InitResourceManager(void *stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) predictor_stream_ = ResourceManager::Instance().InitGPUResource(place_, stream); -#elif defined(PADDLE_WITH_XPU) - predictor_stream_ = - ResourceManager::Instance().InitXPUResource(place_, stream); #endif } void AnalysisPredictor::InitDeviceContexts() { -// Init GPUContext. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // Init GPUContext. if (place_.GetType() == phi::AllocationType::GPU) { device_contexts_.emplace( place_, std::async(std::launch::deferred, [=] { @@ -512,12 +505,10 @@ void AnalysisPredictor::InitDeviceContexts() { })); } #endif -#if defined(PADDLE_WITH_XPU) +#ifdef PADDLE_WITH_XPU if (place_.GetType() == phi::AllocationType::XPU) { device_contexts_.emplace( place_, std::async(std::launch::deferred, [=] { - auto *xpu_resource = - ResourceManager::Instance().GetXPUResource(predictor_stream_); auto &instance = memory::allocation::AllocatorFacade::Instance(); auto *xpu_context = new InferXPUContext(place_); xpu_context->SetAllocator(instance.GetAllocator(place_).get()); @@ -530,15 +521,11 @@ void AnalysisPredictor::InitDeviceContexts() { instance.GetZeroAllocator(place_).get()); xpu_context->SetHostZeroAllocator( instance.GetZeroAllocator(platform::CPUPlace()).get()); - xpu_context->SetStream(xpu_resource->GetStream()); - xpu_context->SetDriverVersion(xpu_resource->GetDriverVersion()); - xpu_context->SetRuntimeVersion(xpu_resource->GetRuntimeVersion()); - xpu_context->SetXpuVersion(xpu_resource->GetXpuVersion()); + xpu_context->SetStream(predictor_stream_); return std::unique_ptr(xpu_context); })); } #endif - // TODO(Inference): Support other backends. } void *AnalysisPredictor::GetExecStream() const { @@ -591,6 +578,11 @@ const void *AnalysisPredictor::GetDeviceContexts() const { bool AnalysisPredictor::PrepareScope( const std::shared_ptr &parent_scope) { +#ifdef PADDLE_WITH_XPU + // Set "XPU_PADDLE_L3_SIZE" to "0" to avoid malloc l3 cache when xpu_context + // init. + setenv("XPU_PADDLE_L3_SIZE", "0", 0); +#endif if (parent_scope) { PADDLE_ENFORCE_NOT_NULL( parent_scope, @@ -1513,6 +1505,7 @@ void AnalysisPredictor::PrepareArgument() { argument_->SetCustomDeviceId(config_.custom_device_id()); } #endif + #ifdef PADDLE_WITH_XPU argument_->SetUseXpu(config_.use_xpu_); argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); @@ -2153,29 +2146,45 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { } #endif -bool AnalysisPredictor::ExpRunWithExternalStream(void *stream) { -#if defined(PADDLE_WITH_XPU) - if (!private_context_) { - PADDLE_THROW(platform::errors::Fatal( - "Please use config.SetExecStream to init resources, and then we " - "will bind resources to execution stream.")); - } - if (stream != predictor_stream_) { +bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) { +#ifdef PADDLE_WITH_XPU + PADDLE_ENFORCE( + private_context_, + paddle::platform::errors::Fatal( + "Must use private context if run predictor with external config.")); + + auto *dev_ctxs = reinterpret_cast>> *>( + this->GetDeviceContexts()); + auto *dev_ctx = + static_cast(dev_ctxs->at(place_).get().get()); + + auto xpu_runtime_config = + reinterpret_cast(config); + auto *stream = xpu_runtime_config->stream; + if (stream != nullptr && stream != predictor_stream_) { paddle::platform::XPUStreamSync( static_cast(predictor_stream_)); - ResourceManager::Instance().XpuResourceReBindStream(predictor_stream_, - stream); predictor_stream_ = stream; - - auto *dev_ctxs = reinterpret_cast>> *>( - this->GetDeviceContexts()); - auto *dev_ctx = - static_cast(dev_ctxs->at(place_).get().get()); dev_ctx->SetStream(stream); } - return ZeroCopyRun(); + + size_t l3_size = xpu_runtime_config->l3_size; + void *l3_ptr = xpu_runtime_config->l3_ptr; + size_t l3_autotune_size = xpu_runtime_config->l3_autotune_size; + PADDLE_ENFORCE_LE( + l3_autotune_size, + l3_size, + phi::errors::InvalidArgument( + "l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).", + l3_autotune_size, + l3_size)); + dev_ctx->SetL3Info(l3_size, l3_ptr, l3_autotune_size); + + bool ret = ZeroCopyRun(); + dev_ctx->L3CacheAutotune(); + return ret; #endif return false; } @@ -2543,10 +2552,6 @@ AnalysisPredictor::~AnalysisPredictor() { if (predictor_stream_ != nullptr) { ResourceManager::Instance().DestroyGPUResource(predictor_stream_); } -#elif defined(PADDLE_WITH_XPU) - if (predictor_stream_ != nullptr) { - ResourceManager::Instance().DestroyXPUResource(predictor_stream_); - } #endif if (place_.GetType() != phi::AllocationType::UNDEFINED) { @@ -3057,10 +3062,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, #endif return false; } -bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, - void *stream) { + +bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p, + void *config) { auto pred = dynamic_cast(p->predictor_.get()); - return pred->ExpRunWithExternalStream(stream); + return pred->ExpRunWithRuntimeConfig(config); } void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c, diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 185aadbc6ab..558c0f21892 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -228,6 +228,9 @@ class AnalysisPredictor : public PaddlePredictor { // Note: Can only be used under thread_local semantics. bool ExpRunWithExternalStream(void *stream); + // Note: Can only be used under thread_local semantics. + bool ExpRunWithRuntimeConfig(void *config); + /// /// \brief Get the execution stream on devices with a concept of stream, /// otherwise returns nullptr. diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc index b56adddfa4e..6c963f49363 100644 --- a/paddle/fluid/inference/api/infer_context.cc +++ b/paddle/fluid/inference/api/infer_context.cc @@ -13,7 +13,11 @@ // limitations under the License. #include "paddle/fluid/inference/api/infer_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#ifdef PADDLE_WITH_XPU +#include "xpu/runtime.h" +#endif +#include "glog/logging.h" namespace paddle { @@ -22,9 +26,129 @@ InferGPUContext::InferGPUContext(const phi::Place& place) : phi::GPUContext(place, false) {} #endif -#if defined(PADDLE_WITH_XPU) +#ifdef PADDLE_WITH_XPU InferXPUContext::InferXPUContext(const phi::Place& place) : phi::XPUContext(place) {} + +void* InferXPUContext::Alloc(phi::TensorBase* tensor, + phi::DataType dtype, + size_t requested_size, + bool pinned, + bool fake_alloc) const { + size_t size = tensor->numel() * phi::SizeOf(tensor->dtype()); + if (l3_autotune_size_ > 0 && holder_map_.empty()) { + void* data_ptr = + DeviceContext::Alloc(tensor, dtype, requested_size, pinned, fake_alloc); + phi::XPUL3CacheBlock* l3_block = nullptr; + phi::Allocation* holder = + reinterpret_cast(tensor)->Holder().get(); + if (holder_l3_blocks_.count(holder) == 0) { + l3_block = new phi::XPUL3CacheBlock(); + holder_l3_blocks_[holder] = l3_block; + l3_blocks_.push_back(l3_block); + } else { + l3_block = holder_l3_blocks_[holder]; + } + l3_block->Record(size); + return data_ptr; + } else if (l3_autotune_size_ > 0 && !holder_map_.empty()) { + phi::Allocation* holder = + reinterpret_cast(tensor)->Holder().get(); + auto holder_iter = holder_map_.find(holder); + if (holder_iter != holder_map_.end()) { + auto& holder_pair = holder_iter->second; + auto* swap_holder = holder_pair.first; + bool& swap_holder_is_l3 = holder_pair.second; + if (swap_holder_is_l3 && swap_holder->size() >= size) { + swap(*holder, *swap_holder); + swap_holder_is_l3 = false; + } else if (!swap_holder_is_l3 && holder->size() < size) { + swap(*holder, *swap_holder); + swap_holder_is_l3 = true; + } + } + return DeviceContext::Alloc( + tensor, dtype, requested_size, pinned, fake_alloc); + } else { + return DeviceContext::Alloc( + tensor, dtype, requested_size, pinned, fake_alloc); + } +} + +void InferXPUContext::SetL3Info(size_t l3_size, + void* l3_ptr, + size_t l3_autotune_size) { + if (l3_ptr == nullptr) { + if (l3_size_ != l3_size) { + if (l3_owned_) { + xpu_free(l3_ptr_); + } + if (l3_size > 0) { + xpu_malloc(&l3_ptr_, l3_size, XPU_MEM_L3); + if (l3_ptr_ != nullptr) { + VLOG(3) << "remalloc l3(" << l3_size << ") success."; + l3_size_ = l3_size; + l3_owned_ = true; + l3_autotune_size_ = l3_autotune_size; + } else { + VLOG(3) << "malloc l3(" << l3_size << ") failed. No l3 will be used."; + l3_size_ = 0; + l3_owned_ = false; + l3_autotune_size_ = 0; + } + } + } + } else { + if (l3_owned_) { + xpu_free(l3_ptr_); + } + l3_ptr_ = l3_ptr; + l3_size_ = l3_size; + l3_autotune_size_ = l3_autotune_size; + } + if (l3_autotune_size_ == 0) { + x_context()->_l3_mgr.set(l3_ptr_, l3_size_); + } +} + +void InferXPUContext::L3CacheAutotune() { + if (l3_autotune_size_ == 0) return; + if (holder_map_.empty()) { + l3_plan_.RunAutotune(l3_blocks_, l3_size_); + auto* plan = l3_plan_.plan(); + int8_t* cur_l3_ptr = reinterpret_cast(l3_ptr_); + for (size_t i = 0; i < l3_blocks_.size(); i++) { + size_t block_size = plan->at(i); + if (block_size > 0) { + l3_blocks_[i]->Set(cur_l3_ptr, block_size); + cur_l3_ptr += block_size; + } + } + x_context()->_l3_mgr.set( + reinterpret_cast(l3_ptr_) + l3_size_ - plan->back(), + plan->back()); + + for (auto holder_l3_block : holder_l3_blocks_) { + auto* l3_block = holder_l3_block.second; + if (l3_block->size() > 0) { + auto* holder = holder_l3_block.first; + auto place = holder->place(); + phi::Allocation* l3_holder = + new phi::Allocation(l3_block->data(), l3_block->size(), place); + holder_map_[holder] = std::make_pair(l3_holder, true); + } + } + } else { + for (auto& holders : holder_map_) { + auto* holder = holders.first; + auto& holder_pair = holders.second; + if (!holder_pair.second) { + swap(*holder, *(holder_pair.first)); + holder_pair.second = true; + } + } + } +} #endif } // namespace paddle diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h index 130fd8c8d48..ebc55098c97 100644 --- a/paddle/fluid/inference/api/infer_context.h +++ b/paddle/fluid/inference/api/infer_context.h @@ -15,6 +15,9 @@ #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/place.h" +#ifdef PADDLE_WITH_XPU +#include "paddle/phi/backends/xpu/xpu_l3_strategy.h" +#endif namespace paddle { @@ -46,14 +49,33 @@ class InferGPUContext : public phi::GPUContext { }; #endif -#if defined(PADDLE_WITH_XPU) +#ifdef PADDLE_WITH_XPU class InferXPUContext : public phi::XPUContext { public: explicit InferXPUContext(const phi::Place& place); - using phi::XPUContext::SetDriverVersion; - using phi::XPUContext::SetRuntimeVersion; - using phi::XPUContext::SetStream; - using phi::XPUContext::SetXpuVersion; + + void* Alloc(phi::TensorBase* tensor, + phi::DataType dtype, + size_t requested_size = 0, + bool pinned = false, + bool fake_alloc = false) const override; + + void SetL3Info(size_t l3_size, void* l3_ptr, size_t l3_autotune_size); + + void L3CacheAutotune(); + + private: + size_t l3_size_{0}; + void* l3_ptr_{nullptr}; + bool l3_owned_{false}; + size_t l3_autotune_size_{0}; + mutable std::vector l3_blocks_; + mutable std::unordered_map + holder_l3_blocks_; + mutable std::unordered_map> + holder_map_; + phi::XPUL3Planner l3_plan_; }; #endif } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index de9108aef77..c136635f889 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -471,6 +471,13 @@ class Predictor; class Tensor; using Config = paddle::AnalysisConfig; namespace experimental { +struct XpuRuntimeConfig { + void* stream{nullptr}; + size_t l3_size{16773120}; + void* l3_ptr{nullptr}; + size_t l3_autotune_size{0}; +}; + // Unstable interface, may be modified or deleted in the future. class PD_INFER_DECL InternalUtils { public: @@ -479,8 +486,8 @@ class PD_INFER_DECL InternalUtils { cudaStream_t stream); static bool RunWithExternalStream(paddle_infer::Predictor* pred, hipStream_t stream); - static bool RunWithExternalStream(paddle_infer::Predictor* pred, - void* stream); + static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config); + static void UpdateConfigInterleaved(paddle_infer::Config* c, bool with_interleaved); diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc index 7968b8d7d9d..3f06ee5722a 100644 --- a/paddle/fluid/inference/api/resource_manager.cc +++ b/paddle/fluid/inference/api/resource_manager.cc @@ -41,9 +41,6 @@ #include "paddle/phi/backends/dynload/cusparse.h" #endif // PADDLE_WITH_CUDA -#ifdef PADDLE_WITH_XPU -#include "paddle/phi/backends/xpu/xpu_info.h" -#endif namespace paddle { namespace internal { @@ -451,123 +448,5 @@ int ResourceManager::RefCount(void* stream) const { if (ref_count_.count(stream) == 0) return 0; return ref_count_.at(stream); } -#endif - -#if defined(PADDLE_WITH_XPU) -// XPUContextResource -XPUContextResource::XPUContextResource(const phi::Place& place, void* stream) - : place_(place) { - InitXPUResource(stream); -} - -XPUContextResource::~XPUContextResource() {} - -void XPUContextResource::InitXPUResource(void* stream) { - phi::backends::xpu::XPUDeviceGuard guard(place_.device); - if (stream) { - owned_stream_ = false; - stream_ = stream; - } - InitXpuProperties(); -} - -void XPUContextResource::InitXpuProperties() { - phi::backends::xpu::XPUDeviceGuard guard(place_.device); - driver_version_ = phi::backends::xpu::GetDriverVersion(); - runtime_version_ = phi::backends::xpu::GetRuntimeVersion(); - xpu_version_ = - static_cast(phi::backends::xpu::get_xpu_version(place_.device)); -} -void* XPUContextResource::GetStream() const { return stream_; } - -int XPUContextResource::GetDriverVersion() const { return driver_version_; } - -int XPUContextResource::GetRuntimeVersion() const { return runtime_version_; } - -int XPUContextResource::GetXpuVersion() const { return xpu_version_; } - -void XPUContextResource::ReBindStream(void* stream) { - owned_stream_ = false; - stream_ = stream; -} -// XPUContextResource End. - -// Resource Manager -void* ResourceManager::InitXPUResource(const phi::Place& place, void* stream) { - std::lock_guard lock_gurad(xpu_mutex_); - if (xpu_resources_.count(stream)) { - Increase(stream); - return stream; - } else { - std::unique_ptr resource{ - new XPUContextResource(place, stream)}; - void* s = resource->GetStream(); - ref_count_[s] = 1; - xpu_resources_.emplace(s, std::move(resource)); - return s; - } -} - -XPUContextResource* ResourceManager::GetXPUResource(void* stream) const { - PADDLE_ENFORCE_EQ(xpu_resources_.count(stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in xpu_resources.", stream)); - return xpu_resources_.at(stream).get(); -} - -void ResourceManager::XpuResourceReBindStream(void* old_stream, - void* new_stream) { - PADDLE_ENFORCE_EQ( - xpu_resources_.count(old_stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in xpu_resources.", old_stream)); - auto xpu_resource = std::move(xpu_resources_.at(old_stream)); - DestroyXPUResource(old_stream); - PADDLE_ENFORCE_EQ( - ref_count_.count(old_stream), - 0, - platform::errors::Fatal("xpu resources rebind stream failed.")); - - xpu_resource->ReBindStream(new_stream); - ref_count_[new_stream]++; - xpu_resources_.emplace(new_stream, std::move(xpu_resource)); -} - -void ResourceManager::DestroyXPUResource(void* stream) { - PADDLE_ENFORCE_EQ(xpu_resources_.count(stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in xpu_resources.", stream)); - Decrease(stream); -} - -void ResourceManager::Decrease(void* stream) { - PADDLE_ENFORCE_EQ(ref_count_.count(stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in ref_count.", stream)); - --ref_count_[stream]; - if (ref_count_[stream] == 0) { - ref_count_.erase(stream); - xpu_resources_.erase(stream); - } -} - -void ResourceManager::Increase(void* stream) { - PADDLE_ENFORCE_EQ(ref_count_.count(stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in ref_count.", stream)); - ++ref_count_[stream]; -} - -int ResourceManager::RefCount(void* stream) const { - if (ref_count_.count(stream) == 0) return 0; - return ref_count_.at(stream); -} -// Resource Manager End. - #endif } // namespace paddle diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h index 3b4c66d3190..e14de1c2ffc 100644 --- a/paddle/fluid/inference/api/resource_manager.h +++ b/paddle/fluid/inference/api/resource_manager.h @@ -124,33 +124,6 @@ class GPUContextResource { }; #endif -#if defined(PADDLE_WITH_XPU) -class XPUContextResource { - public: - explicit XPUContextResource(const phi::Place& place, void* stream); - ~XPUContextResource(); - phi::Place Place() const; - void* GetStream() const; - int GetDriverVersion() const; - int GetRuntimeVersion() const; - int GetXpuVersion() const; - void ReBindStream(void* stream); - - private: - void InitXPUResource(void* stream); - void InitXpuProperties(); - - private: - bool owned_stream_{true}; - void* stream_; - phi::Place place_; - - int driver_version_; - int runtime_version_; - int xpu_version_; -}; // class XPUContextResource -#endif - class ResourceManager { public: ResourceManager() = default; @@ -168,9 +141,8 @@ class ResourceManager { std::mutex cpu_mutex_; std::unique_ptr cpu_resource_{nullptr}; -// GPU Resource #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - + // GPU Resource public: void* InitGPUResource(const phi::Place& place, void* stream); void DestroyGPUResource(void* stream); @@ -190,28 +162,6 @@ class ResourceManager { gpu_resources_; #endif -// XPU Resource -#if defined(PADDLE_WITH_XPU) - - public: - void* InitXPUResource(const phi::Place& place, void* stream); - void DestroyXPUResource(void* stream); - XPUContextResource* GetXPUResource(void* stream) const; - int RefCount(void* stream) const; - void XpuResourceReBindStream(void* old_stream, void* new_stream); - - private: - void Decrease(void* stream); - void Increase(void* stream); - - private: - std::mutex xpu_mutex_; - // a stream corresponding to a series of resource. - std::map> ref_count_; - std::map> - xpu_resources_; -#endif - private: DISABLE_COPY_AND_ASSIGN(ResourceManager); }; diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index bb4ccd9a46e..828437c8f2a 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -38,7 +38,7 @@ endif() if(WITH_XPU) list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc) list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc - xpu/xpu2_op_list.cc) + xpu/xpu2_op_list.cc xpu/xpu_l3_strategy.cc) endif() if(WITH_MKLDNN) diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index 4c0727088f6..44f247ff259 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -42,11 +42,13 @@ struct XPUContext::Impl { auto selected_xpus = backends::xpu::GetXPUSelectedDevices(); for (unsigned int i = 0; i < selected_xpus.size(); i++) { if (place_.GetDeviceId() == selected_xpus[i]) { - if (l3ptrs[place_.GetDeviceId()] == nullptr) { - xpu_malloc(static_cast(&l3ptrs[place_.GetDeviceId()]), - l3_size, - XPU_MEM_L3); + if (l3ptrs[place_.GetDeviceId()] != nullptr) { + xpu_free(l3ptrs[place_.GetDeviceId()]); + l3ptrs[place_.GetDeviceId()] = nullptr; } + xpu_malloc(static_cast(&l3ptrs[place_.GetDeviceId()]), + l3_size, + XPU_MEM_L3); if (l3ptrs[place_.GetDeviceId()] != nullptr) { context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size); VLOG(3) << "xpu place " << static_cast(place_.GetDeviceId()) diff --git a/paddle/phi/backends/xpu/xpu_l3_strategy.cc b/paddle/phi/backends/xpu/xpu_l3_strategy.cc new file mode 100644 index 00000000000..eab256a3eda --- /dev/null +++ b/paddle/phi/backends/xpu/xpu_l3_strategy.cc @@ -0,0 +1,153 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/xpu/xpu_l3_strategy.h" +#include "glog/logging.h" + +namespace phi { + +void XPUL3CacheBlock::Set(void* addr, size_t size) { + if (addr == nullptr || size == 0) { + LOG(FATAL) << "Set XPUL3CacheBlock Size as Zero"; + } + addr_ = addr; + size_ = size; +} + +void XPUL3Planner::RunAutotune( + const std::vector& l3_block_dict, size_t l3_size) { + if (l3_block_dict.size() == 0 || l3_size <= 0 || !plan_.empty()) { + return; + } + VLOG(3) << "AutoTune XPU L3 Cache Block Start."; + struct node { + size_t weights = 0; + size_t scores = 0; + std::vector choices{0}; + }; + std::vector> records; + std::vector record_map; + size_t total_scores = 0; + for (size_t block_idx = 0; block_idx < l3_block_dict.size(); block_idx++) { + XPUL3CacheBlock* cur_block = l3_block_dict[block_idx]; + std::vector& history = cur_block->history_; + auto history_size = history.size(); + size_t score = 0; + VLOG(3) << "Block Idx is " << block_idx; + if (history_size > 1) { + std::vector block_nodes{node()}; + std::sort(history.begin(), history.end()); + for (size_t i = 0; i < history_size; i++) { + VLOG(3) << "Size History : " << i << " is " << history[i]; + if (history[i] > l3_size) { + break; + } + score += history[i]; + if (i == history_size - 1 || history[i + 1] != history[i]) { + node cur_node; + cur_node.weights = history[i]; + cur_node.choices = {history[i]}; + cur_node.scores = score; + block_nodes.push_back(cur_node); + VLOG(3) << "Node Weights is:" << cur_node.weights + << ", Node Scores is: " << score; + } + } + total_scores += score; + records.push_back(block_nodes); + record_map.push_back(block_idx); + } + } + if (records.size() <= 0) { + return; + } + std::vector res(records[0]); + for (size_t block_idx = 1; block_idx < records.size(); block_idx++) { + std::vector new_nodes; + for (size_t node_idx = 0; node_idx < records[block_idx].size(); + node_idx++) { + for (size_t res_idx = 0; res_idx < res.size(); res_idx++) { + node cur_node; + size_t cur_weights = + records[block_idx][node_idx].weights + res[res_idx].weights; + if (cur_weights > l3_size) { + break; + } + cur_node.scores = + records[block_idx][node_idx].scores + res[res_idx].scores; + cur_node.weights = cur_weights; + cur_node.choices = res[res_idx].choices; + cur_node.choices.push_back(records[block_idx][node_idx].choices[0]); + new_nodes.push_back(cur_node); + } + } + struct { + bool operator()(node a, node b) const { + if (a.weights < b.weights) { + return true; + } else if (a.weights == b.weights) { + return a.scores > b.scores; + } else { + return false; + } + } + } customLess; + + std::sort(new_nodes.begin(), new_nodes.end(), customLess); + std::vector stay(new_nodes.size(), true); + for (int i = new_nodes.size() - 1; i >= 0; i--) { + for (int j = i - 1; j >= 0; j--) { + if (new_nodes[j].scores >= new_nodes[i].scores) { + stay[i] = false; + break; + } + } + } + res.clear(); + for (size_t i = 0; i < new_nodes.size(); i++) { + if (stay[i] == true) { + res.push_back(new_nodes[i]); + } + } + VLOG(3) << "XPU L3 Block IDX is " << block_idx + << ", Choices before filter are " << new_nodes.size() + << ", Choices after filter are " << res.size(); + } + // final result: res.back().choices + // std::vector record_map; + for (size_t i = 0; i < res.back().choices.size(); i++) { + VLOG(3) << "BLOCK IDX is " << i << ", Acquired L3 Size is " + << res.back().choices[i]; + } + double l3_global_ratio = static_cast(res.back().scores) / + static_cast(total_scores); + VLOG(3) << "Tensor Space in L3 / Tensor Space in Global :" + << l3_global_ratio * 100 << " %"; + + size_t block_l3_size = + std::accumulate(res.back().choices.begin(), res.back().choices.end(), 0); + size_t xdnn_ctx_l3_size = (l3_size - block_l3_size) / 64 * 64; + + VLOG(3) << "Block L3 Size : " << block_l3_size + << ", XDNN Ctx L3 Size : " << xdnn_ctx_l3_size; + + plan_.resize(l3_block_dict.size() + 1, 0); + for (size_t i = 0; i < res.back().choices.size(); i++) { + plan_[record_map[i]] = res.back().choices[i]; + } + plan_[l3_block_dict.size()] = xdnn_ctx_l3_size; + VLOG(3) << "AutoTune XPU L3 Cache Block End."; +} + +} // namespace phi diff --git a/paddle/phi/backends/xpu/xpu_l3_strategy.h b/paddle/phi/backends/xpu/xpu_l3_strategy.h new file mode 100644 index 00000000000..e1ff3cd0278 --- /dev/null +++ b/paddle/phi/backends/xpu/xpu_l3_strategy.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +namespace phi { + +struct XPUL3CacheBlock { + public: + void Clear() { + addr_ = nullptr; + size_ = 0; + history_.clear(); + } + void Set(void* addr, size_t size); + void Record(size_t size) { history_.push_back(size); } + void* data() { return addr_; } + size_t size() { return size_; } + + private: + void* addr_{nullptr}; + size_t size_{0}; + + public: + std::vector history_; +}; + +class XPUL3Planner { + public: + void RunAutotune(const std::vector& l3_block_dict, + size_t l3_size); + + std::vector* plan() { return &plan_; } + + private: + std::vector plan_; +}; + +} // namespace phi diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index 5c8fc75ff0e..cdbfb557039 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -393,11 +393,8 @@ template T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size, bool pinned) const { - if (pinned) { - return impl_->Alloc( - tensor, GetPinnedPlace(GetPlace()), requested_size, pinned); - } - return impl_->Alloc(tensor, GetPlace(), requested_size, pinned); + DataType dtype = phi::CppTypeToDataType::Type(); + return static_cast(this->Alloc(tensor, dtype, requested_size, pinned)); } void* DeviceContext::HostAlloc(TensorBase* tensor, diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index e88cb4a93be..b6703925517 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -145,11 +145,11 @@ class PADDLE_API DeviceContext { /** * @brief Allocate device memory for tensor. */ - void* Alloc(TensorBase*, - DataType dtype, - size_t requested_size = 0, - bool pinned = false, - bool fake_alloc = false) const; + virtual void* Alloc(TensorBase*, + DataType dtype, + size_t requested_size = 0, + bool pinned = false, + bool fake_alloc = false) const; template T* Alloc(TensorBase* tensor, diff --git a/test/cpp/inference/api/CMakeLists.txt b/test/cpp/inference/api/CMakeLists.txt index 3c310e68065..02d869e2a3c 100644 --- a/test/cpp/inference/api/CMakeLists.txt +++ b/test/cpp/inference/api/CMakeLists.txt @@ -1461,6 +1461,17 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST) --repeat=10) endif() + if(WITH_XPU) + inference_analysis_test( + xpu_runtime_config_resnet50_test + SRCS + xpu_runtime_config_resnet50_test.cc + EXTRA_DEPS + paddle_inference_shared + ARGS + --infer_model=${RESNET50_MODEL_DIR}) + endif() + set(inference_deps ${analysis_deps} paddle_inference_api analysis naive_executor ${GLOB_PASS_LIB}) diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc index d187fb9d173..71a694cf33b 100644 --- a/test/cpp/inference/api/analysis_predictor_tester.cc +++ b/test/cpp/inference/api/analysis_predictor_tester.cc @@ -17,10 +17,6 @@ #if defined(PADDLE_WITH_CUDA) #include #endif -#if defined(PADDLE_WITH_XPU) -#include "xpu/runtime.h" -#include "xpu/xdnn.h" -#endif #include #include @@ -671,57 +667,6 @@ TEST(Predictor, Streams) { } #endif -#if defined(PADDLE_WITH_XPU) -TEST(Predictor, XPUStreams) { - // external stream - { - auto context = baidu::xpu::api::create_context(); - xpu_stream_create(&context->xpu_stream); - - Config config; - config.SetModel(FLAGS_dirname); - config.EnableXpu(); - config.SetExecStream(static_cast(context->xpu_stream)); - CHECK_EQ(config.external_stream_enabled(), true); - - auto predictor = CreatePredictor(config); - auto stream = predictor->GetExecStream(); - CHECK_EQ(static_cast(context->xpu_stream), stream); - CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream)); - CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream), 1); - } - - // 2 predictor on 2 stream - { - auto context1 = baidu::xpu::api::create_context(); - xpu_stream_create(&context1->xpu_stream); - - Config config; - config.SetModel(FLAGS_dirname); - config.EnableXpu(); - config.SetExecStream(static_cast(context1->xpu_stream)); - auto predictor = CreatePredictor(config); - auto stream1 = predictor->GetExecStream(); - CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream1)); - CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream1), 1); - - auto context2 = baidu::xpu::api::create_context(); - xpu_stream_create(&context2->xpu_stream); - - Config config2; - config2.SetModel(FLAGS_dirname); - config2.EnableXpu(); - config2.SetExecStream(static_cast(context2->xpu_stream)); - auto predictor2 = CreatePredictor(config2); - auto stream2 = predictor2->GetExecStream(); - CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream2)); - CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream2), 1); - - CHECK_NE(stream1, stream2); - } -} -#endif - TEST(AnalysisPredictor, OutputHookFunc) { auto hookfunc = [](const std::string& type, const std::string& var_name, diff --git a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc new file mode 100644 index 00000000000..88989847411 --- /dev/null +++ b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "gflags/gflags.h" +#include "test/cpp/inference/api/tester_helper.h" +#include "xpu/runtime.h" +#include "xpu/xdnn.h" + +namespace paddle_infer { + +static const std::vector TRUTH_VALUES = { + 127.779f, 738.165f, 1013.22f, -438.17f, 366.401f, 927.659f, 736.222f, + -633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f, + -242.675f, 117.448f, -801.723f, -391.514f, -404.818f, 454.16f, 515.48f, + -133.031f, 69.293f, 590.096f, -1434.69f, -1070.89f, 307.074f, 400.525f, + -316.12f, -587.125f, -161.056f, 800.363f, -96.4708f, 748.706f, 868.174f, + -447.938f, 112.737f, 1127.2f, 47.4355f, 677.72f, 593.186f, -336.4f, + 551.362f, 397.823f, 78.3979f, -715.398f, 405.969f, 404.256f, 246.019f, + -8.42969f, 131.365f, -648.051f}; + +void PrepareInput(std::shared_ptr predictor) { + const int batch = 1; + const int channel = 3; + const int height = 318; + const int width = 318; + const int input_num = batch * channel * height * width; + std::vector input(input_num, 1); + auto input_names = predictor->GetInputNames(); + auto input_t = predictor->GetInputHandle(input_names[0]); + input_t->Reshape({batch, channel, height, width}); + input_t->CopyFromCpu(input.data()); +} + +void CompareOutput(std::shared_ptr predictor) { + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_t->shape(); + size_t out_num = std::accumulate( + output_shape.begin(), output_shape.end(), 1, std::multiplies()); + + std::vector out_data; + out_data.resize(out_num); + output_t->CopyToCpu(out_data.data()); + + float* data_o = out_data.data(); + for (size_t j = 0; j < out_num; j += 10) { + EXPECT_NEAR( + (data_o[j] - TRUTH_VALUES[j / 10]) / TRUTH_VALUES[j / 10], 0., 10e-3); + } +} + +Config XpuConfig() { + std::string model_dir = FLAGS_infer_model + "/" + "model"; + Config config; + config.SetModel(model_dir + "/model", model_dir + "/params"); + config.EnableXpu(); + return config; +} + +TEST(resnet50_xpu, basic) { + Config config = XpuConfig(); + auto predictor = CreatePredictor(config); + PrepareInput(predictor); + predictor->Run(); + CompareOutput(predictor); +} + +#define RUN_WITH_RUNTIME_CONFIG(idx_, config_) \ + Config config##idx_ = XpuConfig(); \ + auto predictor##idx_ = CreatePredictor(config##idx_); \ + PrepareInput(predictor##idx_); \ + experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \ + &config_); \ + CompareOutput(predictor##idx_); \ + CHECK_EQ(predictor##idx_->GetExecStream(), config_.stream); + +TEST(runtime_stream, null_stream) { + experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0}; + RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); +} + +TEST(runtime_stream, new_stream) { + void* stream = nullptr; + xpu_stream_create(&stream); + CHECK_NOTNULL(stream); + { + experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0}; + RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); + } + xpu_stream_destroy(stream); +} + +TEST(runtime_stream, 2_null_stream) { + experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0}; + RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); + RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config); +} + +TEST(runtime_stream, null_and_new_stream) { + experimental::XpuRuntimeConfig xpu_runtime_config0 = {nullptr, 0, nullptr, 0}; + void* stream = nullptr; + xpu_stream_create(&stream); + CHECK_NOTNULL(stream); + { + experimental::XpuRuntimeConfig xpu_runtime_config1 = { + stream, 0, nullptr, 0}; + RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0); + RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1); + } + xpu_stream_destroy(stream); +} + +TEST(runtime_stream, 2_new_same_stream) { + void* stream = nullptr; + xpu_stream_create(&stream); + CHECK_NOTNULL(stream); + experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0}; + { + RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config); + RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config); + } + xpu_stream_destroy(stream); +} + +TEST(runtime_stream, 2_new_different_stream) { + void* stream0 = nullptr; + xpu_stream_create(&stream0); + CHECK_NOTNULL(stream0); + experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0}; + void* stream1 = nullptr; + xpu_stream_create(&stream1); + CHECK_NOTNULL(stream1); + experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0}; + { + RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0); + RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1); + } + xpu_stream_destroy(stream0); + xpu_stream_destroy(stream1); +} + +void RunPredictorWithRuntimeConfig( + std::shared_ptr predictor, + experimental::XpuRuntimeConfig runtime_config) { + PrepareInput(predictor); + experimental::InternalUtils::RunWithRuntimeConfig(predictor.get(), + &runtime_config); + CompareOutput(predictor); + CHECK_EQ(predictor->GetExecStream(), runtime_config.stream); +} + +TEST(runtime_stream, 2_thread) { + void* stream0 = nullptr; + xpu_stream_create(&stream0); + CHECK_NOTNULL(stream0); + experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0}; + + void* stream1 = nullptr; + xpu_stream_create(&stream1); + CHECK_NOTNULL(stream1); + experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0}; + + { + RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0); + RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1); + std::thread t0( + RunPredictorWithRuntimeConfig, predictor0, xpu_runtime_config0); + std::thread t1( + RunPredictorWithRuntimeConfig, predictor1, xpu_runtime_config1); + t0.join(); + t1.join(); + } + + xpu_stream_destroy(stream0); + xpu_stream_destroy(stream1); +} + +} // namespace paddle_infer -- GitLab