未验证 提交 e135069d 编写于 作者: Z zhupengyang 提交者: GitHub

[xpu][infer] support runtime configs (#53595)

上级 d327d3e1
......@@ -566,12 +566,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
} else {
conv_bias.push_back(0);
}
if (conv->Op()->HasAttr("padding_algorithm")) {
conv2d_xpu_op_desc.SetAttr(
"padding_algorithm",
PADDLE_GET_CONST(std::string,
conv->Op()->GetAttr("padding_algorithm")));
}
conv2d_xpu_op_desc.SetAttr(
"padding_algorithm",
conv->Op()->GetAttrIfExists<std::string>("padding_algorithm"));
auto conv_paddings =
PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("paddings"));
if (conv_paddings.size() == 2) {
......
......@@ -389,25 +389,21 @@ bool AnalysisPredictor::Init(
}
#endif
#if defined(PADDLE_WITH_XPU)
if (config_.use_xpu_ && config_.use_external_stream_) {
if (config_.use_xpu_) {
private_context_ = true;
}
if (private_context_) {
if (!status_is_cloned_) {
if (!status_is_cloned_ && config_.external_stream_enabled()) {
predictor_stream_ = config_.GetExecStream();
}
// NOTE: If the external_stream equals to global_device_contexts's stream,
// then fallback.
auto global_stream =
static_cast<phi::XPUContext *>(
platform::DeviceContextPool::Instance().Get(place_))
->stream();
if (predictor_stream_ != global_stream) {
InitResourceManager(predictor_stream_);
InitDeviceContexts();
auto *global_context = static_cast<phi::XPUContext *>(
platform::DeviceContextPool::Instance().Get(place_));
auto global_stream = global_context->stream();
if (predictor_stream_ == nullptr) {
predictor_stream_ = global_stream;
}
InitDeviceContexts();
}
#endif
inference::DisplayMemoryInfo(place_, "Init predictor");
return true;
}
......@@ -492,15 +488,12 @@ void AnalysisPredictor::InitResourceManager(void *stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
predictor_stream_ =
ResourceManager::Instance().InitGPUResource(place_, stream);
#elif defined(PADDLE_WITH_XPU)
predictor_stream_ =
ResourceManager::Instance().InitXPUResource(place_, stream);
#endif
}
void AnalysisPredictor::InitDeviceContexts() {
// Init GPUContext.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Init GPUContext.
if (place_.GetType() == phi::AllocationType::GPU) {
device_contexts_.emplace(
place_, std::async(std::launch::deferred, [=] {
......@@ -512,12 +505,10 @@ void AnalysisPredictor::InitDeviceContexts() {
}));
}
#endif
#if defined(PADDLE_WITH_XPU)
#ifdef PADDLE_WITH_XPU
if (place_.GetType() == phi::AllocationType::XPU) {
device_contexts_.emplace(
place_, std::async(std::launch::deferred, [=] {
auto *xpu_resource =
ResourceManager::Instance().GetXPUResource(predictor_stream_);
auto &instance = memory::allocation::AllocatorFacade::Instance();
auto *xpu_context = new InferXPUContext(place_);
xpu_context->SetAllocator(instance.GetAllocator(place_).get());
......@@ -530,15 +521,11 @@ void AnalysisPredictor::InitDeviceContexts() {
instance.GetZeroAllocator(place_).get());
xpu_context->SetHostZeroAllocator(
instance.GetZeroAllocator(platform::CPUPlace()).get());
xpu_context->SetStream(xpu_resource->GetStream());
xpu_context->SetDriverVersion(xpu_resource->GetDriverVersion());
xpu_context->SetRuntimeVersion(xpu_resource->GetRuntimeVersion());
xpu_context->SetXpuVersion(xpu_resource->GetXpuVersion());
xpu_context->SetStream(predictor_stream_);
return std::unique_ptr<phi::DeviceContext>(xpu_context);
}));
}
#endif
// TODO(Inference): Support other backends.
}
void *AnalysisPredictor::GetExecStream() const {
......@@ -591,6 +578,11 @@ const void *AnalysisPredictor::GetDeviceContexts() const {
bool AnalysisPredictor::PrepareScope(
const std::shared_ptr<framework::Scope> &parent_scope) {
#ifdef PADDLE_WITH_XPU
// Set "XPU_PADDLE_L3_SIZE" to "0" to avoid malloc l3 cache when xpu_context
// init.
setenv("XPU_PADDLE_L3_SIZE", "0", 0);
#endif
if (parent_scope) {
PADDLE_ENFORCE_NOT_NULL(
parent_scope,
......@@ -1513,6 +1505,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetCustomDeviceId(config_.custom_device_id());
}
#endif
#ifdef PADDLE_WITH_XPU
argument_->SetUseXpu(config_.use_xpu_);
argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
......@@ -2153,29 +2146,45 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
}
#endif
bool AnalysisPredictor::ExpRunWithExternalStream(void *stream) {
#if defined(PADDLE_WITH_XPU)
if (!private_context_) {
PADDLE_THROW(platform::errors::Fatal(
"Please use config.SetExecStream to init resources, and then we "
"will bind resources to execution stream."));
}
if (stream != predictor_stream_) {
bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
#ifdef PADDLE_WITH_XPU
PADDLE_ENFORCE(
private_context_,
paddle::platform::errors::Fatal(
"Must use private context if run predictor with external config."));
auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
auto xpu_runtime_config =
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
auto *stream = xpu_runtime_config->stream;
if (stream != nullptr && stream != predictor_stream_) {
paddle::platform::XPUStreamSync(
static_cast<paddle::xpuStream>(predictor_stream_));
ResourceManager::Instance().XpuResourceReBindStream(predictor_stream_,
stream);
predictor_stream_ = stream;
auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
dev_ctx->SetStream(stream);
}
return ZeroCopyRun();
size_t l3_size = xpu_runtime_config->l3_size;
void *l3_ptr = xpu_runtime_config->l3_ptr;
size_t l3_autotune_size = xpu_runtime_config->l3_autotune_size;
PADDLE_ENFORCE_LE(
l3_autotune_size,
l3_size,
phi::errors::InvalidArgument(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
l3_autotune_size,
l3_size));
dev_ctx->SetL3Info(l3_size, l3_ptr, l3_autotune_size);
bool ret = ZeroCopyRun();
dev_ctx->L3CacheAutotune();
return ret;
#endif
return false;
}
......@@ -2543,10 +2552,6 @@ AnalysisPredictor::~AnalysisPredictor() {
if (predictor_stream_ != nullptr) {
ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
}
#elif defined(PADDLE_WITH_XPU)
if (predictor_stream_ != nullptr) {
ResourceManager::Instance().DestroyXPUResource(predictor_stream_);
}
#endif
if (place_.GetType() != phi::AllocationType::UNDEFINED) {
......@@ -3057,10 +3062,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
#endif
return false;
}
bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
void *stream) {
bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p,
void *config) {
auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
return pred->ExpRunWithExternalStream(stream);
return pred->ExpRunWithRuntimeConfig(config);
}
void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
......
......@@ -228,6 +228,9 @@ class AnalysisPredictor : public PaddlePredictor {
// Note: Can only be used under thread_local semantics.
bool ExpRunWithExternalStream(void *stream);
// Note: Can only be used under thread_local semantics.
bool ExpRunWithRuntimeConfig(void *config);
///
/// \brief Get the execution stream on devices with a concept of stream,
/// otherwise returns nullptr.
......
......@@ -13,7 +13,11 @@
// limitations under the License.
#include "paddle/fluid/inference/api/infer_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#ifdef PADDLE_WITH_XPU
#include "xpu/runtime.h"
#endif
#include "glog/logging.h"
namespace paddle {
......@@ -22,9 +26,129 @@ InferGPUContext::InferGPUContext(const phi::Place& place)
: phi::GPUContext(place, false) {}
#endif
#if defined(PADDLE_WITH_XPU)
#ifdef PADDLE_WITH_XPU
InferXPUContext::InferXPUContext(const phi::Place& place)
: phi::XPUContext(place) {}
void* InferXPUContext::Alloc(phi::TensorBase* tensor,
phi::DataType dtype,
size_t requested_size,
bool pinned,
bool fake_alloc) const {
size_t size = tensor->numel() * phi::SizeOf(tensor->dtype());
if (l3_autotune_size_ > 0 && holder_map_.empty()) {
void* data_ptr =
DeviceContext::Alloc(tensor, dtype, requested_size, pinned, fake_alloc);
phi::XPUL3CacheBlock* l3_block = nullptr;
phi::Allocation* holder =
reinterpret_cast<phi::DenseTensor*>(tensor)->Holder().get();
if (holder_l3_blocks_.count(holder) == 0) {
l3_block = new phi::XPUL3CacheBlock();
holder_l3_blocks_[holder] = l3_block;
l3_blocks_.push_back(l3_block);
} else {
l3_block = holder_l3_blocks_[holder];
}
l3_block->Record(size);
return data_ptr;
} else if (l3_autotune_size_ > 0 && !holder_map_.empty()) {
phi::Allocation* holder =
reinterpret_cast<phi::DenseTensor*>(tensor)->Holder().get();
auto holder_iter = holder_map_.find(holder);
if (holder_iter != holder_map_.end()) {
auto& holder_pair = holder_iter->second;
auto* swap_holder = holder_pair.first;
bool& swap_holder_is_l3 = holder_pair.second;
if (swap_holder_is_l3 && swap_holder->size() >= size) {
swap(*holder, *swap_holder);
swap_holder_is_l3 = false;
} else if (!swap_holder_is_l3 && holder->size() < size) {
swap(*holder, *swap_holder);
swap_holder_is_l3 = true;
}
}
return DeviceContext::Alloc(
tensor, dtype, requested_size, pinned, fake_alloc);
} else {
return DeviceContext::Alloc(
tensor, dtype, requested_size, pinned, fake_alloc);
}
}
void InferXPUContext::SetL3Info(size_t l3_size,
void* l3_ptr,
size_t l3_autotune_size) {
if (l3_ptr == nullptr) {
if (l3_size_ != l3_size) {
if (l3_owned_) {
xpu_free(l3_ptr_);
}
if (l3_size > 0) {
xpu_malloc(&l3_ptr_, l3_size, XPU_MEM_L3);
if (l3_ptr_ != nullptr) {
VLOG(3) << "remalloc l3(" << l3_size << ") success.";
l3_size_ = l3_size;
l3_owned_ = true;
l3_autotune_size_ = l3_autotune_size;
} else {
VLOG(3) << "malloc l3(" << l3_size << ") failed. No l3 will be used.";
l3_size_ = 0;
l3_owned_ = false;
l3_autotune_size_ = 0;
}
}
}
} else {
if (l3_owned_) {
xpu_free(l3_ptr_);
}
l3_ptr_ = l3_ptr;
l3_size_ = l3_size;
l3_autotune_size_ = l3_autotune_size;
}
if (l3_autotune_size_ == 0) {
x_context()->_l3_mgr.set(l3_ptr_, l3_size_);
}
}
void InferXPUContext::L3CacheAutotune() {
if (l3_autotune_size_ == 0) return;
if (holder_map_.empty()) {
l3_plan_.RunAutotune(l3_blocks_, l3_size_);
auto* plan = l3_plan_.plan();
int8_t* cur_l3_ptr = reinterpret_cast<int8_t*>(l3_ptr_);
for (size_t i = 0; i < l3_blocks_.size(); i++) {
size_t block_size = plan->at(i);
if (block_size > 0) {
l3_blocks_[i]->Set(cur_l3_ptr, block_size);
cur_l3_ptr += block_size;
}
}
x_context()->_l3_mgr.set(
reinterpret_cast<int8_t*>(l3_ptr_) + l3_size_ - plan->back(),
plan->back());
for (auto holder_l3_block : holder_l3_blocks_) {
auto* l3_block = holder_l3_block.second;
if (l3_block->size() > 0) {
auto* holder = holder_l3_block.first;
auto place = holder->place();
phi::Allocation* l3_holder =
new phi::Allocation(l3_block->data(), l3_block->size(), place);
holder_map_[holder] = std::make_pair(l3_holder, true);
}
}
} else {
for (auto& holders : holder_map_) {
auto* holder = holders.first;
auto& holder_pair = holders.second;
if (!holder_pair.second) {
swap(*holder, *(holder_pair.first));
holder_pair.second = true;
}
}
}
}
#endif
} // namespace paddle
......@@ -15,6 +15,9 @@
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/place.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/xpu_l3_strategy.h"
#endif
namespace paddle {
......@@ -46,14 +49,33 @@ class InferGPUContext : public phi::GPUContext {
};
#endif
#if defined(PADDLE_WITH_XPU)
#ifdef PADDLE_WITH_XPU
class InferXPUContext : public phi::XPUContext {
public:
explicit InferXPUContext(const phi::Place& place);
using phi::XPUContext::SetDriverVersion;
using phi::XPUContext::SetRuntimeVersion;
using phi::XPUContext::SetStream;
using phi::XPUContext::SetXpuVersion;
void* Alloc(phi::TensorBase* tensor,
phi::DataType dtype,
size_t requested_size = 0,
bool pinned = false,
bool fake_alloc = false) const override;
void SetL3Info(size_t l3_size, void* l3_ptr, size_t l3_autotune_size);
void L3CacheAutotune();
private:
size_t l3_size_{0};
void* l3_ptr_{nullptr};
bool l3_owned_{false};
size_t l3_autotune_size_{0};
mutable std::vector<phi::XPUL3CacheBlock*> l3_blocks_;
mutable std::unordered_map<phi::Allocation*, phi::XPUL3CacheBlock*>
holder_l3_blocks_;
mutable std::unordered_map<phi::Allocation*,
std::pair<phi::Allocation*, bool>>
holder_map_;
phi::XPUL3Planner l3_plan_;
};
#endif
} // namespace paddle
......@@ -471,6 +471,13 @@ class Predictor;
class Tensor;
using Config = paddle::AnalysisConfig;
namespace experimental {
struct XpuRuntimeConfig {
void* stream{nullptr};
size_t l3_size{16773120};
void* l3_ptr{nullptr};
size_t l3_autotune_size{0};
};
// Unstable interface, may be modified or deleted in the future.
class PD_INFER_DECL InternalUtils {
public:
......@@ -479,8 +486,8 @@ class PD_INFER_DECL InternalUtils {
cudaStream_t stream);
static bool RunWithExternalStream(paddle_infer::Predictor* pred,
hipStream_t stream);
static bool RunWithExternalStream(paddle_infer::Predictor* pred,
void* stream);
static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config);
static void UpdateConfigInterleaved(paddle_infer::Config* c,
bool with_interleaved);
......
......@@ -41,9 +41,6 @@
#include "paddle/phi/backends/dynload/cusparse.h"
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/xpu_info.h"
#endif
namespace paddle {
namespace internal {
......@@ -451,123 +448,5 @@ int ResourceManager::RefCount(void* stream) const {
if (ref_count_.count(stream) == 0) return 0;
return ref_count_.at(stream);
}
#endif
#if defined(PADDLE_WITH_XPU)
// XPUContextResource
XPUContextResource::XPUContextResource(const phi::Place& place, void* stream)
: place_(place) {
InitXPUResource(stream);
}
XPUContextResource::~XPUContextResource() {}
void XPUContextResource::InitXPUResource(void* stream) {
phi::backends::xpu::XPUDeviceGuard guard(place_.device);
if (stream) {
owned_stream_ = false;
stream_ = stream;
}
InitXpuProperties();
}
void XPUContextResource::InitXpuProperties() {
phi::backends::xpu::XPUDeviceGuard guard(place_.device);
driver_version_ = phi::backends::xpu::GetDriverVersion();
runtime_version_ = phi::backends::xpu::GetRuntimeVersion();
xpu_version_ =
static_cast<int>(phi::backends::xpu::get_xpu_version(place_.device));
}
void* XPUContextResource::GetStream() const { return stream_; }
int XPUContextResource::GetDriverVersion() const { return driver_version_; }
int XPUContextResource::GetRuntimeVersion() const { return runtime_version_; }
int XPUContextResource::GetXpuVersion() const { return xpu_version_; }
void XPUContextResource::ReBindStream(void* stream) {
owned_stream_ = false;
stream_ = stream;
}
// XPUContextResource End.
// Resource Manager
void* ResourceManager::InitXPUResource(const phi::Place& place, void* stream) {
std::lock_guard<std::mutex> lock_gurad(xpu_mutex_);
if (xpu_resources_.count(stream)) {
Increase(stream);
return stream;
} else {
std::unique_ptr<XPUContextResource> resource{
new XPUContextResource(place, stream)};
void* s = resource->GetStream();
ref_count_[s] = 1;
xpu_resources_.emplace(s, std::move(resource));
return s;
}
}
XPUContextResource* ResourceManager::GetXPUResource(void* stream) const {
PADDLE_ENFORCE_EQ(xpu_resources_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in xpu_resources.", stream));
return xpu_resources_.at(stream).get();
}
void ResourceManager::XpuResourceReBindStream(void* old_stream,
void* new_stream) {
PADDLE_ENFORCE_EQ(
xpu_resources_.count(old_stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in xpu_resources.", old_stream));
auto xpu_resource = std::move(xpu_resources_.at(old_stream));
DestroyXPUResource(old_stream);
PADDLE_ENFORCE_EQ(
ref_count_.count(old_stream),
0,
platform::errors::Fatal("xpu resources rebind stream failed."));
xpu_resource->ReBindStream(new_stream);
ref_count_[new_stream]++;
xpu_resources_.emplace(new_stream, std::move(xpu_resource));
}
void ResourceManager::DestroyXPUResource(void* stream) {
PADDLE_ENFORCE_EQ(xpu_resources_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in xpu_resources.", stream));
Decrease(stream);
}
void ResourceManager::Decrease(void* stream) {
PADDLE_ENFORCE_EQ(ref_count_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in ref_count.", stream));
--ref_count_[stream];
if (ref_count_[stream] == 0) {
ref_count_.erase(stream);
xpu_resources_.erase(stream);
}
}
void ResourceManager::Increase(void* stream) {
PADDLE_ENFORCE_EQ(ref_count_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in ref_count.", stream));
++ref_count_[stream];
}
int ResourceManager::RefCount(void* stream) const {
if (ref_count_.count(stream) == 0) return 0;
return ref_count_.at(stream);
}
// Resource Manager End.
#endif
} // namespace paddle
......@@ -124,33 +124,6 @@ class GPUContextResource {
};
#endif
#if defined(PADDLE_WITH_XPU)
class XPUContextResource {
public:
explicit XPUContextResource(const phi::Place& place, void* stream);
~XPUContextResource();
phi::Place Place() const;
void* GetStream() const;
int GetDriverVersion() const;
int GetRuntimeVersion() const;
int GetXpuVersion() const;
void ReBindStream(void* stream);
private:
void InitXPUResource(void* stream);
void InitXpuProperties();
private:
bool owned_stream_{true};
void* stream_;
phi::Place place_;
int driver_version_;
int runtime_version_;
int xpu_version_;
}; // class XPUContextResource
#endif
class ResourceManager {
public:
ResourceManager() = default;
......@@ -168,9 +141,8 @@ class ResourceManager {
std::mutex cpu_mutex_;
std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
// GPU Resource
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// GPU Resource
public:
void* InitGPUResource(const phi::Place& place, void* stream);
void DestroyGPUResource(void* stream);
......@@ -190,28 +162,6 @@ class ResourceManager {
gpu_resources_;
#endif
// XPU Resource
#if defined(PADDLE_WITH_XPU)
public:
void* InitXPUResource(const phi::Place& place, void* stream);
void DestroyXPUResource(void* stream);
XPUContextResource* GetXPUResource(void* stream) const;
int RefCount(void* stream) const;
void XpuResourceReBindStream(void* old_stream, void* new_stream);
private:
void Decrease(void* stream);
void Increase(void* stream);
private:
std::mutex xpu_mutex_;
// a stream corresponding to a series of resource.
std::map<void* /*stream*/, std::atomic<int>> ref_count_;
std::map<void* /*stream*/, std::unique_ptr<XPUContextResource>>
xpu_resources_;
#endif
private:
DISABLE_COPY_AND_ASSIGN(ResourceManager);
};
......
......@@ -38,7 +38,7 @@ endif()
if(WITH_XPU)
list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
xpu/xpu2_op_list.cc)
xpu/xpu2_op_list.cc xpu/xpu_l3_strategy.cc)
endif()
if(WITH_MKLDNN)
......
......@@ -42,11 +42,13 @@ struct XPUContext::Impl {
auto selected_xpus = backends::xpu::GetXPUSelectedDevices();
for (unsigned int i = 0; i < selected_xpus.size(); i++) {
if (place_.GetDeviceId() == selected_xpus[i]) {
if (l3ptrs[place_.GetDeviceId()] == nullptr) {
xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
l3_size,
XPU_MEM_L3);
if (l3ptrs[place_.GetDeviceId()] != nullptr) {
xpu_free(l3ptrs[place_.GetDeviceId()]);
l3ptrs[place_.GetDeviceId()] = nullptr;
}
xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
l3_size,
XPU_MEM_L3);
if (l3ptrs[place_.GetDeviceId()] != nullptr) {
context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size);
VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId())
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/backends/xpu/xpu_l3_strategy.h"
#include "glog/logging.h"
namespace phi {
void XPUL3CacheBlock::Set(void* addr, size_t size) {
if (addr == nullptr || size == 0) {
LOG(FATAL) << "Set XPUL3CacheBlock Size as Zero";
}
addr_ = addr;
size_ = size;
}
void XPUL3Planner::RunAutotune(
const std::vector<XPUL3CacheBlock*>& l3_block_dict, size_t l3_size) {
if (l3_block_dict.size() == 0 || l3_size <= 0 || !plan_.empty()) {
return;
}
VLOG(3) << "AutoTune XPU L3 Cache Block Start.";
struct node {
size_t weights = 0;
size_t scores = 0;
std::vector<size_t> choices{0};
};
std::vector<std::vector<node>> records;
std::vector<size_t> record_map;
size_t total_scores = 0;
for (size_t block_idx = 0; block_idx < l3_block_dict.size(); block_idx++) {
XPUL3CacheBlock* cur_block = l3_block_dict[block_idx];
std::vector<size_t>& history = cur_block->history_;
auto history_size = history.size();
size_t score = 0;
VLOG(3) << "Block Idx is " << block_idx;
if (history_size > 1) {
std::vector<node> block_nodes{node()};
std::sort(history.begin(), history.end());
for (size_t i = 0; i < history_size; i++) {
VLOG(3) << "Size History : " << i << " is " << history[i];
if (history[i] > l3_size) {
break;
}
score += history[i];
if (i == history_size - 1 || history[i + 1] != history[i]) {
node cur_node;
cur_node.weights = history[i];
cur_node.choices = {history[i]};
cur_node.scores = score;
block_nodes.push_back(cur_node);
VLOG(3) << "Node Weights is:" << cur_node.weights
<< ", Node Scores is: " << score;
}
}
total_scores += score;
records.push_back(block_nodes);
record_map.push_back(block_idx);
}
}
if (records.size() <= 0) {
return;
}
std::vector<node> res(records[0]);
for (size_t block_idx = 1; block_idx < records.size(); block_idx++) {
std::vector<node> new_nodes;
for (size_t node_idx = 0; node_idx < records[block_idx].size();
node_idx++) {
for (size_t res_idx = 0; res_idx < res.size(); res_idx++) {
node cur_node;
size_t cur_weights =
records[block_idx][node_idx].weights + res[res_idx].weights;
if (cur_weights > l3_size) {
break;
}
cur_node.scores =
records[block_idx][node_idx].scores + res[res_idx].scores;
cur_node.weights = cur_weights;
cur_node.choices = res[res_idx].choices;
cur_node.choices.push_back(records[block_idx][node_idx].choices[0]);
new_nodes.push_back(cur_node);
}
}
struct {
bool operator()(node a, node b) const {
if (a.weights < b.weights) {
return true;
} else if (a.weights == b.weights) {
return a.scores > b.scores;
} else {
return false;
}
}
} customLess;
std::sort(new_nodes.begin(), new_nodes.end(), customLess);
std::vector<bool> stay(new_nodes.size(), true);
for (int i = new_nodes.size() - 1; i >= 0; i--) {
for (int j = i - 1; j >= 0; j--) {
if (new_nodes[j].scores >= new_nodes[i].scores) {
stay[i] = false;
break;
}
}
}
res.clear();
for (size_t i = 0; i < new_nodes.size(); i++) {
if (stay[i] == true) {
res.push_back(new_nodes[i]);
}
}
VLOG(3) << "XPU L3 Block IDX is " << block_idx
<< ", Choices before filter are " << new_nodes.size()
<< ", Choices after filter are " << res.size();
}
// final result: res.back().choices
// std::vector<size_t> record_map;
for (size_t i = 0; i < res.back().choices.size(); i++) {
VLOG(3) << "BLOCK IDX is " << i << ", Acquired L3 Size is "
<< res.back().choices[i];
}
double l3_global_ratio = static_cast<double>(res.back().scores) /
static_cast<double>(total_scores);
VLOG(3) << "Tensor Space in L3 / Tensor Space in Global :"
<< l3_global_ratio * 100 << " %";
size_t block_l3_size =
std::accumulate(res.back().choices.begin(), res.back().choices.end(), 0);
size_t xdnn_ctx_l3_size = (l3_size - block_l3_size) / 64 * 64;
VLOG(3) << "Block L3 Size : " << block_l3_size
<< ", XDNN Ctx L3 Size : " << xdnn_ctx_l3_size;
plan_.resize(l3_block_dict.size() + 1, 0);
for (size_t i = 0; i < res.back().choices.size(); i++) {
plan_[record_map[i]] = res.back().choices[i];
}
plan_[l3_block_dict.size()] = xdnn_ctx_l3_size;
VLOG(3) << "AutoTune XPU L3 Cache Block End.";
}
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <numeric>
#include <vector>
namespace phi {
struct XPUL3CacheBlock {
public:
void Clear() {
addr_ = nullptr;
size_ = 0;
history_.clear();
}
void Set(void* addr, size_t size);
void Record(size_t size) { history_.push_back(size); }
void* data() { return addr_; }
size_t size() { return size_; }
private:
void* addr_{nullptr};
size_t size_{0};
public:
std::vector<size_t> history_;
};
class XPUL3Planner {
public:
void RunAutotune(const std::vector<XPUL3CacheBlock*>& l3_block_dict,
size_t l3_size);
std::vector<size_t>* plan() { return &plan_; }
private:
std::vector<size_t> plan_;
};
} // namespace phi
......@@ -393,11 +393,8 @@ template <typename T>
T* DeviceContext::Alloc(TensorBase* tensor,
size_t requested_size,
bool pinned) const {
if (pinned) {
return impl_->Alloc<T>(
tensor, GetPinnedPlace(GetPlace()), requested_size, pinned);
}
return impl_->Alloc<T>(tensor, GetPlace(), requested_size, pinned);
DataType dtype = phi::CppTypeToDataType<T>::Type();
return static_cast<T*>(this->Alloc(tensor, dtype, requested_size, pinned));
}
void* DeviceContext::HostAlloc(TensorBase* tensor,
......
......@@ -145,11 +145,11 @@ class PADDLE_API DeviceContext {
/**
* @brief Allocate device memory for tensor.
*/
void* Alloc(TensorBase*,
DataType dtype,
size_t requested_size = 0,
bool pinned = false,
bool fake_alloc = false) const;
virtual void* Alloc(TensorBase*,
DataType dtype,
size_t requested_size = 0,
bool pinned = false,
bool fake_alloc = false) const;
template <typename T>
T* Alloc(TensorBase* tensor,
......
......@@ -1461,6 +1461,17 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
--repeat=10)
endif()
if(WITH_XPU)
inference_analysis_test(
xpu_runtime_config_resnet50_test
SRCS
xpu_runtime_config_resnet50_test.cc
EXTRA_DEPS
paddle_inference_shared
ARGS
--infer_model=${RESNET50_MODEL_DIR})
endif()
set(inference_deps ${analysis_deps} paddle_inference_api analysis
naive_executor ${GLOB_PASS_LIB})
......
......@@ -17,10 +17,6 @@
#if defined(PADDLE_WITH_CUDA)
#include <cuda_runtime.h>
#endif
#if defined(PADDLE_WITH_XPU)
#include "xpu/runtime.h"
#include "xpu/xdnn.h"
#endif
#include <glog/logging.h>
#include <gtest/gtest.h>
......@@ -671,57 +667,6 @@ TEST(Predictor, Streams) {
}
#endif
#if defined(PADDLE_WITH_XPU)
TEST(Predictor, XPUStreams) {
// external stream
{
auto context = baidu::xpu::api::create_context();
xpu_stream_create(&context->xpu_stream);
Config config;
config.SetModel(FLAGS_dirname);
config.EnableXpu();
config.SetExecStream(static_cast<void*>(context->xpu_stream));
CHECK_EQ(config.external_stream_enabled(), true);
auto predictor = CreatePredictor(config);
auto stream = predictor->GetExecStream();
CHECK_EQ(static_cast<void*>(context->xpu_stream), stream);
CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream));
CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream), 1);
}
// 2 predictor on 2 stream
{
auto context1 = baidu::xpu::api::create_context();
xpu_stream_create(&context1->xpu_stream);
Config config;
config.SetModel(FLAGS_dirname);
config.EnableXpu();
config.SetExecStream(static_cast<void*>(context1->xpu_stream));
auto predictor = CreatePredictor(config);
auto stream1 = predictor->GetExecStream();
CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream1));
CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream1), 1);
auto context2 = baidu::xpu::api::create_context();
xpu_stream_create(&context2->xpu_stream);
Config config2;
config2.SetModel(FLAGS_dirname);
config2.EnableXpu();
config2.SetExecStream(static_cast<void*>(context2->xpu_stream));
auto predictor2 = CreatePredictor(config2);
auto stream2 = predictor2->GetExecStream();
CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream2));
CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream2), 1);
CHECK_NE(stream1, stream2);
}
}
#endif
TEST(AnalysisPredictor, OutputHookFunc) {
auto hookfunc = [](const std::string& type,
const std::string& var_name,
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cmath>
#include "gflags/gflags.h"
#include "test/cpp/inference/api/tester_helper.h"
#include "xpu/runtime.h"
#include "xpu/xdnn.h"
namespace paddle_infer {
static const std::vector<float> TRUTH_VALUES = {
127.779f, 738.165f, 1013.22f, -438.17f, 366.401f, 927.659f, 736.222f,
-633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f,
-242.675f, 117.448f, -801.723f, -391.514f, -404.818f, 454.16f, 515.48f,
-133.031f, 69.293f, 590.096f, -1434.69f, -1070.89f, 307.074f, 400.525f,
-316.12f, -587.125f, -161.056f, 800.363f, -96.4708f, 748.706f, 868.174f,
-447.938f, 112.737f, 1127.2f, 47.4355f, 677.72f, 593.186f, -336.4f,
551.362f, 397.823f, 78.3979f, -715.398f, 405.969f, 404.256f, 246.019f,
-8.42969f, 131.365f, -648.051f};
void PrepareInput(std::shared_ptr<Predictor> predictor) {
const int batch = 1;
const int channel = 3;
const int height = 318;
const int width = 318;
const int input_num = batch * channel * height * width;
std::vector<float> input(input_num, 1);
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputHandle(input_names[0]);
input_t->Reshape({batch, channel, height, width});
input_t->CopyFromCpu(input.data());
}
void CompareOutput(std::shared_ptr<Predictor> predictor) {
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
size_t out_num = std::accumulate(
output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
std::vector<float> out_data;
out_data.resize(out_num);
output_t->CopyToCpu(out_data.data());
float* data_o = out_data.data();
for (size_t j = 0; j < out_num; j += 10) {
EXPECT_NEAR(
(data_o[j] - TRUTH_VALUES[j / 10]) / TRUTH_VALUES[j / 10], 0., 10e-3);
}
}
Config XpuConfig() {
std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableXpu();
return config;
}
TEST(resnet50_xpu, basic) {
Config config = XpuConfig();
auto predictor = CreatePredictor(config);
PrepareInput(predictor);
predictor->Run();
CompareOutput(predictor);
}
#define RUN_WITH_RUNTIME_CONFIG(idx_, config_) \
Config config##idx_ = XpuConfig(); \
auto predictor##idx_ = CreatePredictor(config##idx_); \
PrepareInput(predictor##idx_); \
experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \
&config_); \
CompareOutput(predictor##idx_); \
CHECK_EQ(predictor##idx_->GetExecStream(), config_.stream);
TEST(runtime_stream, null_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0};
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
}
TEST(runtime_stream, new_stream) {
void* stream = nullptr;
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
{
experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0};
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
}
xpu_stream_destroy(stream);
}
TEST(runtime_stream, 2_null_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0};
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config);
}
TEST(runtime_stream, null_and_new_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config0 = {nullptr, 0, nullptr, 0};
void* stream = nullptr;
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
{
experimental::XpuRuntimeConfig xpu_runtime_config1 = {
stream, 0, nullptr, 0};
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
}
xpu_stream_destroy(stream);
}
TEST(runtime_stream, 2_new_same_stream) {
void* stream = nullptr;
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0};
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config);
}
xpu_stream_destroy(stream);
}
TEST(runtime_stream, 2_new_different_stream) {
void* stream0 = nullptr;
xpu_stream_create(&stream0);
CHECK_NOTNULL(stream0);
experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0};
void* stream1 = nullptr;
xpu_stream_create(&stream1);
CHECK_NOTNULL(stream1);
experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0};
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
}
xpu_stream_destroy(stream0);
xpu_stream_destroy(stream1);
}
void RunPredictorWithRuntimeConfig(
std::shared_ptr<Predictor> predictor,
experimental::XpuRuntimeConfig runtime_config) {
PrepareInput(predictor);
experimental::InternalUtils::RunWithRuntimeConfig(predictor.get(),
&runtime_config);
CompareOutput(predictor);
CHECK_EQ(predictor->GetExecStream(), runtime_config.stream);
}
TEST(runtime_stream, 2_thread) {
void* stream0 = nullptr;
xpu_stream_create(&stream0);
CHECK_NOTNULL(stream0);
experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0};
void* stream1 = nullptr;
xpu_stream_create(&stream1);
CHECK_NOTNULL(stream1);
experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0};
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
std::thread t0(
RunPredictorWithRuntimeConfig, predictor0, xpu_runtime_config0);
std::thread t1(
RunPredictorWithRuntimeConfig, predictor1, xpu_runtime_config1);
t0.join();
t1.join();
}
xpu_stream_destroy(stream0);
xpu_stream_destroy(stream1);
}
} // namespace paddle_infer
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册