未验证 提交 e135069d 编写于 作者: Z zhupengyang 提交者: GitHub

[xpu][infer] support runtime configs (#53595)

上级 d327d3e1
...@@ -566,12 +566,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, ...@@ -566,12 +566,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
} else { } else {
conv_bias.push_back(0); conv_bias.push_back(0);
} }
if (conv->Op()->HasAttr("padding_algorithm")) { conv2d_xpu_op_desc.SetAttr(
conv2d_xpu_op_desc.SetAttr( "padding_algorithm",
"padding_algorithm", conv->Op()->GetAttrIfExists<std::string>("padding_algorithm"));
PADDLE_GET_CONST(std::string,
conv->Op()->GetAttr("padding_algorithm")));
}
auto conv_paddings = auto conv_paddings =
PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("paddings")); PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("paddings"));
if (conv_paddings.size() == 2) { if (conv_paddings.size() == 2) {
......
...@@ -389,25 +389,21 @@ bool AnalysisPredictor::Init( ...@@ -389,25 +389,21 @@ bool AnalysisPredictor::Init(
} }
#endif #endif
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
if (config_.use_xpu_ && config_.use_external_stream_) { if (config_.use_xpu_) {
private_context_ = true; private_context_ = true;
} if (!status_is_cloned_ && config_.external_stream_enabled()) {
if (private_context_) {
if (!status_is_cloned_) {
predictor_stream_ = config_.GetExecStream(); predictor_stream_ = config_.GetExecStream();
} }
// NOTE: If the external_stream equals to global_device_contexts's stream, auto *global_context = static_cast<phi::XPUContext *>(
// then fallback. platform::DeviceContextPool::Instance().Get(place_));
auto global_stream = auto global_stream = global_context->stream();
static_cast<phi::XPUContext *>( if (predictor_stream_ == nullptr) {
platform::DeviceContextPool::Instance().Get(place_)) predictor_stream_ = global_stream;
->stream();
if (predictor_stream_ != global_stream) {
InitResourceManager(predictor_stream_);
InitDeviceContexts();
} }
InitDeviceContexts();
} }
#endif #endif
inference::DisplayMemoryInfo(place_, "Init predictor"); inference::DisplayMemoryInfo(place_, "Init predictor");
return true; return true;
} }
...@@ -492,15 +488,12 @@ void AnalysisPredictor::InitResourceManager(void *stream) { ...@@ -492,15 +488,12 @@ void AnalysisPredictor::InitResourceManager(void *stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
predictor_stream_ = predictor_stream_ =
ResourceManager::Instance().InitGPUResource(place_, stream); ResourceManager::Instance().InitGPUResource(place_, stream);
#elif defined(PADDLE_WITH_XPU)
predictor_stream_ =
ResourceManager::Instance().InitXPUResource(place_, stream);
#endif #endif
} }
void AnalysisPredictor::InitDeviceContexts() { void AnalysisPredictor::InitDeviceContexts() {
// Init GPUContext.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Init GPUContext.
if (place_.GetType() == phi::AllocationType::GPU) { if (place_.GetType() == phi::AllocationType::GPU) {
device_contexts_.emplace( device_contexts_.emplace(
place_, std::async(std::launch::deferred, [=] { place_, std::async(std::launch::deferred, [=] {
...@@ -512,12 +505,10 @@ void AnalysisPredictor::InitDeviceContexts() { ...@@ -512,12 +505,10 @@ void AnalysisPredictor::InitDeviceContexts() {
})); }));
} }
#endif #endif
#if defined(PADDLE_WITH_XPU) #ifdef PADDLE_WITH_XPU
if (place_.GetType() == phi::AllocationType::XPU) { if (place_.GetType() == phi::AllocationType::XPU) {
device_contexts_.emplace( device_contexts_.emplace(
place_, std::async(std::launch::deferred, [=] { place_, std::async(std::launch::deferred, [=] {
auto *xpu_resource =
ResourceManager::Instance().GetXPUResource(predictor_stream_);
auto &instance = memory::allocation::AllocatorFacade::Instance(); auto &instance = memory::allocation::AllocatorFacade::Instance();
auto *xpu_context = new InferXPUContext(place_); auto *xpu_context = new InferXPUContext(place_);
xpu_context->SetAllocator(instance.GetAllocator(place_).get()); xpu_context->SetAllocator(instance.GetAllocator(place_).get());
...@@ -530,15 +521,11 @@ void AnalysisPredictor::InitDeviceContexts() { ...@@ -530,15 +521,11 @@ void AnalysisPredictor::InitDeviceContexts() {
instance.GetZeroAllocator(place_).get()); instance.GetZeroAllocator(place_).get());
xpu_context->SetHostZeroAllocator( xpu_context->SetHostZeroAllocator(
instance.GetZeroAllocator(platform::CPUPlace()).get()); instance.GetZeroAllocator(platform::CPUPlace()).get());
xpu_context->SetStream(xpu_resource->GetStream()); xpu_context->SetStream(predictor_stream_);
xpu_context->SetDriverVersion(xpu_resource->GetDriverVersion());
xpu_context->SetRuntimeVersion(xpu_resource->GetRuntimeVersion());
xpu_context->SetXpuVersion(xpu_resource->GetXpuVersion());
return std::unique_ptr<phi::DeviceContext>(xpu_context); return std::unique_ptr<phi::DeviceContext>(xpu_context);
})); }));
} }
#endif #endif
// TODO(Inference): Support other backends.
} }
void *AnalysisPredictor::GetExecStream() const { void *AnalysisPredictor::GetExecStream() const {
...@@ -591,6 +578,11 @@ const void *AnalysisPredictor::GetDeviceContexts() const { ...@@ -591,6 +578,11 @@ const void *AnalysisPredictor::GetDeviceContexts() const {
bool AnalysisPredictor::PrepareScope( bool AnalysisPredictor::PrepareScope(
const std::shared_ptr<framework::Scope> &parent_scope) { const std::shared_ptr<framework::Scope> &parent_scope) {
#ifdef PADDLE_WITH_XPU
// Set "XPU_PADDLE_L3_SIZE" to "0" to avoid malloc l3 cache when xpu_context
// init.
setenv("XPU_PADDLE_L3_SIZE", "0", 0);
#endif
if (parent_scope) { if (parent_scope) {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
parent_scope, parent_scope,
...@@ -1513,6 +1505,7 @@ void AnalysisPredictor::PrepareArgument() { ...@@ -1513,6 +1505,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetCustomDeviceId(config_.custom_device_id()); argument_->SetCustomDeviceId(config_.custom_device_id());
} }
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
argument_->SetUseXpu(config_.use_xpu_); argument_->SetUseXpu(config_.use_xpu_);
argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
...@@ -2153,29 +2146,45 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { ...@@ -2153,29 +2146,45 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
} }
#endif #endif
bool AnalysisPredictor::ExpRunWithExternalStream(void *stream) { bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
#if defined(PADDLE_WITH_XPU) #ifdef PADDLE_WITH_XPU
if (!private_context_) { PADDLE_ENFORCE(
PADDLE_THROW(platform::errors::Fatal( private_context_,
"Please use config.SetExecStream to init resources, and then we " paddle::platform::errors::Fatal(
"will bind resources to execution stream.")); "Must use private context if run predictor with external config."));
}
if (stream != predictor_stream_) { auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
auto xpu_runtime_config =
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
auto *stream = xpu_runtime_config->stream;
if (stream != nullptr && stream != predictor_stream_) {
paddle::platform::XPUStreamSync( paddle::platform::XPUStreamSync(
static_cast<paddle::xpuStream>(predictor_stream_)); static_cast<paddle::xpuStream>(predictor_stream_));
ResourceManager::Instance().XpuResourceReBindStream(predictor_stream_,
stream);
predictor_stream_ = stream; predictor_stream_ = stream;
auto *dev_ctxs = reinterpret_cast<const std::map<
phi::Place,
std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
this->GetDeviceContexts());
auto *dev_ctx =
static_cast<InferXPUContext *>(dev_ctxs->at(place_).get().get());
dev_ctx->SetStream(stream); dev_ctx->SetStream(stream);
} }
return ZeroCopyRun();
size_t l3_size = xpu_runtime_config->l3_size;
void *l3_ptr = xpu_runtime_config->l3_ptr;
size_t l3_autotune_size = xpu_runtime_config->l3_autotune_size;
PADDLE_ENFORCE_LE(
l3_autotune_size,
l3_size,
phi::errors::InvalidArgument(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
l3_autotune_size,
l3_size));
dev_ctx->SetL3Info(l3_size, l3_ptr, l3_autotune_size);
bool ret = ZeroCopyRun();
dev_ctx->L3CacheAutotune();
return ret;
#endif #endif
return false; return false;
} }
...@@ -2543,10 +2552,6 @@ AnalysisPredictor::~AnalysisPredictor() { ...@@ -2543,10 +2552,6 @@ AnalysisPredictor::~AnalysisPredictor() {
if (predictor_stream_ != nullptr) { if (predictor_stream_ != nullptr) {
ResourceManager::Instance().DestroyGPUResource(predictor_stream_); ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
} }
#elif defined(PADDLE_WITH_XPU)
if (predictor_stream_ != nullptr) {
ResourceManager::Instance().DestroyXPUResource(predictor_stream_);
}
#endif #endif
if (place_.GetType() != phi::AllocationType::UNDEFINED) { if (place_.GetType() != phi::AllocationType::UNDEFINED) {
...@@ -3057,10 +3062,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p, ...@@ -3057,10 +3062,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
#endif #endif
return false; return false;
} }
bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
void *stream) { bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p,
void *config) {
auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get()); auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
return pred->ExpRunWithExternalStream(stream); return pred->ExpRunWithRuntimeConfig(config);
} }
void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c, void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
......
...@@ -228,6 +228,9 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -228,6 +228,9 @@ class AnalysisPredictor : public PaddlePredictor {
// Note: Can only be used under thread_local semantics. // Note: Can only be used under thread_local semantics.
bool ExpRunWithExternalStream(void *stream); bool ExpRunWithExternalStream(void *stream);
// Note: Can only be used under thread_local semantics.
bool ExpRunWithRuntimeConfig(void *config);
/// ///
/// \brief Get the execution stream on devices with a concept of stream, /// \brief Get the execution stream on devices with a concept of stream,
/// otherwise returns nullptr. /// otherwise returns nullptr.
......
...@@ -13,7 +13,11 @@ ...@@ -13,7 +13,11 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/api/infer_context.h" #include "paddle/fluid/inference/api/infer_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h"
#ifdef PADDLE_WITH_XPU
#include "xpu/runtime.h"
#endif
#include "glog/logging.h"
namespace paddle { namespace paddle {
...@@ -22,9 +26,129 @@ InferGPUContext::InferGPUContext(const phi::Place& place) ...@@ -22,9 +26,129 @@ InferGPUContext::InferGPUContext(const phi::Place& place)
: phi::GPUContext(place, false) {} : phi::GPUContext(place, false) {}
#endif #endif
#if defined(PADDLE_WITH_XPU) #ifdef PADDLE_WITH_XPU
InferXPUContext::InferXPUContext(const phi::Place& place) InferXPUContext::InferXPUContext(const phi::Place& place)
: phi::XPUContext(place) {} : phi::XPUContext(place) {}
void* InferXPUContext::Alloc(phi::TensorBase* tensor,
phi::DataType dtype,
size_t requested_size,
bool pinned,
bool fake_alloc) const {
size_t size = tensor->numel() * phi::SizeOf(tensor->dtype());
if (l3_autotune_size_ > 0 && holder_map_.empty()) {
void* data_ptr =
DeviceContext::Alloc(tensor, dtype, requested_size, pinned, fake_alloc);
phi::XPUL3CacheBlock* l3_block = nullptr;
phi::Allocation* holder =
reinterpret_cast<phi::DenseTensor*>(tensor)->Holder().get();
if (holder_l3_blocks_.count(holder) == 0) {
l3_block = new phi::XPUL3CacheBlock();
holder_l3_blocks_[holder] = l3_block;
l3_blocks_.push_back(l3_block);
} else {
l3_block = holder_l3_blocks_[holder];
}
l3_block->Record(size);
return data_ptr;
} else if (l3_autotune_size_ > 0 && !holder_map_.empty()) {
phi::Allocation* holder =
reinterpret_cast<phi::DenseTensor*>(tensor)->Holder().get();
auto holder_iter = holder_map_.find(holder);
if (holder_iter != holder_map_.end()) {
auto& holder_pair = holder_iter->second;
auto* swap_holder = holder_pair.first;
bool& swap_holder_is_l3 = holder_pair.second;
if (swap_holder_is_l3 && swap_holder->size() >= size) {
swap(*holder, *swap_holder);
swap_holder_is_l3 = false;
} else if (!swap_holder_is_l3 && holder->size() < size) {
swap(*holder, *swap_holder);
swap_holder_is_l3 = true;
}
}
return DeviceContext::Alloc(
tensor, dtype, requested_size, pinned, fake_alloc);
} else {
return DeviceContext::Alloc(
tensor, dtype, requested_size, pinned, fake_alloc);
}
}
void InferXPUContext::SetL3Info(size_t l3_size,
void* l3_ptr,
size_t l3_autotune_size) {
if (l3_ptr == nullptr) {
if (l3_size_ != l3_size) {
if (l3_owned_) {
xpu_free(l3_ptr_);
}
if (l3_size > 0) {
xpu_malloc(&l3_ptr_, l3_size, XPU_MEM_L3);
if (l3_ptr_ != nullptr) {
VLOG(3) << "remalloc l3(" << l3_size << ") success.";
l3_size_ = l3_size;
l3_owned_ = true;
l3_autotune_size_ = l3_autotune_size;
} else {
VLOG(3) << "malloc l3(" << l3_size << ") failed. No l3 will be used.";
l3_size_ = 0;
l3_owned_ = false;
l3_autotune_size_ = 0;
}
}
}
} else {
if (l3_owned_) {
xpu_free(l3_ptr_);
}
l3_ptr_ = l3_ptr;
l3_size_ = l3_size;
l3_autotune_size_ = l3_autotune_size;
}
if (l3_autotune_size_ == 0) {
x_context()->_l3_mgr.set(l3_ptr_, l3_size_);
}
}
void InferXPUContext::L3CacheAutotune() {
if (l3_autotune_size_ == 0) return;
if (holder_map_.empty()) {
l3_plan_.RunAutotune(l3_blocks_, l3_size_);
auto* plan = l3_plan_.plan();
int8_t* cur_l3_ptr = reinterpret_cast<int8_t*>(l3_ptr_);
for (size_t i = 0; i < l3_blocks_.size(); i++) {
size_t block_size = plan->at(i);
if (block_size > 0) {
l3_blocks_[i]->Set(cur_l3_ptr, block_size);
cur_l3_ptr += block_size;
}
}
x_context()->_l3_mgr.set(
reinterpret_cast<int8_t*>(l3_ptr_) + l3_size_ - plan->back(),
plan->back());
for (auto holder_l3_block : holder_l3_blocks_) {
auto* l3_block = holder_l3_block.second;
if (l3_block->size() > 0) {
auto* holder = holder_l3_block.first;
auto place = holder->place();
phi::Allocation* l3_holder =
new phi::Allocation(l3_block->data(), l3_block->size(), place);
holder_map_[holder] = std::make_pair(l3_holder, true);
}
}
} else {
for (auto& holders : holder_map_) {
auto* holder = holders.first;
auto& holder_pair = holders.second;
if (!holder_pair.second) {
swap(*holder, *(holder_pair.first));
holder_pair.second = true;
}
}
}
}
#endif #endif
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
#include "paddle/phi/backends/all_context.h" #include "paddle/phi/backends/all_context.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/xpu_l3_strategy.h"
#endif
namespace paddle { namespace paddle {
...@@ -46,14 +49,33 @@ class InferGPUContext : public phi::GPUContext { ...@@ -46,14 +49,33 @@ class InferGPUContext : public phi::GPUContext {
}; };
#endif #endif
#if defined(PADDLE_WITH_XPU) #ifdef PADDLE_WITH_XPU
class InferXPUContext : public phi::XPUContext { class InferXPUContext : public phi::XPUContext {
public: public:
explicit InferXPUContext(const phi::Place& place); explicit InferXPUContext(const phi::Place& place);
using phi::XPUContext::SetDriverVersion;
using phi::XPUContext::SetRuntimeVersion; void* Alloc(phi::TensorBase* tensor,
using phi::XPUContext::SetStream; phi::DataType dtype,
using phi::XPUContext::SetXpuVersion; size_t requested_size = 0,
bool pinned = false,
bool fake_alloc = false) const override;
void SetL3Info(size_t l3_size, void* l3_ptr, size_t l3_autotune_size);
void L3CacheAutotune();
private:
size_t l3_size_{0};
void* l3_ptr_{nullptr};
bool l3_owned_{false};
size_t l3_autotune_size_{0};
mutable std::vector<phi::XPUL3CacheBlock*> l3_blocks_;
mutable std::unordered_map<phi::Allocation*, phi::XPUL3CacheBlock*>
holder_l3_blocks_;
mutable std::unordered_map<phi::Allocation*,
std::pair<phi::Allocation*, bool>>
holder_map_;
phi::XPUL3Planner l3_plan_;
}; };
#endif #endif
} // namespace paddle } // namespace paddle
...@@ -471,6 +471,13 @@ class Predictor; ...@@ -471,6 +471,13 @@ class Predictor;
class Tensor; class Tensor;
using Config = paddle::AnalysisConfig; using Config = paddle::AnalysisConfig;
namespace experimental { namespace experimental {
struct XpuRuntimeConfig {
void* stream{nullptr};
size_t l3_size{16773120};
void* l3_ptr{nullptr};
size_t l3_autotune_size{0};
};
// Unstable interface, may be modified or deleted in the future. // Unstable interface, may be modified or deleted in the future.
class PD_INFER_DECL InternalUtils { class PD_INFER_DECL InternalUtils {
public: public:
...@@ -479,8 +486,8 @@ class PD_INFER_DECL InternalUtils { ...@@ -479,8 +486,8 @@ class PD_INFER_DECL InternalUtils {
cudaStream_t stream); cudaStream_t stream);
static bool RunWithExternalStream(paddle_infer::Predictor* pred, static bool RunWithExternalStream(paddle_infer::Predictor* pred,
hipStream_t stream); hipStream_t stream);
static bool RunWithExternalStream(paddle_infer::Predictor* pred, static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config);
void* stream);
static void UpdateConfigInterleaved(paddle_infer::Config* c, static void UpdateConfigInterleaved(paddle_infer::Config* c,
bool with_interleaved); bool with_interleaved);
......
...@@ -41,9 +41,6 @@ ...@@ -41,9 +41,6 @@
#include "paddle/phi/backends/dynload/cusparse.h" #include "paddle/phi/backends/dynload/cusparse.h"
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/xpu_info.h"
#endif
namespace paddle { namespace paddle {
namespace internal { namespace internal {
...@@ -451,123 +448,5 @@ int ResourceManager::RefCount(void* stream) const { ...@@ -451,123 +448,5 @@ int ResourceManager::RefCount(void* stream) const {
if (ref_count_.count(stream) == 0) return 0; if (ref_count_.count(stream) == 0) return 0;
return ref_count_.at(stream); return ref_count_.at(stream);
} }
#endif
#if defined(PADDLE_WITH_XPU)
// XPUContextResource
XPUContextResource::XPUContextResource(const phi::Place& place, void* stream)
: place_(place) {
InitXPUResource(stream);
}
XPUContextResource::~XPUContextResource() {}
void XPUContextResource::InitXPUResource(void* stream) {
phi::backends::xpu::XPUDeviceGuard guard(place_.device);
if (stream) {
owned_stream_ = false;
stream_ = stream;
}
InitXpuProperties();
}
void XPUContextResource::InitXpuProperties() {
phi::backends::xpu::XPUDeviceGuard guard(place_.device);
driver_version_ = phi::backends::xpu::GetDriverVersion();
runtime_version_ = phi::backends::xpu::GetRuntimeVersion();
xpu_version_ =
static_cast<int>(phi::backends::xpu::get_xpu_version(place_.device));
}
void* XPUContextResource::GetStream() const { return stream_; }
int XPUContextResource::GetDriverVersion() const { return driver_version_; }
int XPUContextResource::GetRuntimeVersion() const { return runtime_version_; }
int XPUContextResource::GetXpuVersion() const { return xpu_version_; }
void XPUContextResource::ReBindStream(void* stream) {
owned_stream_ = false;
stream_ = stream;
}
// XPUContextResource End.
// Resource Manager
void* ResourceManager::InitXPUResource(const phi::Place& place, void* stream) {
std::lock_guard<std::mutex> lock_gurad(xpu_mutex_);
if (xpu_resources_.count(stream)) {
Increase(stream);
return stream;
} else {
std::unique_ptr<XPUContextResource> resource{
new XPUContextResource(place, stream)};
void* s = resource->GetStream();
ref_count_[s] = 1;
xpu_resources_.emplace(s, std::move(resource));
return s;
}
}
XPUContextResource* ResourceManager::GetXPUResource(void* stream) const {
PADDLE_ENFORCE_EQ(xpu_resources_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in xpu_resources.", stream));
return xpu_resources_.at(stream).get();
}
void ResourceManager::XpuResourceReBindStream(void* old_stream,
void* new_stream) {
PADDLE_ENFORCE_EQ(
xpu_resources_.count(old_stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in xpu_resources.", old_stream));
auto xpu_resource = std::move(xpu_resources_.at(old_stream));
DestroyXPUResource(old_stream);
PADDLE_ENFORCE_EQ(
ref_count_.count(old_stream),
0,
platform::errors::Fatal("xpu resources rebind stream failed."));
xpu_resource->ReBindStream(new_stream);
ref_count_[new_stream]++;
xpu_resources_.emplace(new_stream, std::move(xpu_resource));
}
void ResourceManager::DestroyXPUResource(void* stream) {
PADDLE_ENFORCE_EQ(xpu_resources_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in xpu_resources.", stream));
Decrease(stream);
}
void ResourceManager::Decrease(void* stream) {
PADDLE_ENFORCE_EQ(ref_count_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in ref_count.", stream));
--ref_count_[stream];
if (ref_count_[stream] == 0) {
ref_count_.erase(stream);
xpu_resources_.erase(stream);
}
}
void ResourceManager::Increase(void* stream) {
PADDLE_ENFORCE_EQ(ref_count_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in ref_count.", stream));
++ref_count_[stream];
}
int ResourceManager::RefCount(void* stream) const {
if (ref_count_.count(stream) == 0) return 0;
return ref_count_.at(stream);
}
// Resource Manager End.
#endif #endif
} // namespace paddle } // namespace paddle
...@@ -124,33 +124,6 @@ class GPUContextResource { ...@@ -124,33 +124,6 @@ class GPUContextResource {
}; };
#endif #endif
#if defined(PADDLE_WITH_XPU)
class XPUContextResource {
public:
explicit XPUContextResource(const phi::Place& place, void* stream);
~XPUContextResource();
phi::Place Place() const;
void* GetStream() const;
int GetDriverVersion() const;
int GetRuntimeVersion() const;
int GetXpuVersion() const;
void ReBindStream(void* stream);
private:
void InitXPUResource(void* stream);
void InitXpuProperties();
private:
bool owned_stream_{true};
void* stream_;
phi::Place place_;
int driver_version_;
int runtime_version_;
int xpu_version_;
}; // class XPUContextResource
#endif
class ResourceManager { class ResourceManager {
public: public:
ResourceManager() = default; ResourceManager() = default;
...@@ -168,9 +141,8 @@ class ResourceManager { ...@@ -168,9 +141,8 @@ class ResourceManager {
std::mutex cpu_mutex_; std::mutex cpu_mutex_;
std::unique_ptr<CPUContextResource> cpu_resource_{nullptr}; std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
// GPU Resource
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// GPU Resource
public: public:
void* InitGPUResource(const phi::Place& place, void* stream); void* InitGPUResource(const phi::Place& place, void* stream);
void DestroyGPUResource(void* stream); void DestroyGPUResource(void* stream);
...@@ -190,28 +162,6 @@ class ResourceManager { ...@@ -190,28 +162,6 @@ class ResourceManager {
gpu_resources_; gpu_resources_;
#endif #endif
// XPU Resource
#if defined(PADDLE_WITH_XPU)
public:
void* InitXPUResource(const phi::Place& place, void* stream);
void DestroyXPUResource(void* stream);
XPUContextResource* GetXPUResource(void* stream) const;
int RefCount(void* stream) const;
void XpuResourceReBindStream(void* old_stream, void* new_stream);
private:
void Decrease(void* stream);
void Increase(void* stream);
private:
std::mutex xpu_mutex_;
// a stream corresponding to a series of resource.
std::map<void* /*stream*/, std::atomic<int>> ref_count_;
std::map<void* /*stream*/, std::unique_ptr<XPUContextResource>>
xpu_resources_;
#endif
private: private:
DISABLE_COPY_AND_ASSIGN(ResourceManager); DISABLE_COPY_AND_ASSIGN(ResourceManager);
}; };
......
...@@ -38,7 +38,7 @@ endif() ...@@ -38,7 +38,7 @@ endif()
if(WITH_XPU) if(WITH_XPU)
list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc) list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
xpu/xpu2_op_list.cc) xpu/xpu2_op_list.cc xpu/xpu_l3_strategy.cc)
endif() endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
......
...@@ -42,11 +42,13 @@ struct XPUContext::Impl { ...@@ -42,11 +42,13 @@ struct XPUContext::Impl {
auto selected_xpus = backends::xpu::GetXPUSelectedDevices(); auto selected_xpus = backends::xpu::GetXPUSelectedDevices();
for (unsigned int i = 0; i < selected_xpus.size(); i++) { for (unsigned int i = 0; i < selected_xpus.size(); i++) {
if (place_.GetDeviceId() == selected_xpus[i]) { if (place_.GetDeviceId() == selected_xpus[i]) {
if (l3ptrs[place_.GetDeviceId()] == nullptr) { if (l3ptrs[place_.GetDeviceId()] != nullptr) {
xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]), xpu_free(l3ptrs[place_.GetDeviceId()]);
l3_size, l3ptrs[place_.GetDeviceId()] = nullptr;
XPU_MEM_L3);
} }
xpu_malloc(static_cast<void**>(&l3ptrs[place_.GetDeviceId()]),
l3_size,
XPU_MEM_L3);
if (l3ptrs[place_.GetDeviceId()] != nullptr) { if (l3ptrs[place_.GetDeviceId()] != nullptr) {
context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size); context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size);
VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId()) VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId())
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/backends/xpu/xpu_l3_strategy.h"
#include "glog/logging.h"
namespace phi {
void XPUL3CacheBlock::Set(void* addr, size_t size) {
if (addr == nullptr || size == 0) {
LOG(FATAL) << "Set XPUL3CacheBlock Size as Zero";
}
addr_ = addr;
size_ = size;
}
void XPUL3Planner::RunAutotune(
const std::vector<XPUL3CacheBlock*>& l3_block_dict, size_t l3_size) {
if (l3_block_dict.size() == 0 || l3_size <= 0 || !plan_.empty()) {
return;
}
VLOG(3) << "AutoTune XPU L3 Cache Block Start.";
struct node {
size_t weights = 0;
size_t scores = 0;
std::vector<size_t> choices{0};
};
std::vector<std::vector<node>> records;
std::vector<size_t> record_map;
size_t total_scores = 0;
for (size_t block_idx = 0; block_idx < l3_block_dict.size(); block_idx++) {
XPUL3CacheBlock* cur_block = l3_block_dict[block_idx];
std::vector<size_t>& history = cur_block->history_;
auto history_size = history.size();
size_t score = 0;
VLOG(3) << "Block Idx is " << block_idx;
if (history_size > 1) {
std::vector<node> block_nodes{node()};
std::sort(history.begin(), history.end());
for (size_t i = 0; i < history_size; i++) {
VLOG(3) << "Size History : " << i << " is " << history[i];
if (history[i] > l3_size) {
break;
}
score += history[i];
if (i == history_size - 1 || history[i + 1] != history[i]) {
node cur_node;
cur_node.weights = history[i];
cur_node.choices = {history[i]};
cur_node.scores = score;
block_nodes.push_back(cur_node);
VLOG(3) << "Node Weights is:" << cur_node.weights
<< ", Node Scores is: " << score;
}
}
total_scores += score;
records.push_back(block_nodes);
record_map.push_back(block_idx);
}
}
if (records.size() <= 0) {
return;
}
std::vector<node> res(records[0]);
for (size_t block_idx = 1; block_idx < records.size(); block_idx++) {
std::vector<node> new_nodes;
for (size_t node_idx = 0; node_idx < records[block_idx].size();
node_idx++) {
for (size_t res_idx = 0; res_idx < res.size(); res_idx++) {
node cur_node;
size_t cur_weights =
records[block_idx][node_idx].weights + res[res_idx].weights;
if (cur_weights > l3_size) {
break;
}
cur_node.scores =
records[block_idx][node_idx].scores + res[res_idx].scores;
cur_node.weights = cur_weights;
cur_node.choices = res[res_idx].choices;
cur_node.choices.push_back(records[block_idx][node_idx].choices[0]);
new_nodes.push_back(cur_node);
}
}
struct {
bool operator()(node a, node b) const {
if (a.weights < b.weights) {
return true;
} else if (a.weights == b.weights) {
return a.scores > b.scores;
} else {
return false;
}
}
} customLess;
std::sort(new_nodes.begin(), new_nodes.end(), customLess);
std::vector<bool> stay(new_nodes.size(), true);
for (int i = new_nodes.size() - 1; i >= 0; i--) {
for (int j = i - 1; j >= 0; j--) {
if (new_nodes[j].scores >= new_nodes[i].scores) {
stay[i] = false;
break;
}
}
}
res.clear();
for (size_t i = 0; i < new_nodes.size(); i++) {
if (stay[i] == true) {
res.push_back(new_nodes[i]);
}
}
VLOG(3) << "XPU L3 Block IDX is " << block_idx
<< ", Choices before filter are " << new_nodes.size()
<< ", Choices after filter are " << res.size();
}
// final result: res.back().choices
// std::vector<size_t> record_map;
for (size_t i = 0; i < res.back().choices.size(); i++) {
VLOG(3) << "BLOCK IDX is " << i << ", Acquired L3 Size is "
<< res.back().choices[i];
}
double l3_global_ratio = static_cast<double>(res.back().scores) /
static_cast<double>(total_scores);
VLOG(3) << "Tensor Space in L3 / Tensor Space in Global :"
<< l3_global_ratio * 100 << " %";
size_t block_l3_size =
std::accumulate(res.back().choices.begin(), res.back().choices.end(), 0);
size_t xdnn_ctx_l3_size = (l3_size - block_l3_size) / 64 * 64;
VLOG(3) << "Block L3 Size : " << block_l3_size
<< ", XDNN Ctx L3 Size : " << xdnn_ctx_l3_size;
plan_.resize(l3_block_dict.size() + 1, 0);
for (size_t i = 0; i < res.back().choices.size(); i++) {
plan_[record_map[i]] = res.back().choices[i];
}
plan_[l3_block_dict.size()] = xdnn_ctx_l3_size;
VLOG(3) << "AutoTune XPU L3 Cache Block End.";
}
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <numeric>
#include <vector>
namespace phi {
struct XPUL3CacheBlock {
public:
void Clear() {
addr_ = nullptr;
size_ = 0;
history_.clear();
}
void Set(void* addr, size_t size);
void Record(size_t size) { history_.push_back(size); }
void* data() { return addr_; }
size_t size() { return size_; }
private:
void* addr_{nullptr};
size_t size_{0};
public:
std::vector<size_t> history_;
};
class XPUL3Planner {
public:
void RunAutotune(const std::vector<XPUL3CacheBlock*>& l3_block_dict,
size_t l3_size);
std::vector<size_t>* plan() { return &plan_; }
private:
std::vector<size_t> plan_;
};
} // namespace phi
...@@ -393,11 +393,8 @@ template <typename T> ...@@ -393,11 +393,8 @@ template <typename T>
T* DeviceContext::Alloc(TensorBase* tensor, T* DeviceContext::Alloc(TensorBase* tensor,
size_t requested_size, size_t requested_size,
bool pinned) const { bool pinned) const {
if (pinned) { DataType dtype = phi::CppTypeToDataType<T>::Type();
return impl_->Alloc<T>( return static_cast<T*>(this->Alloc(tensor, dtype, requested_size, pinned));
tensor, GetPinnedPlace(GetPlace()), requested_size, pinned);
}
return impl_->Alloc<T>(tensor, GetPlace(), requested_size, pinned);
} }
void* DeviceContext::HostAlloc(TensorBase* tensor, void* DeviceContext::HostAlloc(TensorBase* tensor,
......
...@@ -145,11 +145,11 @@ class PADDLE_API DeviceContext { ...@@ -145,11 +145,11 @@ class PADDLE_API DeviceContext {
/** /**
* @brief Allocate device memory for tensor. * @brief Allocate device memory for tensor.
*/ */
void* Alloc(TensorBase*, virtual void* Alloc(TensorBase*,
DataType dtype, DataType dtype,
size_t requested_size = 0, size_t requested_size = 0,
bool pinned = false, bool pinned = false,
bool fake_alloc = false) const; bool fake_alloc = false) const;
template <typename T> template <typename T>
T* Alloc(TensorBase* tensor, T* Alloc(TensorBase* tensor,
......
...@@ -1461,6 +1461,17 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST) ...@@ -1461,6 +1461,17 @@ if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
--repeat=10) --repeat=10)
endif() endif()
if(WITH_XPU)
inference_analysis_test(
xpu_runtime_config_resnet50_test
SRCS
xpu_runtime_config_resnet50_test.cc
EXTRA_DEPS
paddle_inference_shared
ARGS
--infer_model=${RESNET50_MODEL_DIR})
endif()
set(inference_deps ${analysis_deps} paddle_inference_api analysis set(inference_deps ${analysis_deps} paddle_inference_api analysis
naive_executor ${GLOB_PASS_LIB}) naive_executor ${GLOB_PASS_LIB})
......
...@@ -17,10 +17,6 @@ ...@@ -17,10 +17,6 @@
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif #endif
#if defined(PADDLE_WITH_XPU)
#include "xpu/runtime.h"
#include "xpu/xdnn.h"
#endif
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -671,57 +667,6 @@ TEST(Predictor, Streams) { ...@@ -671,57 +667,6 @@ TEST(Predictor, Streams) {
} }
#endif #endif
#if defined(PADDLE_WITH_XPU)
TEST(Predictor, XPUStreams) {
// external stream
{
auto context = baidu::xpu::api::create_context();
xpu_stream_create(&context->xpu_stream);
Config config;
config.SetModel(FLAGS_dirname);
config.EnableXpu();
config.SetExecStream(static_cast<void*>(context->xpu_stream));
CHECK_EQ(config.external_stream_enabled(), true);
auto predictor = CreatePredictor(config);
auto stream = predictor->GetExecStream();
CHECK_EQ(static_cast<void*>(context->xpu_stream), stream);
CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream));
CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream), 1);
}
// 2 predictor on 2 stream
{
auto context1 = baidu::xpu::api::create_context();
xpu_stream_create(&context1->xpu_stream);
Config config;
config.SetModel(FLAGS_dirname);
config.EnableXpu();
config.SetExecStream(static_cast<void*>(context1->xpu_stream));
auto predictor = CreatePredictor(config);
auto stream1 = predictor->GetExecStream();
CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream1));
CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream1), 1);
auto context2 = baidu::xpu::api::create_context();
xpu_stream_create(&context2->xpu_stream);
Config config2;
config2.SetModel(FLAGS_dirname);
config2.EnableXpu();
config2.SetExecStream(static_cast<void*>(context2->xpu_stream));
auto predictor2 = CreatePredictor(config2);
auto stream2 = predictor2->GetExecStream();
CHECK_NOTNULL(paddle::ResourceManager::Instance().GetXPUResource(stream2));
CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream2), 1);
CHECK_NE(stream1, stream2);
}
}
#endif
TEST(AnalysisPredictor, OutputHookFunc) { TEST(AnalysisPredictor, OutputHookFunc) {
auto hookfunc = [](const std::string& type, auto hookfunc = [](const std::string& type,
const std::string& var_name, const std::string& var_name,
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cmath>
#include "gflags/gflags.h"
#include "test/cpp/inference/api/tester_helper.h"
#include "xpu/runtime.h"
#include "xpu/xdnn.h"
namespace paddle_infer {
static const std::vector<float> TRUTH_VALUES = {
127.779f, 738.165f, 1013.22f, -438.17f, 366.401f, 927.659f, 736.222f,
-633.684f, -329.927f, -430.155f, -633.062f, -146.548f, -1324.28f, -1349.36f,
-242.675f, 117.448f, -801.723f, -391.514f, -404.818f, 454.16f, 515.48f,
-133.031f, 69.293f, 590.096f, -1434.69f, -1070.89f, 307.074f, 400.525f,
-316.12f, -587.125f, -161.056f, 800.363f, -96.4708f, 748.706f, 868.174f,
-447.938f, 112.737f, 1127.2f, 47.4355f, 677.72f, 593.186f, -336.4f,
551.362f, 397.823f, 78.3979f, -715.398f, 405.969f, 404.256f, 246.019f,
-8.42969f, 131.365f, -648.051f};
void PrepareInput(std::shared_ptr<Predictor> predictor) {
const int batch = 1;
const int channel = 3;
const int height = 318;
const int width = 318;
const int input_num = batch * channel * height * width;
std::vector<float> input(input_num, 1);
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputHandle(input_names[0]);
input_t->Reshape({batch, channel, height, width});
input_t->CopyFromCpu(input.data());
}
void CompareOutput(std::shared_ptr<Predictor> predictor) {
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
size_t out_num = std::accumulate(
output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
std::vector<float> out_data;
out_data.resize(out_num);
output_t->CopyToCpu(out_data.data());
float* data_o = out_data.data();
for (size_t j = 0; j < out_num; j += 10) {
EXPECT_NEAR(
(data_o[j] - TRUTH_VALUES[j / 10]) / TRUTH_VALUES[j / 10], 0., 10e-3);
}
}
Config XpuConfig() {
std::string model_dir = FLAGS_infer_model + "/" + "model";
Config config;
config.SetModel(model_dir + "/model", model_dir + "/params");
config.EnableXpu();
return config;
}
TEST(resnet50_xpu, basic) {
Config config = XpuConfig();
auto predictor = CreatePredictor(config);
PrepareInput(predictor);
predictor->Run();
CompareOutput(predictor);
}
#define RUN_WITH_RUNTIME_CONFIG(idx_, config_) \
Config config##idx_ = XpuConfig(); \
auto predictor##idx_ = CreatePredictor(config##idx_); \
PrepareInput(predictor##idx_); \
experimental::InternalUtils::RunWithRuntimeConfig(predictor##idx_.get(), \
&config_); \
CompareOutput(predictor##idx_); \
CHECK_EQ(predictor##idx_->GetExecStream(), config_.stream);
TEST(runtime_stream, null_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0};
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
}
TEST(runtime_stream, new_stream) {
void* stream = nullptr;
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
{
experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0};
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
}
xpu_stream_destroy(stream);
}
TEST(runtime_stream, 2_null_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config = {nullptr, 0, nullptr, 0};
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config);
}
TEST(runtime_stream, null_and_new_stream) {
experimental::XpuRuntimeConfig xpu_runtime_config0 = {nullptr, 0, nullptr, 0};
void* stream = nullptr;
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
{
experimental::XpuRuntimeConfig xpu_runtime_config1 = {
stream, 0, nullptr, 0};
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
}
xpu_stream_destroy(stream);
}
TEST(runtime_stream, 2_new_same_stream) {
void* stream = nullptr;
xpu_stream_create(&stream);
CHECK_NOTNULL(stream);
experimental::XpuRuntimeConfig xpu_runtime_config = {stream, 0, nullptr, 0};
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config);
}
xpu_stream_destroy(stream);
}
TEST(runtime_stream, 2_new_different_stream) {
void* stream0 = nullptr;
xpu_stream_create(&stream0);
CHECK_NOTNULL(stream0);
experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0};
void* stream1 = nullptr;
xpu_stream_create(&stream1);
CHECK_NOTNULL(stream1);
experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0};
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
}
xpu_stream_destroy(stream0);
xpu_stream_destroy(stream1);
}
void RunPredictorWithRuntimeConfig(
std::shared_ptr<Predictor> predictor,
experimental::XpuRuntimeConfig runtime_config) {
PrepareInput(predictor);
experimental::InternalUtils::RunWithRuntimeConfig(predictor.get(),
&runtime_config);
CompareOutput(predictor);
CHECK_EQ(predictor->GetExecStream(), runtime_config.stream);
}
TEST(runtime_stream, 2_thread) {
void* stream0 = nullptr;
xpu_stream_create(&stream0);
CHECK_NOTNULL(stream0);
experimental::XpuRuntimeConfig xpu_runtime_config0 = {stream0, 0, nullptr, 0};
void* stream1 = nullptr;
xpu_stream_create(&stream1);
CHECK_NOTNULL(stream1);
experimental::XpuRuntimeConfig xpu_runtime_config1 = {stream1, 0, nullptr, 0};
{
RUN_WITH_RUNTIME_CONFIG(0, xpu_runtime_config0);
RUN_WITH_RUNTIME_CONFIG(1, xpu_runtime_config1);
std::thread t0(
RunPredictorWithRuntimeConfig, predictor0, xpu_runtime_config0);
std::thread t1(
RunPredictorWithRuntimeConfig, predictor1, xpu_runtime_config1);
t0.join();
t1.join();
}
xpu_stream_destroy(stream0);
xpu_stream_destroy(stream1);
}
} // namespace paddle_infer
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册