未验证 提交 4e3fb219 编写于 作者: W Wilber 提交者: GitHub

Inference support Ascend910 (#34101)

上级 a4028b4b
...@@ -36,6 +36,8 @@ PassStrategy *AnalysisConfig::pass_builder() const { ...@@ -36,6 +36,8 @@ PassStrategy *AnalysisConfig::pass_builder() const {
pass_builder_.reset(new GpuPassStrategy); pass_builder_.reset(new GpuPassStrategy);
} else if (use_xpu_) { } else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy); pass_builder_.reset(new XpuPassStrategy);
} else if (use_npu_) {
pass_builder_.reset(new NpuPassStrategy);
} else { } else {
LOG(INFO) << "Create CPU IR passes"; LOG(INFO) << "Create CPU IR passes";
pass_builder_.reset(new CpuPassStrategy); pass_builder_.reset(new CpuPassStrategy);
...@@ -110,6 +112,18 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked, ...@@ -110,6 +112,18 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
Update(); Update();
} }
void AnalysisConfig::EnableNpu(int device_id) {
#ifdef PADDLE_WITH_ASCEND_CL
use_npu_ = true;
npu_device_id_ = device_id;
#else
LOG(ERROR) << "Please compile with npu to EnableNpu()";
use_npu_ = false;
#endif
Update();
}
AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
#define CP_MEMBER(member__) member__ = other.member__; #define CP_MEMBER(member__) member__ = other.member__;
...@@ -127,7 +141,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -127,7 +141,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_gpu_); CP_MEMBER(use_gpu_);
CP_MEMBER(use_cudnn_); CP_MEMBER(use_cudnn_);
CP_MEMBER(gpu_device_id_); CP_MEMBER(gpu_device_id_);
CP_MEMBER(xpu_device_id_);
CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(enable_memory_optim_); CP_MEMBER(enable_memory_optim_);
...@@ -167,7 +180,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -167,7 +180,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(lite_ops_filter_); CP_MEMBER(lite_ops_filter_);
CP_MEMBER(lite_zero_copy_); CP_MEMBER(lite_zero_copy_);
// XPU related.
CP_MEMBER(use_xpu_); CP_MEMBER(use_xpu_);
CP_MEMBER(xpu_device_id_);
CP_MEMBER(xpu_l3_workspace_size_); CP_MEMBER(xpu_l3_workspace_size_);
CP_MEMBER(xpu_locked_); CP_MEMBER(xpu_locked_);
CP_MEMBER(xpu_autotune_); CP_MEMBER(xpu_autotune_);
...@@ -175,6 +190,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -175,6 +190,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(xpu_precision_); CP_MEMBER(xpu_precision_);
CP_MEMBER(xpu_adaptive_seqlen_); CP_MEMBER(xpu_adaptive_seqlen_);
// NPU related.
CP_MEMBER(use_npu_);
CP_MEMBER(npu_device_id_);
// profile related. // profile related.
CP_MEMBER(with_profile_); CP_MEMBER(with_profile_);
...@@ -202,6 +221,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -202,6 +221,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
} else if (use_xpu_) { } else if (use_xpu_) {
pass_builder_.reset(new XpuPassStrategy( pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(other.pass_builder()))); *static_cast<XpuPassStrategy *>(other.pass_builder())));
} else if (use_npu_) {
pass_builder_.reset(new NpuPassStrategy(
*static_cast<NpuPassStrategy *>(other.pass_builder())));
} else { } else {
pass_builder_.reset(new CpuPassStrategy( pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(other.pass_builder()))); *static_cast<CpuPassStrategy *>(other.pass_builder())));
...@@ -376,7 +398,9 @@ void AnalysisConfig::Update() { ...@@ -376,7 +398,9 @@ void AnalysisConfig::Update() {
if (info == serialized_info_cache_) return; if (info == serialized_info_cache_) return;
// Transfer pass_builder and copy the existing compatible passes. // Transfer pass_builder and copy the existing compatible passes.
if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) { if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) ||
((use_xpu() ^ pass_builder_->use_xpu())) ||
((use_npu() ^ pass_builder_->use_npu()))) {
if (use_gpu()) { if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy); pass_builder_.reset(new GpuPassStrategy);
...@@ -390,6 +414,12 @@ void AnalysisConfig::Update() { ...@@ -390,6 +414,12 @@ void AnalysisConfig::Update() {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy); pass_builder_.reset(new XpuPassStrategy);
} else if (use_npu()) {
PADDLE_ENFORCE_EQ(
use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between GPU and NPU."));
pass_builder_.reset(new NpuPassStrategy);
} else { } else {
pass_builder_.reset(new CpuPassStrategy); pass_builder_.reset(new CpuPassStrategy);
} }
...@@ -405,6 +435,13 @@ void AnalysisConfig::Update() { ...@@ -405,6 +435,13 @@ void AnalysisConfig::Update() {
"Only one choice can be made between CPU and XPU.")); "Only one choice can be made between CPU and XPU."));
pass_builder_.reset(new XpuPassStrategy( pass_builder_.reset(new XpuPassStrategy(
*static_cast<XpuPassStrategy *>(pass_builder_.get()))); *static_cast<XpuPassStrategy *>(pass_builder_.get())));
} else if (use_npu()) {
PADDLE_ENFORCE_EQ(
use_gpu(), false,
platform::errors::InvalidArgument(
"Only one choice can be made between GPU and NPU."));
pass_builder_.reset(new NpuPassStrategy(
*static_cast<NpuPassStrategy *>(pass_builder_.get())));
} else { } else {
pass_builder_.reset(new CpuPassStrategy( pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(pass_builder_.get()))); *static_cast<CpuPassStrategy *>(pass_builder_.get())));
...@@ -502,6 +539,19 @@ void AnalysisConfig::Update() { ...@@ -502,6 +539,19 @@ void AnalysisConfig::Update() {
#endif #endif
} }
if (use_npu_) {
#ifdef PADDLE_WITH_ASCEND_CL
PADDLE_ENFORCE_EQ(use_gpu_, false,
platform::errors::Unavailable(
"Currently, NPU and GPU cannot be enabled in the "
"same analysis configuration."));
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use an NPU device, but Paddle was not compiled "
"with NPU-runtime."));
#endif
}
if (ir_debug_) { if (ir_debug_) {
pass_builder()->TurnOnDebug(); pass_builder()->TurnOnDebug();
} }
...@@ -566,6 +616,9 @@ std::string AnalysisConfig::SerializeInfoCache() { ...@@ -566,6 +616,9 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << xpu_precision_; ss << xpu_precision_;
ss << xpu_adaptive_seqlen_; ss << xpu_adaptive_seqlen_;
ss << use_npu_;
ss << npu_device_id_;
ss << thread_local_stream_; ss << thread_local_stream_;
return ss.str(); return ss.str();
......
...@@ -264,6 +264,14 @@ bool AnalysisPredictor::CreateExecutor() { ...@@ -264,6 +264,14 @@ bool AnalysisPredictor::CreateExecutor() {
"with WITH_XPU.")); "with WITH_XPU."));
#endif // PADDLE_WITH_XPU #endif // PADDLE_WITH_XPU
} }
} else if (config_.use_npu()) {
#ifdef PADDLE_WITH_ASCEND_CL
place_ = paddle::platform::NPUPlace(config_.npu_device_id());
#else
PADDLE_THROW(platform::errors::Unavailable(
"You tried to use NPU forward propagation, but Paddle was not compiled "
"with WITH_ASCEND_CL."));
#endif
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -847,6 +855,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor( ...@@ -847,6 +855,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
} }
} else if (platform::is_npu_place(place_)) {
auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
} else { } else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
...@@ -879,6 +890,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor( ...@@ -879,6 +890,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
} }
} else if (platform::is_npu_place(place_)) {
auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
} else { } else {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_bool(profile, false, "Turn on profiler for fluid"); DEFINE_bool(profile, false, "Turn on profiler for fluid");
...@@ -78,6 +79,8 @@ bool NativePaddlePredictor::Init( ...@@ -78,6 +79,8 @@ bool NativePaddlePredictor::Init(
place_ = paddle::platform::CUDAPlace(config_.device); place_ = paddle::platform::CUDAPlace(config_.device);
} else if (config_.use_xpu) { } else if (config_.use_xpu) {
place_ = paddle::platform::XPUPlace(config_.device); place_ = paddle::platform::XPUPlace(config_.device);
} else if (config_.use_npu) {
place_ = paddle::platform::NPUPlace(config_.device);
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = paddle::platform::CPUPlace();
} }
...@@ -255,7 +258,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -255,7 +258,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Not compile with CUDA, should not reach here.")); "Not compile with CUDA, should not reach here."));
#endif #endif
} else { } else if (platform::is_xpu_place(place_)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_); auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr), memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
...@@ -264,6 +267,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -264,6 +267,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Not compile with XPU, should not reach here.")); "Not compile with XPU, should not reach here."));
#endif
} else {
#ifdef PADDLE_WITH_ASCEND_CL
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
platform::CPUPlace(), inputs[i].data.data(),
inputs[i].data.length(), dev_ctx->stream());
#else
PADDLE_THROW(platform::errors::Unavailable(
"Not compile with NPU, should not reach here."));
#endif #endif
} }
......
...@@ -67,6 +67,7 @@ void MainWord2Vec(const paddle::PaddlePlace& place) { ...@@ -67,6 +67,7 @@ void MainWord2Vec(const paddle::PaddlePlace& place) {
auto predictor = CreatePaddlePredictor<NativeConfig>(config); auto predictor = CreatePaddlePredictor<NativeConfig>(config);
config.use_gpu = paddle::gpu_place_used(place); config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place); config.use_xpu = paddle::xpu_place_used(place);
config.use_npu = paddle::npu_place_used(place);
framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}}; framework::LoD lod{{0, 1}};
...@@ -119,6 +120,7 @@ void MainImageClassification(const paddle::PaddlePlace& place) { ...@@ -119,6 +120,7 @@ void MainImageClassification(const paddle::PaddlePlace& place) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = paddle::gpu_place_used(place); config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place); config.use_xpu = paddle::xpu_place_used(place);
config.use_npu = paddle::npu_place_used(place);
config.model_dir = config.model_dir =
FLAGS_book_dirname + "/image_classification_resnet.inference.model"; FLAGS_book_dirname + "/image_classification_resnet.inference.model";
...@@ -163,6 +165,7 @@ void MainThreadsWord2Vec(const paddle::PaddlePlace& place) { ...@@ -163,6 +165,7 @@ void MainThreadsWord2Vec(const paddle::PaddlePlace& place) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = paddle::gpu_place_used(place); config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place); config.use_xpu = paddle::xpu_place_used(place);
config.use_npu = paddle::npu_place_used(place);
auto main_predictor = CreatePaddlePredictor<NativeConfig>(config); auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
// prepare inputs data and reference results // prepare inputs data and reference results
...@@ -227,6 +230,7 @@ void MainThreadsImageClassification(const paddle::PaddlePlace& place) { ...@@ -227,6 +230,7 @@ void MainThreadsImageClassification(const paddle::PaddlePlace& place) {
NativeConfig config = GetConfig(); NativeConfig config = GetConfig();
config.use_gpu = paddle::gpu_place_used(place); config.use_gpu = paddle::gpu_place_used(place);
config.use_xpu = paddle::xpu_place_used(place); config.use_xpu = paddle::xpu_place_used(place);
config.use_npu = paddle::npu_place_used(place);
config.model_dir = config.model_dir =
FLAGS_book_dirname + "/image_classification_resnet.inference.model"; FLAGS_book_dirname + "/image_classification_resnet.inference.model";
...@@ -297,6 +301,15 @@ TEST(inference_api_native, image_classification_xpu) { ...@@ -297,6 +301,15 @@ TEST(inference_api_native, image_classification_xpu) {
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(inference_api_native, word2vec_npu) {
MainWord2Vec(paddle::PaddlePlace::kNPU);
}
// TEST(inference_api_native, image_classification_npu) {
// MainImageClassification(paddle::PaddlePlace::kNPU);
// }
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(inference_api_native, word2vec_gpu) { TEST(inference_api_native, word2vec_gpu) {
MainWord2Vec(paddle::PaddlePlace::kGPU); MainWord2Vec(paddle::PaddlePlace::kGPU);
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -150,10 +151,26 @@ void Tensor::CopyFromCpu(const T *data) { ...@@ -150,10 +151,26 @@ void Tensor::CopyFromCpu(const T *data) {
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with XPU place because paddle is not compiled " "Can not create tensor with XPU place because paddle is not compiled "
"with XPU.")); "with XPU."));
#endif
} else if (place_ == PlaceType::kNPU) {
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::NPUPlace npu_place(device_);
auto *t_data = tensor->mutable_data<T>(npu_place);
auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
pool.Get(npu_place));
paddle::memory::Copy(npu_place, static_cast<void *>(t_data),
paddle::platform::CPUPlace(), data, ele_size,
dev_ctx->stream());
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with NPU place because paddle is not compiled "
"with NPU."));
#endif #endif
} else { } else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU and XPU now.")); "The analysis predictor supports CPU, GPU, NPU and XPU now."));
} }
} }
...@@ -212,10 +229,26 @@ void Tensor::CopyToCpu(T *data) { ...@@ -212,10 +229,26 @@ void Tensor::CopyToCpu(T *data) {
PADDLE_THROW(paddle::platform::errors::Unavailable( PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with XPU place because paddle is not compiled " "Can not create tensor with XPU place because paddle is not compiled "
"with XPU.")); "with XPU."));
#endif
} else if (place_ == PlaceType::kNPU) {
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto npu_place = BOOST_GET_CONST(paddle::platform::NPUPlace, t_place);
auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
pool.Get(npu_place));
paddle::memory::Copy(paddle::platform::CPUPlace(),
static_cast<void *>(data), npu_place, t_data,
ele_num * sizeof(T), dev_ctx->stream());
aclrtSynchronizeStream(dev_ctx->stream());
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not create tensor with NPU place because paddle is not compiled "
"with NPU."));
#endif #endif
} else { } else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"The analysis predictor supports CPU, GPU and XPU now.")); "The analysis predictor supports CPU, GPU, NPU and XPU now."));
} }
} }
template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data); template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
......
...@@ -203,6 +203,12 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -203,6 +203,12 @@ struct PD_INFER_DECL AnalysisConfig {
const std::string& precision = "int16", const std::string& precision = "int16",
bool adaptive_seqlen = false); bool adaptive_seqlen = false);
/// ///
/// \brief Turn on NPU.
///
/// \param device_id device_id the NPU card to use (default is 0).
///
void EnableNpu(int device_id = 0);
///
/// \brief A boolean state telling whether the GPU is turned on. /// \brief A boolean state telling whether the GPU is turned on.
/// ///
/// \return bool Whether the GPU is turned on. /// \return bool Whether the GPU is turned on.
...@@ -215,6 +221,12 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -215,6 +221,12 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
bool use_xpu() const { return use_xpu_; } bool use_xpu() const { return use_xpu_; }
/// ///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \return bool Whether the NPU is turned on.
///
bool use_npu() const { return use_npu_; }
///
/// \brief Get the GPU device id. /// \brief Get the GPU device id.
/// ///
/// \return int The GPU device id. /// \return int The GPU device id.
...@@ -227,6 +239,12 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -227,6 +239,12 @@ struct PD_INFER_DECL AnalysisConfig {
/// ///
int xpu_device_id() const { return xpu_device_id_; } int xpu_device_id() const { return xpu_device_id_; }
/// ///
/// \brief Get the NPU device id.
///
/// \return int The NPU device id.
///
int npu_device_id() const { return npu_device_id_; }
///
/// \brief Get the initial size in MB of the GPU memory pool. /// \brief Get the initial size in MB of the GPU memory pool.
/// ///
/// \return int The initial size in MB of the GPU memory pool. /// \return int The initial size in MB of the GPU memory pool.
...@@ -619,11 +637,15 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -619,11 +637,15 @@ struct PD_INFER_DECL AnalysisConfig {
// GPU related. // GPU related.
bool use_gpu_{false}; bool use_gpu_{false};
int gpu_device_id_{0}; int gpu_device_id_{0};
int xpu_device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
bool thread_local_stream_{false};
bool use_cudnn_{false}; bool use_cudnn_{false};
// NPU related
bool use_npu_{false};
int npu_device_id_{0};
// Padding related // Padding related
bool use_fc_padding_{true}; bool use_fc_padding_{true};
...@@ -689,8 +711,9 @@ struct PD_INFER_DECL AnalysisConfig { ...@@ -689,8 +711,9 @@ struct PD_INFER_DECL AnalysisConfig {
Precision lite_precision_mode_; Precision lite_precision_mode_;
bool lite_zero_copy_; bool lite_zero_copy_;
bool thread_local_stream_{false}; // XPU related.
bool use_xpu_{false}; bool use_xpu_{false};
int xpu_device_id_{0};
int xpu_l3_workspace_size_; int xpu_l3_workspace_size_;
bool xpu_locked_; bool xpu_locked_;
bool xpu_autotune_; bool xpu_autotune_;
......
...@@ -303,6 +303,7 @@ struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config { ...@@ -303,6 +303,7 @@ struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
/// GPU related fields. /// GPU related fields.
bool use_xpu{false}; bool use_xpu{false};
bool use_gpu{false}; bool use_gpu{false};
bool use_npu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{ float fraction_of_gpu_memory{
-1.f}; ///< Change to a float in (0,1] if needed. -1.f}; ///< Change to a float in (0,1] if needed.
......
...@@ -144,6 +144,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -144,6 +144,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \return A bool variable implying whether we are in xpu mode. /// \return A bool variable implying whether we are in xpu mode.
bool use_xpu() const { return use_xpu_; } bool use_xpu() const { return use_xpu_; }
/// \brief Check if we are using npu.
/// \return A bool variable implying whether we are in npu mode.
bool use_npu() const { return use_npu_; }
/// \brief Default destructor. /// \brief Default destructor.
virtual ~PassStrategy() = default; virtual ~PassStrategy() = default;
...@@ -151,6 +155,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { ...@@ -151,6 +155,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
/// \cond Protected /// \cond Protected
bool use_xpu_{false}; bool use_xpu_{false};
bool use_gpu_{false}; bool use_gpu_{false};
bool use_npu_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
/// \endcond /// \endcond
}; };
...@@ -236,7 +241,22 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { ...@@ -236,7 +241,22 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
/// mode. /// mode.
class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
public: public:
XpuPassStrategy() : PassStrategy({}) {} XpuPassStrategy() : PassStrategy({}) { use_xpu_ = true; }
};
/// \class NpuPassStrategy
/// \brief The NPU passes controller, it is used in AnalysisPredictor with NPU
/// mode.
class PD_INFER_DECL NpuPassStrategy final : public PassStrategy {
public:
NpuPassStrategy() : PassStrategy({}) { use_npu_ = true; }
/// \brief Construct by copying another NpuPassStrategy object.
/// \param[in] other The NpuPassStrategy object we want to copy.
explicit NpuPassStrategy(const NpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {
use_npu_ = true;
}
}; };
/// \brief List of tensorRT subgraph passes. /// \brief List of tensorRT subgraph passes.
......
...@@ -28,7 +28,7 @@ enum DataType { ...@@ -28,7 +28,7 @@ enum DataType {
// TODO(Superjomn) support more data types if needed. // TODO(Superjomn) support more data types if needed.
}; };
enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU }; enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU };
/// \brief Represents an n-dimensional array of values. /// \brief Represents an n-dimensional array of values.
/// The Tensor is used to store the input or output of the network. /// The Tensor is used to store the input or output of the network.
......
...@@ -135,11 +135,21 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, ...@@ -135,11 +135,21 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
precision, adaptive_seqlen); precision, adaptive_seqlen);
} }
void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) {
CHECK_AND_CONVERT_PD_CONFIG;
config->EnableNpu(device_id);
}
PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) { PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
return config->use_xpu(); return config->use_xpu();
} }
PD_Bool PD_ConfigUseNpu(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->use_npu();
}
int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) { int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
return config->gpu_device_id(); return config->gpu_device_id();
...@@ -148,6 +158,10 @@ int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) { ...@@ -148,6 +158,10 @@ int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
return config->xpu_device_id(); return config->xpu_device_id();
} }
int32_t PD_ConfigNpuDeviceId(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG;
return config->npu_device_id();
}
int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) { int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) {
CHECK_AND_CONVERT_PD_CONFIG; CHECK_AND_CONVERT_PD_CONFIG;
return config->memory_pool_init_size_mb(); return config->memory_pool_init_size_mb();
......
...@@ -177,6 +177,14 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( ...@@ -177,6 +177,14 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
PD_Bool autotune, const char* autotune_file, const char* precision, PD_Bool autotune, const char* autotune_file, const char* precision,
PD_Bool adaptive_seqlen); PD_Bool adaptive_seqlen);
/// ///
/// \brief Turn on NPU.
///
/// \param[in] pd_onfig config
/// \param[in] device_id device_id the NPU card to use.
///
PADDLE_CAPI_EXPORT extern void PD_ConfigEnableNpu(
__pd_keep PD_Config* pd_config, int32_t device_id);
///
/// \brief A boolean state telling whether the XPU is turned on. /// \brief A boolean state telling whether the XPU is turned on.
/// ///
/// \param[in] pd_onfig config /// \param[in] pd_onfig config
...@@ -185,6 +193,14 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( ...@@ -185,6 +193,14 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu( PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
__pd_keep PD_Config* pd_config); __pd_keep PD_Config* pd_config);
/// ///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \param[in] pd_onfig config
/// \return Whether the NPU is turned on.
///
PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseNpu(
__pd_keep PD_Config* pd_config);
///
/// \brief Get the GPU device id. /// \brief Get the GPU device id.
/// ///
/// \param[in] pd_onfig config /// \param[in] pd_onfig config
...@@ -201,6 +217,14 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId( ...@@ -201,6 +217,14 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId( PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
__pd_keep PD_Config* pd_config); __pd_keep PD_Config* pd_config);
/// ///
/// \brief Get the NPU device id.
///
/// \param[in] pd_onfig config
/// \return The NPU device id.
///
PADDLE_CAPI_EXPORT extern int32_t PD_ConfigNpuDeviceId(
__pd_keep PD_Config* pd_config);
///
/// \brief Get the initial size in MB of the GPU memory pool. /// \brief Get the initial size in MB of the GPU memory pool.
/// ///
/// \param[in] pd_onfig config /// \param[in] pd_onfig config
......
...@@ -181,6 +181,15 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo ...@@ -181,6 +181,15 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo
cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen)) cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen))
} }
///
/// \brief Turn on NPU.
///
/// \param deviceId the NPU card to use.
///
func (config *Config) EnableNpu(deviceId int32) {
C.PD_ConfigEnableNpu(config.c, C.int32_t(deviceId))
}
/// ///
/// \brief A boolean state telling whether the GPU is turned on. /// \brief A boolean state telling whether the GPU is turned on.
/// ///
...@@ -199,6 +208,15 @@ func (config *Config) UseXpu() bool { ...@@ -199,6 +208,15 @@ func (config *Config) UseXpu() bool {
return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c)) return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c))
} }
///
/// \brief A boolean state telling whether the NPU is turned on.
///
/// \return bool Whether the NPU is turned on.
///
func (config *Config) UseNpu() bool {
return cvtPDBoolToGo(C.PD_ConfigUseNpu(config.c))
}
/// ///
/// \brief Get the GPU device id. /// \brief Get the GPU device id.
/// ///
...@@ -217,6 +235,15 @@ func (config *Config) XpuDeviceId() int32 { ...@@ -217,6 +235,15 @@ func (config *Config) XpuDeviceId() int32 {
return int32(C.PD_ConfigXpuDeviceId(config.c)) return int32(C.PD_ConfigXpuDeviceId(config.c))
} }
///
/// \brief Get the NPU device id.
///
/// \return int32 The NPU device id.
///
func (config *Config) NpuDeviceId() int32 {
return int32(C.PD_ConfigNpuDeviceId(config.c))
}
/// ///
/// \brief Get the initial size in MB of the GPU memory pool. /// \brief Get the initial size in MB of the GPU memory pool.
/// ///
......
...@@ -34,6 +34,9 @@ bool gpu_place_used(const paddle::PaddlePlace& place) { ...@@ -34,6 +34,9 @@ bool gpu_place_used(const paddle::PaddlePlace& place) {
bool xpu_place_used(const paddle::PaddlePlace& place) { bool xpu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kXPU; return place == paddle::PaddlePlace::kXPU;
} }
bool npu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kNPU;
}
bool cpu_place_used(const paddle::PaddlePlace& place) { bool cpu_place_used(const paddle::PaddlePlace& place) {
return place == paddle::PaddlePlace::kCPU; return place == paddle::PaddlePlace::kCPU;
} }
......
...@@ -379,7 +379,8 @@ void BindPaddlePlace(py::module *m) { ...@@ -379,7 +379,8 @@ void BindPaddlePlace(py::module *m) {
.value("UNK", PaddlePlace::kUNK) .value("UNK", PaddlePlace::kUNK)
.value("CPU", PaddlePlace::kCPU) .value("CPU", PaddlePlace::kCPU)
.value("GPU", PaddlePlace::kGPU) .value("GPU", PaddlePlace::kGPU)
.value("XPU", PaddlePlace::kXPU); .value("XPU", PaddlePlace::kXPU)
.value("NPU", PaddlePlace::kNPU);
} }
void BindPaddlePredictor(py::module *m) { void BindPaddlePredictor(py::module *m) {
...@@ -409,6 +410,7 @@ void BindNativeConfig(py::module *m) { ...@@ -409,6 +410,7 @@ void BindNativeConfig(py::module *m) {
.def(py::init<>()) .def(py::init<>())
.def_readwrite("use_gpu", &NativeConfig::use_gpu) .def_readwrite("use_gpu", &NativeConfig::use_gpu)
.def_readwrite("use_xpu", &NativeConfig::use_xpu) .def_readwrite("use_xpu", &NativeConfig::use_xpu)
.def_readwrite("use_npu", &NativeConfig::use_npu)
.def_readwrite("device", &NativeConfig::device) .def_readwrite("device", &NativeConfig::device)
.def_readwrite("fraction_of_gpu_memory", .def_readwrite("fraction_of_gpu_memory",
&NativeConfig::fraction_of_gpu_memory) &NativeConfig::fraction_of_gpu_memory)
...@@ -471,11 +473,14 @@ void BindAnalysisConfig(py::module *m) { ...@@ -471,11 +473,14 @@ void BindAnalysisConfig(py::module *m) {
py::arg("locked") = false, py::arg("autotune") = true, py::arg("locked") = false, py::arg("autotune") = true,
py::arg("autotune_file") = "", py::arg("precision") = "int16", py::arg("autotune_file") = "", py::arg("precision") = "int16",
py::arg("adaptive_seqlen") = false) py::arg("adaptive_seqlen") = false)
.def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
.def("disable_gpu", &AnalysisConfig::DisableGpu) .def("disable_gpu", &AnalysisConfig::DisableGpu)
.def("use_gpu", &AnalysisConfig::use_gpu) .def("use_gpu", &AnalysisConfig::use_gpu)
.def("use_xpu", &AnalysisConfig::use_xpu) .def("use_xpu", &AnalysisConfig::use_xpu)
.def("use_npu", &AnalysisConfig::use_npu)
.def("gpu_device_id", &AnalysisConfig::gpu_device_id) .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
.def("xpu_device_id", &AnalysisConfig::xpu_device_id) .def("xpu_device_id", &AnalysisConfig::xpu_device_id)
.def("npu_device_id", &AnalysisConfig::npu_device_id)
.def("memory_pool_init_size_mb", .def("memory_pool_init_size_mb",
&AnalysisConfig::memory_pool_init_size_mb) &AnalysisConfig::memory_pool_init_size_mb)
.def("fraction_of_gpu_memory_for_pool", .def("fraction_of_gpu_memory_for_pool",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册