From 4e3fb2198678e1f206d146e46af3f05d97f6a17e Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 14 Jul 2021 19:03:22 +0800
Subject: [PATCH] Inference support Ascend910 (#34101)

---
 paddle/fluid/inference/api/analysis_config.cc | 57 ++++++++++++++++++-
 .../fluid/inference/api/analysis_predictor.cc | 14 +++++
 paddle/fluid/inference/api/api_impl.cc        | 19 ++++++-
 paddle/fluid/inference/api/api_impl_tester.cc | 13 +++++
 .../inference/api/details/zero_copy_tensor.cc | 37 +++++++++++-
 .../inference/api/paddle_analysis_config.h    | 27 ++++++++-
 paddle/fluid/inference/api/paddle_api.h       |  1 +
 .../fluid/inference/api/paddle_pass_builder.h | 22 ++++++-
 paddle/fluid/inference/api/paddle_tensor.h    |  2 +-
 paddle/fluid/inference/capi_exp/pd_config.cc  | 14 +++++
 paddle/fluid/inference/capi_exp/pd_config.h   | 24 ++++++++
 paddle/fluid/inference/goapi/config.go        | 27 +++++++++
 paddle/fluid/inference/tests/test_helper.h    |  3 +
 paddle/fluid/pybind/inference_api.cc          |  7 ++-
 14 files changed, 257 insertions(+), 10 deletions(-)
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index b5ca0ef5924..58b0a3536a4 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -36,6 +36,8 @@ PassStrategy *AnalysisConfig::pass_builder() const {
       pass_builder_.reset(new GpuPassStrategy);
     } else if (use_xpu_) {
       pass_builder_.reset(new XpuPassStrategy);
+    } else if (use_npu_) {
+      pass_builder_.reset(new NpuPassStrategy);
     } else {
       LOG(INFO) << "Create CPU IR passes";
       pass_builder_.reset(new CpuPassStrategy);
@@ -110,6 +112,18 @@ void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
   Update();
 }
 
+void AnalysisConfig::EnableNpu(int device_id) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  use_npu_ = true;
+  npu_device_id_ = device_id;
+#else
+  LOG(ERROR) << "Please compile with npu to EnableNpu()";
+  use_npu_ = false;
+#endif
+
+  Update();
+}
+
 AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
@@ -127,7 +141,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_gpu_);
   CP_MEMBER(use_cudnn_);
   CP_MEMBER(gpu_device_id_);
-  CP_MEMBER(xpu_device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
 
   CP_MEMBER(enable_memory_optim_);
@@ -167,7 +180,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(lite_ops_filter_);
   CP_MEMBER(lite_zero_copy_);
 
+  // XPU related.
   CP_MEMBER(use_xpu_);
+  CP_MEMBER(xpu_device_id_);
   CP_MEMBER(xpu_l3_workspace_size_);
   CP_MEMBER(xpu_locked_);
   CP_MEMBER(xpu_autotune_);
@@ -175,6 +190,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(xpu_precision_);
   CP_MEMBER(xpu_adaptive_seqlen_);
 
+  // NPU related.
+  CP_MEMBER(use_npu_);
+  CP_MEMBER(npu_device_id_);
+
   // profile related.
   CP_MEMBER(with_profile_);
 
@@ -202,6 +221,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   } else if (use_xpu_) {
     pass_builder_.reset(new XpuPassStrategy(
         *static_cast<XpuPassStrategy *>(other.pass_builder())));
+  } else if (use_npu_) {
+    pass_builder_.reset(new NpuPassStrategy(
+        *static_cast<NpuPassStrategy *>(other.pass_builder())));
   } else {
     pass_builder_.reset(new CpuPassStrategy(
         *static_cast<CpuPassStrategy *>(other.pass_builder())));
@@ -376,7 +398,9 @@ void AnalysisConfig::Update() {
   if (info == serialized_info_cache_) return;
 
   // Transfer pass_builder and copy the existing compatible passes.
-  if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
+  if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) ||
+      ((use_xpu() ^ pass_builder_->use_xpu())) ||
+      ((use_npu() ^ pass_builder_->use_npu()))) {
     if (use_gpu()) {
       pass_builder_.reset(new GpuPassStrategy);
 
@@ -390,6 +414,12 @@ void AnalysisConfig::Update() {
           platform::errors::InvalidArgument(
               "Only one choice can be made between CPU and XPU."));
       pass_builder_.reset(new XpuPassStrategy);
+    } else if (use_npu()) {
+      PADDLE_ENFORCE_EQ(
+          use_gpu(), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between GPU and NPU."));
+      pass_builder_.reset(new NpuPassStrategy);
     } else {
       pass_builder_.reset(new CpuPassStrategy);
     }
@@ -405,6 +435,13 @@ void AnalysisConfig::Update() {
               "Only one choice can be made between CPU and XPU."));
       pass_builder_.reset(new XpuPassStrategy(
           *static_cast<XpuPassStrategy *>(pass_builder_.get())));
+    } else if (use_npu()) {
+      PADDLE_ENFORCE_EQ(
+          use_gpu(), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between GPU and NPU."));
+      pass_builder_.reset(new NpuPassStrategy(
+          *static_cast<NpuPassStrategy *>(pass_builder_.get())));
     } else {
       pass_builder_.reset(new CpuPassStrategy(
           *static_cast<CpuPassStrategy *>(pass_builder_.get())));
@@ -502,6 +539,19 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_npu_) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    PADDLE_ENFORCE_EQ(use_gpu_, false,
+                      platform::errors::Unavailable(
+                          "Currently, NPU and GPU cannot be enabled in the "
+                          "same analysis configuration."));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to use an NPU device, but Paddle was not compiled "
+        "with NPU-runtime."));
+#endif
+  }
+
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
@@ -566,6 +616,9 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << xpu_precision_;
   ss << xpu_adaptive_seqlen_;
 
+  ss << use_npu_;
+  ss << npu_device_id_;
+
   ss << thread_local_stream_;
 
   return ss.str();
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 1aa46ab5713..dd3a33130a3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -264,6 +264,14 @@ bool AnalysisPredictor::CreateExecutor() {
           "with WITH_XPU."));
 #endif  // PADDLE_WITH_XPU
     }
+  } else if (config_.use_npu()) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    place_ = paddle::platform::NPUPlace(config_.npu_device_id());
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to use NPU forward propagation, but Paddle was not compiled "
+        "with WITH_ASCEND_CL."));
+#endif
   } else {
     place_ = paddle::platform::CPUPlace();
   }
@@ -847,6 +855,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
       auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
       res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
     }
+  } else if (platform::is_npu_place(place_)) {
+    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+    res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
   } else {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -879,6 +890,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
       auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
       res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
     }
+  } else if (platform::is_npu_place(place_)) {
+    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+    res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
   } else {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 6930b3bd2e9..bb104015947 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
@@ -78,6 +79,8 @@ bool NativePaddlePredictor::Init(
     place_ = paddle::platform::CUDAPlace(config_.device);
   } else if (config_.use_xpu) {
     place_ = paddle::platform::XPUPlace(config_.device);
+  } else if (config_.use_npu) {
+    place_ = paddle::platform::NPUPlace(config_.device);
   } else {
     place_ = paddle::platform::CPUPlace();
   }
@@ -255,7 +258,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       PADDLE_THROW(platform::errors::Unavailable(
           "Not compile with CUDA, should not reach here."));
 #endif
-    } else {
+    } else if (platform::is_xpu_place(place_)) {
 #ifdef PADDLE_WITH_XPU
       auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
       memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
@@ -264,6 +267,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 #else
       PADDLE_THROW(platform::errors::Unavailable(
           "Not compile with XPU, should not reach here."));
+#endif
+    } else {
+#ifdef PADDLE_WITH_ASCEND_CL
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx =
+          static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
+      auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+      memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
+                   platform::CPUPlace(), inputs[i].data.data(),
+                   inputs[i].data.length(), dev_ctx->stream());
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Not compile with NPU, should not reach here."));
 #endif
     }
 
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index e3fad1fec06..89aec34110b 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -67,6 +67,7 @@ void MainWord2Vec(const paddle::PaddlePlace& place) {
   auto predictor = CreatePaddlePredictor<NativeConfig>(config);
   config.use_gpu = paddle::gpu_place_used(place);
   config.use_xpu = paddle::xpu_place_used(place);
+  config.use_npu = paddle::npu_place_used(place);
 
   framework::LoDTensor first_word, second_word, third_word, fourth_word;
   framework::LoD lod{{0, 1}};
@@ -119,6 +120,7 @@ void MainImageClassification(const paddle::PaddlePlace& place) {
   NativeConfig config = GetConfig();
   config.use_gpu = paddle::gpu_place_used(place);
   config.use_xpu = paddle::xpu_place_used(place);
+  config.use_npu = paddle::npu_place_used(place);
   config.model_dir =
       FLAGS_book_dirname + "/image_classification_resnet.inference.model";
 
@@ -163,6 +165,7 @@ void MainThreadsWord2Vec(const paddle::PaddlePlace& place) {
   NativeConfig config = GetConfig();
   config.use_gpu = paddle::gpu_place_used(place);
   config.use_xpu = paddle::xpu_place_used(place);
+  config.use_npu = paddle::npu_place_used(place);
   auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   // prepare inputs data and reference results
@@ -227,6 +230,7 @@ void MainThreadsImageClassification(const paddle::PaddlePlace& place) {
   NativeConfig config = GetConfig();
   config.use_gpu = paddle::gpu_place_used(place);
   config.use_xpu = paddle::xpu_place_used(place);
+  config.use_npu = paddle::npu_place_used(place);
   config.model_dir =
       FLAGS_book_dirname + "/image_classification_resnet.inference.model";
 
@@ -297,6 +301,15 @@ TEST(inference_api_native, image_classification_xpu) {
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(inference_api_native, word2vec_npu) {
+  MainWord2Vec(paddle::PaddlePlace::kNPU);
+}
+// TEST(inference_api_native, image_classification_npu) {
+//   MainImageClassification(paddle::PaddlePlace::kNPU);
+// }
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(inference_api_native, word2vec_gpu) {
   MainWord2Vec(paddle::PaddlePlace::kGPU);
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 313cbfb7c78..5ed6691ebb8 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -150,10 +151,26 @@ void Tensor::CopyFromCpu(const T *data) {
     PADDLE_THROW(paddle::platform::errors::Unavailable(
         "Can not create tensor with XPU place because paddle is not compiled "
         "with XPU."));
+#endif
+  } else if (place_ == PlaceType::kNPU) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    paddle::platform::NPUPlace npu_place(device_);
+    auto *t_data = tensor->mutable_data<T>(npu_place);
+    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
+        pool.Get(npu_place));
+    paddle::memory::Copy(npu_place, static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(), data, ele_size,
+                         dev_ctx->stream());
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not create tensor with NPU place because paddle is not compiled "
+        "with NPU."));
 #endif
   } else {
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU and XPU now."));
+        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
   }
 }
 
@@ -212,10 +229,26 @@ void Tensor::CopyToCpu(T *data) {
     PADDLE_THROW(paddle::platform::errors::Unavailable(
         "Can not create tensor with XPU place because paddle is not compiled "
         "with XPU."));
+#endif
+  } else if (place_ == PlaceType::kNPU) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    auto npu_place = BOOST_GET_CONST(paddle::platform::NPUPlace, t_place);
+    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
+        pool.Get(npu_place));
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data), npu_place, t_data,
+                         ele_num * sizeof(T), dev_ctx->stream());
+    aclrtSynchronizeStream(dev_ctx->stream());
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not create tensor with NPU place because paddle is not compiled "
+        "with NPU."));
 #endif
   } else {
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU and XPU now."));
+        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
   }
 }
 template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 81e742e8a6f..58d02d8d1e7 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -203,6 +203,12 @@ struct PD_INFER_DECL AnalysisConfig {
                  const std::string& precision = "int16",
                  bool adaptive_seqlen = false);
   ///
+  /// \brief Turn on NPU.
+  ///
+  /// \param device_id device_id the NPU card to use (default is 0).
+  ///
+  void EnableNpu(int device_id = 0);
+  ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
   /// \return bool Whether the GPU is turned on.
@@ -215,6 +221,12 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool use_xpu() const { return use_xpu_; }
   ///
+  /// \brief A boolean state telling whether the NPU is turned on.
+  ///
+  /// \return bool Whether the NPU is turned on.
+  ///
+  bool use_npu() const { return use_npu_; }
+  ///
   /// \brief Get the GPU device id.
   ///
   /// \return int The GPU device id.
@@ -227,6 +239,12 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   int xpu_device_id() const { return xpu_device_id_; }
   ///
+  /// \brief Get the NPU device id.
+  ///
+  /// \return int The NPU device id.
+  ///
+  int npu_device_id() const { return npu_device_id_; }
+  ///
   /// \brief Get the initial size in MB of the GPU memory pool.
   ///
   /// \return int The initial size in MB of the GPU memory pool.
@@ -619,11 +637,15 @@ struct PD_INFER_DECL AnalysisConfig {
   // GPU related.
   bool use_gpu_{false};
   int gpu_device_id_{0};
-  int xpu_device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
+  bool thread_local_stream_{false};
 
   bool use_cudnn_{false};
 
+  // NPU related
+  bool use_npu_{false};
+  int npu_device_id_{0};
+
   // Padding related
   bool use_fc_padding_{true};
 
@@ -689,8 +711,9 @@ struct PD_INFER_DECL AnalysisConfig {
   Precision lite_precision_mode_;
   bool lite_zero_copy_;
 
-  bool thread_local_stream_{false};
+  // XPU related.
   bool use_xpu_{false};
+  int xpu_device_id_{0};
   int xpu_l3_workspace_size_;
   bool xpu_locked_;
   bool xpu_autotune_;
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 3e92ffaf9dc..de6b28de275 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -303,6 +303,7 @@ struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
   /// GPU related fields.
   bool use_xpu{false};
   bool use_gpu{false};
+  bool use_npu{false};
   int device{0};
   float fraction_of_gpu_memory{
       -1.f};  ///< Change to a float in (0,1] if needed.
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index d7556b50031..f25060cd091 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -144,6 +144,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \return A bool variable implying whether we are in xpu mode.
   bool use_xpu() const { return use_xpu_; }
 
+  /// \brief Check if we are using npu.
+  /// \return A bool variable implying whether we are in npu mode.
+  bool use_npu() const { return use_npu_; }
+
   /// \brief Default destructor.
   virtual ~PassStrategy() = default;
 
@@ -151,6 +155,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \cond Protected
   bool use_xpu_{false};
   bool use_gpu_{false};
+  bool use_npu_{false};
   bool use_mkldnn_{false};
   /// \endcond
 };
@@ -236,7 +241,22 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
 /// mode.
 class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
  public:
-  XpuPassStrategy() : PassStrategy({}) {}
+  XpuPassStrategy() : PassStrategy({}) { use_xpu_ = true; }
+};
+
+/// \class NpuPassStrategy
+/// \brief The NPU passes controller, it is used in AnalysisPredictor with NPU
+/// mode.
+class PD_INFER_DECL NpuPassStrategy final : public PassStrategy {
+ public:
+  NpuPassStrategy() : PassStrategy({}) { use_npu_ = true; }
+
+  /// \brief Construct by copying another NpuPassStrategy object.
+  /// \param[in] other The NpuPassStrategy object we want to copy.
+  explicit NpuPassStrategy(const NpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {
+    use_npu_ = true;
+  }
 };
 
 /// \brief List of tensorRT subgraph passes.
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 9c4e5858af3..fa3067b62d6 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -28,7 +28,7 @@ enum DataType {
   // TODO(Superjomn) support more data types if needed.
 };
 
-enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU };
+enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU };
 
 /// \brief Represents an n-dimensional array of values.
 /// The Tensor is used to store the input or output of the network.
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index e9104ef5237..bd96f401233 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -135,11 +135,21 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
                     precision, adaptive_seqlen);
 }
 
+void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableNpu(device_id);
+}
+
 PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   return config->use_xpu();
 }
 
+PD_Bool PD_ConfigUseNpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_npu();
+}
+
 int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   return config->gpu_device_id();
@@ -148,6 +158,10 @@ int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   return config->xpu_device_id();
 }
+int32_t PD_ConfigNpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->npu_device_id();
+}
 int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   return config->memory_pool_init_size_mb();
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index a47ca5d2768..ac0ed8c8689 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -177,6 +177,14 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
     PD_Bool autotune, const char* autotune_file, const char* precision,
     PD_Bool adaptive_seqlen);
 ///
+/// \brief Turn on NPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] device_id device_id the NPU card to use.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableNpu(
+    __pd_keep PD_Config* pd_config, int32_t device_id);
+///
 /// \brief A boolean state telling whether the XPU is turned on.
 ///
 /// \param[in] pd_onfig config
@@ -185,6 +193,14 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
 PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
     __pd_keep PD_Config* pd_config);
 ///
+/// \brief A boolean state telling whether the NPU is turned on.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the NPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseNpu(
+    __pd_keep PD_Config* pd_config);
+///
 /// \brief Get the GPU device id.
 ///
 /// \param[in] pd_onfig config
@@ -201,6 +217,14 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
 PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
     __pd_keep PD_Config* pd_config);
 ///
+/// \brief Get the NPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The NPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigNpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
 /// \brief Get the initial size in MB of the GPU memory pool.
 ///
 /// \param[in] pd_onfig config
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index 9200de3d08f..866ae0e38b7 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -181,6 +181,15 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo
 		cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen))
 }
 
+///
+/// \brief Turn on NPU.
+///
+/// \param deviceId the NPU card to use.
+///
+func (config *Config) EnableNpu(deviceId int32) {
+	C.PD_ConfigEnableNpu(config.c, C.int32_t(deviceId))
+}
+
 ///
 /// \brief A boolean state telling whether the GPU is turned on.
 ///
@@ -199,6 +208,15 @@ func (config *Config) UseXpu() bool {
 	return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c))
 }
 
+///
+/// \brief A boolean state telling whether the NPU is turned on.
+///
+/// \return bool Whether the NPU is turned on.
+///
+func (config *Config) UseNpu() bool {
+	return cvtPDBoolToGo(C.PD_ConfigUseNpu(config.c))
+}
+
 ///
 /// \brief Get the GPU device id.
 ///
@@ -217,6 +235,15 @@ func (config *Config) XpuDeviceId() int32 {
 	return int32(C.PD_ConfigXpuDeviceId(config.c))
 }
 
+///
+/// \brief Get the NPU device id.
+///
+/// \return int32 The NPU device id.
+///
+func (config *Config) NpuDeviceId() int32 {
+	return int32(C.PD_ConfigNpuDeviceId(config.c))
+}
+
 ///
 /// \brief Get the initial size in MB of the GPU memory pool.
 ///
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index fc2c6a030a6..cf8a32ba94a 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -34,6 +34,9 @@ bool gpu_place_used(const paddle::PaddlePlace& place) {
 bool xpu_place_used(const paddle::PaddlePlace& place) {
   return place == paddle::PaddlePlace::kXPU;
 }
+bool npu_place_used(const paddle::PaddlePlace& place) {
+  return place == paddle::PaddlePlace::kNPU;
+}
 bool cpu_place_used(const paddle::PaddlePlace& place) {
   return place == paddle::PaddlePlace::kCPU;
 }
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b2572e5aa4b..6a949ba2a60 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -379,7 +379,8 @@ void BindPaddlePlace(py::module *m) {
       .value("UNK", PaddlePlace::kUNK)
       .value("CPU", PaddlePlace::kCPU)
       .value("GPU", PaddlePlace::kGPU)
-      .value("XPU", PaddlePlace::kXPU);
+      .value("XPU", PaddlePlace::kXPU)
+      .value("NPU", PaddlePlace::kNPU);
 }
 
 void BindPaddlePredictor(py::module *m) {
@@ -409,6 +410,7 @@ void BindNativeConfig(py::module *m) {
       .def(py::init<>())
       .def_readwrite("use_gpu", &NativeConfig::use_gpu)
       .def_readwrite("use_xpu", &NativeConfig::use_xpu)
+      .def_readwrite("use_npu", &NativeConfig::use_npu)
       .def_readwrite("device", &NativeConfig::device)
       .def_readwrite("fraction_of_gpu_memory",
                      &NativeConfig::fraction_of_gpu_memory)
@@ -471,11 +473,14 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("locked") = false, py::arg("autotune") = true,
            py::arg("autotune_file") = "", py::arg("precision") = "int16",
            py::arg("adaptive_seqlen") = false)
+      .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
+      .def("use_npu", &AnalysisConfig::use_npu)
       .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
       .def("xpu_device_id", &AnalysisConfig::xpu_device_id)
+      .def("npu_device_id", &AnalysisConfig::npu_device_id)
       .def("memory_pool_init_size_mb",
            &AnalysisConfig::memory_pool_init_size_mb)
       .def("fraction_of_gpu_memory_for_pool",
-- 
GitLab