[feature] support npu allocator (#30840)

[feature] support npu allocator

[feature] support npu allocator (#30840)
[feature] support npu allocator
81138239 · Leo Chen · GitHub · ebef6601 · 81138239 · 81138239
37 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,11 +32,14 @@ option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+# to develop some acl related functionality on x86
+option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
 if (WITH_GPU  AND WITH_XPU)
    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU  AND WITH_ASCEND)
+if (WITH_GPU AND WITH_ASCEND)
    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -82,6 +82,10 @@ if(WITH_ASCEND)
    add_definitions(-DPADDLE_WITH_ASCEND)
 endif()

+if(WITH_ASCEND_CL)
+    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+endif()
+
 if(WITH_XPU)
    message(STATUS "Compile with XPU!")
    add_definitions(-DPADDLE_WITH_XPU)

--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -21,38 +21,58 @@ else()
    set(ASCEND_DIR /usr/local/Ascend)
 endif()

-set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
-set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
-set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
-set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
-set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
-set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
-set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
-
-set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
-set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
-set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
-set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
-set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
-set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
-
-set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
-set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
-set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
-INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
-
-if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
-  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
-endif()
+if(WITH_ASCEND)
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
+    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+  endif()

-ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})

-ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})

-ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})

-add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+
+elseif(WITH_ASCEND_CL)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  
+  set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so)
+  set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
+  message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}")
+  message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}")
+  INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+  add_custom_target(extern_ascend DEPENDS atlas_acl)
+endif()
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -274,10 +274,10 @@ if(WITH_BOX_PS)
    list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)

-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
    include(external/ascend)
    list(APPEND third_party_deps extern_ascend)
-endif (WITH_ASCEND)
+endif ()

 if (WITH_PSCORE)
    include(external/snappy)

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
        platform::errors::Unimplemented("platform::XPUPlace is not supported"));
  }

+  inline ::DLContext operator()(const platform::NPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
+  }
+
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
    ::DLContext ctx;

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -431,6 +431,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
    return GetResultHelper(out, gpu);
  }

+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPlace& npu) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Not supported on place (%s) ",
+        npu));
+    //return GetResultHelper(out, npu);
+  }
+
  bool GetResult(const framework::Tensor& out,
                 const platform::CPUPlace& cpu) const {
    return *out.data<bool>();
@@ -633,6 +641,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
 #endif
  }

+  void VisitorImpl(const platform::NPUPlace& npu) const {
+    //TODO(zhiqiu)
+  }
+
  void VisitorImpl(const platform::CPUPlace& cpu) const {
    int num = in_.numel();
    const bool* in_ptr = in_.data<bool>();

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -116,6 +116,23 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+  void operator()(const platform::NPUPlace& place) {
+    // TODO(zhiqiu): SUPPORT it
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const platform::NPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
  // there is NO blas in CUDAPinnedPlace
  void operator()(const platform::CUDAPinnedPlace& place) {
    PADDLE_THROW(platform::errors::PermissionDenied(

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -19,6 +19,10 @@ if (WITH_GPU)
  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()

+if (WITH_ASCEND_CL)
+  cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
+endif()
+
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)

 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -42,6 +42,7 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
+#include "paddle/fluid/platform/npu_info.h"

 DEFINE_int64(
    gpu_allocator_retry_time, 10000,
@@ -76,6 +77,11 @@ class AllocatorFacadePrivate {
          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
        }
        InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
 #endif
        break;
      }
@@ -195,6 +201,12 @@ class AllocatorFacadePrivate {
  }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
  class ZeroSizeAllocator : public Allocator {
   public:
    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
@@ -112,6 +113,7 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
 }

+// For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 #ifdef PADDLE_WITH_XPU
@@ -216,6 +218,136 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #endif
 }

+// For Ascend NPU
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUBuddyAllocatorList {
+ private:
+  NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
+    auto npu_num = devices_.size();
+    allocators_.resize(npu_num);
+    init_flags_.reserve(npu_num);
+    for (size_t i = 0; i < npu_num; ++i) {
+      init_flags_.emplace_back(new std::once_flag());
+    }
+  }
+
+  static NPUBuddyAllocatorList *CreateNewInstance() {
+    return new NPUBuddyAllocatorList();
+  }
+
+ public:
+  static NPUBuddyAllocatorList *Instance() {
+    static auto *instance = CreateNewInstance();
+    return instance;
+  }
+
+  BuddyAllocator *Get(int npu_id) {
+    auto pos = std::distance(
+        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
+    PADDLE_ENFORCE_LT(pos, devices_.size(),
+                      platform::errors::OutOfRange(
+                          "The index exceeds the size of devices, the size of "
+                          "devices is %d, the index is %d",
+                          devices_.size(), pos));
+
+    std::call_once(*init_flags_[pos], [this, pos] {
+      platform::SetNPUDeviceId(devices_[pos]);
+      allocators_[pos].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::NPUAllocator(devices_[pos])),
+          platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
+    });
+
+    return allocators_[pos].get();
+  }
+
+ private:
+  std::vector<int> devices_;
+  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
+  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
+  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
+}
+#endif
+
+template <>
+size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
+  auto *ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    platform::NPUDeviceGuard(place.device);
+    size_t avail, total;
+    platform::NPUMemoryUsage(&avail, &total);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
+        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
+        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
+        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
+        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      aclrtMemset(ptr, size, 0xEF, size);
+    }
+  }
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetNPUBuddyAllocator(place.device)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+// For CUDA
+
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
 private:

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -14,6 +14,8 @@

 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"

+#include <unistd.h>
+
 #include <algorithm>
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
@@ -69,6 +71,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  sleep(10);
+  alloc.Release(platform::NPUPlace(0));
+
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::NPUPlace(0));
+}
+#endif
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/npu_allocator.h"
+#include <string>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool NPUAllocator::IsAllocThreadSafe() const { return true; }
+void NPUAllocator::FreeImpl(Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "NPU memory is freed in incorrect device. This may be a bug"));
+  platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
+                            place_.device);
+  delete allocation;
+}
+
+Allocation* NPUAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_,
+                 [this] { platform::SetNPUDeviceId(place_.device); });
+
+  void* ptr;
+  auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
+  if (LIKELY(result == ACL_ERROR_NONE)) {
+    return new Allocation(ptr, size, platform::Place(place_));
+  }
+
+  size_t avail, total, actual_avail, actual_total;
+  bool is_limited = platform::RecordedNPUMemGetInfo(
+      &avail, &total, &actual_avail, &actual_total, place_.device);
+
+  std::string err_msg;
+  if (is_limited) {
+    auto limit_size = (total >> 20);
+    err_msg = string::Sprintf(
+        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
+        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
+        "GPU memory usage is limited to %d MB.\n"
+        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+        limit_size, limit_size);
+  }
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on NPU %d. "
+      "Cannot allocate %s memory on NPU %d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using NPU %d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+      "2. If no, please decrease the batch size of your model. %s\n\n",
+      place_.device, string::HumanReadableSize(size), place_.device,
+      string::HumanReadableSize(avail), place_.device, err_msg));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NPUAllocator : public Allocator {
+ public:
+  explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::NPUPlace place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -2,11 +2,15 @@ include(ExternalProject)

 cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)

+set(system_allocator_DEPS gflags cpu_info place)
+
 if(${WITH_GPU})
  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
-else(${WITH_GPU})
+elseif(${WITH_ASCEND_CL})
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
+else()
  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
-endif(${WITH_GPU})
+endif()

 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)


--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -21,6 +21,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
+

 namespace paddle {
 namespace memory {
@@ -235,6 +239,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
    }
  }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
+    } else {
+      // Compute the re-allocation size, we store the re-allocation size when
+      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
+      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
+        realloc_size_ = platform::NPUReallocSize();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
+    }
+  }
+#endif

  // Allocate a new block
  void* p = system_allocator_->Alloc(&index, allocate_bytes);

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"

 namespace paddle {
 namespace memory {

--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -19,13 +19,15 @@ limitations under the License. */
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
+#include <fstream>
+#include <string>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"

-#ifdef PADDLE_WITH_CUDA
-#include <fstream>
-#include <string>
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)

 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -324,6 +326,33 @@ TEST(BuddyAllocator, Release) {
 }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(BuddyAllocator, NpuFraction) {
+  // In a 16 GB machine, the pool size will be about 160 MB
+  FLAGS_fraction_of_gpu_memory_to_use = 0.005;
+  FLAGS_fraction_of_gpu_memory_to_use = 0.92;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
+      platform::NPUMinChunkSize(), platform::NPUMaxChunkSize());
+
+  // Less than pool size
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  buddy_allocator.Release();
+
+  // Greater than max chunk size
+  TestBuddyAllocator(&buddy_allocator, 300 << 20,
+                     /* use_system_allocator = */ true);
+  TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
+                     /* use_system_allocator = */ true);
+}
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
+
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -35,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -239,6 +240,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }

 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+void* NPUAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
+
+  if (result == ACL_ERROR_NONE) {
+    *index = 0;
+    npu_alloc_size_ += size;
+    return p;
+  } else {
+    size_t avail, total, actual_avail, actual_total;
+    bool is_limited = platform::RecordedNPUMemGetInfo(
+        &avail, &total, &actual_avail, &actual_total, npu_id_);
+
+    std::string err_msg;
+    if (is_limited) {
+      auto limit_size = (total >> 20);
+      err_msg = string::Sprintf(
+          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
+          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
+          "maximum GPU memory usage is limited to %d MB.\n"
+          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+          limit_size, limit_size);
+    }
+
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on NPU %d. "
+        "Cannot allocate %s memory on NPU %d, "
+        "available memory is only %s.\n\n"
+        "Please check whether there is any other process using NPU %d.\n"
+        "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+        "2. If no, please try one of the following suggestions:\n"
+        "   1) Decrease the batch size of your model.\n"
+        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
+        "please set it to a higher value but less than 1.0.\n"
+        "      The command is "
+        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
+        npu_id_, string::HumanReadableSize(size), npu_id_,
+        string::HumanReadableSize(avail), npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+  }
+}
+
+void NPUAllocator::Free(void* p, size_t size, size_t index) {
+  VLOG(4) << "Free " << p << " size " << size;
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size, npu_alloc_size_));
+  npu_alloc_size_ -= size;
+
+  platform::RecordedNPUFree(p, size, npu_id_);
+}
+
+bool NPUAllocator::UseGpu() const { return true; }
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+
+class NPUAllocator : public SystemAllocator {
+ public:
+  explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t npu_alloc_size_ = 0;
+  int npu_id_;
+};
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -81,3 +81,11 @@ TEST(GPUAllocator, AllocFailure) {
  }
 }
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NPUAllocator, Alloc) {
+  paddle::memory::detail::NPUAllocator a(0);
+  TestAllocator(&a, 1<<20);
+  TestAllocator(&a, 1);
+}
+#endif
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -148,6 +148,13 @@ void set_constant_with_place<platform::XPUPlace>(
  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
 }

+template <>
+void set_constant_with_place<platform::NPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -71,6 +71,10 @@ if(WITH_ASCEND)
    cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
 endif()

+if(WITH_ASCEND_CL)
+    cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)


--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -228,6 +228,33 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
 xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
+  NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
+  // NOTE(zhiqiu): Usually, no need to create context explicitly,
+  // ACL creates a default context which contains 1 default stream
+  // and 1 sync strean after aclrtSetDevice.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
+}
+
+NPUDeviceContext::~NPUDeviceContext() {
+  // NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
+}
+
+void NPUDeviceContext::Wait() const {
+  NPUDeviceGuard guard(place_.device);
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
+}
+
+Place NPUDeviceContext::GetPlace() const { return place_; }
+
+aclrtContext* NPUDeviceContext::context() const {
+  return const_cast<aclrtContext*>(&context_);
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA

 class EigenCudaStreamDevice : public Eigen::StreamInterface {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -59,6 +59,11 @@ struct GpuDevice;
 #include "paddle/fluid/platform/xpu_info.h"
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#include "paddle/fluid/platform/npu_info.h"
+#endif
+
 namespace paddle {
 namespace platform {

@@ -77,11 +82,13 @@ enum DeviceType {
  CPU = 0,
  CUDA = 1,
  XPU = 2,
+  NPU = 3,
 };

 constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kNPU = DeviceType::NPU;

 class DeviceContext {
 public:
@@ -153,6 +160,46 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 };
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDeviceContext : public DeviceContext {
+ public:
+  explicit NPUDeviceContext(NPUPlace place);
+  virtual ~NPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  aclrtContext* context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+#ifdef PADDLE_WITH_ASCEND_HCCL
+  /*! \brief  Return bkcl context. */
+  HCCLContext_t hccl_context() const { return hccl_context_; }
+
+  /*! \brief  Set bkcl context. */
+  void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
+#endif
+
+ private:
+  NPUPlace place_;
+  aclrtContext context_;
+#ifdef PADDLE_WITH_ASCEND_HCCL
+  HCCLContext_t hccl_context_;
+#endif
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in NPU
+  // NOTE(zhiqiu): why need?
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::NPUPlace> {
+  using TYPE = NPUDeviceContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA

 class CudnnWorkspaceHandle;

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
@@ -214,3 +215,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
+
+#endif
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -38,6 +38,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_error.pb.h"
 #endif  // PADDLE_WITH_CUDA

+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#endif  // PADDLE_WITH_ASCEND_CL
+
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -940,7 +944,6 @@ DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 #endif
-
 }  // namespace details

 #define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                        \
@@ -996,5 +999,40 @@ inline void retry_sleep(unsigned milliseconds) {
 #undef DEFINE_CUDA_STATUS_TYPE
 #endif  // PADDLE_WITH_CUDA

+#ifdef PADDLE_WITH_ASCEND_CL
+namespace details {
+template <typename T>
+struct NPUStatusType {};
+
+#define DEFINE_NPU_STATUS_TYPE(type, success_value) \
+  template <>                                       \
+  struct NPUStatusType<type> {                      \
+    using Type = type;                              \
+    static constexpr Type kSuccess = success_value; \
+  }
+
+DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
+}  // namespace details
+
+inline std::string build_npu_error_msg(aclError stat) {
+  std::string s = " ACL error, the error code is : " + stat;
+  return s;
+}
+
+#define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
+  do {                                                         \
+    auto __cond__ = (COND);                                    \
+    using __NPU_STATUS_TYPE__ = decltype(__cond__);            \
+    constexpr auto __success_type__ =                          \
+        ::paddle::platform::details::NPUStatusType<            \
+            __NPU_STATUS_TYPE__>::kSuccess;                    \
+    if (UNLIKELY(__cond__ != __success_type__)) {              \
+      auto __summary__ = ::paddle::platform::errors::External( \
+          ::paddle::platform::build_npu_error_msg(__cond__));  \
+      __THROW_ERROR_INTERNAL__(__summary__);                   \
+    }                                                          \
+  } while (0)
+#endif  // PADDLE_WITH_ASCEND_CL
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false,
            "Checking whether operator produce NAN/INF or not. It will be "
            "extremely slow so please use this flag wisely.");

-#ifdef PADDLE_WITH_CUDA
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)

 /**
 * CUDA related related FLAG
@@ -377,7 +378,8 @@ DEFINE_double(
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
    "reserve the rest for page tables, etc");

-#ifdef PADDLE_WITH_CUDA
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)

 /**
 * Memory related FLAG

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -80,6 +80,7 @@ static int GetCUDADeviceCountImpl() {
 }

 int GetCUDADeviceCount() {
+  // cache the count
  static auto dev_cnt = GetCUDADeviceCountImpl();
  return dev_cnt;
 }

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/string/split.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -70,6 +71,7 @@ namespace framework {

 std::once_flag gflags_init_flag;
 std::once_flag glog_init_flag;
+std::once_flag npu_init_flag;

 bool InitGflags(std::vector<std::string> args) {
  bool successed = false;
@@ -148,6 +150,17 @@ void InitDevices() {
  } catch (const std::exception &exp) {
    LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
+  platform::AclInstance::Instance();  // NOLINT
+  try {
+    // use user specified XPUs in single-node multi-process mode.
+    devices = platform::GetSelectedNPUDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING)
+        << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
+  }
 #endif
  InitDevices(devices);
 }

--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -35,3 +35,13 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size)
 DEFINE_INT_STATUS(STAT_gpu13_mem_size)
 DEFINE_INT_STATUS(STAT_gpu14_mem_size)
 DEFINE_INT_STATUS(STAT_gpu15_mem_size)
+
+// For Ascend NPU
+DEFINE_INT_STATUS(STAT_npu0_mem_size)
+DEFINE_INT_STATUS(STAT_npu1_mem_size)
+DEFINE_INT_STATUS(STAT_npu2_mem_size)
+DEFINE_INT_STATUS(STAT_npu3_mem_size)
+DEFINE_INT_STATUS(STAT_npu4_mem_size)
+DEFINE_INT_STATUS(STAT_npu5_mem_size)
+DEFINE_INT_STATUS(STAT_npu6_mem_size)
+DEFINE_INT_STATUS(STAT_npu7_mem_size)
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -187,3 +187,13 @@ class StatRegistry {
  USE_INT_STAT(STAT_gpu13_mem_size); \
  USE_INT_STAT(STAT_gpu14_mem_size); \
  USE_INT_STAT(STAT_gpu15_mem_size)
+
+#define USE_NPU_MEM_STAT            \
+  USE_INT_STAT(STAT_npu0_mem_size); \
+  USE_INT_STAT(STAT_npu1_mem_size); \
+  USE_INT_STAT(STAT_npu2_mem_size); \
+  USE_INT_STAT(STAT_npu3_mem_size); \
+  USE_INT_STAT(STAT_npu4_mem_size); \
+  USE_INT_STAT(STAT_npu5_mem_size); \
+  USE_INT_STAT(STAT_npu6_mem_size); \
+  USE_INT_STAT(STAT_npu7_mem_size)
--- a/paddle/fluid/platform/npu_info.cc
+++ b/paddle/fluid/platform/npu_info.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/npu_info.h"
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/string/split.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_string(selected_gpus);
+DECLARE_uint64(gpu_memory_limit_mb);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+USE_NPU_MEM_STAT;
+
+namespace paddle {
+namespace platform {
+
+static int GetNPUDeviceCountImpl() {
+  uint32_t count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count));
+  return count;
+}
+
+int GetNPUDeviceCount() {
+  static auto dev_cnt = GetNPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+// For example, "1.0.1"
+std::string GetNPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  int major = 0, minor = 0, patch = 0;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch));
+  return string::Sprintf("%d.%d.%d", major, minor, patch);
+}
+
+int GetCurrentNPUDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id));
+  return device_id;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedNPUDevices() {
+  // use user specified NPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetNPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void SetNPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  // NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice
+  // pairly.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id));
+}
+
+void ResetNPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id));
+}
+
+void NPUMemoryUsage(size_t *available, size_t *total) {
+  size_t actual_available, actual_total;
+  RecordedNPUMemGetInfo(available, total, &actual_available, &actual_total,
+                        platform::GetCurrentNPUDeviceId());
+}
+
+size_t NPUAvailableMemToAlloc() {
+  size_t total = 0;
+  size_t available = 0;
+  NPUMemoryUsage(&available, &total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = NPUMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
+}
+
+size_t NPUMaxAllocSize() {
+  return std::max(NPUInitAllocSize(), NPUReallocSize());
+}
+
+static size_t NPUAllocSize(bool realloc) {
+  size_t available_to_alloc = NPUAvailableMemToAlloc();
+  PADDLE_ENFORCE_GT(
+      available_to_alloc, 0,
+      platform::errors::ResourceExhausted("Not enough available NPU memory."));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(
+      available_to_alloc, alloc_bytes,
+      platform::errors::ResourceExhausted("Not enough available NPU memory."));
+  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
+           << " MiB, is it Re-alloc: " << realloc;
+  return alloc_bytes;
+}
+
+size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
+
+size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
+
+size_t NPUMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t NPUMaxChunkSize() {
+  size_t max_chunk_size = NPUMaxAllocSize();
+  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+void NPUMemcpyASync(void *dst, const void *src, size_t count,
+                    enum aclrtMemcpyKind kind, aclrtStream stream,
+                    size_t dst_max_count) {
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
+}
+
+void NPUMemcpySync(void *dst, const void *src, size_t count,
+                   enum aclrtMemcpyKind kind, size_t dst_max_count) {
+  // NOTE(zhiqiu):  The default max_count is count
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
+}
+
+void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
+                    size_t max_count) {
+  max_count = max_count ? max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemsetAsync(dst, max_count, value, count, stream));
+}
+
+void NPUStreamSync(aclrtStream stream) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
+}
+
+static void RaiseNonOutOfMemoryError(aclError *status) {
+  if (*status == ACL_ERROR_BAD_ALLOC) {
+    *status = ACL_ERROR_NONE;
+  }
+  PADDLE_ENFORCE_NPU_SUCCESS(*status);
+}
+
+class RecordedNPUMallocHelper {
+ private:
+  explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0)
+      : dev_id_(dev_id), limit_size_(limit_size) {
+    if (NeedRecord()) {
+      mtx_.reset(new std::mutex());
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper);
+
+ public:
+  static RecordedNPUMallocHelper *Instance(int dev_id) {
+    std::call_once(once_flag_, [] {
+      int dev_cnt = GetNPUDeviceCount();
+      instances_.reserve(dev_cnt);
+      for (int i = 0; i < dev_cnt; ++i) {
+        // NOTE(zhiqiu): share the flags with gpu, avoid more flags.
+        instances_.emplace_back(
+            new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
+      }
+    });
+
+    PADDLE_ENFORCE_GE(
+        dev_id, 0,
+        platform::errors::OutOfRange(
+            "Device id must be not less than 0, but got %d.", dev_id));
+    PADDLE_ENFORCE_LT(
+        dev_id, instances_.size(),
+        platform::errors::OutOfRange("Device id %d exceeds npu card number %d.",
+                                     dev_id, instances_.size()));
+    return instances_[dev_id].get();
+  }
+
+  /**
+   * Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC
+   * or ACL_ERROR_NONE would be returned.
+   */
+  aclError Malloc(void **ptr, size_t size) {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
+      return ACL_ERROR_BAD_ALLOC;
+    }
+
+    NPUDeviceGuard guard(dev_id_);
+    auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    if (result == ACL_ERROR_NONE) {
+      if (NeedRecord()) {
+        cur_size_ += size;
+      }
+      STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
+      return result;
+    } else {
+      RaiseNonOutOfMemoryError(&result);
+      // Non out of memory error would be raised inside
+      // RaiseNonOutOfMemoryError. Therefore, we can
+      // return cudaErrorMemoryAllocation directly here.
+      return ACL_ERROR_BAD_ALLOC;
+    }
+  }
+
+  /**
+   * Free gpu memory. Usually, free is not allowed to raise error.
+   * If it does raise error, the process should be crashed.
+   */
+  void Free(void *ptr, size_t size) {
+    NPUDeviceGuard guard(dev_id_);
+    auto result = aclrtFree(ptr);
+    PADDLE_ENFORCE_NPU_SUCCESS(result);
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      cur_size_ -= size;
+    }
+    STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
+  }
+
+  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                  size_t *actual_total) {
+    {
+      NPUDeviceGuard guard(dev_id_);
+      auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total);
+      if (result != ACL_ERROR_NONE) {
+        *actual_avail = 0;
+      }
+      RaiseNonOutOfMemoryError(&result);
+    }
+
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_);
+      *total = std::min(*actual_total, limit_size_);
+      return *total < *actual_total;
+    } else {
+      *avail = *actual_avail;
+      *total = *actual_total;
+      return false;
+    }
+  }
+
+  inline bool NeedRecord() const { return limit_size_ != 0; }
+
+  uint64_t RecordedSize() const {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    return NeedRecord() ? cur_size_ : 0;
+  }
+
+  uint64_t LimitSize() const { return limit_size_; }
+
+ private:
+  const int dev_id_;
+  const uint64_t limit_size_;
+  uint64_t cur_size_{0};
+
+  mutable std::unique_ptr<std::mutex> mtx_;
+
+  static std::once_flag once_flag_;
+  static std::vector<std::unique_ptr<RecordedNPUMallocHelper>> instances_;
+};
+
+std::once_flag RecordedNPUMallocHelper::once_flag_;
+std::vector<std::unique_ptr<RecordedNPUMallocHelper>>
+    RecordedNPUMallocHelper::instances_;
+
+aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size);
+}
+
+void RecordedNPUFree(void *p, size_t size, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size);
+}
+
+bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo(
+      avail, total, actual_avail, actual_total);
+}
+
+uint64_t RecordedNPUMallocSize(int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize();
+}
+
+bool IsNPUMallocRecorded(int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/npu_info.h
+++ b/paddle/fluid/platform/npu_info.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <stddef.h>
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of NPU devices in system.
+int GetNPUDeviceCount();
+
+//! Get the runtime version of the ith NPU
+std::string GetNPURuntimeVersion(int id);
+
+//! Get the current NPU device id in system.
+int GetCurrentNPUDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedNPUDevices();
+
+//! Set the NPU device id for next execution.
+void SetNPUDeviceId(int device_id);
+
+//! Reset the NPU device id for next execution.
+void ResetNPUDeviceId(int device_id);
+
+//! Get the memory usage of current NPU device.
+void NPUMemoryUsage(size_t *available, size_t *total);
+
+//! Get the available memory to allocate, which is the size of available npu
+//! minus reserving.
+size_t NPUAvailableMemToAlloc();
+
+//! Get the maximum allocation size of current NPU device.
+size_t NPUMaxAllocSize();
+
+//! Get the initial allocation size of current NPU device.
+size_t NPUInitAllocSize();
+
+//! Get the re-allocation size of current NPU device.
+size_t NPUReallocSize();
+
+//! Get the minimum chunk size for NPU buddy allocator.
+size_t NPUMinChunkSize();
+
+//! Get the maximum chunk size for NPU buddy allocator.
+size_t NPUMaxChunkSize();
+
+//! Copy memory from address src to dst asynchronously.
+void NPUMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum aclrtMemcpyKind kind, aclrtStream stream,
+                    size_t dst_max_count = 0);
+
+//! Copy memory from address src to dst synchronously.
+void NPUMemcpySync(void *dst, const void *src, size_t count,
+                   enum aclrtMemcpyKind kind, size_t dst_max_count = 0);
+
+//! Set memory dst with value count size asynchronously
+void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
+                    size_t max_count = 0);
+
+//! Blocks until stream has completed all operations.
+void NPUStreamSync(aclrtStream stream);
+
+//! aclrtMalloc with recorded info
+aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id);
+
+//! aclrtFree with recorded info
+void RecordedNPUFree(void *p, size_t size, int dev_id);
+
+//! Get available and total gpu memory with considering limitation
+bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id);
+
+//! Get recorded actrtMalloc size. If record is disabled, return 0.
+uint64_t RecordedNPUMallocSize(int dev_id);
+
+bool IsNPUMallocRecorded(int dev_id);
+
+class NPUDeviceGuard {
+ public:
+  explicit inline NPUDeviceGuard(int dev_id) {
+    int prev_id = platform::GetCurrentNPUDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      platform::SetNPUDeviceId(dev_id);
+    }
+  }
+
+  inline ~NPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      platform::SetNPUDeviceId(prev_id_);
+    }
+  }
+
+  NPUDeviceGuard(const NPUDeviceGuard &o) = delete;
+  NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+class AclInstance {
+ public:
+  // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
+  // no PADDLE_ENFORCE here, call acl API directly.
+  ~AclInstance() {}
+  AclInstance(const AclInstance &o) = delete;
+  const AclInstance &operator=(const AclInstance &o) = delete;
+
+  static AclInstance &Instance() {
+    static AclInstance instance;
+    return instance;
+  }
+
+  void Finalize() {
+    // NOTE(zhiqiu): DO NOT perform finalize in destructor
+    // to avoid problems caused by destructor order of static
+    // object.
+    for (size_t i = 0; i < devices_.size(); ++i) {
+      auto status = aclrtResetDevice(devices_[i]);
+      VLOG(4) << "Call aclrtResetDevice " << devices_[i]
+              << " status = " << status;
+    }
+    auto status = aclFinalize();
+    VLOG(4) << "Call aclFinalize, status = " << status;
+  }
+
+ private:
+  // forbid calling default constructor
+  AclInstance() {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
+    VLOG(4) << "Call aclrtSetDevice ";
+    // NOTE(zhiqiu): why set devices here?
+    // Because ACL creates a default context which contains 2 streams
+    // when calling aclrtSetDeviceId, so usually we do not need to
+    // create contexts explicitly. And, for each device, aclrtSetDeviceId
+    // need to call parily with aclrtResetDeviceId to destory the default
+    // context. Here, we use this singleton and static instance to manage
+    // the devices to make sure they will be resetted before program exit.
+    devices_ = platform::GetSelectedNPUDevices();
+    for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
+      SetNPUDeviceId(*it);
+      VLOG(4) << "Call aclrtSetDevice " << *it;
+    }
+  }
+  std::vector<int> devices_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -33,6 +33,7 @@ class PlacePrinter : public boost::static_visitor<> {
    os_ << "CUDAPlace(" << p.device << ")";
  }
  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
+  void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }

 private:
@@ -49,6 +50,10 @@ bool is_xpu_place(const Place &p) {
  return boost::apply_visitor(IsXPUPlace(), p);
 }

+bool is_npu_place(const Place &p) {
+  return boost::apply_visitor(IsNPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
  return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -67,6 +72,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
      return true;
    } else if (is_xpu_place(p1)) {
      return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
+    } else if (is_npu_place(p1)) {
+      return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
    } else {
      return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
    }

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -72,16 +72,31 @@ struct XPUPlace {
  int device;
 };

+struct NPUPlace {
+  NPUPlace() : NPUPlace(0) {}
+  explicit NPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const NPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const NPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const NPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &gpu) const { return true; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return true; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };

 struct IsCPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const CPUPlace &) const { return true; }
  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -89,27 +104,38 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
 struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };

 struct IsXPUPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &xpu) const { return true; }
+  bool operator()(const XPUPlace &) const { return true; }
+  bool operator()(const NPUPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };

-class Place
-    : public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
+struct IsNPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
+                                    CUDAPinnedPlace> {
 private:
  using PlaceBase =
-      boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
+      boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;

 public:
  Place() = default;
  Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
  Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
+  Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
  Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
  Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
      : PlaceBase(cuda_pinned_place) {}
@@ -126,6 +152,7 @@ using PlaceList = std::vector<Place>;

 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
+bool is_npu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
@@ -153,6 +180,16 @@ struct PlaceVisitorWrapper
 #endif
  }

+  typename Visitor::result_type operator()(const NPUPlace &npu) const {
+#ifdef PADDLE_WITH_ASCEND
+    return visitor_(npu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with NPU. Cannot visit npu device"));
+    return typename Visitor::result_type();
+#endif
+  }
+
  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #ifdef PADDLE_WITH_CUDA
    return visitor_(cuda);

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -102,6 +102,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
@@ -487,6 +491,11 @@ PYBIND11_MODULE(core_noavx, m) {
        make_ddim(x_dim), make_ddim(y_dim), -1));
  });

+#ifdef PADDLE_WITH_ASCEND_CL
+  m.def("_npu_finalize",
+        []() { platform::AclInstance::Instance().Finalize(); });
+#endif
+
  m.def(
      "_append_python_callable_object_and_return_id",
      [](py::object py_obj) -> size_t {
@@ -1447,7 +1456,6 @@ All parameter, weight, gradient are variables in Paddle.
      .def("__repr__", string::to_string<const platform::CUDAPlace &>)
      .def("__str__", string::to_string<const platform::CUDAPlace &>);

-  
  py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
    **Note**:
    Examples:

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/npu_info.h"

 int main(int argc, char** argv) {
  paddle::memory::allocation::UseAllocatorStrategyGFlag();
@@ -38,11 +39,12 @@ int main(int argc, char** argv) {
  }
 #endif

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL)
  envs.push_back("fraction_of_gpu_memory_to_use");
  envs.push_back("initial_gpu_memory_in_mb");
  envs.push_back("reallocate_gpu_memory_in_mb");
  envs.push_back("allocator_strategy");
+  envs.push_back("selected_gpus");
 #elif __clang__
  envs.push_back("use_mkldnn");
  envs.push_back("initial_cpu_memory_in_mb");
@@ -92,6 +94,10 @@ int main(int argc, char** argv) {
  paddle::framework::InitDevices();

  int ret = RUN_ALL_TESTS();
+  
+#ifdef PADDLE_WITH_ASCEND_CL
+  paddle::platform::AclInstance::Instance().Finalize();
+#endif

  if (env_str) free(env_str);
  if (undefok_str) free(undefok_str);