diff --git a/CMakeLists.txt b/CMakeLists.txt
index 043a799b6a17c4cf4e4044fa0b58fe919beccbbe..e712efd67fc1dcc53a0de3726f7fd5696eca8834 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,11 +32,14 @@ option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+# to develop some acl related functionality on x86
+option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU  AND WITH_ASCEND)
+if (WITH_GPU AND WITH_ASCEND)
     message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
 # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index fc1e72ba3fccbb2a14f8482502b7c9783ae3a989..c229bdcd643027caad27fec54cf462a5998cc9a9 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -82,6 +82,10 @@ if(WITH_ASCEND)
     add_definitions(-DPADDLE_WITH_ASCEND)
 endif()
 
+if(WITH_ASCEND_CL)
+    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+endif()
+
 if(WITH_XPU)
     message(STATUS "Compile with XPU!")
     add_definitions(-DPADDLE_WITH_XPU)
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index a0b6f480f95ae70333c2f3dd8d20a8050b045425..c23d30c5b9b26b6d7d8aa09bca64d7c675022254 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -21,38 +21,58 @@ else()
     set(ASCEND_DIR /usr/local/Ascend)
 endif()
 
-set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
-set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
-set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
-set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
-set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
-set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
-set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
-
-set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
-set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
-set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
-set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
-set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
-set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
-
-set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
-set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
-set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
-INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
-
-if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
-  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
-endif()
+if(WITH_ASCEND)
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
+    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+  endif()
 
-ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
 
-ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
 
-ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
 
-add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+
+elseif(WITH_ASCEND_CL)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  
+  set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so)
+  set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
+  message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}")
+  message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}")
+  INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+  add_custom_target(extern_ascend DEPENDS atlas_acl)
+endif()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index d576a299b866c8faf5fcdb25672f6403546207df..327de067be8762a789831f7d5033a1cb83e7aa64 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -274,10 +274,10 @@ if(WITH_BOX_PS)
     list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)
 
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     include(external/ascend)
     list(APPEND third_party_deps extern_ascend)
-endif (WITH_ASCEND)
+endif ()
 
 if (WITH_PSCORE)
     include(external/snappy)
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index ac1e39ad2c1af6894d6bbaec563c487a6857f95a..fce930727bcf63a751991539dcf32eea2cd1c9a0 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
         platform::errors::Unimplemented("platform::XPUPlace is not supported"));
   }
 
+  inline ::DLContext operator()(const platform::NPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
+  }
+
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
     ::DLContext ctx;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1ad321df216fe16f8731f400026716f1c33b84e3..0de97a62ac0e1e574ccbdfaf4c993366f1a0d77f 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -431,6 +431,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
     return GetResultHelper(out, gpu);
   }
 
+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPlace& npu) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Not supported on place (%s) ",
+        npu));
+    //return GetResultHelper(out, npu);
+  }
+
   bool GetResult(const framework::Tensor& out,
                  const platform::CPUPlace& cpu) const {
     return *out.data<bool>();
@@ -633,6 +641,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
 #endif
   }
 
+  void VisitorImpl(const platform::NPUPlace& npu) const {
+    //TODO(zhiqiu)
+  }
+
   void VisitorImpl(const platform::CPUPlace& cpu) const {
     int num = in_.numel();
     const bool* in_ptr = in_.data<bool>();
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index ff8494a3888172a26edeeca7dfdde77bcaf0e1f4..08e668c25a06035be3c0ef50c42c0838d69aa20c 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -116,6 +116,23 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  void operator()(const platform::NPUPlace& place) {
+    // TODO(zhiqiu): SUPPORT it
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const platform::NPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
   // there is NO blas in CUDAPinnedPlace
   void operator()(const platform::CUDAPinnedPlace& place) {
     PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index c93f637af1a20256f232914b187911e670ba38ce..14d4c983faf0259f9e31848de3fdb76cace3e291 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -19,6 +19,10 @@ if (WITH_GPU)
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()
 
+if (WITH_ASCEND_CL)
+  cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
+endif()
+
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index a124a56ef89c57f2537704be5508cf564dbcb959..100d24c89abdd82b1cc0b6b9aeb59b9ef9c35cd3 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -42,6 +42,7 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
+#include "paddle/fluid/platform/npu_info.h"
 
 DEFINE_int64(
     gpu_allocator_retry_time, 10000,
@@ -76,6 +77,11 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
 #endif
         break;
       }
@@ -195,6 +201,12 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
   class ZeroSizeAllocator : public Allocator {
    public:
     explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index fcde4cbab42685f4d55892f555941e3f5949e11c..4b41ba8cf0e92e84671e6be0e9b46a32cb007094 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
@@ -112,6 +113,7 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+// For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 #ifdef PADDLE_WITH_XPU
@@ -216,6 +218,136 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #endif
 }
 
+// For Ascend NPU
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUBuddyAllocatorList {
+ private:
+  NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
+    auto npu_num = devices_.size();
+    allocators_.resize(npu_num);
+    init_flags_.reserve(npu_num);
+    for (size_t i = 0; i < npu_num; ++i) {
+      init_flags_.emplace_back(new std::once_flag());
+    }
+  }
+
+  static NPUBuddyAllocatorList *CreateNewInstance() {
+    return new NPUBuddyAllocatorList();
+  }
+
+ public:
+  static NPUBuddyAllocatorList *Instance() {
+    static auto *instance = CreateNewInstance();
+    return instance;
+  }
+
+  BuddyAllocator *Get(int npu_id) {
+    auto pos = std::distance(
+        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
+    PADDLE_ENFORCE_LT(pos, devices_.size(),
+                      platform::errors::OutOfRange(
+                          "The index exceeds the size of devices, the size of "
+                          "devices is %d, the index is %d",
+                          devices_.size(), pos));
+
+    std::call_once(*init_flags_[pos], [this, pos] {
+      platform::SetNPUDeviceId(devices_[pos]);
+      allocators_[pos].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::NPUAllocator(devices_[pos])),
+          platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
+    });
+
+    return allocators_[pos].get();
+  }
+
+ private:
+  std::vector<int> devices_;
+  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
+  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
+  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
+}
+#endif
+
+template <>
+size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
+  auto *ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    platform::NPUDeviceGuard(place.device);
+    size_t avail, total;
+    platform::NPUMemoryUsage(&avail, &total);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
+        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
+        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
+        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
+        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      aclrtMemset(ptr, size, 0xEF, size);
+    }
+  }
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetNPUBuddyAllocator(place.device)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+// For CUDA
+
 #ifdef PADDLE_WITH_CUDA
 class GPUBuddyAllocatorList {
  private:
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
index 054c75b11f78c7733c15ac39a44cdc45078af7e7..473239d714d89a70fc1eea88a453cc3f76317d67 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
 
+#include <unistd.h>
+
 #include <algorithm>
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
@@ -69,6 +71,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  sleep(10);
+  alloc.Release(platform::NPUPlace(0));
+
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::NPUPlace(0));
+}
+#endif
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ecdee9bd03352201060911848647b60d3cc0203
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/npu_allocator.h"
+#include <string>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool NPUAllocator::IsAllocThreadSafe() const { return true; }
+void NPUAllocator::FreeImpl(Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "NPU memory is freed in incorrect device. This may be a bug"));
+  platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
+                            place_.device);
+  delete allocation;
+}
+
+Allocation* NPUAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_,
+                 [this] { platform::SetNPUDeviceId(place_.device); });
+
+  void* ptr;
+  auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
+  if (LIKELY(result == ACL_ERROR_NONE)) {
+    return new Allocation(ptr, size, platform::Place(place_));
+  }
+
+  size_t avail, total, actual_avail, actual_total;
+  bool is_limited = platform::RecordedNPUMemGetInfo(
+      &avail, &total, &actual_avail, &actual_total, place_.device);
+
+  std::string err_msg;
+  if (is_limited) {
+    auto limit_size = (total >> 20);
+    err_msg = string::Sprintf(
+        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
+        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
+        "GPU memory usage is limited to %d MB.\n"
+        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+        limit_size, limit_size);
+  }
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on NPU %d. "
+      "Cannot allocate %s memory on NPU %d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using NPU %d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+      "2. If no, please decrease the batch size of your model. %s\n\n",
+      place_.device, string::HumanReadableSize(size), place_.device,
+      string::HumanReadableSize(avail), place_.device, err_msg));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..738ec5d3ce120f3d08b887d3a84d4d79a1e9e1d6
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NPUAllocator : public Allocator {
+ public:
+  explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::NPUPlace place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index 8f0988e871fa5c9990285d7ff80257a6e19046a6..66d8c7e6bba6616dfc066970f3d168b77db2276a 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -2,11 +2,15 @@ include(ExternalProject)
 
 cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)
 
+set(system_allocator_DEPS gflags cpu_info place)
+
 if(${WITH_GPU})
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
-else(${WITH_GPU})
+elseif(${WITH_ASCEND_CL})
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
+else()
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
-endif(${WITH_GPU})
+endif()
 
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 37795715361ec3ec633b79824ebcbeee4c3a22e4..0a391539b9831c782e99afece8bc4947df37d51e 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -21,6 +21,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
+
 
 namespace paddle {
 namespace memory {
@@ -235,6 +239,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
     }
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
+    } else {
+      // Compute the re-allocation size, we store the re-allocation size when
+      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
+      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
+        realloc_size_ = platform::NPUReallocSize();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
+    }
+  }
+#endif
 
   // Allocate a new block
   void* p = system_allocator_->Alloc(&index, allocate_bytes);
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index de77108f3404a1ad0cc611dec6a9fdae97865fea..807de9c03adf9fc1a10ffd1e8d019ce815b6d04b 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 90f7e33eb3540f6272df80296bba57c3d7d9b596..a0319a2b707eea32e8ba3cd59f10e9355685c288 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -19,13 +19,15 @@ limitations under the License. */
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
+#include <fstream>
+#include <string>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 
-#ifdef PADDLE_WITH_CUDA
-#include <fstream>
-#include <string>
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -324,6 +326,33 @@ TEST(BuddyAllocator, Release) {
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(BuddyAllocator, NpuFraction) {
+  // In a 16 GB machine, the pool size will be about 160 MB
+  FLAGS_fraction_of_gpu_memory_to_use = 0.005;
+  FLAGS_fraction_of_gpu_memory_to_use = 0.92;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
+      platform::NPUMinChunkSize(), platform::NPUMaxChunkSize());
+
+  // Less than pool size
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  buddy_allocator.Release();
+
+  // Greater than max chunk size
+  TestBuddyAllocator(&buddy_allocator, 300 << 20,
+                     /* use_system_allocator = */ true);
+  TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
+                     /* use_system_allocator = */ true);
+}
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
+
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 0fbbf405f0bf166b71a3b447338d9df7ad675f1b..f5cfa5f5f8681f958835f3db762f2db243026497 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -35,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -239,6 +240,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
 
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+void* NPUAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
+
+  if (result == ACL_ERROR_NONE) {
+    *index = 0;
+    npu_alloc_size_ += size;
+    return p;
+  } else {
+    size_t avail, total, actual_avail, actual_total;
+    bool is_limited = platform::RecordedNPUMemGetInfo(
+        &avail, &total, &actual_avail, &actual_total, npu_id_);
+
+    std::string err_msg;
+    if (is_limited) {
+      auto limit_size = (total >> 20);
+      err_msg = string::Sprintf(
+          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
+          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
+          "maximum GPU memory usage is limited to %d MB.\n"
+          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+          limit_size, limit_size);
+    }
+
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on NPU %d. "
+        "Cannot allocate %s memory on NPU %d, "
+        "available memory is only %s.\n\n"
+        "Please check whether there is any other process using NPU %d.\n"
+        "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+        "2. If no, please try one of the following suggestions:\n"
+        "   1) Decrease the batch size of your model.\n"
+        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
+        "please set it to a higher value but less than 1.0.\n"
+        "      The command is "
+        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
+        npu_id_, string::HumanReadableSize(size), npu_id_,
+        string::HumanReadableSize(avail), npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+  }
+}
+
+void NPUAllocator::Free(void* p, size_t size, size_t index) {
+  VLOG(4) << "Free " << p << " size " << size;
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size, npu_alloc_size_));
+  npu_alloc_size_ -= size;
+
+  platform::RecordedNPUFree(p, size, npu_id_);
+}
+
+bool NPUAllocator::UseGpu() const { return true; }
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index 42f0f23ec1d5d48276285dcef547a4d51054538b..7acaaa4d665e4dacde61b04a3c653c36f277ffcc 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class NPUAllocator : public SystemAllocator {
+ public:
+  explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t npu_alloc_size_ = 0;
+  int npu_id_;
+};
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index ea4897494f72b96e85911e03b651af1b4eac3298..d2ccb9f892f6b3a01a5d6a89ab91d0007bc5d12e 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -81,3 +81,11 @@ TEST(GPUAllocator, AllocFailure) {
   }
 }
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NPUAllocator, Alloc) {
+  paddle::memory::detail::NPUAllocator a(0);
+  TestAllocator(&a, 1<<20);
+  TestAllocator(&a, 1);
+}
+#endif
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 5afda787339dbe714ba6c82e3c34d39eb6d75580..f6ba1687980f497c8b6a32ad66798a87ddc35396 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -148,6 +148,13 @@ void set_constant_with_place<platform::XPUPlace>(
   PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<platform::NPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 63ff4852f16de6d77fb385f1eae984403e116722..f3331349fde86b9e2ad00e05d7ca7263373d9a2f 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -71,6 +71,10 @@ if(WITH_ASCEND)
     cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
 endif()
 
+if(WITH_ASCEND_CL)
+    cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index fb94768984fcfb4b886e4805f8328fe76a7b3625..24182b837f13cb83b6d087ed1e7410cdf3845d34 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -228,6 +228,33 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
 xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
+  NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
+  // NOTE(zhiqiu): Usually, no need to create context explicitly,
+  // ACL creates a default context which contains 1 default stream
+  // and 1 sync strean after aclrtSetDevice.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
+}
+
+NPUDeviceContext::~NPUDeviceContext() {
+  // NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
+}
+
+void NPUDeviceContext::Wait() const {
+  NPUDeviceGuard guard(place_.device);
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
+}
+
+Place NPUDeviceContext::GetPlace() const { return place_; }
+
+aclrtContext* NPUDeviceContext::context() const {
+  return const_cast<aclrtContext*>(&context_);
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index f058da97b5cfa2358873dea6e3efec997fb40dff..a4e584eeffa21be4dc4d65c89f8257b5ff66d953 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -59,6 +59,11 @@ struct GpuDevice;
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#include "paddle/fluid/platform/npu_info.h"
+#endif
+
 namespace paddle {
 namespace platform {
 
@@ -77,11 +82,13 @@ enum DeviceType {
   CPU = 0,
   CUDA = 1,
   XPU = 2,
+  NPU = 3,
 };
 
 constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kNPU = DeviceType::NPU;
 
 class DeviceContext {
  public:
@@ -153,6 +160,46 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 };
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDeviceContext : public DeviceContext {
+ public:
+  explicit NPUDeviceContext(NPUPlace place);
+  virtual ~NPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  aclrtContext* context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+#ifdef PADDLE_WITH_ASCEND_HCCL
+  /*! \brief  Return bkcl context. */
+  HCCLContext_t hccl_context() const { return hccl_context_; }
+
+  /*! \brief  Set bkcl context. */
+  void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
+#endif
+
+ private:
+  NPUPlace place_;
+  aclrtContext context_;
+#ifdef PADDLE_WITH_ASCEND_HCCL
+  HCCLContext_t hccl_context_;
+#endif
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in NPU
+  // NOTE(zhiqiu): why need?
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::NPUPlace> {
+  using TYPE = NPUDeviceContext;
+};
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 
 class CudnnWorkspaceHandle;
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index db84b8731f9ca467c4521221a3dbe0b1fc61b597..6c9a0cd44442faeb5b0f9a8115e5231c34f8fe02 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
@@ -214,3 +215,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 0b8a361abb58889476a437f8d3fe4a932b09cf31..c06616d01d572b23d50fd79c577eb271a71ba754 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -38,6 +38,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_error.pb.h"
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#endif  // PADDLE_WITH_ASCEND_CL
+
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -940,7 +944,6 @@ DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 #endif
-
 }  // namespace details
 
 #define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                        \
@@ -996,5 +999,40 @@ inline void retry_sleep(unsigned milliseconds) {
 #undef DEFINE_CUDA_STATUS_TYPE
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_ASCEND_CL
+namespace details {
+template <typename T>
+struct NPUStatusType {};
+
+#define DEFINE_NPU_STATUS_TYPE(type, success_value) \
+  template <>                                       \
+  struct NPUStatusType<type> {                      \
+    using Type = type;                              \
+    static constexpr Type kSuccess = success_value; \
+  }
+
+DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
+}  // namespace details
+
+inline std::string build_npu_error_msg(aclError stat) {
+  std::string s = " ACL error, the error code is : " + stat;
+  return s;
+}
+
+#define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
+  do {                                                         \
+    auto __cond__ = (COND);                                    \
+    using __NPU_STATUS_TYPE__ = decltype(__cond__);            \
+    constexpr auto __success_type__ =                          \
+        ::paddle::platform::details::NPUStatusType<            \
+            __NPU_STATUS_TYPE__>::kSuccess;                    \
+    if (UNLIKELY(__cond__ != __success_type__)) {              \
+      auto __summary__ = ::paddle::platform::errors::External( \
+          ::paddle::platform::build_npu_error_msg(__cond__));  \
+      __THROW_ERROR_INTERNAL__(__summary__);                   \
+    }                                                          \
+  } while (0)
+#endif  // PADDLE_WITH_ASCEND_CL
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 20be80b17617432e81bd70cebb6eeeae4626e5ef..e786d01c075133f4935502953d7bddcd47e61cfc 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
             "extremely slow so please use this flag wisely.");
 
-#ifdef PADDLE_WITH_CUDA
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
 
 /**
  * CUDA related related FLAG
@@ -377,7 +378,8 @@ DEFINE_double(
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
     "reserve the rest for page tables, etc");
 
-#ifdef PADDLE_WITH_CUDA
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
 
 /**
  * Memory related FLAG
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 2a6714c39a1cb4c435dd33a3ee9dd86fe561c1b6..ccab175397c9311cb5b87e8563a6d0fdf7f53997 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -80,6 +80,7 @@ static int GetCUDADeviceCountImpl() {
 }
 
 int GetCUDADeviceCount() {
+  // cache the count
   static auto dev_cnt = GetCUDADeviceCountImpl();
   return dev_cnt;
 }
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 3efdff2333d31fbc00412daf68788538faa19ad5..1c8b05768a434a22740810b2fe4c17bba420bb77 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/string/split.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -70,6 +71,7 @@ namespace framework {
 
 std::once_flag gflags_init_flag;
 std::once_flag glog_init_flag;
+std::once_flag npu_init_flag;
 
 bool InitGflags(std::vector<std::string> args) {
   bool successed = false;
@@ -148,6 +150,17 @@ void InitDevices() {
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
   }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
+  platform::AclInstance::Instance();  // NOLINT
+  try {
+    // use user specified XPUs in single-node multi-process mode.
+    devices = platform::GetSelectedNPUDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING)
+        << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
+  }
 #endif
   InitDevices(devices);
 }
diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
index 76554012bf51e34fc99db7759404f0e8d6f96cd6..1b44cb196547c2d26cdd5ae72c3331022f834657 100644
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -35,3 +35,13 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size)
 DEFINE_INT_STATUS(STAT_gpu13_mem_size)
 DEFINE_INT_STATUS(STAT_gpu14_mem_size)
 DEFINE_INT_STATUS(STAT_gpu15_mem_size)
+
+// For Ascend NPU
+DEFINE_INT_STATUS(STAT_npu0_mem_size)
+DEFINE_INT_STATUS(STAT_npu1_mem_size)
+DEFINE_INT_STATUS(STAT_npu2_mem_size)
+DEFINE_INT_STATUS(STAT_npu3_mem_size)
+DEFINE_INT_STATUS(STAT_npu4_mem_size)
+DEFINE_INT_STATUS(STAT_npu5_mem_size)
+DEFINE_INT_STATUS(STAT_npu6_mem_size)
+DEFINE_INT_STATUS(STAT_npu7_mem_size)
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index b57fae9daac41f37829309c4bc5f58fb2606ca02..0eb9448ce0fad4e1caadb3e08140417294d5d0e7 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -187,3 +187,13 @@ class StatRegistry {
   USE_INT_STAT(STAT_gpu13_mem_size); \
   USE_INT_STAT(STAT_gpu14_mem_size); \
   USE_INT_STAT(STAT_gpu15_mem_size)
+
+#define USE_NPU_MEM_STAT            \
+  USE_INT_STAT(STAT_npu0_mem_size); \
+  USE_INT_STAT(STAT_npu1_mem_size); \
+  USE_INT_STAT(STAT_npu2_mem_size); \
+  USE_INT_STAT(STAT_npu3_mem_size); \
+  USE_INT_STAT(STAT_npu4_mem_size); \
+  USE_INT_STAT(STAT_npu5_mem_size); \
+  USE_INT_STAT(STAT_npu6_mem_size); \
+  USE_INT_STAT(STAT_npu7_mem_size)
diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7508f01a1a3b8f575bef17b7172d4dfacef5dfe
--- /dev/null
+++ b/paddle/fluid/platform/npu_info.cc
@@ -0,0 +1,349 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/npu_info.h"
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/string/split.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_string(selected_gpus);
+DECLARE_uint64(gpu_memory_limit_mb);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+USE_NPU_MEM_STAT;
+
+namespace paddle {
+namespace platform {
+
+static int GetNPUDeviceCountImpl() {
+  uint32_t count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count));
+  return count;
+}
+
+int GetNPUDeviceCount() {
+  static auto dev_cnt = GetNPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+// For example, "1.0.1"
+std::string GetNPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  int major = 0, minor = 0, patch = 0;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch));
+  return string::Sprintf("%d.%d.%d", major, minor, patch);
+}
+
+int GetCurrentNPUDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id));
+  return device_id;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedNPUDevices() {
+  // use user specified NPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetNPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void SetNPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  // NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice
+  // pairly.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id));
+}
+
+void ResetNPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id));
+}
+
+void NPUMemoryUsage(size_t *available, size_t *total) {
+  size_t actual_available, actual_total;
+  RecordedNPUMemGetInfo(available, total, &actual_available, &actual_total,
+                        platform::GetCurrentNPUDeviceId());
+}
+
+size_t NPUAvailableMemToAlloc() {
+  size_t total = 0;
+  size_t available = 0;
+  NPUMemoryUsage(&available, &total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = NPUMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
+}
+
+size_t NPUMaxAllocSize() {
+  return std::max(NPUInitAllocSize(), NPUReallocSize());
+}
+
+static size_t NPUAllocSize(bool realloc) {
+  size_t available_to_alloc = NPUAvailableMemToAlloc();
+  PADDLE_ENFORCE_GT(
+      available_to_alloc, 0,
+      platform::errors::ResourceExhausted("Not enough available NPU memory."));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(
+      available_to_alloc, alloc_bytes,
+      platform::errors::ResourceExhausted("Not enough available NPU memory."));
+  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
+           << " MiB, is it Re-alloc: " << realloc;
+  return alloc_bytes;
+}
+
+size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
+
+size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
+
+size_t NPUMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t NPUMaxChunkSize() {
+  size_t max_chunk_size = NPUMaxAllocSize();
+  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+void NPUMemcpyASync(void *dst, const void *src, size_t count,
+                    enum aclrtMemcpyKind kind, aclrtStream stream,
+                    size_t dst_max_count) {
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
+}
+
+void NPUMemcpySync(void *dst, const void *src, size_t count,
+                   enum aclrtMemcpyKind kind, size_t dst_max_count) {
+  // NOTE(zhiqiu):  The default max_count is count
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
+}
+
+void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
+                    size_t max_count) {
+  max_count = max_count ? max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemsetAsync(dst, max_count, value, count, stream));
+}
+
+void NPUStreamSync(aclrtStream stream) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
+}
+
+static void RaiseNonOutOfMemoryError(aclError *status) {
+  if (*status == ACL_ERROR_BAD_ALLOC) {
+    *status = ACL_ERROR_NONE;
+  }
+  PADDLE_ENFORCE_NPU_SUCCESS(*status);
+}
+
+class RecordedNPUMallocHelper {
+ private:
+  explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0)
+      : dev_id_(dev_id), limit_size_(limit_size) {
+    if (NeedRecord()) {
+      mtx_.reset(new std::mutex());
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper);
+
+ public:
+  static RecordedNPUMallocHelper *Instance(int dev_id) {
+    std::call_once(once_flag_, [] {
+      int dev_cnt = GetNPUDeviceCount();
+      instances_.reserve(dev_cnt);
+      for (int i = 0; i < dev_cnt; ++i) {
+        // NOTE(zhiqiu): share the flags with gpu, avoid more flags.
+        instances_.emplace_back(
+            new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
+      }
+    });
+
+    PADDLE_ENFORCE_GE(
+        dev_id, 0,
+        platform::errors::OutOfRange(
+            "Device id must be not less than 0, but got %d.", dev_id));
+    PADDLE_ENFORCE_LT(
+        dev_id, instances_.size(),
+        platform::errors::OutOfRange("Device id %d exceeds npu card number %d.",
+                                     dev_id, instances_.size()));
+    return instances_[dev_id].get();
+  }
+
+  /**
+   * Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC
+   * or ACL_ERROR_NONE would be returned.
+   */
+  aclError Malloc(void **ptr, size_t size) {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
+      return ACL_ERROR_BAD_ALLOC;
+    }
+
+    NPUDeviceGuard guard(dev_id_);
+    auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    if (result == ACL_ERROR_NONE) {
+      if (NeedRecord()) {
+        cur_size_ += size;
+      }
+      STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
+      return result;
+    } else {
+      RaiseNonOutOfMemoryError(&result);
+      // Non out of memory error would be raised inside
+      // RaiseNonOutOfMemoryError. Therefore, we can
+      // return cudaErrorMemoryAllocation directly here.
+      return ACL_ERROR_BAD_ALLOC;
+    }
+  }
+
+  /**
+   * Free gpu memory. Usually, free is not allowed to raise error.
+   * If it does raise error, the process should be crashed.
+   */
+  void Free(void *ptr, size_t size) {
+    NPUDeviceGuard guard(dev_id_);
+    auto result = aclrtFree(ptr);
+    PADDLE_ENFORCE_NPU_SUCCESS(result);
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      cur_size_ -= size;
+    }
+    STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
+  }
+
+  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                  size_t *actual_total) {
+    {
+      NPUDeviceGuard guard(dev_id_);
+      auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total);
+      if (result != ACL_ERROR_NONE) {
+        *actual_avail = 0;
+      }
+      RaiseNonOutOfMemoryError(&result);
+    }
+
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_);
+      *total = std::min(*actual_total, limit_size_);
+      return *total < *actual_total;
+    } else {
+      *avail = *actual_avail;
+      *total = *actual_total;
+      return false;
+    }
+  }
+
+  inline bool NeedRecord() const { return limit_size_ != 0; }
+
+  uint64_t RecordedSize() const {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    return NeedRecord() ? cur_size_ : 0;
+  }
+
+  uint64_t LimitSize() const { return limit_size_; }
+
+ private:
+  const int dev_id_;
+  const uint64_t limit_size_;
+  uint64_t cur_size_{0};
+
+  mutable std::unique_ptr<std::mutex> mtx_;
+
+  static std::once_flag once_flag_;
+  static std::vector<std::unique_ptr<RecordedNPUMallocHelper>> instances_;
+};
+
+std::once_flag RecordedNPUMallocHelper::once_flag_;
+std::vector<std::unique_ptr<RecordedNPUMallocHelper>>
+    RecordedNPUMallocHelper::instances_;
+
+aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size);
+}
+
+void RecordedNPUFree(void *p, size_t size, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size);
+}
+
+bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo(
+      avail, total, actual_avail, actual_total);
+}
+
+uint64_t RecordedNPUMallocSize(int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize();
+}
+
+bool IsNPUMallocRecorded(int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..beac23dc96075806af7c082f747315a34df82563
--- /dev/null
+++ b/paddle/fluid/platform/npu_info.h
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <stddef.h>
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of NPU devices in system.
+int GetNPUDeviceCount();
+
+//! Get the runtime version of the ith NPU
+std::string GetNPURuntimeVersion(int id);
+
+//! Get the current NPU device id in system.
+int GetCurrentNPUDeviceId();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedNPUDevices();
+
+//! Set the NPU device id for next execution.
+void SetNPUDeviceId(int device_id);
+
+//! Reset the NPU device id for next execution.
+void ResetNPUDeviceId(int device_id);
+
+//! Get the memory usage of current NPU device.
+void NPUMemoryUsage(size_t *available, size_t *total);
+
+//! Get the available memory to allocate, which is the size of available npu
+//! minus reserving.
+size_t NPUAvailableMemToAlloc();
+
+//! Get the maximum allocation size of current NPU device.
+size_t NPUMaxAllocSize();
+
+//! Get the initial allocation size of current NPU device.
+size_t NPUInitAllocSize();
+
+//! Get the re-allocation size of current NPU device.
+size_t NPUReallocSize();
+
+//! Get the minimum chunk size for NPU buddy allocator.
+size_t NPUMinChunkSize();
+
+//! Get the maximum chunk size for NPU buddy allocator.
+size_t NPUMaxChunkSize();
+
+//! Copy memory from address src to dst asynchronously.
+void NPUMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum aclrtMemcpyKind kind, aclrtStream stream,
+                    size_t dst_max_count = 0);
+
+//! Copy memory from address src to dst synchronously.
+void NPUMemcpySync(void *dst, const void *src, size_t count,
+                   enum aclrtMemcpyKind kind, size_t dst_max_count = 0);
+
+//! Set memory dst with value count size asynchronously
+void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
+                    size_t max_count = 0);
+
+//! Blocks until stream has completed all operations.
+void NPUStreamSync(aclrtStream stream);
+
+//! aclrtMalloc with recorded info
+aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id);
+
+//! aclrtFree with recorded info
+void RecordedNPUFree(void *p, size_t size, int dev_id);
+
+//! Get available and total gpu memory with considering limitation
+bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id);
+
+//! Get recorded actrtMalloc size. If record is disabled, return 0.
+uint64_t RecordedNPUMallocSize(int dev_id);
+
+bool IsNPUMallocRecorded(int dev_id);
+
+class NPUDeviceGuard {
+ public:
+  explicit inline NPUDeviceGuard(int dev_id) {
+    int prev_id = platform::GetCurrentNPUDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      platform::SetNPUDeviceId(dev_id);
+    }
+  }
+
+  inline ~NPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      platform::SetNPUDeviceId(prev_id_);
+    }
+  }
+
+  NPUDeviceGuard(const NPUDeviceGuard &o) = delete;
+  NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+class AclInstance {
+ public:
+  // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
+  // no PADDLE_ENFORCE here, call acl API directly.
+  ~AclInstance() {}
+  AclInstance(const AclInstance &o) = delete;
+  const AclInstance &operator=(const AclInstance &o) = delete;
+
+  static AclInstance &Instance() {
+    static AclInstance instance;
+    return instance;
+  }
+
+  void Finalize() {
+    // NOTE(zhiqiu): DO NOT perform finalize in destructor
+    // to avoid problems caused by destructor order of static
+    // object.
+    for (size_t i = 0; i < devices_.size(); ++i) {
+      auto status = aclrtResetDevice(devices_[i]);
+      VLOG(4) << "Call aclrtResetDevice " << devices_[i]
+              << " status = " << status;
+    }
+    auto status = aclFinalize();
+    VLOG(4) << "Call aclFinalize, status = " << status;
+  }
+
+ private:
+  // forbid calling default constructor
+  AclInstance() {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
+    VLOG(4) << "Call aclrtSetDevice ";
+    // NOTE(zhiqiu): why set devices here?
+    // Because ACL creates a default context which contains 2 streams
+    // when calling aclrtSetDeviceId, so usually we do not need to
+    // create contexts explicitly. And, for each device, aclrtSetDeviceId
+    // need to call parily with aclrtResetDeviceId to destory the default
+    // context. Here, we use this singleton and static instance to manage
+    // the devices to make sure they will be resetted before program exit.
+    devices_ = platform::GetSelectedNPUDevices();
+    for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
+      SetNPUDeviceId(*it);
+      VLOG(4) << "Call aclrtSetDevice " << *it;
+    }
+  }
+  std::vector<int> devices_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index b80d2fd1632cd82c231fae724fc4d754b8fed0fc..1cc9fd9fe76341cd495a3580cddbff65f5b0e208 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -33,6 +33,7 @@ class PlacePrinter : public boost::static_visitor<> {
     os_ << "CUDAPlace(" << p.device << ")";
   }
   void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
+  void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -49,6 +50,10 @@ bool is_xpu_place(const Place &p) {
   return boost::apply_visitor(IsXPUPlace(), p);
 }
 
+bool is_npu_place(const Place &p) {
+  return boost::apply_visitor(IsNPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
   return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -67,6 +72,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
       return true;
     } else if (is_xpu_place(p1)) {
       return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
+    } else if (is_npu_place(p1)) {
+      return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
     } else {
       return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
     }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index f95f6954a32e771e7413a766afcfea8b85ff1f7e..3f74701319df0bcc1864461d8de76186a8114a3d 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -72,16 +72,31 @@ struct XPUPlace {
   int device;
 };
 
+struct NPUPlace {
+  NPUPlace() : NPUPlace(0) {}
+  explicit NPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const NPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const NPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const NPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &gpu) const { return true; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const CPUPlace &) const { return true; }
   bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -89,27 +104,38 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
 struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
 
 struct IsXPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &xpu) const { return true; }
+  bool operator()(const XPUPlace &) const { return true; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
-class Place
-    : public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
+struct IsNPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
+                                    CUDAPinnedPlace> {
  private:
   using PlaceBase =
-      boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
+      boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;
 
  public:
   Place() = default;
   Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
   Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
+  Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
@@ -126,6 +152,7 @@ using PlaceList = std::vector<Place>;
 
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
+bool is_npu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
@@ -153,6 +180,16 @@ struct PlaceVisitorWrapper
 #endif
   }
 
+  typename Visitor::result_type operator()(const NPUPlace &npu) const {
+#ifdef PADDLE_WITH_ASCEND
+    return visitor_(npu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with NPU. Cannot visit npu device"));
+    return typename Visitor::result_type();
+#endif
+  }
+
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #ifdef PADDLE_WITH_CUDA
     return visitor_(cuda);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e1a638adf505d185f5bbb3b8ca0376b8ff1279df..0fa50a8cd362b1ddaece4796bc243d751a60aab3 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -102,6 +102,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
@@ -487,6 +491,11 @@ PYBIND11_MODULE(core_noavx, m) {
         make_ddim(x_dim), make_ddim(y_dim), -1));
   });
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  m.def("_npu_finalize",
+        []() { platform::AclInstance::Instance().Finalize(); });
+#endif
+
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
@@ -1447,7 +1456,6 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
-  
   py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
     **Note**:
     Examples:
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 38ed76a87cd3e46145d4a1a5e679174a41a4ee86..2a1af1755991387b6bf417bc1ef1b38a0da632bb 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/npu_info.h"
 
 int main(int argc, char** argv) {
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
@@ -38,11 +39,12 @@ int main(int argc, char** argv) {
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL)
   envs.push_back("fraction_of_gpu_memory_to_use");
   envs.push_back("initial_gpu_memory_in_mb");
   envs.push_back("reallocate_gpu_memory_in_mb");
   envs.push_back("allocator_strategy");
+  envs.push_back("selected_gpus");
 #elif __clang__
   envs.push_back("use_mkldnn");
   envs.push_back("initial_cpu_memory_in_mb");
@@ -92,6 +94,10 @@ int main(int argc, char** argv) {
   paddle::framework::InitDevices();
 
   int ret = RUN_ALL_TESTS();
+  
+#ifdef PADDLE_WITH_ASCEND_CL
+  paddle::platform::AclInstance::Instance().Finalize();
+#endif
 
   if (env_str) free(env_str);
   if (undefok_str) free(undefok_str);