[NPU] Support npu pinned allocator and manage Tensor on NPUPinnedPlace (#32840)

6b3bb796 · liym27 · GitHub · 890f626b · 6b3bb796 · 6b3bb796
19 changed file
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -87,6 +87,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
  }

+  inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "platform::NPUPinnedPlace is not supported"));
+  }
+
  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    ::DLContext ctx;

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -503,6 +503,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
    // return GetResultHelper(out, npu);
  }

+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPinnedPlace& cpu) const {
+    return *out.data<bool>();
+  }
+
  bool GetResult(const framework::Tensor& out,
                 const platform::CPUPlace& cpu) const {
    return *out.data<bool>();
@@ -731,6 +736,18 @@ struct BothFalseVisitor : public boost::static_visitor<> {
      out_ptr[i] = lhs && rhs;
    }
  }
+
+  void VisitorImpl(
+      const platform::NPUPinnedPlace& cpu /* equals to cpu*/) const {
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
+  }
 };

 void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -132,6 +132,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }
 #endif

+  void operator()(const platform::NPUPinnedPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
  // there is NO blas in CUDAPinnedPlace
  void operator()(const platform::CUDAPinnedPlace& place) {
    PADDLE_THROW(platform::errors::PermissionDenied(

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -29,6 +29,7 @@ endif()

 if (WITH_ASCEND_CL)
  cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
+  cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info)
 endif()

 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
@@ -73,10 +74,15 @@ endif()

 list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)

+if (WITH_ASCEND_CL)
+    list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
+endif()
+
+
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )

 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -20,6 +20,9 @@
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -72,6 +75,7 @@ class AllocatorFacadePrivate {
        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
        }
+        InitNaiveBestFitNPUPinnedAllocator();
 #endif
        break;
      }
@@ -195,6 +199,12 @@ class AllocatorFacadePrivate {
  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
  }
+
+  void InitNaiveBestFitNPUPinnedAllocator() {
+    allocators_[platform::NPUPinnedPlace()] =
+        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
+  }
+
 #endif

  class ZeroSizeAllocator : public Allocator {
@@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
      ->Release(place);
 }

+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place) {
+  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -15,11 +15,17 @@
 #pragma once
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace memory {
 namespace allocation {
+#ifdef PADDLE_WITH_ASCEND_CL
+using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
+#endif

 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
@@ -46,6 +52,7 @@ class AllocatorFacade {

  // Release unused memory pool.
  uint64_t Release(const platform::Place& place);
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);

  // TODO(yy): Allocate a Copy-On-Write allocation?
 private:

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -287,6 +287,21 @@ class NPUBuddyAllocatorList {
 BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
 }
+
+BuddyAllocator *GetNPUPinnedBuddyAllocator() {
+  static std::once_flag init_flag;
+  static BuddyAllocator *ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::NPUPinnedAllocator),
+                            platform::NPUPinnedMinChunkSize(),
+                            platform::NPUPinnedMaxChunkSize());
+  });
+
+  return ba;
+}
+
 #endif

 template <>
@@ -351,6 +366,59 @@ uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
 #endif
 }

+template <>
+size_t Used<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUPinnedBuddyAllocator()->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
+                                      size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto *buddy_allocator = GetNPUPinnedBuddyAllocator();
+  void *ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    LOG(WARNING) << "aclrtMallocHost Cannot allocate " << size
+                 << " bytes in NPUPinnedPlace";
+  }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
+                                    void *p, size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  GetNPUPinnedBuddyAllocator()->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::NPUPinnedPlace>(
+    const platform::NPUPinnedPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUPinnedBuddyAllocator()->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
 // For CUDA
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUBuddyAllocatorList {

--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+void NPUPinnedAllocator::ProcessEventsAndFree() {
+  for (auto it = npu_events_.begin(); it != npu_events_.end();) {
+    aclrtEvent event = it->second;
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
+
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      Allocation *allocation = it->first;
+      void *ptr = allocation->ptr();
+      free(ptr);
+      npu_events_.erase(it++);
+      delete allocation;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+    } else {
+      ++it;
+    }
+  }
+}
+
+Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
+  ProcessEventsAndFree();
+  void *ptr;
+  int error = posix_memalign(&ptr, kAlignment, size);
+  PADDLE_ENFORCE_EQ(
+      error, 0,
+      platform::errors::ResourceExhausted(
+          "Fail to alloc memory of %ld size, error code is %d.", size, error));
+  return new Allocation(ptr, size, platform::NPUPinnedPlace());
+}
+
+void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+  void *ptr = allocation->ptr();
+  auto iter = npu_events_.find(allocation);
+  aclrtEvent event = iter->second;
+  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
+  if (status == ACL_EVENT_STATUS_COMPLETE) {
+    free(ptr);
+    npu_events_.erase(allocation);
+    delete allocation;
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+  }
+  return;
+}
+
+uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
+  return static_cast<uint64_t>(0);
+}
+
+void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
+                                     aclrtStream stream) {
+  aclrtEvent event = nullptr;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream));
+  npu_events_.insert({allocation, event});
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+#endif
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+
+#include "acl/acl.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NPUPinnedAllocator : public Allocator {
+ public:
+  bool IsAllocThreadSafe() const override { return true; }
+  void ProcessEventsAndFree();
+  void RecordEvent(Allocation *allocation, aclrtStream stream);
+  constexpr static size_t kAlignment = 4096UL;
+
+ protected:
+  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(Allocation *allocation) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override;
+
+ private:
+  std::unordered_map<Allocation *, aclrtEvent> npu_events_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) {
 }

 bool NPUAllocator::UseGpu() const { return true; }
+
+void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  size_t usable =
+      paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
+
+  if (size > usable) {
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB pinned memory."
+                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
+    return nullptr;
+  }
+
+  void* p;
+  // PINNED memory is visible to all NPU contexts.
+  auto result = aclrtMallocHost(&p, size);
+
+  if (result == ACL_ERROR_NONE) {
+    *index = 1;  // PINNED memory
+    npu_pinnd_alloc_size_ += size;
+    return p;
+  } else {
+    LOG(WARNING) << "aclrtMallocHost failed.";
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
+  aclError err;
+  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
+                                  "The index should be 1, but got %d", index));
+
+  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated npu pinned memory (%d)",
+                        size, npu_pinnd_alloc_size_));
+  npu_pinnd_alloc_size_ -= size;
+  err = aclrtFreeHost(p);
+
+  if (err != ACL_ERROR_NONE) {
+    PADDLE_ENFORCE_EQ(
+        err, 0,
+        platform::errors::Fatal(
+            "aclrtFreeHost failed in NPUPinnedAllocator, error code is %d",
+            err));
+  }
+}
+
+bool NPUPinnedAllocator::UseGpu() const { return false; }
+
 #endif

 }  // namespace detail

--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator {
  size_t npu_alloc_size_ = 0;
  int npu_id_;
 };
+
+class NPUPinnedAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t npu_pinnd_alloc_size_ = 0;
+};
 #endif

 }  // namespace detail

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -245,7 +245,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();

-    platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
+    platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU");
    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
  }
 }
@@ -294,6 +294,86 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
    }
  }
 }
+
+template <>
+void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
+    const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst,
+    platform::NPUPinnedPlace src_place, const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
+    const void* src, size_t num, aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(src_place.device);
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+  } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
+
+    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
+  }
+}
+
+template <>
+void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
+    platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
+    const void* src, size_t num, aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(dst_place.device);
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+  } else {
+    // On NPU, async operation after sync operation is ok, while sync operation
+    // after async is not ok, since the async operation may not done.
+    // So, its needed to do wait before sync operation.
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
+    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
+  }
+}
+
 #endif

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -158,6 +158,14 @@ void set_constant_with_place<platform::NPUPlace>(
  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
 }

+template <>
+void set_constant_with_place<platform::NPUPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
    const platform::DeviceContext& context, framework::Tensor* tensor,

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() {
  return CUDAPinnedMaxAllocSize() / 256;
 }

+size_t NPUPinnedMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t NPUPinnedMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 64 KB.
+  return 1 << 16;
+}
+
+size_t NPUPinnedMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
+  // memory.
+  return NPUPinnedMaxAllocSize() / 256;
+}
+
 #ifdef PADDLE_WITH_XBYAK
 static Xbyak::util::Cpu cpu;
 bool MayIUse(const cpu_isa_t cpu_isa) {

--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();

+//! Get the maximum allocation size for a machine.
+size_t NPUPinnedMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t NPUPinnedMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t NPUPinnedMaxChunkSize();
+
 typedef enum {
  isa_any,
  sse42,

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool(
      PADDLE_THROW(platform::errors::Unimplemented(
          "NPUPlace is not supported. Please "
          "re-compile with WITH_ASCEND_CL option."));
+#endif
+    } else if (platform::is_npu_pinned_place(p)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      EmplaceDeviceContext<NPUPinnedDeviceContext, NPUPinnedPlace>(
+          &device_contexts_, p);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPinnedPlace is not supported. Please re-compile with "
+          "WITH_ASCEND_CL "
+          "option."));
 #endif
    }
  }
@@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
 Place NPUDeviceContext::GetPlace() const { return place_; }

 aclrtContext NPUDeviceContext::context() const { return context_; }
+
+NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
+    : place_(place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
+
 #endif

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -233,6 +233,27 @@ template <>
 struct DefaultDeviceContextType<platform::NPUPlace> {
  using TYPE = NPUDeviceContext;
 };
+
+// Currently, NPUPinnedDeviceContext is only used to data copying.
+class NPUPinnedDeviceContext : public DeviceContext {
+ public:
+  NPUPinnedDeviceContext();
+  explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
+
+  Place GetPlace() const override;
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+ private:
+  NPUPinnedPlace place_;
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
+
+template <>
+struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
+  using TYPE = NPUPinnedDeviceContext;
+};
+
 #endif

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> {
  }
  void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
  void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
+  void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }

 private:
@@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) {
  return boost::apply_visitor(IsCUDAPinnedPlace(), p);
 }

+bool is_npu_pinned_place(const Place &p) {
+  return boost::apply_visitor(IsNPUPinnedPlace(), p);
+}
+
 bool places_are_same_class(const Place &p1, const Place &p2) {
  return p1.which() == p2.which();
 }

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -85,10 +85,19 @@ struct NPUPlace {
  int device;
 };

+struct NPUPinnedPlace {
+  NPUPinnedPlace() {}
+
+  inline bool operator==(const NPUPinnedPlace &) const { return true; }
+  inline bool operator!=(const NPUPinnedPlace &) const { return false; }
+  inline bool operator<(const NPUPinnedPlace &) const { return false; }
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return true; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return true; }
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
@@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return true; }
  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
  bool operator()(const CPUPlace &) const { return false; }
  bool operator()(const XPUPlace &) const { return false; }
  bool operator()(const NPUPlace &) const { return true; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return true; }
  bool operator()(const CUDAPlace &) const { return false; }
  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };

 class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                    CUDAPinnedPlace> {
+                                    CUDAPinnedPlace, NPUPinnedPlace> {
 private:
-  using PlaceBase =
-      boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;
+  using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
+                                   CUDAPinnedPlace, NPUPinnedPlace>;

 public:
  Place() = default;
@@ -139,6 +161,8 @@ class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
  Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
  Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
      : PlaceBase(cuda_pinned_place) {}
+  Place(const NPUPinnedPlace &npu_pinned_place)  // NOLINT
+      : PlaceBase(npu_pinned_place) {}

  bool operator<(const Place &place) const {
    return PlaceBase::operator<(static_cast<const PlaceBase &>(place));
@@ -155,6 +179,7 @@ bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
+bool is_npu_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);

@@ -190,6 +215,17 @@ struct PlaceVisitorWrapper
 #endif
  }

+  typename Visitor::result_type operator()(
+      const NPUPinnedPlace &npu_pinned) const {
+#ifdef PADDLE_WITH_ASCEND_CL
+    return visitor_(npu_pinned);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
+    return typename Visitor::result_type();
+#endif
+  }
+
  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    return visitor_(cuda);