diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index b99ab6b5a7ff195ef7d659598df88467bb158c6e..3833b027d2a364d7a46d01540983a1637de25376 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -87,6 +87,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
         platform::errors::Unimplemented("platform::NPUPlace is not supported"));
   }
 
+  inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "platform::NPUPinnedPlace is not supported"));
+  }
+
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ::DLContext ctx;
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 78fd1af09e29458ec84549c55dd99f8c29da29db..105751645bbc5929dc07e524dcc3e8b52ec52034 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -503,6 +503,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
     // return GetResultHelper(out, npu);
   }
 
+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPinnedPlace& cpu) const {
+    return *out.data<bool>();
+  }
+
   bool GetResult(const framework::Tensor& out,
                  const platform::CPUPlace& cpu) const {
     return *out.data<bool>();
@@ -731,6 +736,18 @@ struct BothFalseVisitor : public boost::static_visitor<> {
       out_ptr[i] = lhs && rhs;
     }
   }
+
+  void VisitorImpl(
+      const platform::NPUPinnedPlace& cpu /* equals to cpu*/) const {
+    int num = in_.numel();
+    const bool* in_ptr = in_.data<bool>();
+    bool* out_ptr = out_->data<bool>();
+    for (int i = 0; i < num; ++i) {
+      bool lhs = !in_ptr[i];
+      bool rhs = !out_ptr[i];
+      out_ptr[i] = lhs && rhs;
+    }
+  }
 };
 
 void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 43546cf99c69ffa3aa1f1a792e7b344ed0735a31..6b9b411713329ad0f9f663f57a081a7404a5aa7b 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -132,6 +132,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
+  void operator()(const platform::NPUPinnedPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
   // there is NO blas in CUDAPinnedPlace
   void operator()(const platform::CUDAPinnedPlace& place) {
     PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 2ea047fa13c10596995916234ef67e8a276b6b22..9a0637453f03f08a50bb1af958b1ba5e584869b4 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -29,6 +29,7 @@ endif()
 
 if (WITH_ASCEND_CL)
   cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
+  cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info)
 endif()
 
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
@@ -73,10 +74,15 @@ endif()
 
 list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
 
+if (WITH_ASCEND_CL)
+    list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
+endif()
+
+
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 730efa5c646885026eee1e472205ce723b0fcb1b..3a156f1fa3c4cfb39d8dd3524353fd0c6a616184 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -20,6 +20,9 @@
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -72,6 +75,7 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
         }
+        InitNaiveBestFitNPUPinnedAllocator();
 #endif
         break;
       }
@@ -195,6 +199,12 @@ class AllocatorFacadePrivate {
   void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
   }
+
+  void InitNaiveBestFitNPUPinnedAllocator() {
+    allocators_[platform::NPUPinnedPlace()] =
+        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
+  }
+
 #endif
 
   class ZeroSizeAllocator : public Allocator {
@@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
       ->Release(place);
 }
 
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place) {
+  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index fa906fbf5ce8fedb7790e19a1e7c257bbce5faac..7f6ad561aa931bd42fe312fe397cc561a64f723f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -15,11 +15,17 @@
 #pragma once
 #include <memory>
 #include "paddle/fluid/memory/allocation/allocator.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace memory {
 namespace allocation {
+#ifdef PADDLE_WITH_ASCEND_CL
+using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
+#endif
 
 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
@@ -46,6 +52,7 @@ class AllocatorFacade {
 
   // Release unused memory pool.
   uint64_t Release(const platform::Place& place);
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
   // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 3e88d61783c9e67053ef065f61fef5cf991a9b25..bc72b4b20d061445932d877417f02917dfd613cf 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -287,6 +287,21 @@ class NPUBuddyAllocatorList {
 BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
   return NPUBuddyAllocatorList::Instance()->Get(npu_id);
 }
+
+BuddyAllocator *GetNPUPinnedBuddyAllocator() {
+  static std::once_flag init_flag;
+  static BuddyAllocator *ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::NPUPinnedAllocator),
+                            platform::NPUPinnedMinChunkSize(),
+                            platform::NPUPinnedMaxChunkSize());
+  });
+
+  return ba;
+}
+
 #endif
 
 template <>
@@ -351,6 +366,59 @@ uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
 #endif
 }
 
+template <>
+size_t Used<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUPinnedBuddyAllocator()->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
+                                      size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto *buddy_allocator = GetNPUPinnedBuddyAllocator();
+  void *ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    LOG(WARNING) << "aclrtMallocHost Cannot allocate " << size
+                 << " bytes in NPUPinnedPlace";
+  }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
+                                    void *p, size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  GetNPUPinnedBuddyAllocator()->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::NPUPinnedPlace>(
+    const platform::NPUPinnedPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUPinnedBuddyAllocator()->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
 // For CUDA
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUBuddyAllocatorList {
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..507a8589d94ddd1adf925aa5e01c787439624c62
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+void NPUPinnedAllocator::ProcessEventsAndFree() {
+  for (auto it = npu_events_.begin(); it != npu_events_.end();) {
+    aclrtEvent event = it->second;
+    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
+
+    if (status == ACL_EVENT_STATUS_COMPLETE) {
+      Allocation *allocation = it->first;
+      void *ptr = allocation->ptr();
+      free(ptr);
+      npu_events_.erase(it++);
+      delete allocation;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+    } else {
+      ++it;
+    }
+  }
+}
+
+Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
+  ProcessEventsAndFree();
+  void *ptr;
+  int error = posix_memalign(&ptr, kAlignment, size);
+  PADDLE_ENFORCE_EQ(
+      error, 0,
+      platform::errors::ResourceExhausted(
+          "Fail to alloc memory of %ld size, error code is %d.", size, error));
+  return new Allocation(ptr, size, platform::NPUPinnedPlace());
+}
+
+void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+  void *ptr = allocation->ptr();
+  auto iter = npu_events_.find(allocation);
+  aclrtEvent event = iter->second;
+  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
+  if (status == ACL_EVENT_STATUS_COMPLETE) {
+    free(ptr);
+    npu_events_.erase(allocation);
+    delete allocation;
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+  }
+  return;
+}
+
+uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
+  return static_cast<uint64_t>(0);
+}
+
+void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
+                                     aclrtStream stream) {
+  aclrtEvent event = nullptr;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream));
+  npu_events_.insert({allocation, event});
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c856b931ee2cf5b5734d90636b4bfd3dad138da
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+
+#include "acl/acl.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NPUPinnedAllocator : public Allocator {
+ public:
+  bool IsAllocThreadSafe() const override { return true; }
+  void ProcessEventsAndFree();
+  void RecordEvent(Allocation *allocation, aclrtStream stream);
+  constexpr static size_t kAlignment = 4096UL;
+
+ protected:
+  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(Allocation *allocation) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override;
+
+ private:
+  std::unordered_map<Allocation *, aclrtEvent> npu_events_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 0d7065d8bfba0e4ba6f443a3f9e87ee0e1a825a6..d6dc303ebc789ef447f16a955905e3a837776baa 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) {
 }
 
 bool NPUAllocator::UseGpu() const { return true; }
+
+void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  size_t usable =
+      paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
+
+  if (size > usable) {
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB pinned memory."
+                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
+    return nullptr;
+  }
+
+  void* p;
+  // PINNED memory is visible to all NPU contexts.
+  auto result = aclrtMallocHost(&p, size);
+
+  if (result == ACL_ERROR_NONE) {
+    *index = 1;  // PINNED memory
+    npu_pinnd_alloc_size_ += size;
+    return p;
+  } else {
+    LOG(WARNING) << "aclrtMallocHost failed.";
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
+  aclError err;
+  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
+                                  "The index should be 1, but got %d", index));
+
+  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated npu pinned memory (%d)",
+                        size, npu_pinnd_alloc_size_));
+  npu_pinnd_alloc_size_ -= size;
+  err = aclrtFreeHost(p);
+
+  if (err != ACL_ERROR_NONE) {
+    PADDLE_ENFORCE_EQ(
+        err, 0,
+        platform::errors::Fatal(
+            "aclrtFreeHost failed in NPUPinnedAllocator, error code is %d",
+            err));
+  }
+}
+
+bool NPUPinnedAllocator::UseGpu() const { return false; }
+
 #endif
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index 26711ae4070f5ed72f77519b196c4c354cb049e1..92042f0bbae9f0d29d15b9ed266f57cfa7594412 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator {
   size_t npu_alloc_size_ = 0;
   int npu_id_;
 };
+
+class NPUPinnedAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t npu_pinnd_alloc_size_ = 0;
+};
 #endif
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 730d49e8acd93022e6e46f7285b9548ed7a5c6d8..a925957e1af1001d69f7d9a6ef6311d997c0edb6 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -245,7 +245,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
 
-    platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
+    platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU");
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
   }
 }
@@ -294,6 +294,86 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
     }
   }
 }
+
+template <>
+void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
+    const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst,
+    platform::NPUPinnedPlace src_place, const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
+    platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
+    const void* src, size_t num, aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(src_place.device);
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+  } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
+
+    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
+  }
+}
+
+template <>
+void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
+    platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
+    const void* src, size_t num, aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(dst_place.device);
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+  } else {
+    // On NPU, async operation after sync operation is ok, while sync operation
+    // after async is not ok, since the async operation may not done.
+    // So, its needed to do wait before sync operation.
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
+    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
+  }
+}
+
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 0bdc7b69434221ffd91b0df94287df0eae42d89b..56217b4dc7ef5a2adc96bfa9c27aeba33af57893 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -158,6 +158,14 @@ void set_constant_with_place<platform::NPUPlace>(
   PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<platform::NPUPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 923c97350e89ea9a3de01120bb7df57766247a38..6405b556217660bc0efb52eef33c83a3aceafc80 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
+size_t NPUPinnedMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t NPUPinnedMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 64 KB.
+  return 1 << 16;
+}
+
+size_t NPUPinnedMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
+  // memory.
+  return NPUPinnedMaxAllocSize() / 256;
+}
+
 #ifdef PADDLE_WITH_XBYAK
 static Xbyak::util::Cpu cpu;
 bool MayIUse(const cpu_isa_t cpu_isa) {
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 94527149d4e0b459dee03375d56fb0a9526aa055..29dc0a15aaea11c77f926877ab01abadc5ea3a73 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
+//! Get the maximum allocation size for a machine.
+size_t NPUPinnedMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t NPUPinnedMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t NPUPinnedMaxChunkSize();
+
 typedef enum {
   isa_any,
   sse42,
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 9a47ac45462ed7080d34404891fb8410a71d3938..7e983eb54ae2cdb44cf4ae5a949f0fac40ec4835 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(platform::errors::Unimplemented(
           "NPUPlace is not supported. Please "
           "re-compile with WITH_ASCEND_CL option."));
+#endif
+    } else if (platform::is_npu_pinned_place(p)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      EmplaceDeviceContext<NPUPinnedDeviceContext, NPUPinnedPlace>(
+          &device_contexts_, p);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPinnedPlace is not supported. Please re-compile with "
+          "WITH_ASCEND_CL "
+          "option."));
 #endif
     }
   }
@@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
 Place NPUDeviceContext::GetPlace() const { return place_; }
 
 aclrtContext NPUDeviceContext::context() const { return context_; }
+
+NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
+    : place_(place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
+
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index d91e14ec3aa923b81976f953d9673175d5217b21..e62f0673e97fadc68de6c7f08591a941e035a4b8 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -233,6 +233,27 @@ template <>
 struct DefaultDeviceContextType<platform::NPUPlace> {
   using TYPE = NPUDeviceContext;
 };
+
+// Currently, NPUPinnedDeviceContext is only used to data copying.
+class NPUPinnedDeviceContext : public DeviceContext {
+ public:
+  NPUPinnedDeviceContext();
+  explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
+
+  Place GetPlace() const override;
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+ private:
+  NPUPinnedPlace place_;
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
+
+template <>
+struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
+  using TYPE = NPUPinnedDeviceContext;
+};
+
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 1cc9fd9fe76341cd495a3580cddbff65f5b0e208..14c772d88897f4fa28e7c37a9452b78b637419a2 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> {
   }
   void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
   void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
+  void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) {
   return boost::apply_visitor(IsCUDAPinnedPlace(), p);
 }
 
+bool is_npu_pinned_place(const Place &p) {
+  return boost::apply_visitor(IsNPUPinnedPlace(), p);
+}
+
 bool places_are_same_class(const Place &p1, const Place &p2) {
   return p1.which() == p2.which();
 }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index f20fac477d0ec4ef40a3544476e223b6ad97fffa..62d30ecc5ce2efdc1e87229843ee39685507d771 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -85,10 +85,19 @@ struct NPUPlace {
   int device;
 };
 
+struct NPUPinnedPlace {
+  NPUPinnedPlace() {}
+
+  inline bool operator==(const NPUPinnedPlace &) const { return true; }
+  inline bool operator!=(const NPUPinnedPlace &) const { return false; }
+  inline bool operator<(const NPUPinnedPlace &) const { return false; }
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return true; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
@@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return true; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return true; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return true; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                    CUDAPinnedPlace> {
+                                    CUDAPinnedPlace, NPUPinnedPlace> {
  private:
-  using PlaceBase =
-      boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;
+  using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
+                                   CUDAPinnedPlace, NPUPinnedPlace>;
 
  public:
   Place() = default;
@@ -139,6 +161,8 @@ class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
+  Place(const NPUPinnedPlace &npu_pinned_place)  // NOLINT
+      : PlaceBase(npu_pinned_place) {}
 
   bool operator<(const Place &place) const {
     return PlaceBase::operator<(static_cast<const PlaceBase &>(place));
@@ -155,6 +179,7 @@ bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
+bool is_npu_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
@@ -190,6 +215,17 @@ struct PlaceVisitorWrapper
 #endif
   }
 
+  typename Visitor::result_type operator()(
+      const NPUPinnedPlace &npu_pinned) const {
+#ifdef PADDLE_WITH_ASCEND_CL
+    return visitor_(npu_pinned);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with NPU. Cannot visit npu_pinned"));
+    return typename Visitor::result_type();
+#endif
+  }
+
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     return visitor_(cuda);