diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index b99ab6b5a7ff195ef7d659598df88467bb158c6e..3833b027d2a364d7a46d01540983a1637de25376 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -87,6 +87,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { platform::errors::Unimplemented("platform::NPUPlace is not supported")); } + inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const { + PADDLE_THROW(platform::errors::Unimplemented( + "platform::NPUPinnedPlace is not supported")); + } + inline ::DLContext operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLContext ctx; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 78fd1af09e29458ec84549c55dd99f8c29da29db..105751645bbc5929dc07e524dcc3e8b52ec52034 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -503,6 +503,11 @@ class AnyVisitor : public boost::static_visitor { // return GetResultHelper(out, npu); } + bool GetResult(const framework::Tensor& out, + const platform::NPUPinnedPlace& cpu) const { + return *out.data(); + } + bool GetResult(const framework::Tensor& out, const platform::CPUPlace& cpu) const { return *out.data(); @@ -731,6 +736,18 @@ struct BothFalseVisitor : public boost::static_visitor<> { out_ptr[i] = lhs && rhs; } } + + void VisitorImpl( + const platform::NPUPinnedPlace& cpu /* equals to cpu*/) const { + int num = in_.numel(); + const bool* in_ptr = in_.data(); + bool* out_ptr = out_->data(); + for (int i = 0; i < num; ++i) { + bool lhs = !in_ptr[i]; + bool rhs = !out_ptr[i]; + out_ptr[i] = lhs && rhs; + } + } }; void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) { diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 43546cf99c69ffa3aa1f1a792e7b344ed0735a31..6b9b411713329ad0f9f663f57a081a7404a5aa7b 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -132,6 +132,12 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif + void operator()(const platform::NPUPinnedPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } // there is NO blas in CUDAPinnedPlace void operator()(const platform::CUDAPinnedPlace& place) { PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 2ea047fa13c10596995916234ef67e8a276b6b22..9a0637453f03f08a50bb1af958b1ba5e584869b4 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -29,6 +29,7 @@ endif() if (WITH_ASCEND_CL) cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) + cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info) endif() cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) @@ -73,10 +74,15 @@ endif() list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator) +if (WITH_ASCEND_CL) + list(APPEND AllocatorFacadeDeps npu_pinned_allocator) +endif() + + cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy ) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) if (WITH_TESTING) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 730efa5c646885026eee1e472205ce723b0fcb1b..3a156f1fa3c4cfb39d8dd3524353fd0c6a616184 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -20,6 +20,9 @@ #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -72,6 +75,7 @@ class AllocatorFacadePrivate { for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); } + InitNaiveBestFitNPUPinnedAllocator(); #endif break; } @@ -195,6 +199,12 @@ class AllocatorFacadePrivate { void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { allocators_[p] = std::make_shared(p); } + + void InitNaiveBestFitNPUPinnedAllocator() { + allocators_[platform::NPUPinnedPlace()] = + std::make_shared(); + } + #endif class ZeroSizeAllocator : public Allocator { @@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ->Release(place); } +const std::shared_ptr& AllocatorFacade::GetAllocator( + const platform::Place& place) { + return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index fa906fbf5ce8fedb7790e19a1e7c257bbce5faac..7f6ad561aa931bd42fe312fe397cc561a64f723f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -15,11 +15,17 @@ #pragma once #include #include "paddle/fluid/memory/allocation/allocator.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif #include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { namespace allocation { +#ifdef PADDLE_WITH_ASCEND_CL +using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator; +#endif // Allocator Facade is the interface exposed to other modules. // All the configuration or dirty code under development should @@ -46,6 +52,7 @@ class AllocatorFacade { // Release unused memory pool. uint64_t Release(const platform::Place& place); + const std::shared_ptr& GetAllocator(const platform::Place& place); // TODO(yy): Allocate a Copy-On-Write allocation? private: diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 3e88d61783c9e67053ef065f61fef5cf991a9b25..bc72b4b20d061445932d877417f02917dfd613cf 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -287,6 +287,21 @@ class NPUBuddyAllocatorList { BuddyAllocator *GetNPUBuddyAllocator(int npu_id) { return NPUBuddyAllocatorList::Instance()->Get(npu_id); } + +BuddyAllocator *GetNPUPinnedBuddyAllocator() { + static std::once_flag init_flag; + static BuddyAllocator *ba = nullptr; + + std::call_once(init_flag, []() { + ba = new BuddyAllocator(std::unique_ptr( + new detail::NPUPinnedAllocator), + platform::NPUPinnedMinChunkSize(), + platform::NPUPinnedMaxChunkSize()); + }); + + return ba; +} + #endif template <> @@ -351,6 +366,59 @@ uint64_t Release(const platform::NPUPlace &place) { #endif } +template <> +size_t Used(const platform::NPUPinnedPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUPinnedBuddyAllocator()->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPinnedPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void *Alloc(const platform::NPUPinnedPlace &place, + size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + auto *buddy_allocator = GetNPUPinnedBuddyAllocator(); + void *ptr = buddy_allocator->Alloc(size); + + if (ptr == nullptr) { + LOG(WARNING) << "aclrtMallocHost Cannot allocate " << size + << " bytes in NPUPinnedPlace"; + } + if (FLAGS_init_allocated_mem) { + memset(ptr, 0xEF, size); + } + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPinnedPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::NPUPinnedPlace &place, + void *p, size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + GetNPUPinnedBuddyAllocator()->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPinnedPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release( + const platform::NPUPinnedPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUPinnedBuddyAllocator()->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPinnedPlace' is not supported in CPU only device.")); +#endif +} + // For CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUBuddyAllocatorList { diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..507a8589d94ddd1adf925aa5e01c787439624c62 --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +void NPUPinnedAllocator::ProcessEventsAndFree() { + for (auto it = npu_events_.begin(); it != npu_events_.end();) { + aclrtEvent event = it->second; + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status)); + + if (status == ACL_EVENT_STATUS_COMPLETE) { + Allocation *allocation = it->first; + void *ptr = allocation->ptr(); + free(ptr); + npu_events_.erase(it++); + delete allocation; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event)); + } else { + ++it; + } + } +} + +Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { + ProcessEventsAndFree(); + void *ptr; + int error = posix_memalign(&ptr, kAlignment, size); + PADDLE_ENFORCE_EQ( + error, 0, + platform::errors::ResourceExhausted( + "Fail to alloc memory of %ld size, error code is %d.", size, error)); + return new Allocation(ptr, size, platform::NPUPinnedPlace()); +} + +void NPUPinnedAllocator::FreeImpl(Allocation *allocation) { + void *ptr = allocation->ptr(); + auto iter = npu_events_.find(allocation); + aclrtEvent event = iter->second; + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status)); + if (status == ACL_EVENT_STATUS_COMPLETE) { + free(ptr); + npu_events_.erase(allocation); + delete allocation; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event)); + } + return; +} + +uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) { + return static_cast(0); +} + +void NPUPinnedAllocator::RecordEvent(Allocation *allocation, + aclrtStream stream) { + aclrtEvent event = nullptr; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event)); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream)); + npu_events_.insert({allocation, event}); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle +#endif diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..4c856b931ee2cf5b5734d90636b4bfd3dad138da --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h @@ -0,0 +1,51 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_ASCEND_CL +#include // NOLINT +#include +#include + +#include "acl/acl.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class NPUPinnedAllocator : public Allocator { + public: + bool IsAllocThreadSafe() const override { return true; } + void ProcessEventsAndFree(); + void RecordEvent(Allocation *allocation, aclrtStream stream); + constexpr static size_t kAlignment = 4096UL; + + protected: + Allocation *AllocateImpl(size_t size) override; + void FreeImpl(Allocation *allocation) override; + uint64_t ReleaseImpl(const platform::Place &place) override; + + private: + std::unordered_map npu_events_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 0d7065d8bfba0e4ba6f443a3f9e87ee0e1a825a6..d6dc303ebc789ef447f16a955905e3a837776baa 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) { } bool NPUAllocator::UseGpu() const { return true; } + +void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + size_t usable = + paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_; + + if (size > usable) { + LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB pinned memory." + << ", available " << usable / 1024.0 / 1024.0 << " MB"; + return nullptr; + } + + void* p; + // PINNED memory is visible to all NPU contexts. + auto result = aclrtMallocHost(&p, size); + + if (result == ACL_ERROR_NONE) { + *index = 1; // PINNED memory + npu_pinnd_alloc_size_ += size; + return p; + } else { + LOG(WARNING) << "aclrtMallocHost failed."; + return nullptr; + } + + return nullptr; +} + +void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) { + aclError err; + PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument( + "The index should be 1, but got %d", index)); + + PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated npu pinned memory (%d)", + size, npu_pinnd_alloc_size_)); + npu_pinnd_alloc_size_ -= size; + err = aclrtFreeHost(p); + + if (err != ACL_ERROR_NONE) { + PADDLE_ENFORCE_EQ( + err, 0, + platform::errors::Fatal( + "aclrtFreeHost failed in NPUPinnedAllocator, error code is %d", + err)); + } +} + +bool NPUPinnedAllocator::UseGpu() const { return false; } + #endif } // namespace detail diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index 26711ae4070f5ed72f77519b196c4c354cb049e1..92042f0bbae9f0d29d15b9ed266f57cfa7594412 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator { size_t npu_alloc_size_ = 0; int npu_id_; }; + +class NPUPinnedAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t npu_pinnd_alloc_size_ = 0; +}; #endif } // namespace detail diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 730d49e8acd93022e6e46f7285b9548ed7a5c6d8..a925957e1af1001d69f7d9a6ef6311d997c0edb6 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -245,7 +245,7 @@ void Copy(platform::CPUPlace dst_place, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); - platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); + platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } } @@ -294,6 +294,86 @@ void Copy(platform::NPUPlace dst_place, } } } + +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, + const void* src, size_t num) { + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place; + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num) { + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place; + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::NPUPinnedPlace dst_place, void* dst, + platform::NPUPinnedPlace src_place, const void* src, size_t num) { + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place; + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, + const void* src, size_t num, aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(src_place.device); + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + + platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); + } +} + +template <> +void Copy( + platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, + const void* src, size_t num, aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(dst_place.device); + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + } else { + // On NPU, async operation after sync operation is ok, while sync operation + // after async is not ok, since the async operation may not done. + // So, its needed to do wait before sync operation. + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + + platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); + } +} + #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 0bdc7b69434221ffd91b0df94287df0eae42d89b..56217b4dc7ef5a2adc96bfa9c27aeba33af57893 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -158,6 +158,14 @@ void set_constant_with_place( PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported")); } +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW( + platform::errors::Unimplemented("NPUPinnedPlace is not supported")); +} + template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 923c97350e89ea9a3de01120bb7df57766247a38..6405b556217660bc0efb52eef33c83a3aceafc80 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() { return CUDAPinnedMaxAllocSize() / 256; } +size_t NPUPinnedMaxAllocSize() { + // For distributed systems, it requires configuring and limiting + // the fraction of memory to use. + return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory(); +} + +size_t NPUPinnedMinChunkSize() { + // Allow to allocate the minimum chunk size is 64 KB. + return 1 << 16; +} + +size_t NPUPinnedMaxChunkSize() { + // Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED + // memory. + return NPUPinnedMaxAllocSize() / 256; +} + #ifdef PADDLE_WITH_XBYAK static Xbyak::util::Cpu cpu; bool MayIUse(const cpu_isa_t cpu_isa) { diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 94527149d4e0b459dee03375d56fb0a9526aa055..29dc0a15aaea11c77f926877ab01abadc5ea3a73 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); +//! Get the maximum allocation size for a machine. +size_t NPUPinnedMaxAllocSize(); + +//! Get the minimum chunk size for buddy allocator. +size_t NPUPinnedMinChunkSize(); + +//! Get the maximum chunk size for buddy allocator. +size_t NPUPinnedMaxChunkSize(); + typedef enum { isa_any, sse42, diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 9a47ac45462ed7080d34404891fb8410a71d3938..7e983eb54ae2cdb44cf4ae5a949f0fac40ec4835 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool( PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported. Please " "re-compile with WITH_ASCEND_CL option.")); +#endif + } else if (platform::is_npu_pinned_place(p)) { +#ifdef PADDLE_WITH_ASCEND_CL + EmplaceDeviceContext( + &device_contexts_, p); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPinnedPlace is not supported. Please re-compile with " + "WITH_ASCEND_CL " + "option.")); #endif } } @@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } Place NPUDeviceContext::GetPlace() const { return place_; } aclrtContext NPUDeviceContext::context() const { return context_; } + +NPUPinnedDeviceContext::NPUPinnedDeviceContext() { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place) + : place_(place) { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +Place NPUPinnedDeviceContext::GetPlace() const { return place_; } + #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index d91e14ec3aa923b81976f953d9673175d5217b21..e62f0673e97fadc68de6c7f08591a941e035a4b8 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -233,6 +233,27 @@ template <> struct DefaultDeviceContextType { using TYPE = NPUDeviceContext; }; + +// Currently, NPUPinnedDeviceContext is only used to data copying. +class NPUPinnedDeviceContext : public DeviceContext { + public: + NPUPinnedDeviceContext(); + explicit NPUPinnedDeviceContext(NPUPinnedPlace place); + + Place GetPlace() const override; + + Eigen::DefaultDevice* eigen_device() const; + + private: + NPUPinnedPlace place_; + std::unique_ptr eigen_device_; +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = NPUPinnedDeviceContext; +}; + #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 1cc9fd9fe76341cd495a3580cddbff65f5b0e208..14c772d88897f4fa28e7c37a9452b78b637419a2 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> { } void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; } + void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; } void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } private: @@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) { return boost::apply_visitor(IsCUDAPinnedPlace(), p); } +bool is_npu_pinned_place(const Place &p) { + return boost::apply_visitor(IsNPUPinnedPlace(), p); +} + bool places_are_same_class(const Place &p1, const Place &p2) { return p1.which() == p2.which(); } diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index f20fac477d0ec4ef40a3544476e223b6ad97fffa..62d30ecc5ce2efdc1e87229843ee39685507d771 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -85,10 +85,19 @@ struct NPUPlace { int device; }; +struct NPUPinnedPlace { + NPUPinnedPlace() {} + + inline bool operator==(const NPUPinnedPlace &) const { return true; } + inline bool operator!=(const NPUPinnedPlace &) const { return false; } + inline bool operator<(const NPUPinnedPlace &) const { return false; } +}; + struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return true; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return true; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } }; @@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return true; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return true; } + bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &) const { return false; } +}; + +struct IsNPUPinnedPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return true; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; class Place : public boost::variant { + CUDAPinnedPlace, NPUPinnedPlace> { private: - using PlaceBase = - boost::variant; + using PlaceBase = boost::variant; public: Place() = default; @@ -139,6 +161,8 @@ class Place : public boost::variant(place)); @@ -155,6 +179,7 @@ bool is_xpu_place(const Place &); bool is_npu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); +bool is_npu_pinned_place(const Place &); bool places_are_same_class(const Place &, const Place &); bool is_same_place(const Place &, const Place &); @@ -190,6 +215,17 @@ struct PlaceVisitorWrapper #endif } + typename Visitor::result_type operator()( + const NPUPinnedPlace &npu_pinned) const { +#ifdef PADDLE_WITH_ASCEND_CL + return visitor_(npu_pinned); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with NPU. Cannot visit npu_pinned")); + return typename Visitor::result_type(); +#endif + } + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return visitor_(cuda);