未验证 提交 6b3bb796 编写于 作者: L liym27 提交者: GitHub

[NPU] Support npu pinned allocator and manage Tensor on NPUPinnedPlace (#32840)

上级 890f626b
...@@ -87,6 +87,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -87,6 +87,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
platform::errors::Unimplemented("platform::NPUPlace is not supported")); platform::errors::Unimplemented("platform::NPUPlace is not supported"));
} }
inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"platform::NPUPinnedPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const { inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::DLContext ctx; ::DLContext ctx;
......
...@@ -503,6 +503,11 @@ class AnyVisitor : public boost::static_visitor<bool> { ...@@ -503,6 +503,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
// return GetResultHelper(out, npu); // return GetResultHelper(out, npu);
} }
bool GetResult(const framework::Tensor& out,
const platform::NPUPinnedPlace& cpu) const {
return *out.data<bool>();
}
bool GetResult(const framework::Tensor& out, bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const { const platform::CPUPlace& cpu) const {
return *out.data<bool>(); return *out.data<bool>();
...@@ -731,6 +736,18 @@ struct BothFalseVisitor : public boost::static_visitor<> { ...@@ -731,6 +736,18 @@ struct BothFalseVisitor : public boost::static_visitor<> {
out_ptr[i] = lhs && rhs; out_ptr[i] = lhs && rhs;
} }
} }
void VisitorImpl(
const platform::NPUPinnedPlace& cpu /* equals to cpu*/) const {
int num = in_.numel();
const bool* in_ptr = in_.data<bool>();
bool* out_ptr = out_->data<bool>();
for (int i = 0; i < num; ++i) {
bool lhs = !in_ptr[i];
bool rhs = !out_ptr[i];
out_ptr[i] = lhs && rhs;
}
}
}; };
void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) { void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
......
...@@ -132,6 +132,12 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -132,6 +132,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
} }
#endif #endif
void operator()(const platform::NPUPinnedPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
// there is NO blas in CUDAPinnedPlace // there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) { void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
......
...@@ -29,6 +29,7 @@ endif() ...@@ -29,6 +29,7 @@ endif()
if (WITH_ASCEND_CL) if (WITH_ASCEND_CL)
cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info)
endif() endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
...@@ -73,10 +74,15 @@ endif() ...@@ -73,10 +74,15 @@ endif()
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator) list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
if (WITH_ASCEND_CL)
list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
endif()
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
if (WITH_TESTING) if (WITH_TESTING)
......
...@@ -20,6 +20,9 @@ ...@@ -20,6 +20,9 @@
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -72,6 +75,7 @@ class AllocatorFacadePrivate { ...@@ -72,6 +75,7 @@ class AllocatorFacadePrivate {
for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
} }
InitNaiveBestFitNPUPinnedAllocator();
#endif #endif
break; break;
} }
...@@ -195,6 +199,12 @@ class AllocatorFacadePrivate { ...@@ -195,6 +199,12 @@ class AllocatorFacadePrivate {
void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p); allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
} }
void InitNaiveBestFitNPUPinnedAllocator() {
allocators_[platform::NPUPinnedPlace()] =
std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
}
#endif #endif
class ZeroSizeAllocator : public Allocator { class ZeroSizeAllocator : public Allocator {
...@@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ...@@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
->Release(place); ->Release(place);
} }
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place) {
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -15,11 +15,17 @@ ...@@ -15,11 +15,17 @@
#pragma once #pragma once
#include <memory> #include <memory>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
#ifdef PADDLE_WITH_ASCEND_CL
using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
#endif
// Allocator Facade is the interface exposed to other modules. // Allocator Facade is the interface exposed to other modules.
// All the configuration or dirty code under development should // All the configuration or dirty code under development should
...@@ -46,6 +52,7 @@ class AllocatorFacade { ...@@ -46,6 +52,7 @@ class AllocatorFacade {
// Release unused memory pool. // Release unused memory pool.
uint64_t Release(const platform::Place& place); uint64_t Release(const platform::Place& place);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
// TODO(yy): Allocate a Copy-On-Write allocation? // TODO(yy): Allocate a Copy-On-Write allocation?
private: private:
......
...@@ -287,6 +287,21 @@ class NPUBuddyAllocatorList { ...@@ -287,6 +287,21 @@ class NPUBuddyAllocatorList {
BuddyAllocator *GetNPUBuddyAllocator(int npu_id) { BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
return NPUBuddyAllocatorList::Instance()->Get(npu_id); return NPUBuddyAllocatorList::Instance()->Get(npu_id);
} }
BuddyAllocator *GetNPUPinnedBuddyAllocator() {
static std::once_flag init_flag;
static BuddyAllocator *ba = nullptr;
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUPinnedAllocator),
platform::NPUPinnedMinChunkSize(),
platform::NPUPinnedMaxChunkSize());
});
return ba;
}
#endif #endif
template <> template <>
...@@ -351,6 +366,59 @@ uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) { ...@@ -351,6 +366,59 @@ uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
#endif #endif
} }
template <>
size_t Used<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUPinnedBuddyAllocator()->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}
template <>
void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
auto *buddy_allocator = GetNPUPinnedBuddyAllocator();
void *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
LOG(WARNING) << "aclrtMallocHost Cannot allocate " << size
<< " bytes in NPUPinnedPlace";
}
if (FLAGS_init_allocated_mem) {
memset(ptr, 0xEF, size);
}
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
void *p, size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator()->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::NPUPinnedPlace>(
const platform::NPUPinnedPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUPinnedBuddyAllocator()->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPinnedPlace' is not supported in CPU only device."));
#endif
}
// For CUDA // For CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class GPUBuddyAllocatorList { class GPUBuddyAllocatorList {
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
void NPUPinnedAllocator::ProcessEventsAndFree() {
for (auto it = npu_events_.begin(); it != npu_events_.end();) {
aclrtEvent event = it->second;
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
if (status == ACL_EVENT_STATUS_COMPLETE) {
Allocation *allocation = it->first;
void *ptr = allocation->ptr();
free(ptr);
npu_events_.erase(it++);
delete allocation;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
} else {
++it;
}
}
}
Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
ProcessEventsAndFree();
void *ptr;
int error = posix_memalign(&ptr, kAlignment, size);
PADDLE_ENFORCE_EQ(
error, 0,
platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size, error code is %d.", size, error));
return new Allocation(ptr, size, platform::NPUPinnedPlace());
}
void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
void *ptr = allocation->ptr();
auto iter = npu_events_.find(allocation);
aclrtEvent event = iter->second;
aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
if (status == ACL_EVENT_STATUS_COMPLETE) {
free(ptr);
npu_events_.erase(allocation);
delete allocation;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
}
return;
}
uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
return static_cast<uint64_t>(0);
}
void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
aclrtStream stream) {
aclrtEvent event = nullptr;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream));
npu_events_.insert({allocation, event});
}
} // namespace allocation
} // namespace memory
} // namespace paddle
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include "acl/acl.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class NPUPinnedAllocator : public Allocator {
public:
bool IsAllocThreadSafe() const override { return true; }
void ProcessEventsAndFree();
void RecordEvent(Allocation *allocation, aclrtStream stream);
constexpr static size_t kAlignment = 4096UL;
protected:
Allocation *AllocateImpl(size_t size) override;
void FreeImpl(Allocation *allocation) override;
uint64_t ReleaseImpl(const platform::Place &place) override;
private:
std::unordered_map<Allocation *, aclrtEvent> npu_events_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
#endif
...@@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) {
} }
bool NPUAllocator::UseGpu() const { return true; } bool NPUAllocator::UseGpu() const { return true; }
void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
size_t usable =
paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB pinned memory."
<< ", available " << usable / 1024.0 / 1024.0 << " MB";
return nullptr;
}
void* p;
// PINNED memory is visible to all NPU contexts.
auto result = aclrtMallocHost(&p, size);
if (result == ACL_ERROR_NONE) {
*index = 1; // PINNED memory
npu_pinnd_alloc_size_ += size;
return p;
} else {
LOG(WARNING) << "aclrtMallocHost failed.";
return nullptr;
}
return nullptr;
}
void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
aclError err;
PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
"The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)",
size, npu_pinnd_alloc_size_));
npu_pinnd_alloc_size_ -= size;
err = aclrtFreeHost(p);
if (err != ACL_ERROR_NONE) {
PADDLE_ENFORCE_EQ(
err, 0,
platform::errors::Fatal(
"aclrtFreeHost failed in NPUPinnedAllocator, error code is %d",
err));
}
}
bool NPUPinnedAllocator::UseGpu() const { return false; }
#endif #endif
} // namespace detail } // namespace detail
......
...@@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator { ...@@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator {
size_t npu_alloc_size_ = 0; size_t npu_alloc_size_ = 0;
int npu_id_; int npu_id_;
}; };
class NPUPinnedAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t npu_pinnd_alloc_size_ = 0;
};
#endif #endif
} // namespace detail } // namespace detail
......
...@@ -245,7 +245,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place, ...@@ -245,7 +245,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
} }
} }
...@@ -294,6 +294,86 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -294,6 +294,86 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
} }
} }
} }
template <>
void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
const void* src, size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
platform::NPUPinnedPlace dst_place, void* dst,
platform::NPUPinnedPlace src_place, const void* src, size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
const void* src, size_t num, aclrtStream stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
}
}
template <>
void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
const void* src, size_t num, aclrtStream stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(dst_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
} else {
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
}
}
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -158,6 +158,14 @@ void set_constant_with_place<platform::NPUPlace>( ...@@ -158,6 +158,14 @@ void set_constant_with_place<platform::NPUPlace>(
PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported")); PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
} }
template <>
void set_constant_with_place<platform::NPUPinnedPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(
platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <> template <>
void set_constant_with_place<platform::CPUPlace>( void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor, const platform::DeviceContext& context, framework::Tensor* tensor,
......
...@@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() { ...@@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() {
return CUDAPinnedMaxAllocSize() / 256; return CUDAPinnedMaxAllocSize() / 256;
} }
size_t NPUPinnedMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
}
size_t NPUPinnedMinChunkSize() {
// Allow to allocate the minimum chunk size is 64 KB.
return 1 << 16;
}
size_t NPUPinnedMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
// memory.
return NPUPinnedMaxAllocSize() / 256;
}
#ifdef PADDLE_WITH_XBYAK #ifdef PADDLE_WITH_XBYAK
static Xbyak::util::Cpu cpu; static Xbyak::util::Cpu cpu;
bool MayIUse(const cpu_isa_t cpu_isa) { bool MayIUse(const cpu_isa_t cpu_isa) {
......
...@@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize(); ...@@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator. //! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize(); size_t CUDAPinnedMaxChunkSize();
//! Get the maximum allocation size for a machine.
size_t NPUPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t NPUPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t NPUPinnedMaxChunkSize();
typedef enum { typedef enum {
isa_any, isa_any,
sse42, sse42,
......
...@@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool( ...@@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported. Please " "NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option.")); "re-compile with WITH_ASCEND_CL option."));
#endif
} else if (platform::is_npu_pinned_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext<NPUPinnedDeviceContext, NPUPinnedPlace>(
&device_contexts_, p);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."));
#endif #endif
} }
} }
...@@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } ...@@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place NPUDeviceContext::GetPlace() const { return place_; } Place NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext NPUDeviceContext::context() const { return context_; } aclrtContext NPUDeviceContext::context() const { return context_; }
NPUPinnedDeviceContext::NPUPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
}
NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place)
: place_(place) {
eigen_device_.reset(new Eigen::DefaultDevice());
}
Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const {
return eigen_device_.get();
}
Place NPUPinnedDeviceContext::GetPlace() const { return place_; }
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -233,6 +233,27 @@ template <> ...@@ -233,6 +233,27 @@ template <>
struct DefaultDeviceContextType<platform::NPUPlace> { struct DefaultDeviceContextType<platform::NPUPlace> {
using TYPE = NPUDeviceContext; using TYPE = NPUDeviceContext;
}; };
// Currently, NPUPinnedDeviceContext is only used to data copying.
class NPUPinnedDeviceContext : public DeviceContext {
public:
NPUPinnedDeviceContext();
explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
Place GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const;
private:
NPUPinnedPlace place_;
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
};
template <>
struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
using TYPE = NPUPinnedDeviceContext;
};
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> { ...@@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> {
} }
void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; } void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
private: private:
...@@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) { ...@@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) {
return boost::apply_visitor(IsCUDAPinnedPlace(), p); return boost::apply_visitor(IsCUDAPinnedPlace(), p);
} }
bool is_npu_pinned_place(const Place &p) {
return boost::apply_visitor(IsNPUPinnedPlace(), p);
}
bool places_are_same_class(const Place &p1, const Place &p2) { bool places_are_same_class(const Place &p1, const Place &p2) {
return p1.which() == p2.which(); return p1.which() == p2.which();
} }
......
...@@ -85,10 +85,19 @@ struct NPUPlace { ...@@ -85,10 +85,19 @@ struct NPUPlace {
int device; int device;
}; };
struct NPUPinnedPlace {
NPUPinnedPlace() {}
inline bool operator==(const NPUPinnedPlace &) const { return true; }
inline bool operator!=(const NPUPinnedPlace &) const { return false; }
inline bool operator<(const NPUPinnedPlace &) const { return false; }
};
struct IsCUDAPlace : public boost::static_visitor<bool> { struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; } bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return true; } bool operator()(const CUDAPlace &) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; }
}; };
...@@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> { ...@@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return true; } bool operator()(const CPUPlace &) const { return true; }
bool operator()(const XPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; }
}; };
...@@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> { ...@@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; } bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
}; };
...@@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> { ...@@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; } bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return true; } bool operator()(const XPUPlace &) const { return true; }
bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; }
}; };
...@@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor<bool> { ...@@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; } bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return true; } bool operator()(const NPUPlace &) const { return true; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return true; }
bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; }
}; };
class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
CUDAPinnedPlace> { CUDAPinnedPlace, NPUPinnedPlace> {
private: private:
using PlaceBase = using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>; CUDAPinnedPlace, NPUPinnedPlace>;
public: public:
Place() = default; Place() = default;
...@@ -139,6 +161,8 @@ class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, ...@@ -139,6 +161,8 @@ class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT
Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT
: PlaceBase(cuda_pinned_place) {} : PlaceBase(cuda_pinned_place) {}
Place(const NPUPinnedPlace &npu_pinned_place) // NOLINT
: PlaceBase(npu_pinned_place) {}
bool operator<(const Place &place) const { bool operator<(const Place &place) const {
return PlaceBase::operator<(static_cast<const PlaceBase &>(place)); return PlaceBase::operator<(static_cast<const PlaceBase &>(place));
...@@ -155,6 +179,7 @@ bool is_xpu_place(const Place &); ...@@ -155,6 +179,7 @@ bool is_xpu_place(const Place &);
bool is_npu_place(const Place &); bool is_npu_place(const Place &);
bool is_cpu_place(const Place &); bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &); bool is_cuda_pinned_place(const Place &);
bool is_npu_pinned_place(const Place &);
bool places_are_same_class(const Place &, const Place &); bool places_are_same_class(const Place &, const Place &);
bool is_same_place(const Place &, const Place &); bool is_same_place(const Place &, const Place &);
...@@ -190,6 +215,17 @@ struct PlaceVisitorWrapper ...@@ -190,6 +215,17 @@ struct PlaceVisitorWrapper
#endif #endif
} }
typename Visitor::result_type operator()(
const NPUPinnedPlace &npu_pinned) const {
#ifdef PADDLE_WITH_ASCEND_CL
return visitor_(npu_pinned);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"));
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const { typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return visitor_(cuda); return visitor_(cuda);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册