未验证 提交 81138239 编写于 作者: L Leo Chen 提交者: GitHub

[feature] support npu allocator (#30840)

[feature] support npu allocator
上级 ebef6601
......@@ -32,11 +32,14 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
if (WITH_GPU AND WITH_ASCEND)
if (WITH_GPU AND WITH_ASCEND)
message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
......
......@@ -82,6 +82,10 @@ if(WITH_ASCEND)
add_definitions(-DPADDLE_WITH_ASCEND)
endif()
if(WITH_ASCEND_CL)
add_definitions(-DPADDLE_WITH_ASCEND_CL)
endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)
......
......@@ -21,38 +21,58 @@ else()
set(ASCEND_DIR /usr/local/Ascend)
endif()
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
if(WITH_ASCEND)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
elseif(WITH_ASCEND_CL)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so)
set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}")
message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}")
INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR})
ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
add_custom_target(extern_ascend DEPENDS atlas_acl)
endif()
......@@ -274,10 +274,10 @@ if(WITH_BOX_PS)
list(APPEND third_party_deps extern_box_ps)
endif(WITH_BOX_PS)
if(WITH_ASCEND)
if(WITH_ASCEND OR WITH_ASCEND_CL)
include(external/ascend)
list(APPEND third_party_deps extern_ascend)
endif (WITH_ASCEND)
endif ()
if (WITH_PSCORE)
include(external/snappy)
......
......@@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::NPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::NPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA
::DLContext ctx;
......
......@@ -431,6 +431,14 @@ class AnyVisitor : public boost::static_visitor<bool> {
return GetResultHelper(out, gpu);
}
bool GetResult(const framework::Tensor& out,
const platform::NPUPlace& npu) const {
PADDLE_THROW(platform::errors::Unimplemented(
"Not supported on place (%s) ",
npu));
//return GetResultHelper(out, npu);
}
bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
......@@ -633,6 +641,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
#endif
}
void VisitorImpl(const platform::NPUPlace& npu) const {
//TODO(zhiqiu)
}
void VisitorImpl(const platform::CPUPlace& cpu) const {
int num = in_.numel();
const bool* in_ptr = in_.data<bool>();
......
......@@ -116,6 +116,23 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void operator()(const platform::NPUPlace& place) {
// TODO(zhiqiu): SUPPORT it
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const platform::NPUPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
// there is NO blas in CUDAPinnedPlace
void operator()(const platform::CUDAPinnedPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
......
......@@ -19,6 +19,10 @@ if (WITH_GPU)
cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
endif()
if (WITH_ASCEND_CL)
cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
......
......@@ -42,6 +42,7 @@
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
#include "paddle/fluid/platform/npu_info.h"
DEFINE_int64(
gpu_allocator_retry_time, 10000,
......@@ -76,6 +77,11 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
#ifdef PADDLE_WITH_ASCEND_CL
for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
}
#endif
break;
}
......@@ -195,6 +201,12 @@ class AllocatorFacadePrivate {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
class ZeroSizeAllocator : public Allocator {
public:
explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
......
......@@ -23,6 +23,7 @@
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
......@@ -112,6 +113,7 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
// For kunlun XPU
template <>
void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_XPU
......@@ -216,6 +218,136 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#endif
}
// For Ascend NPU
#ifdef PADDLE_WITH_ASCEND_CL
class NPUBuddyAllocatorList {
private:
NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
auto npu_num = devices_.size();
allocators_.resize(npu_num);
init_flags_.reserve(npu_num);
for (size_t i = 0; i < npu_num; ++i) {
init_flags_.emplace_back(new std::once_flag());
}
}
static NPUBuddyAllocatorList *CreateNewInstance() {
return new NPUBuddyAllocatorList();
}
public:
static NPUBuddyAllocatorList *Instance() {
static auto *instance = CreateNewInstance();
return instance;
}
BuddyAllocator *Get(int npu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
<< "or 'FLAGS_initial_gpu_memory_in_mb' "
<< "or 'FLAGS_reallocate_gpu_memory_in_mb' "
<< "to change the memory size for GPU usage.\n"
<< "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
});
return allocators_[pos].get();
}
private:
std::vector<int> devices_;
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};
BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
return NPUBuddyAllocatorList::Instance()->Get(npu_id);
}
#endif
template <>
size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
platform::NPUDeviceGuard(place.device);
size_t avail, total;
platform::NPUMemoryUsage(&avail, &total);
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
"%s, GpuMaxChunkSize %s, GPU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::NPUPlace>(place))));
} else {
if (FLAGS_init_allocated_mem) {
aclrtMemset(ptr, size, 0xEF, size);
}
}
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetNPUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
#ifdef PADDLE_WITH_ASCEND_CL
return GetNPUBuddyAllocator(place.device)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'NPUPlace' is not supported in CPU only device."));
#endif
}
// For CUDA
#ifdef PADDLE_WITH_CUDA
class GPUBuddyAllocatorList {
private:
......
......@@ -14,6 +14,8 @@
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#include <unistd.h>
#include <algorithm>
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
......@@ -69,6 +71,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
{
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
}
sleep(10);
alloc.Release(platform::NPUPlace(0));
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
alloc.Release(platform::NPUPlace(0));
}
#endif
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/npu_allocator.h"
#include <string>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/npu_info.h"
namespace paddle {
namespace memory {
namespace allocation {
bool NPUAllocator::IsAllocThreadSafe() const { return true; }
void NPUAllocator::FreeImpl(Allocation* allocation) {
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
platform::errors::PermissionDenied(
"NPU memory is freed in incorrect device. This may be a bug"));
platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
place_.device);
delete allocation;
}
Allocation* NPUAllocator::AllocateImpl(size_t size) {
std::call_once(once_flag_,
[this] { platform::SetNPUDeviceId(place_.device); });
void* ptr;
auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
if (LIKELY(result == ACL_ERROR_NONE)) {
return new Allocation(ptr, size, platform::Place(place_));
}
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedNPUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, place_.device);
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
"value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
"GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using NPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
"2. If no, please decrease the batch size of your model. %s\n\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(avail), place_.device, err_msg));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
class NPUAllocator : public Allocator {
public:
explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
bool IsAllocThreadSafe() const override;
protected:
void FreeImpl(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size) override;
private:
platform::NPUPlace place_;
std::once_flag once_flag_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -2,11 +2,15 @@ include(ExternalProject)
cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)
set(system_allocator_DEPS gflags cpu_info place)
if(${WITH_GPU})
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
else(${WITH_GPU})
elseif(${WITH_ASCEND_CL})
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
else()
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
endif(${WITH_GPU})
endif()
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
......
......@@ -21,6 +21,10 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
namespace paddle {
namespace memory {
......@@ -235,6 +239,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
} else {
// Compute the re-allocation size, we store the re-allocation size when
// user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
realloc_size_ = platform::NPUReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
#endif
// Allocate a new block
void* p = system_allocator_->Alloc(&index, allocate_bytes);
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
namespace paddle {
namespace memory {
......
......@@ -19,13 +19,15 @@ limitations under the License. */
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
#include <fstream>
#include <string>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#ifdef PADDLE_WITH_CUDA
#include <fstream>
#include <string>
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
......@@ -324,6 +326,33 @@ TEST(BuddyAllocator, Release) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(BuddyAllocator, NpuFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.005;
FLAGS_fraction_of_gpu_memory_to_use = 0.92;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
platform::NPUMinChunkSize(), platform::NPUMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
buddy_allocator.Release();
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator, 300 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -35,6 +35,7 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -239,6 +240,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif
#ifdef PADDLE_WITH_ASCEND_CL
void* NPUAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
void* p;
auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
if (result == ACL_ERROR_NONE) {
*index = 0;
npu_alloc_size_ += size;
return p;
} else {
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedNPUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, npu_id_);
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on NPU %d. "
"Cannot allocate %s memory on NPU %d, "
"available memory is only %s.\n\n"
"Please check whether there is any other process using NPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
"2. If no, please try one of the following suggestions:\n"
" 1) Decrease the batch size of your model.\n"
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
npu_id_, string::HumanReadableSize(size), npu_id_,
string::HumanReadableSize(avail), npu_id_,
FLAGS_fraction_of_gpu_memory_to_use, err_msg));
}
}
void NPUAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(npu_alloc_size_, size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, npu_alloc_size_));
npu_alloc_size_ -= size;
platform::RecordedNPUFree(p, size, npu_id_);
}
bool NPUAllocator::UseGpu() const { return true; }
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUAllocator : public SystemAllocator {
public:
explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t npu_alloc_size_ = 0;
int npu_id_;
};
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -81,3 +81,11 @@ TEST(GPUAllocator, AllocFailure) {
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
TEST(NPUAllocator, Alloc) {
paddle::memory::detail::NPUAllocator a(0);
TestAllocator(&a, 1<<20);
TestAllocator(&a, 1);
}
#endif
......@@ -148,6 +148,13 @@ void set_constant_with_place<platform::XPUPlace>(
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::NPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
......
......@@ -71,6 +71,10 @@ if(WITH_ASCEND)
cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
endif()
if(WITH_ASCEND_CL)
cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl)
endif()
add_subdirectory(dynload)
add_subdirectory(stream)
......
......@@ -228,6 +228,33 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
xpu::Context* XPUDeviceContext::x_context() const { return context_; }
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
// NOTE(zhiqiu): Usually, no need to create context explicitly,
// ACL creates a default context which contains 1 default stream
// and 1 sync strean after aclrtSetDevice.
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
}
NPUDeviceContext::~NPUDeviceContext() {
// NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
void NPUDeviceContext::Wait() const {
NPUDeviceGuard guard(place_.device);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
}
Place NPUDeviceContext::GetPlace() const { return place_; }
aclrtContext* NPUDeviceContext::context() const {
return const_cast<aclrtContext*>(&context_);
}
#endif
#ifdef PADDLE_WITH_CUDA
class EigenCudaStreamDevice : public Eigen::StreamInterface {
......
......@@ -59,6 +59,11 @@ struct GpuDevice;
#include "paddle/fluid/platform/xpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h"
#include "paddle/fluid/platform/npu_info.h"
#endif
namespace paddle {
namespace platform {
......@@ -77,11 +82,13 @@ enum DeviceType {
CPU = 0,
CUDA = 1,
XPU = 2,
NPU = 3,
};
constexpr DeviceType kCPU = DeviceType::CPU;
constexpr DeviceType kCUDA = DeviceType::CUDA;
constexpr DeviceType kXPU = DeviceType::XPU;
constexpr DeviceType kNPU = DeviceType::NPU;
class DeviceContext {
public:
......@@ -153,6 +160,46 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUDeviceContext : public DeviceContext {
public:
explicit NPUDeviceContext(NPUPlace place);
virtual ~NPUDeviceContext();
Eigen::DefaultDevice* eigen_device() const { return nullptr; }
Place GetPlace() const override;
aclrtContext* context() const;
/*! \brief Wait for all operations completion in the stream. */
void Wait() const override;
#ifdef PADDLE_WITH_ASCEND_HCCL
/*! \brief Return bkcl context. */
HCCLContext_t hccl_context() const { return hccl_context_; }
/*! \brief Set bkcl context. */
void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
#endif
private:
NPUPlace place_;
aclrtContext context_;
#ifdef PADDLE_WITH_ASCEND_HCCL
HCCLContext_t hccl_context_;
#endif
// Need to be the same with other DeviceContext,
// Eventhough eigen_device_ is not used in NPU
// NOTE(zhiqiu): why need?
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
};
template <>
struct DefaultDeviceContextType<platform::NPUPlace> {
using TYPE = NPUDeviceContext;
};
#endif
#ifdef PADDLE_WITH_CUDA
class CudnnWorkspaceHandle;
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include <cudnn.h>
#include <glog/logging.h>
#include <mutex> // NOLINT
......@@ -214,3 +215,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif
......@@ -38,6 +38,10 @@ limitations under the License. */
#include "paddle/fluid/platform/cuda_error.pb.h"
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_ASCEND_CL
#include "acl/acl.h"
#endif // PADDLE_WITH_ASCEND_CL
#include <fstream>
#include <iomanip>
#include <memory>
......@@ -940,7 +944,6 @@ DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
#endif
} // namespace details
#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \
......@@ -996,5 +999,40 @@ inline void retry_sleep(unsigned milliseconds) {
#undef DEFINE_CUDA_STATUS_TYPE
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_ASCEND_CL
namespace details {
template <typename T>
struct NPUStatusType {};
#define DEFINE_NPU_STATUS_TYPE(type, success_value) \
template <> \
struct NPUStatusType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
} // namespace details
inline std::string build_npu_error_msg(aclError stat) {
std::string s = " ACL error, the error code is : " + stat;
return s;
}
#define PADDLE_ENFORCE_NPU_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __NPU_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::NPUStatusType< \
__NPU_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
::paddle::platform::build_npu_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
#endif // PADDLE_WITH_ASCEND_CL
} // namespace platform
} // namespace paddle
......@@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
#ifdef PADDLE_WITH_CUDA
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
/**
* CUDA related related FLAG
......@@ -377,7 +378,8 @@ DEFINE_double(
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
#ifdef PADDLE_WITH_CUDA
// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags.
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL)
/**
* Memory related FLAG
......
......@@ -80,6 +80,7 @@ static int GetCUDADeviceCountImpl() {
}
int GetCUDADeviceCount() {
// cache the count
static auto dev_cnt = GetCUDADeviceCountImpl();
return dev_cnt;
}
......
......@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/string/split.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
......@@ -70,6 +71,7 @@ namespace framework {
std::once_flag gflags_init_flag;
std::once_flag glog_init_flag;
std::once_flag npu_init_flag;
bool InitGflags(std::vector<std::string> args) {
bool successed = false;
......@@ -148,6 +150,17 @@ void InitDevices() {
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
platform::AclInstance::Instance(); // NOLINT
try {
// use user specified XPUs in single-node multi-process mode.
devices = platform::GetSelectedNPUDevices();
} catch (const std::exception &exp) {
LOG(WARNING)
<< "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
}
#endif
InitDevices(devices);
}
......
......@@ -35,3 +35,13 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size)
DEFINE_INT_STATUS(STAT_gpu13_mem_size)
DEFINE_INT_STATUS(STAT_gpu14_mem_size)
DEFINE_INT_STATUS(STAT_gpu15_mem_size)
// For Ascend NPU
DEFINE_INT_STATUS(STAT_npu0_mem_size)
DEFINE_INT_STATUS(STAT_npu1_mem_size)
DEFINE_INT_STATUS(STAT_npu2_mem_size)
DEFINE_INT_STATUS(STAT_npu3_mem_size)
DEFINE_INT_STATUS(STAT_npu4_mem_size)
DEFINE_INT_STATUS(STAT_npu5_mem_size)
DEFINE_INT_STATUS(STAT_npu6_mem_size)
DEFINE_INT_STATUS(STAT_npu7_mem_size)
......@@ -187,3 +187,13 @@ class StatRegistry {
USE_INT_STAT(STAT_gpu13_mem_size); \
USE_INT_STAT(STAT_gpu14_mem_size); \
USE_INT_STAT(STAT_gpu15_mem_size)
#define USE_NPU_MEM_STAT \
USE_INT_STAT(STAT_npu0_mem_size); \
USE_INT_STAT(STAT_npu1_mem_size); \
USE_INT_STAT(STAT_npu2_mem_size); \
USE_INT_STAT(STAT_npu3_mem_size); \
USE_INT_STAT(STAT_npu4_mem_size); \
USE_INT_STAT(STAT_npu5_mem_size); \
USE_INT_STAT(STAT_npu6_mem_size); \
USE_INT_STAT(STAT_npu7_mem_size)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/npu_info.h"
#include <algorithm>
#include <cstdlib>
#include <memory>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/string/split.h"
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_string(selected_gpus);
DECLARE_uint64(gpu_memory_limit_mb);
constexpr static float fraction_reserve_gpu_memory = 0.05f;
USE_NPU_MEM_STAT;
namespace paddle {
namespace platform {
static int GetNPUDeviceCountImpl() {
uint32_t count;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count));
return count;
}
int GetNPUDeviceCount() {
static auto dev_cnt = GetNPUDeviceCountImpl();
return dev_cnt;
}
// For example, "1.0.1"
std::string GetNPURuntimeVersion(int id) {
PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than NPU count, "
"but received id is: %d. NPU count is: %d.",
id, GetNPUDeviceCount()));
int major = 0, minor = 0, patch = 0;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch));
return string::Sprintf("%d.%d.%d", major, minor, patch);
}
int GetCurrentNPUDeviceId() {
int device_id;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id));
return device_id;
}
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedNPUDevices() {
// use user specified NPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetNPUDeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void SetNPUDeviceId(int id) {
PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than NPU count, "
"but received id is: %d. NPU count is: %d.",
id, GetNPUDeviceCount()));
// NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice
// pairly.
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id));
}
void ResetNPUDeviceId(int id) {
PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
platform::errors::InvalidArgument(
"Device id must be less than NPU count, "
"but received id is: %d. NPU count is: %d.",
id, GetNPUDeviceCount()));
PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id));
}
void NPUMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total;
RecordedNPUMemGetInfo(available, total, &actual_available, &actual_total,
platform::GetCurrentNPUDeviceId());
}
size_t NPUAvailableMemToAlloc() {
size_t total = 0;
size_t available = 0;
NPUMemoryUsage(&available, &total);
size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
// If available size is less than minimum chunk size, no usable memory exists
size_t available_to_alloc = available - reserving;
size_t min_chunk_size = NPUMinChunkSize();
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20)
<< "M, " << (available_to_alloc >> 20) << "M available to allocate";
return available_to_alloc;
}
size_t NPUMaxAllocSize() {
return std::max(NPUInitAllocSize(), NPUReallocSize());
}
static size_t NPUAllocSize(bool realloc) {
size_t available_to_alloc = NPUAvailableMemToAlloc();
PADDLE_ENFORCE_GT(
available_to_alloc, 0,
platform::errors::ResourceExhausted("Not enough available NPU memory."));
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction
size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
: FLAGS_initial_gpu_memory_in_mb;
size_t alloc_bytes =
(flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE(
available_to_alloc, alloc_bytes,
platform::errors::ResourceExhausted("Not enough available NPU memory."));
VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
<< " MiB, is it Re-alloc: " << realloc;
return alloc_bytes;
}
size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
size_t NPUMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
size_t NPUMaxChunkSize() {
size_t max_chunk_size = NPUMaxAllocSize();
VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
return max_chunk_size;
}
void NPUMemcpyASync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, aclrtStream stream,
size_t dst_max_count) {
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
}
void NPUMemcpySync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, size_t dst_max_count) {
// NOTE(zhiqiu): The default max_count is count
dst_max_count = dst_max_count ? dst_max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
}
void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
size_t max_count) {
max_count = max_count ? max_count : count;
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtMemsetAsync(dst, max_count, value, count, stream));
}
void NPUStreamSync(aclrtStream stream) {
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
}
static void RaiseNonOutOfMemoryError(aclError *status) {
if (*status == ACL_ERROR_BAD_ALLOC) {
*status = ACL_ERROR_NONE;
}
PADDLE_ENFORCE_NPU_SUCCESS(*status);
}
class RecordedNPUMallocHelper {
private:
explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0)
: dev_id_(dev_id), limit_size_(limit_size) {
if (NeedRecord()) {
mtx_.reset(new std::mutex());
}
}
DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper);
public:
static RecordedNPUMallocHelper *Instance(int dev_id) {
std::call_once(once_flag_, [] {
int dev_cnt = GetNPUDeviceCount();
instances_.reserve(dev_cnt);
for (int i = 0; i < dev_cnt; ++i) {
// NOTE(zhiqiu): share the flags with gpu, avoid more flags.
instances_.emplace_back(
new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
}
});
PADDLE_ENFORCE_GE(
dev_id, 0,
platform::errors::OutOfRange(
"Device id must be not less than 0, but got %d.", dev_id));
PADDLE_ENFORCE_LT(
dev_id, instances_.size(),
platform::errors::OutOfRange("Device id %d exceeds npu card number %d.",
dev_id, instances_.size()));
return instances_[dev_id].get();
}
/**
* Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC
* or ACL_ERROR_NONE would be returned.
*/
aclError Malloc(void **ptr, size_t size) {
LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
return ACL_ERROR_BAD_ALLOC;
}
NPUDeviceGuard guard(dev_id_);
auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
if (result == ACL_ERROR_NONE) {
if (NeedRecord()) {
cur_size_ += size;
}
STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
return result;
} else {
RaiseNonOutOfMemoryError(&result);
// Non out of memory error would be raised inside
// RaiseNonOutOfMemoryError. Therefore, we can
// return cudaErrorMemoryAllocation directly here.
return ACL_ERROR_BAD_ALLOC;
}
}
/**
* Free gpu memory. Usually, free is not allowed to raise error.
* If it does raise error, the process should be crashed.
*/
void Free(void *ptr, size_t size) {
NPUDeviceGuard guard(dev_id_);
auto result = aclrtFree(ptr);
PADDLE_ENFORCE_NPU_SUCCESS(result);
if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
cur_size_ -= size;
}
STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
}
bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total) {
{
NPUDeviceGuard guard(dev_id_);
auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total);
if (result != ACL_ERROR_NONE) {
*actual_avail = 0;
}
RaiseNonOutOfMemoryError(&result);
}
if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
*avail = std::min(*actual_avail, limit_size_ - cur_size_);
*total = std::min(*actual_total, limit_size_);
return *total < *actual_total;
} else {
*avail = *actual_avail;
*total = *actual_total;
return false;
}
}
inline bool NeedRecord() const { return limit_size_ != 0; }
uint64_t RecordedSize() const {
LockGuardPtr<std::mutex> lock(mtx_);
return NeedRecord() ? cur_size_ : 0;
}
uint64_t LimitSize() const { return limit_size_; }
private:
const int dev_id_;
const uint64_t limit_size_;
uint64_t cur_size_{0};
mutable std::unique_ptr<std::mutex> mtx_;
static std::once_flag once_flag_;
static std::vector<std::unique_ptr<RecordedNPUMallocHelper>> instances_;
};
std::once_flag RecordedNPUMallocHelper::once_flag_;
std::vector<std::unique_ptr<RecordedNPUMallocHelper>>
RecordedNPUMallocHelper::instances_;
aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size);
}
void RecordedNPUFree(void *p, size_t size, int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size);
}
bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total, int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo(
avail, total, actual_avail, actual_total);
}
uint64_t RecordedNPUMallocSize(int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize();
}
bool IsNPUMallocRecorded(int dev_id) {
return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <stddef.h>
#include <string>
#include <vector>
#include "acl/acl.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
//! Get the total number of NPU devices in system.
int GetNPUDeviceCount();
//! Get the runtime version of the ith NPU
std::string GetNPURuntimeVersion(int id);
//! Get the current NPU device id in system.
int GetCurrentNPUDeviceId();
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedNPUDevices();
//! Set the NPU device id for next execution.
void SetNPUDeviceId(int device_id);
//! Reset the NPU device id for next execution.
void ResetNPUDeviceId(int device_id);
//! Get the memory usage of current NPU device.
void NPUMemoryUsage(size_t *available, size_t *total);
//! Get the available memory to allocate, which is the size of available npu
//! minus reserving.
size_t NPUAvailableMemToAlloc();
//! Get the maximum allocation size of current NPU device.
size_t NPUMaxAllocSize();
//! Get the initial allocation size of current NPU device.
size_t NPUInitAllocSize();
//! Get the re-allocation size of current NPU device.
size_t NPUReallocSize();
//! Get the minimum chunk size for NPU buddy allocator.
size_t NPUMinChunkSize();
//! Get the maximum chunk size for NPU buddy allocator.
size_t NPUMaxChunkSize();
//! Copy memory from address src to dst asynchronously.
void NPUMemcpyAsync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, aclrtStream stream,
size_t dst_max_count = 0);
//! Copy memory from address src to dst synchronously.
void NPUMemcpySync(void *dst, const void *src, size_t count,
enum aclrtMemcpyKind kind, size_t dst_max_count = 0);
//! Set memory dst with value count size asynchronously
void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
size_t max_count = 0);
//! Blocks until stream has completed all operations.
void NPUStreamSync(aclrtStream stream);
//! aclrtMalloc with recorded info
aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id);
//! aclrtFree with recorded info
void RecordedNPUFree(void *p, size_t size, int dev_id);
//! Get available and total gpu memory with considering limitation
bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total, int dev_id);
//! Get recorded actrtMalloc size. If record is disabled, return 0.
uint64_t RecordedNPUMallocSize(int dev_id);
bool IsNPUMallocRecorded(int dev_id);
class NPUDeviceGuard {
public:
explicit inline NPUDeviceGuard(int dev_id) {
int prev_id = platform::GetCurrentNPUDeviceId();
if (prev_id != dev_id) {
prev_id_ = prev_id;
platform::SetNPUDeviceId(dev_id);
}
}
inline ~NPUDeviceGuard() {
if (prev_id_ != -1) {
platform::SetNPUDeviceId(prev_id_);
}
}
NPUDeviceGuard(const NPUDeviceGuard &o) = delete;
NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete;
private:
int prev_id_{-1};
};
class AclInstance {
public:
// NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
// no PADDLE_ENFORCE here, call acl API directly.
~AclInstance() {}
AclInstance(const AclInstance &o) = delete;
const AclInstance &operator=(const AclInstance &o) = delete;
static AclInstance &Instance() {
static AclInstance instance;
return instance;
}
void Finalize() {
// NOTE(zhiqiu): DO NOT perform finalize in destructor
// to avoid problems caused by destructor order of static
// object.
for (size_t i = 0; i < devices_.size(); ++i) {
auto status = aclrtResetDevice(devices_[i]);
VLOG(4) << "Call aclrtResetDevice " << devices_[i]
<< " status = " << status;
}
auto status = aclFinalize();
VLOG(4) << "Call aclFinalize, status = " << status;
}
private:
// forbid calling default constructor
AclInstance() {
PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
VLOG(4) << "Call aclrtSetDevice ";
// NOTE(zhiqiu): why set devices here?
// Because ACL creates a default context which contains 2 streams
// when calling aclrtSetDeviceId, so usually we do not need to
// create contexts explicitly. And, for each device, aclrtSetDeviceId
// need to call parily with aclrtResetDeviceId to destory the default
// context. Here, we use this singleton and static instance to manage
// the devices to make sure they will be resetted before program exit.
devices_ = platform::GetSelectedNPUDevices();
for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
SetNPUDeviceId(*it);
VLOG(4) << "Call aclrtSetDevice " << *it;
}
}
std::vector<int> devices_;
};
} // namespace platform
} // namespace paddle
#endif
......@@ -33,6 +33,7 @@ class PlacePrinter : public boost::static_visitor<> {
os_ << "CUDAPlace(" << p.device << ")";
}
void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
private:
......@@ -49,6 +50,10 @@ bool is_xpu_place(const Place &p) {
return boost::apply_visitor(IsXPUPlace(), p);
}
bool is_npu_place(const Place &p) {
return boost::apply_visitor(IsNPUPlace(), p);
}
bool is_cpu_place(const Place &p) {
return boost::apply_visitor(IsCPUPlace(), p);
}
......@@ -67,6 +72,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
return true;
} else if (is_xpu_place(p1)) {
return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
} else if (is_npu_place(p1)) {
return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
} else {
return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
}
......
......@@ -72,16 +72,31 @@ struct XPUPlace {
int device;
};
struct NPUPlace {
NPUPlace() : NPUPlace(0) {}
explicit NPUPlace(int d) : device(d) {}
inline int GetDeviceId() const { return device; }
// needed for variant equality comparison
inline bool operator==(const NPUPlace &o) const { return device == o.device; }
inline bool operator!=(const NPUPlace &o) const { return !(*this == o); }
inline bool operator<(const NPUPlace &o) const { return device < o.device; }
int device;
};
struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const CUDAPlace &gpu) const { return true; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &cpu) const { return true; }
bool operator()(const CPUPlace &) const { return true; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -89,27 +104,38 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
};
struct IsXPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &xpu) const { return true; }
bool operator()(const XPUPlace &) const { return true; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
class Place
: public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
struct IsNPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return true; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
CUDAPinnedPlace> {
private:
using PlaceBase =
boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;
public:
Place() = default;
Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT
Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT
Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT
Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT
Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT
: PlaceBase(cuda_pinned_place) {}
......@@ -126,6 +152,7 @@ using PlaceList = std::vector<Place>;
bool is_gpu_place(const Place &);
bool is_xpu_place(const Place &);
bool is_npu_place(const Place &);
bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &);
bool places_are_same_class(const Place &, const Place &);
......@@ -153,6 +180,16 @@ struct PlaceVisitorWrapper
#endif
}
typename Visitor::result_type operator()(const NPUPlace &npu) const {
#ifdef PADDLE_WITH_ASCEND
return visitor_(npu);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with NPU. Cannot visit npu device"));
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#ifdef PADDLE_WITH_CUDA
return visitor_(cuda);
......
......@@ -102,6 +102,10 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/npu_info.h"
#endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
......@@ -487,6 +491,11 @@ PYBIND11_MODULE(core_noavx, m) {
make_ddim(x_dim), make_ddim(y_dim), -1));
});
#ifdef PADDLE_WITH_ASCEND_CL
m.def("_npu_finalize",
[]() { platform::AclInstance::Instance().Finalize(); });
#endif
m.def(
"_append_python_callable_object_and_return_id",
[](py::object py_obj) -> size_t {
......@@ -1447,7 +1456,6 @@ All parameter, weight, gradient are variables in Paddle.
.def("__repr__", string::to_string<const platform::CUDAPlace &>)
.def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
**Note**:
Examples:
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/npu_info.h"
int main(int argc, char** argv) {
paddle::memory::allocation::UseAllocatorStrategyGFlag();
......@@ -38,11 +39,12 @@ int main(int argc, char** argv) {
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL)
envs.push_back("fraction_of_gpu_memory_to_use");
envs.push_back("initial_gpu_memory_in_mb");
envs.push_back("reallocate_gpu_memory_in_mb");
envs.push_back("allocator_strategy");
envs.push_back("selected_gpus");
#elif __clang__
envs.push_back("use_mkldnn");
envs.push_back("initial_cpu_memory_in_mb");
......@@ -92,6 +94,10 @@ int main(int argc, char** argv) {
paddle::framework::InitDevices();
int ret = RUN_ALL_TESTS();
#ifdef PADDLE_WITH_ASCEND_CL
paddle::platform::AclInstance::Instance().Finalize();
#endif
if (env_str) free(env_str);
if (undefok_str) free(undefok_str);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册