diff --git a/CMakeLists.txt b/CMakeLists.txt index 043a799b6a17c4cf4e4044fa0b58fe919beccbbe..e712efd67fc1dcc53a0de3726f7fd5696eca8834 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,11 +32,14 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) +# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON +# to develop some acl related functionality on x86 +option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() -if (WITH_GPU AND WITH_ASCEND) +if (WITH_GPU AND WITH_ASCEND) message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") endif() # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them. diff --git a/cmake/configure.cmake b/cmake/configure.cmake index fc1e72ba3fccbb2a14f8482502b7c9783ae3a989..c229bdcd643027caad27fec54cf462a5998cc9a9 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -82,6 +82,10 @@ if(WITH_ASCEND) add_definitions(-DPADDLE_WITH_ASCEND) endif() +if(WITH_ASCEND_CL) + add_definitions(-DPADDLE_WITH_ASCEND_CL) +endif() + if(WITH_XPU) message(STATUS "Compile with XPU!") add_definitions(-DPADDLE_WITH_XPU) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index a0b6f480f95ae70333c2f3dd8d20a8050b045425..c23d30c5b9b26b6d7d8aa09bca64d7c675022254 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -21,38 +21,58 @@ else() set(ASCEND_DIR /usr/local/Ascend) endif() -set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) -set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) -set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) -set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) -set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) -set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) -set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) - -set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) -set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) -set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) -set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) -set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) -set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) -set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) - -set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) -set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) -set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) -INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) - -if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) - add_definitions(-DPADDLE_WITH_ASCEND_STRING) -endif() +if(WITH_ASCEND) + set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) + set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) + set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) + set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) + set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) + set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) + set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) + + set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) + set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) + set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) + set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) + set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) + + set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) + set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) + set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) + INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) + + if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) + add_definitions(-DPADDLE_WITH_ASCEND_STRING) + endif() -ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) + ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) -ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) -ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) + ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) -add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) + add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) + +elseif(WITH_ASCEND_CL) + set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) + set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) + set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) + + set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) + set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) + + set(atlas_acl_lib ${ATLAS_ACL_DIR}/libascendcl.so) + set(ATLAS_ACL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) + message(STATUS "ATLAS_ACL_INC_DIR ${ATLAS_ACL_INC_DIR}") + message(STATUS "ATLAS_ACL_LIB_DIR ${ATLAS_ACL_DIR}") + INCLUDE_DIRECTORIES(${ATLAS_ACL_INC_DIR}) + + ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) + add_custom_target(extern_ascend DEPENDS atlas_acl) +endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index d576a299b866c8faf5fcdb25672f6403546207df..327de067be8762a789831f7d5033a1cb83e7aa64 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -274,10 +274,10 @@ if(WITH_BOX_PS) list(APPEND third_party_deps extern_box_ps) endif(WITH_BOX_PS) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) include(external/ascend) list(APPEND third_party_deps extern_ascend) -endif (WITH_ASCEND) +endif () if (WITH_PSCORE) include(external/snappy) diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index ac1e39ad2c1af6894d6bbaec563c487a6857f95a..fce930727bcf63a751991539dcf32eea2cd1c9a0 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -83,6 +83,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { platform::errors::Unimplemented("platform::XPUPlace is not supported")); } + inline ::DLContext operator()(const platform::NPUPlace &place) const { + PADDLE_THROW( + platform::errors::Unimplemented("platform::NPUPlace is not supported")); + } + inline ::DLContext operator()(const platform::CUDAPlace &place) const { #ifdef PADDLE_WITH_CUDA ::DLContext ctx; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1ad321df216fe16f8731f400026716f1c33b84e3..0de97a62ac0e1e574ccbdfaf4c993366f1a0d77f 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -431,6 +431,14 @@ class AnyVisitor : public boost::static_visitor { return GetResultHelper(out, gpu); } + bool GetResult(const framework::Tensor& out, + const platform::NPUPlace& npu) const { + PADDLE_THROW(platform::errors::Unimplemented( + "Not supported on place (%s) ", + npu)); + //return GetResultHelper(out, npu); + } + bool GetResult(const framework::Tensor& out, const platform::CPUPlace& cpu) const { return *out.data(); @@ -633,6 +641,10 @@ struct BothFalseVisitor : public boost::static_visitor<> { #endif } + void VisitorImpl(const platform::NPUPlace& npu) const { + //TODO(zhiqiu) + } + void VisitorImpl(const platform::CPUPlace& cpu) const { int num = in_.numel(); const bool* in_ptr = in_.data(); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index ff8494a3888172a26edeeca7dfdde77bcaf0e1f4..08e668c25a06035be3c0ef50c42c0838d69aa20c 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -116,6 +116,23 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif +#ifdef PADDLE_WITH_ASCEND_CL + void operator()(const platform::NPUPlace& place) { + // TODO(zhiqiu): SUPPORT it + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#else + void operator()(const platform::NPUPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#endif + // there is NO blas in CUDAPinnedPlace void operator()(const platform::CUDAPinnedPlace& place) { PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index c93f637af1a20256f232914b187911e670ba38ce..14d4c983faf0259f9e31848de3fdb76cace3e291 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -19,6 +19,10 @@ if (WITH_GPU) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) endif() +if (WITH_ASCEND_CL) + cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) +endif() + cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index a124a56ef89c57f2537704be5508cf564dbcb959..100d24c89abdd82b1cc0b6b9aeb59b9ef9c35cd3 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -42,6 +42,7 @@ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu_info.h" #endif +#include "paddle/fluid/platform/npu_info.h" DEFINE_int64( gpu_allocator_retry_time, 10000, @@ -76,6 +77,11 @@ class AllocatorFacadePrivate { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); +#endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } #endif break; } @@ -195,6 +201,12 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_ASCEND_CL + void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { + allocators_[p] = std::make_shared(p); + } +#endif + class ZeroSizeAllocator : public Allocator { public: explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index fcde4cbab42685f4d55892f555941e3f5949e11c..4b41ba8cf0e92e84671e6be0e9b46a32cb007094 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/split.h" @@ -112,6 +113,7 @@ size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +// For kunlun XPU template <> void *Alloc(const platform::XPUPlace &place, size_t size) { #ifdef PADDLE_WITH_XPU @@ -216,6 +218,136 @@ size_t Used(const platform::XPUPlace &place) { #endif } +// For Ascend NPU +#ifdef PADDLE_WITH_ASCEND_CL +class NPUBuddyAllocatorList { + private: + NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) { + auto npu_num = devices_.size(); + allocators_.resize(npu_num); + init_flags_.reserve(npu_num); + for (size_t i = 0; i < npu_num; ++i) { + init_flags_.emplace_back(new std::once_flag()); + } + } + + static NPUBuddyAllocatorList *CreateNewInstance() { + return new NPUBuddyAllocatorList(); + } + + public: + static NPUBuddyAllocatorList *Instance() { + static auto *instance = CreateNewInstance(); + return instance; + } + + BuddyAllocator *Get(int npu_id) { + auto pos = std::distance( + devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); + PADDLE_ENFORCE_LT(pos, devices_.size(), + platform::errors::OutOfRange( + "The index exceeds the size of devices, the size of " + "devices is %d, the index is %d", + devices_.size(), pos)); + + std::call_once(*init_flags_[pos], [this, pos] { + platform::SetNPUDeviceId(devices_[pos]); + allocators_[pos].reset(new BuddyAllocator( + std::unique_ptr( + new detail::NPUAllocator(devices_[pos])), + platform::NPUMinChunkSize(), platform::NPUMaxChunkSize())); + VLOG(10) << "\n\nNOTE:\n" + << "You can set GFlags environment variable " + << "'FLAGS_fraction_of_gpu_memory_to_use' " + << "or 'FLAGS_initial_gpu_memory_in_mb' " + << "or 'FLAGS_reallocate_gpu_memory_in_mb' " + << "to change the memory size for GPU usage.\n" + << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " + << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; + }); + + return allocators_[pos].get(); + } + + private: + std::vector devices_; + std::vector> init_flags_; + std::vector> allocators_; +}; + +BuddyAllocator *GetNPUBuddyAllocator(int npu_id) { + return NPUBuddyAllocatorList::Instance()->Get(npu_id); +} +#endif + +template <> +size_t Used(const platform::NPUPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUBuddyAllocator(place.device)->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void *Alloc(const platform::NPUPlace &place, size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + auto *buddy_allocator = GetNPUBuddyAllocator(place.device); + auto *ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + platform::NPUDeviceGuard(place.device); + size_t avail, total; + platform::NPUMemoryUsage(&avail, &total); + PADDLE_THROW(platform::errors::ResourceExhausted( + "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " + "%s, GpuMaxChunkSize %s, GPU memory used: %s.", + string::HumanReadableSize(size), place.device, + string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), + string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), + string::HumanReadableSize(Used(place)))); + } else { + if (FLAGS_init_allocated_mem) { + aclrtMemset(ptr, size, 0xEF, size); + } + } + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::NPUPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetNPUBuddyAllocator(place.device)->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release(const platform::NPUPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUBuddyAllocator(place.device)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +// For CUDA + #ifdef PADDLE_WITH_CUDA class GPUBuddyAllocatorList { private: diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc index 054c75b11f78c7733c15ac39a44cdc45078af7e7..473239d714d89a70fc1eea88a453cc3f76317d67 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" +#include + #include #include // NOLINT #include // NOLINT @@ -69,6 +71,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { } #endif +#ifdef PADDLE_WITH_ASCEND_CL +TEST(NaiveBestFitAllocatorTest, NpuAlloc) { + NaiveBestFitAllocator alloc{platform::NPUPlace(0)}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + sleep(10); + alloc.Release(platform::NPUPlace(0)); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::NPUPlace(0)); +} +#endif + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ecdee9bd03352201060911848647b60d3cc0203 --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/npu_allocator.h" +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool NPUAllocator::IsAllocThreadSafe() const { return true; } +void NPUAllocator::FreeImpl(Allocation* allocation) { + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_, + platform::errors::PermissionDenied( + "NPU memory is freed in incorrect device. This may be a bug")); + platform::RecordedNPUFree(allocation->ptr(), allocation->size(), + place_.device); + delete allocation; +} + +Allocation* NPUAllocator::AllocateImpl(size_t size) { + std::call_once(once_flag_, + [this] { platform::SetNPUDeviceId(place_.device); }); + + void* ptr; + auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device); + if (LIKELY(result == ACL_ERROR_NONE)) { + return new Allocation(ptr, size, platform::Place(place_)); + } + + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedNPUMemGetInfo( + &avail, &total, &actual_avail, &actual_total, place_.device); + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger " + "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum " + "GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on NPU %d. " + "Cannot allocate %s memory on NPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using NPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" + "2. If no, please decrease the batch size of your model. %s\n\n", + place_.device, string::HumanReadableSize(size), place_.device, + string::HumanReadableSize(avail), place_.device, err_msg)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..738ec5d3ce120f3d08b887d3a84d4d79a1e9e1d6 --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_allocator.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class NPUAllocator : public Allocator { + public: + explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {} + + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(Allocation* allocation) override; + Allocation* AllocateImpl(size_t size) override; + + private: + platform::NPUPlace place_; + std::once_flag once_flag_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index 8f0988e871fa5c9990285d7ff80257a6e19046a6..66d8c7e6bba6616dfc066970f3d168b77db2276a 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -2,11 +2,15 @@ include(ExternalProject) cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place) +set(system_allocator_DEPS gflags cpu_info place) + if(${WITH_GPU}) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) -else(${WITH_GPU}) +elseif(${WITH_ASCEND_CL}) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place) +else() cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place) -endif(${WITH_GPU}) +endif() cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 37795715361ec3ec633b79824ebcbeee4c3a22e4..0a391539b9831c782e99afece8bc4947df37d51e 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -21,6 +21,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif +#ifdef PADDLE_WITH_ASCEND_CL +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif + namespace paddle { namespace memory { @@ -235,6 +239,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( } } #endif +#ifdef PADDLE_WITH_ASCEND_CL + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the allocation size for gpu for the first allocation. + allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes); + } else { + // Compute the re-allocation size, we store the re-allocation size when + // user set FLAGS_reallocate_gpu_memory_in_mb to fix value. + if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) { + realloc_size_ = platform::NPUReallocSize(); + } + allocate_bytes = std::max(realloc_size_, request_bytes); + } + } +#endif // Allocate a new block void* p = system_allocator_->Alloc(&index, allocate_bytes); diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index de77108f3404a1ad0cc611dec6a9fdae97865fea..807de9c03adf9fc1a10ffd1e8d019ce815b6d04b 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 90f7e33eb3540f6272df80296bba57c3d7d9b596..a0319a2b707eea32e8ba3cd59f10e9355685c288 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -19,13 +19,15 @@ limitations under the License. */ #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif +#include +#include + #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" -#ifdef PADDLE_WITH_CUDA -#include -#include +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL) DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); @@ -324,6 +326,33 @@ TEST(BuddyAllocator, Release) { } #endif +#ifdef PADDLE_WITH_ASCEND_CL +TEST(BuddyAllocator, NpuFraction) { + // In a 16 GB machine, the pool size will be about 160 MB + FLAGS_fraction_of_gpu_memory_to_use = 0.005; + FLAGS_fraction_of_gpu_memory_to_use = 0.92; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new NPUAllocator(0)), + platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()); + + // Less than pool size + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + buddy_allocator.Release(); + + // Greater than max chunk size + TestBuddyAllocator(&buddy_allocator, 300 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 1 * static_cast(1 << 30), + /* use_system_allocator = */ true); +} +#endif + } // namespace detail } // namespace memory } // namespace paddle + diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 0fbbf405f0bf166b71a3b447338d9df7ad675f1b..f5cfa5f5f8681f958835f3db762f2db243026497 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -35,6 +35,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -239,6 +240,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } #endif +#ifdef PADDLE_WITH_ASCEND_CL +void* NPUAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + void* p; + auto result = platform::RecordedNPUMalloc(&p, size, npu_id_); + + if (result == ACL_ERROR_NONE) { + *index = 0; + npu_alloc_size_ += size; + return p; + } else { + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedNPUMemGetInfo( + &avail, &total, &actual_avail, &actual_total, npu_id_); + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a " + "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " + "maximum GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on NPU %d. " + "Cannot allocate %s memory on NPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using NPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" + "2. If no, please try one of the following suggestions:\n" + " 1) Decrease the batch size of your model.\n" + " 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, " + "please set it to a higher value but less than 1.0.\n" + " The command is " + "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", + npu_id_, string::HumanReadableSize(size), npu_id_, + string::HumanReadableSize(avail), npu_id_, + FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + } +} + +void NPUAllocator::Free(void* p, size_t size, size_t index) { + VLOG(4) << "Free " << p << " size " << size; + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(npu_alloc_size_, size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated gpu memory (%d)", + size, npu_alloc_size_)); + npu_alloc_size_ -= size; + + platform::RecordedNPUFree(p, size, npu_id_); +} + +bool NPUAllocator::UseGpu() const { return true; } +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index 42f0f23ec1d5d48276285dcef547a4d51054538b..7acaaa4d665e4dacde61b04a3c653c36f277ffcc 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL + +class NPUAllocator : public SystemAllocator { + public: + explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {} + + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t npu_alloc_size_ = 0; + int npu_id_; +}; +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index ea4897494f72b96e85911e03b651af1b4eac3298..d2ccb9f892f6b3a01a5d6a89ab91d0007bc5d12e 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -81,3 +81,11 @@ TEST(GPUAllocator, AllocFailure) { } } #endif + +#ifdef PADDLE_WITH_ASCEND_CL +TEST(NPUAllocator, Alloc) { + paddle::memory::detail::NPUAllocator a(0); + TestAllocator(&a, 1<<20); + TestAllocator(&a, 1); +} +#endif diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 5afda787339dbe714ba6c82e3c34d39eb6d75580..f6ba1687980f497c8b6a32ad66798a87ddc35396 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -148,6 +148,13 @@ void set_constant_with_place( PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); } +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported")); +} + template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 63ff4852f16de6d77fb385f1eae984403e116722..f3331349fde86b9e2ad00e05d7ca7263373d9a2f 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -71,6 +71,10 @@ if(WITH_ASCEND) cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl) endif() +if(WITH_ASCEND_CL) + cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor atlas_acl) +endif() + add_subdirectory(dynload) add_subdirectory(stream) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index fb94768984fcfb4b886e4805f8328fe76a7b3625..24182b837f13cb83b6d087ed1e7410cdf3845d34 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -228,6 +228,33 @@ Place XPUDeviceContext::GetPlace() const { return place_; } xpu::Context* XPUDeviceContext::x_context() const { return context_; } #endif +#ifdef PADDLE_WITH_ASCEND_CL +NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) { + NPUDeviceGuard guard(place_.device); + // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device)); + // NOTE(zhiqiu): Usually, no need to create context explicitly, + // ACL creates a default context which contains 1 default stream + // and 1 sync strean after aclrtSetDevice. + PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_)); +} + +NPUDeviceContext::~NPUDeviceContext() { + // NPUDeviceGuard guard(place_.device); + // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_)); +} + +void NPUDeviceContext::Wait() const { + NPUDeviceGuard guard(place_.device); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); +} + +Place NPUDeviceContext::GetPlace() const { return place_; } + +aclrtContext* NPUDeviceContext::context() const { + return const_cast(&context_); +} +#endif + #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice : public Eigen::StreamInterface { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index f058da97b5cfa2358873dea6e3efec997fb40dff..a4e584eeffa21be4dc4d65c89f8257b5ff66d953 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -59,6 +59,11 @@ struct GpuDevice; #include "paddle/fluid/platform/xpu_info.h" #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include "acl/acl.h" +#include "paddle/fluid/platform/npu_info.h" +#endif + namespace paddle { namespace platform { @@ -77,11 +82,13 @@ enum DeviceType { CPU = 0, CUDA = 1, XPU = 2, + NPU = 3, }; constexpr DeviceType kCPU = DeviceType::CPU; constexpr DeviceType kCUDA = DeviceType::CUDA; constexpr DeviceType kXPU = DeviceType::XPU; +constexpr DeviceType kNPU = DeviceType::NPU; class DeviceContext { public: @@ -153,6 +160,46 @@ struct DefaultDeviceContextType { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL +class NPUDeviceContext : public DeviceContext { + public: + explicit NPUDeviceContext(NPUPlace place); + virtual ~NPUDeviceContext(); + Eigen::DefaultDevice* eigen_device() const { return nullptr; } + Place GetPlace() const override; + aclrtContext* context() const; + + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override; + +#ifdef PADDLE_WITH_ASCEND_HCCL + /*! \brief Return bkcl context. */ + HCCLContext_t hccl_context() const { return hccl_context_; } + + /*! \brief Set bkcl context. */ + void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; } +#endif + + private: + NPUPlace place_; + aclrtContext context_; +#ifdef PADDLE_WITH_ASCEND_HCCL + HCCLContext_t hccl_context_; +#endif + + // Need to be the same with other DeviceContext, + // Eventhough eigen_device_ is not used in NPU + // NOTE(zhiqiu): why need? + std::unique_ptr eigen_device_; + DISABLE_COPY_AND_ASSIGN(NPUDeviceContext); +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = NPUDeviceContext; +}; +#endif + #ifdef PADDLE_WITH_CUDA class CudnnWorkspaceHandle; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index db84b8731f9ca467c4521221a3dbe0b1fc61b597..6c9a0cd44442faeb5b0f9a8115e5231c34f8fe02 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_CUDA #include #include #include // NOLINT @@ -214,3 +215,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) } // namespace dynload } // namespace platform } // namespace paddle + +#endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 0b8a361abb58889476a437f8d3fe4a932b09cf31..c06616d01d572b23d50fd79c577eb271a71ba754 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -38,6 +38,10 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_error.pb.h" #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_ASCEND_CL +#include "acl/acl.h" +#endif // PADDLE_WITH_ASCEND_CL + #include #include #include @@ -940,7 +944,6 @@ DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); #endif - } // namespace details #define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \ @@ -996,5 +999,40 @@ inline void retry_sleep(unsigned milliseconds) { #undef DEFINE_CUDA_STATUS_TYPE #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_ASCEND_CL +namespace details { +template +struct NPUStatusType {}; + +#define DEFINE_NPU_STATUS_TYPE(type, success_value) \ + template <> \ + struct NPUStatusType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE); +} // namespace details + +inline std::string build_npu_error_msg(aclError stat) { + std::string s = " ACL error, the error code is : " + stat; + return s; +} + +#define PADDLE_ENFORCE_NPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __NPU_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::NPUStatusType< \ + __NPU_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = ::paddle::platform::errors::External( \ + ::paddle::platform::build_npu_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) +#endif // PADDLE_WITH_ASCEND_CL + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 20be80b17617432e81bd70cebb6eeeae4626e5ef..e786d01c075133f4935502953d7bddcd47e61cfc 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -45,7 +45,8 @@ DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); -#ifdef PADDLE_WITH_CUDA +// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags. +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL) /** * CUDA related related FLAG @@ -377,7 +378,8 @@ DEFINE_double( "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "reserve the rest for page tables, etc"); -#ifdef PADDLE_WITH_CUDA +// NOTE(zhiqiu): better to share the flags, otherwise we will have too many flags. +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ASCEND_CL) /** * Memory related FLAG diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 2a6714c39a1cb4c435dd33a3ee9dd86fe561c1b6..ccab175397c9311cb5b87e8563a6d0fdf7f53997 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -80,6 +80,7 @@ static int GetCUDADeviceCountImpl() { } int GetCUDADeviceCount() { + // cache the count static auto dev_cnt = GetCUDADeviceCountImpl(); return dev_cnt; } diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 3efdff2333d31fbc00412daf68788538faa19ad5..1c8b05768a434a22740810b2fe4c17bba420bb77 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/string/split.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" @@ -70,6 +71,7 @@ namespace framework { std::once_flag gflags_init_flag; std::once_flag glog_init_flag; +std::once_flag npu_init_flag; bool InitGflags(std::vector args) { bool successed = false; @@ -148,6 +150,17 @@ void InitDevices() { } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime."; } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL + platform::AclInstance::Instance(); // NOLINT + try { + // use user specified XPUs in single-node multi-process mode. + devices = platform::GetSelectedNPUDevices(); + } catch (const std::exception &exp) { + LOG(WARNING) + << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime."; + } #endif InitDevices(devices); } diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc index 76554012bf51e34fc99db7759404f0e8d6f96cd6..1b44cb196547c2d26cdd5ae72c3331022f834657 100644 --- a/paddle/fluid/platform/monitor.cc +++ b/paddle/fluid/platform/monitor.cc @@ -35,3 +35,13 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size) DEFINE_INT_STATUS(STAT_gpu13_mem_size) DEFINE_INT_STATUS(STAT_gpu14_mem_size) DEFINE_INT_STATUS(STAT_gpu15_mem_size) + +// For Ascend NPU +DEFINE_INT_STATUS(STAT_npu0_mem_size) +DEFINE_INT_STATUS(STAT_npu1_mem_size) +DEFINE_INT_STATUS(STAT_npu2_mem_size) +DEFINE_INT_STATUS(STAT_npu3_mem_size) +DEFINE_INT_STATUS(STAT_npu4_mem_size) +DEFINE_INT_STATUS(STAT_npu5_mem_size) +DEFINE_INT_STATUS(STAT_npu6_mem_size) +DEFINE_INT_STATUS(STAT_npu7_mem_size) diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h index b57fae9daac41f37829309c4bc5f58fb2606ca02..0eb9448ce0fad4e1caadb3e08140417294d5d0e7 100644 --- a/paddle/fluid/platform/monitor.h +++ b/paddle/fluid/platform/monitor.h @@ -187,3 +187,13 @@ class StatRegistry { USE_INT_STAT(STAT_gpu13_mem_size); \ USE_INT_STAT(STAT_gpu14_mem_size); \ USE_INT_STAT(STAT_gpu15_mem_size) + +#define USE_NPU_MEM_STAT \ + USE_INT_STAT(STAT_npu0_mem_size); \ + USE_INT_STAT(STAT_npu1_mem_size); \ + USE_INT_STAT(STAT_npu2_mem_size); \ + USE_INT_STAT(STAT_npu3_mem_size); \ + USE_INT_STAT(STAT_npu4_mem_size); \ + USE_INT_STAT(STAT_npu5_mem_size); \ + USE_INT_STAT(STAT_npu6_mem_size); \ + USE_INT_STAT(STAT_npu7_mem_size) diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc new file mode 100644 index 0000000000000000000000000000000000000000..c7508f01a1a3b8f575bef17b7172d4dfacef5dfe --- /dev/null +++ b/paddle/fluid/platform/npu_info.cc @@ -0,0 +1,349 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/npu_info.h" +#include +#include +#include + +#include "gflags/gflags.h" + +#include "paddle/fluid/platform/lock_guard_ptr.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/string/split.h" + +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_bool(enable_cublas_tensor_op_math); +DECLARE_string(selected_gpus); +DECLARE_uint64(gpu_memory_limit_mb); + +constexpr static float fraction_reserve_gpu_memory = 0.05f; + +USE_NPU_MEM_STAT; + +namespace paddle { +namespace platform { + +static int GetNPUDeviceCountImpl() { + uint32_t count; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count)); + return count; +} + +int GetNPUDeviceCount() { + static auto dev_cnt = GetNPUDeviceCountImpl(); + return dev_cnt; +} + +// For example, "1.0.1" +std::string GetNPURuntimeVersion(int id) { + PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than NPU count, " + "but received id is: %d. NPU count is: %d.", + id, GetNPUDeviceCount())); + int major = 0, minor = 0, patch = 0; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch)); + return string::Sprintf("%d.%d.%d", major, minor, patch); +} + +int GetCurrentNPUDeviceId() { + int device_id; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id)); + return device_id; +} + +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedNPUDevices() { + // use user specified NPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_gpus.empty()) { + auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetNPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +void SetNPUDeviceId(int id) { + PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than NPU count, " + "but received id is: %d. NPU count is: %d.", + id, GetNPUDeviceCount())); + // NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice + // pairly. + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id)); +} + +void ResetNPUDeviceId(int id) { + PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than NPU count, " + "but received id is: %d. NPU count is: %d.", + id, GetNPUDeviceCount())); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id)); +} + +void NPUMemoryUsage(size_t *available, size_t *total) { + size_t actual_available, actual_total; + RecordedNPUMemGetInfo(available, total, &actual_available, &actual_total, + platform::GetCurrentNPUDeviceId()); +} + +size_t NPUAvailableMemToAlloc() { + size_t total = 0; + size_t available = 0; + NPUMemoryUsage(&available, &total); + size_t reserving = + static_cast(fraction_reserve_gpu_memory * available); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = available - reserving; + size_t min_chunk_size = NPUMinChunkSize(); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20) + << "M, " << (available_to_alloc >> 20) << "M available to allocate"; + return available_to_alloc; +} + +size_t NPUMaxAllocSize() { + return std::max(NPUInitAllocSize(), NPUReallocSize()); +} + +static size_t NPUAllocSize(bool realloc) { + size_t available_to_alloc = NPUAvailableMemToAlloc(); + PADDLE_ENFORCE_GT( + available_to_alloc, 0, + platform::errors::ResourceExhausted("Not enough available NPU memory.")); + // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be + // allocated by fraction + size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb + : FLAGS_initial_gpu_memory_in_mb; + size_t alloc_bytes = + (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * + FLAGS_fraction_of_gpu_memory_to_use); + PADDLE_ENFORCE_GE( + available_to_alloc, alloc_bytes, + platform::errors::ResourceExhausted("Not enough available NPU memory.")); + VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) + << " MiB, is it Re-alloc: " << realloc; + return alloc_bytes; +} + +size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); } + +size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); } + +size_t NPUMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t NPUMaxChunkSize() { + size_t max_chunk_size = NPUMaxAllocSize(); + VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; + return max_chunk_size; +} + +void NPUMemcpyASync(void *dst, const void *src, size_t count, + enum aclrtMemcpyKind kind, aclrtStream stream, + size_t dst_max_count) { + dst_max_count = dst_max_count ? dst_max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream)); +} + +void NPUMemcpySync(void *dst, const void *src, size_t count, + enum aclrtMemcpyKind kind, size_t dst_max_count) { + // NOTE(zhiqiu): The default max_count is count + dst_max_count = dst_max_count ? dst_max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind)); +} + +void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream, + size_t max_count) { + max_count = max_count ? max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtMemsetAsync(dst, max_count, value, count, stream)); +} + +void NPUStreamSync(aclrtStream stream) { + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream)); +} + +static void RaiseNonOutOfMemoryError(aclError *status) { + if (*status == ACL_ERROR_BAD_ALLOC) { + *status = ACL_ERROR_NONE; + } + PADDLE_ENFORCE_NPU_SUCCESS(*status); +} + +class RecordedNPUMallocHelper { + private: + explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0) + : dev_id_(dev_id), limit_size_(limit_size) { + if (NeedRecord()) { + mtx_.reset(new std::mutex()); + } + } + + DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper); + + public: + static RecordedNPUMallocHelper *Instance(int dev_id) { + std::call_once(once_flag_, [] { + int dev_cnt = GetNPUDeviceCount(); + instances_.reserve(dev_cnt); + for (int i = 0; i < dev_cnt; ++i) { + // NOTE(zhiqiu): share the flags with gpu, avoid more flags. + instances_.emplace_back( + new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20)); + } + }); + + PADDLE_ENFORCE_GE( + dev_id, 0, + platform::errors::OutOfRange( + "Device id must be not less than 0, but got %d.", dev_id)); + PADDLE_ENFORCE_LT( + dev_id, instances_.size(), + platform::errors::OutOfRange("Device id %d exceeds npu card number %d.", + dev_id, instances_.size())); + return instances_[dev_id].get(); + } + + /** + * Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC + * or ACL_ERROR_NONE would be returned. + */ + aclError Malloc(void **ptr, size_t size) { + LockGuardPtr lock(mtx_); + if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) { + return ACL_ERROR_BAD_ALLOC; + } + + NPUDeviceGuard guard(dev_id_); + auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); + if (result == ACL_ERROR_NONE) { + if (NeedRecord()) { + cur_size_ += size; + } + STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size); + return result; + } else { + RaiseNonOutOfMemoryError(&result); + // Non out of memory error would be raised inside + // RaiseNonOutOfMemoryError. Therefore, we can + // return cudaErrorMemoryAllocation directly here. + return ACL_ERROR_BAD_ALLOC; + } + } + + /** + * Free gpu memory. Usually, free is not allowed to raise error. + * If it does raise error, the process should be crashed. + */ + void Free(void *ptr, size_t size) { + NPUDeviceGuard guard(dev_id_); + auto result = aclrtFree(ptr); + PADDLE_ENFORCE_NPU_SUCCESS(result); + if (NeedRecord()) { + std::lock_guard guard(*mtx_); + cur_size_ -= size; + } + STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size); + } + + bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total) { + { + NPUDeviceGuard guard(dev_id_); + auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total); + if (result != ACL_ERROR_NONE) { + *actual_avail = 0; + } + RaiseNonOutOfMemoryError(&result); + } + + if (NeedRecord()) { + std::lock_guard guard(*mtx_); + *avail = std::min(*actual_avail, limit_size_ - cur_size_); + *total = std::min(*actual_total, limit_size_); + return *total < *actual_total; + } else { + *avail = *actual_avail; + *total = *actual_total; + return false; + } + } + + inline bool NeedRecord() const { return limit_size_ != 0; } + + uint64_t RecordedSize() const { + LockGuardPtr lock(mtx_); + return NeedRecord() ? cur_size_ : 0; + } + + uint64_t LimitSize() const { return limit_size_; } + + private: + const int dev_id_; + const uint64_t limit_size_; + uint64_t cur_size_{0}; + + mutable std::unique_ptr mtx_; + + static std::once_flag once_flag_; + static std::vector> instances_; +}; + +std::once_flag RecordedNPUMallocHelper::once_flag_; +std::vector> + RecordedNPUMallocHelper::instances_; + +aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size); +} + +void RecordedNPUFree(void *p, size_t size, int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size); +} + +bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total, int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo( + avail, total, actual_avail, actual_total); +} + +uint64_t RecordedNPUMallocSize(int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize(); +} + +bool IsNPUMallocRecorded(int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord(); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h new file mode 100644 index 0000000000000000000000000000000000000000..beac23dc96075806af7c082f747315a34df82563 --- /dev/null +++ b/paddle/fluid/platform/npu_info.h @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_ASCEND_CL +#include + +#include +#include + +#include "acl/acl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +//! Get the total number of NPU devices in system. +int GetNPUDeviceCount(); + +//! Get the runtime version of the ith NPU +std::string GetNPURuntimeVersion(int id); + +//! Get the current NPU device id in system. +int GetCurrentNPUDeviceId(); + +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedNPUDevices(); + +//! Set the NPU device id for next execution. +void SetNPUDeviceId(int device_id); + +//! Reset the NPU device id for next execution. +void ResetNPUDeviceId(int device_id); + +//! Get the memory usage of current NPU device. +void NPUMemoryUsage(size_t *available, size_t *total); + +//! Get the available memory to allocate, which is the size of available npu +//! minus reserving. +size_t NPUAvailableMemToAlloc(); + +//! Get the maximum allocation size of current NPU device. +size_t NPUMaxAllocSize(); + +//! Get the initial allocation size of current NPU device. +size_t NPUInitAllocSize(); + +//! Get the re-allocation size of current NPU device. +size_t NPUReallocSize(); + +//! Get the minimum chunk size for NPU buddy allocator. +size_t NPUMinChunkSize(); + +//! Get the maximum chunk size for NPU buddy allocator. +size_t NPUMaxChunkSize(); + +//! Copy memory from address src to dst asynchronously. +void NPUMemcpyAsync(void *dst, const void *src, size_t count, + enum aclrtMemcpyKind kind, aclrtStream stream, + size_t dst_max_count = 0); + +//! Copy memory from address src to dst synchronously. +void NPUMemcpySync(void *dst, const void *src, size_t count, + enum aclrtMemcpyKind kind, size_t dst_max_count = 0); + +//! Set memory dst with value count size asynchronously +void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream, + size_t max_count = 0); + +//! Blocks until stream has completed all operations. +void NPUStreamSync(aclrtStream stream); + +//! aclrtMalloc with recorded info +aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id); + +//! aclrtFree with recorded info +void RecordedNPUFree(void *p, size_t size, int dev_id); + +//! Get available and total gpu memory with considering limitation +bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total, int dev_id); + +//! Get recorded actrtMalloc size. If record is disabled, return 0. +uint64_t RecordedNPUMallocSize(int dev_id); + +bool IsNPUMallocRecorded(int dev_id); + +class NPUDeviceGuard { + public: + explicit inline NPUDeviceGuard(int dev_id) { + int prev_id = platform::GetCurrentNPUDeviceId(); + if (prev_id != dev_id) { + prev_id_ = prev_id; + platform::SetNPUDeviceId(dev_id); + } + } + + inline ~NPUDeviceGuard() { + if (prev_id_ != -1) { + platform::SetNPUDeviceId(prev_id_); + } + } + + NPUDeviceGuard(const NPUDeviceGuard &o) = delete; + NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete; + + private: + int prev_id_{-1}; +}; + +class AclInstance { + public: + // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so + // no PADDLE_ENFORCE here, call acl API directly. + ~AclInstance() {} + AclInstance(const AclInstance &o) = delete; + const AclInstance &operator=(const AclInstance &o) = delete; + + static AclInstance &Instance() { + static AclInstance instance; + return instance; + } + + void Finalize() { + // NOTE(zhiqiu): DO NOT perform finalize in destructor + // to avoid problems caused by destructor order of static + // object. + for (size_t i = 0; i < devices_.size(); ++i) { + auto status = aclrtResetDevice(devices_[i]); + VLOG(4) << "Call aclrtResetDevice " << devices_[i] + << " status = " << status; + } + auto status = aclFinalize(); + VLOG(4) << "Call aclFinalize, status = " << status; + } + + private: + // forbid calling default constructor + AclInstance() { + PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr)); + VLOG(4) << "Call aclrtSetDevice "; + // NOTE(zhiqiu): why set devices here? + // Because ACL creates a default context which contains 2 streams + // when calling aclrtSetDeviceId, so usually we do not need to + // create contexts explicitly. And, for each device, aclrtSetDeviceId + // need to call parily with aclrtResetDeviceId to destory the default + // context. Here, we use this singleton and static instance to manage + // the devices to make sure they will be resetted before program exit. + devices_ = platform::GetSelectedNPUDevices(); + for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) { + SetNPUDeviceId(*it); + VLOG(4) << "Call aclrtSetDevice " << *it; + } + } + std::vector devices_; +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index b80d2fd1632cd82c231fae724fc4d754b8fed0fc..1cc9fd9fe76341cd495a3580cddbff65f5b0e208 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -33,6 +33,7 @@ class PlacePrinter : public boost::static_visitor<> { os_ << "CUDAPlace(" << p.device << ")"; } void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } + void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; } void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } private: @@ -49,6 +50,10 @@ bool is_xpu_place(const Place &p) { return boost::apply_visitor(IsXPUPlace(), p); } +bool is_npu_place(const Place &p) { + return boost::apply_visitor(IsNPUPlace(), p); +} + bool is_cpu_place(const Place &p) { return boost::apply_visitor(IsCPUPlace(), p); } @@ -67,6 +72,8 @@ bool is_same_place(const Place &p1, const Place &p2) { return true; } else if (is_xpu_place(p1)) { return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2); + } else if (is_npu_place(p1)) { + return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2); } else { return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2); } diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index f95f6954a32e771e7413a766afcfea8b85ff1f7e..3f74701319df0bcc1864461d8de76186a8114a3d 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -72,16 +72,31 @@ struct XPUPlace { int device; }; +struct NPUPlace { + NPUPlace() : NPUPlace(0) {} + explicit NPUPlace(int d) : device(d) {} + + inline int GetDeviceId() const { return device; } + // needed for variant equality comparison + inline bool operator==(const NPUPlace &o) const { return device == o.device; } + inline bool operator!=(const NPUPlace &o) const { return !(*this == o); } + inline bool operator<(const NPUPlace &o) const { return device < o.device; } + + int device; +}; + struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } - bool operator()(const CUDAPlace &gpu) const { return true; } + bool operator()(const NPUPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return true; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; struct IsCPUPlace : public boost::static_visitor { - bool operator()(const CPUPlace &cpu) const { return true; } + bool operator()(const CPUPlace &) const { return true; } bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -89,27 +104,38 @@ struct IsCPUPlace : public boost::static_visitor { struct IsCUDAPinnedPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } }; struct IsXPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &xpu) const { return true; } + bool operator()(const XPUPlace &) const { return true; } + bool operator()(const NPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; -class Place - : public boost::variant { +struct IsNPUPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return true; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &) const { return false; } +}; + +class Place : public boost::variant { private: using PlaceBase = - boost::variant; + boost::variant; public: Place() = default; Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT + Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT : PlaceBase(cuda_pinned_place) {} @@ -126,6 +152,7 @@ using PlaceList = std::vector; bool is_gpu_place(const Place &); bool is_xpu_place(const Place &); +bool is_npu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); bool places_are_same_class(const Place &, const Place &); @@ -153,6 +180,16 @@ struct PlaceVisitorWrapper #endif } + typename Visitor::result_type operator()(const NPUPlace &npu) const { +#ifdef PADDLE_WITH_ASCEND + return visitor_(npu); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with NPU. Cannot visit npu device")); + return typename Visitor::result_type(); +#endif + } + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { #ifdef PADDLE_WITH_CUDA return visitor_(cuda); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e1a638adf505d185f5bbb3b8ca0376b8ff1279df..0fa50a8cd362b1ddaece4796bc243d751a60aab3 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -102,6 +102,10 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/npu_info.h" +#endif + #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu_info.h" #endif @@ -487,6 +491,11 @@ PYBIND11_MODULE(core_noavx, m) { make_ddim(x_dim), make_ddim(y_dim), -1)); }); +#ifdef PADDLE_WITH_ASCEND_CL + m.def("_npu_finalize", + []() { platform::AclInstance::Instance().Finalize(); }); +#endif + m.def( "_append_python_callable_object_and_return_id", [](py::object py_obj) -> size_t { @@ -1447,7 +1456,6 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); - py::class_(m, "XPUPlace", R"DOC( **Note**: Examples: diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 38ed76a87cd3e46145d4a1a5e679174a41a4ee86..2a1af1755991387b6bf417bc1ef1b38a0da632bb 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/npu_info.h" int main(int argc, char** argv) { paddle::memory::allocation::UseAllocatorStrategyGFlag(); @@ -38,11 +39,12 @@ int main(int argc, char** argv) { } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_ASCEND_CL) envs.push_back("fraction_of_gpu_memory_to_use"); envs.push_back("initial_gpu_memory_in_mb"); envs.push_back("reallocate_gpu_memory_in_mb"); envs.push_back("allocator_strategy"); + envs.push_back("selected_gpus"); #elif __clang__ envs.push_back("use_mkldnn"); envs.push_back("initial_cpu_memory_in_mb"); @@ -92,6 +94,10 @@ int main(int argc, char** argv) { paddle::framework::InitDevices(); int ret = RUN_ALL_TESTS(); + +#ifdef PADDLE_WITH_ASCEND_CL + paddle::platform::AclInstance::Instance().Finalize(); +#endif if (env_str) free(env_str); if (undefok_str) free(undefok_str);