From afe02e9dca89292b97be91cf1cdb0b500dca64a4 Mon Sep 17 00:00:00 2001 From: From00 Date: Wed, 30 Mar 2022 15:46:33 +0800 Subject: [PATCH] Add new APIs for GPU memory monitoring (max_memory_allocated, max_memory_reserved, memory_allocated, memory_reserved) (#38657) * Add new API memory_reserved * Add memory_allocated, max_memory_reserved and max_memory_allocater * Fix CI error * Fix CI error * Enhance UT * Add FLAGS_memory_stats_opt * Add STATS macro functions * Add StatAllocator * Fix CI errors * Add UT * Fix CI errors --- paddle/fluid/memory/CMakeLists.txt | 4 +- paddle/fluid/memory/allocation/CMakeLists.txt | 8 +- .../memory/allocation/allocator_facade.cc | 18 ++ .../fluid/memory/allocation/stat_allocator.h | 56 ++++++ paddle/fluid/memory/memory.h | 1 + paddle/fluid/memory/stats.cc | 118 ++++++++++++ paddle/fluid/memory/stats.h | 172 +++++++++++++++++ paddle/fluid/memory/stats_test.cc | 107 +++++++++++ paddle/fluid/platform/device/gpu/gpu_info.cc | 23 ++- paddle/fluid/pybind/pybind.cc | 3 + python/paddle/device/cuda/__init__.py | 175 ++++++++++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 1 + .../test_cuda_max_memory_allocated.py | 61 ++++++ .../test_cuda_max_memory_reserved.py | 61 ++++++ .../unittests/test_cuda_memory_allocated.py | 53 ++++++ .../unittests/test_cuda_memory_reserved.py | 53 ++++++ 16 files changed, 899 insertions(+), 15 deletions(-) create mode 100644 paddle/fluid/memory/allocation/stat_allocator.h create mode 100644 paddle/fluid/memory/stats.cc create mode 100644 paddle/fluid/memory/stats.h create mode 100644 paddle/fluid/memory/stats_test.cc create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 4492615d23..76bb8993cb 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -10,8 +10,10 @@ endif() cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS}) cc_library(memcpy SRCS memcpy.cc DEPS place device_context) +cc_library(stats SRCS stats.cc DEPS enforce) +cc_library(memory DEPS malloc memcpy stats) -cc_library(memory DEPS malloc memcpy) +cc_test(stats_test SRCS stats_test.cc DEPS stats) if (WITH_GPU) nv_test(malloc_test diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index f296ce96d4..5af13f76b3 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(allocator SRCS allocator.cc DEPS place) +cc_library(allocator SRCS allocator.cc DEPS place stats) cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) @@ -14,7 +14,7 @@ else () endif() if (WITH_GPU) - nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) + nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats) nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph) @@ -27,7 +27,7 @@ if (WITH_GPU) endif() if (WITH_ROCM) - hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) + hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats) hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) @@ -101,7 +101,7 @@ endif() cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats) if (WITH_GPU) target_link_libraries(allocator_facade cuda_graph) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 88bbe339f8..7619767123 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" +#include "paddle/fluid/memory/allocation/stat_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -315,6 +316,8 @@ class AllocatorFacadePrivate { WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time); } + WrapStatAllocator(); + CheckAllocThreadSafe(); #ifdef PADDLE_WITH_CUDA @@ -521,6 +524,7 @@ class AllocatorFacadePrivate { InitAutoGrowthCUDAAllocator(p, stream); WrapStreamSafeCUDAAllocator(p, stream); WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time); + WrapStatAllocator(p, stream); } } @@ -715,6 +719,11 @@ class AllocatorFacadePrivate { allocator = std::make_shared(allocator, retry_time); } + void WrapStatAllocator(platform::CUDAPlace p, gpuStream_t stream) { + std::shared_ptr& allocator = cuda_allocators_[p][stream]; + allocator = std::make_shared(allocator); + } + #ifdef PADDLE_WITH_CUDA void WrapCUDAGraphAllocator() { for (auto& item : allocators_) { @@ -895,6 +904,15 @@ class AllocatorFacadePrivate { } } + void WrapStatAllocator() { + for (auto& pair : allocators_) { + // Now memory stats is only supported for GPU + if (platform::is_gpu_place(pair.first)) { + pair.second = std::make_shared(pair.second); + } + } + } + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // a standalone CUDA allocator to support multi-stream GC in new executor std::map> diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h new file mode 100644 index 0000000000..71569366c2 --- /dev/null +++ b/paddle/fluid/memory/allocation/stat_allocator.h @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/stats.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class StatAllocator : public Allocator { + public: + explicit StatAllocator(std::shared_ptr underlying_allocator) + : underlying_allocator_(std::move(underlying_allocator)) {} + + bool IsAllocThreadSafe() const override { return true; } + + protected: + void FreeImpl(phi::Allocation* allocation) override { + MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), + -allocation->size()); + underlying_allocator_->Free(allocation); + } + + phi::Allocation* AllocateImpl(size_t size) override { + phi::Allocator::AllocationPtr allocation = + underlying_allocator_->Allocate(size); + MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), + allocation->size()); + return allocation.release(); + } + + uint64_t ReleaseImpl(const platform::Place& place) override { + return underlying_allocator_->Release(place); + } + + private: + std::shared_ptr underlying_allocator_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h index 8d904e3be5..db4950fdf5 100644 --- a/paddle/fluid/memory/memory.h +++ b/paddle/fluid/memory/memory.h @@ -16,3 +16,4 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/stats.h" diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc new file mode 100644 index 0000000000..31d776de40 --- /dev/null +++ b/paddle/fluid/memory/stats.cc @@ -0,0 +1,118 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/stats.h" + +#include "paddle/fluid/memory/allocation/spin_lock.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace memory { + +class StatRegistry { + public: + static StatRegistry* GetInstance() { + static StatRegistry instance; + return &instance; + } + + StatBase* GetStat(const std::string& stat_type, int dev_id) { + auto it = stat_map_.find(GetStatKey(stat_type, dev_id)); + if (it == stat_map_.end()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The STAT type \"%s\" for device %d has not been regeistered.", + stat_type.c_str(), dev_id)); + } + return it->second; + } + + std::string GetStatKey(const std::string& stat_type, int dev_id) { + return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type; + } + + int64_t GetCurrentValue(const std::string& stat_type, int dev_id) { + return GetStat(stat_type, dev_id)->GetCurrentValue(); + } + + int64_t GetPeakValue(const std::string& stat_type, int dev_id) { + return GetStat(stat_type, dev_id)->GetPeakValue(); + } + + void Register(const std::string& stat_type, int dev_id, StatBase* stat) { + std::lock_guard lock_guard(stat_map_lock_); + stat_map_[GetStatKey(stat_type, dev_id)] = stat; + } + + void Unregister(const std::string& stat_type, int dev_id) { + std::lock_guard lock_guard(stat_map_lock_); + stat_map_.erase(GetStatKey(stat_type, dev_id)); + } + + void Update(const std::string& stat_type, int dev_id, int64_t increment) { + stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment); + } + + private: + StatRegistry() = default; + + DISABLE_COPY_AND_ASSIGN(StatRegistry); + + std::unordered_map stat_map_; + SpinLock stat_map_lock_; +}; + +int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) { + return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id); +} + +int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) { + return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id); +} + +void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) { + StatRegistry::GetInstance()->Update(stat_type, dev_id, increment); +} + +#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \ + StatRegistry::GetInstance()->Register( \ + #item, id, Stat::GetInstance()); + +#define MEMORY_STAT_REGISTER(item) \ + MEMORY_STAT_REGISTER_WITH_ID(item, 0); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 1); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 2); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 3); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 4); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 5); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 6); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 7); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 8); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 9); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 10); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 11); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 12); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 13); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 14); \ + MEMORY_STAT_REGISTER_WITH_ID(item, 15) + +int RegisterAllStats() { + MEMORY_STAT_REGISTER(Allocated); + MEMORY_STAT_REGISTER(Reserved); + return 0; +} + +UNUSED static int regiester_all_stats = RegisterAllStats(); + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h new file mode 100644 index 0000000000..f644d2f587 --- /dev/null +++ b/paddle/fluid/memory/stats.h @@ -0,0 +1,172 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/errors.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace memory { + +using framework::ThreadDataRegistry; + +struct ThreadLocalStatBase { + int64_t current{0}; + int64_t peak{0}; +}; + +class StatBase { + public: + StatBase() = default; + virtual ~StatBase() = default; + + virtual int64_t GetCurrentValue() = 0; + virtual int64_t GetPeakValue() = 0; + virtual void Update(int64_t) = 0; + + private: + DISABLE_COPY_AND_ASSIGN(StatBase); +}; + +template +class Stat : public StatBase { + public: + static Stat* GetInstance() { + static Stat instance; + return &instance; + } + + int64_t GetCurrentValue() override { + std::unordered_map thread_local_stats = + ThreadDataRegistry::GetInstance() + .GetAllThreadDataByValue(); + int64_t current_value = 0; + for (auto pair : thread_local_stats) { + current_value += pair.second.current; + } + return current_value; + } + + int64_t GetPeakValue() override { return peak_value_; } + + void Update(int64_t increment) override { + auto& thread_data_registry = + ThreadDataRegistry::GetInstance(); + ThreadLocalStatType* thread_local_stat = + thread_data_registry.GetMutableCurrentThreadData(); + thread_local_stat->current += increment; + + if (thread_local_stat->current > thread_local_stat->peak) { + thread_local_stat->peak = thread_local_stat->current; + int64_t current_value = GetCurrentValue(); + int64_t prev_value = peak_value_; + while (prev_value < current_value && + !peak_value_.compare_exchange_weak(prev_value, current_value)) { + } + VLOG(8) << "Update peak_value, after update, peak_value = " << peak_value_ + << " , current value = " << current_value; + } + } + + private: + Stat() {} + ~Stat() {} + std::atomic peak_value_{0}; +}; + +// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT +// values by a string, however, they has worse performance than the macro +// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and +// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low +// performance overhead is required. +int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id); +int64_t StatGetPeakValue(const std::string& stat_type, int dev_id); +void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment); + +#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \ + case id: \ + stat = paddle::memory::Stat< \ + paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \ + break + +#define MEMORY_STAT_FUNC(item, id, func, ...) \ + do { \ + paddle::memory::StatBase* stat = nullptr; \ + switch (id) { \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14); \ + MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15); \ + default: \ + PADDLE_THROW(paddle::platform::errors::OutOfRange( \ + "Only support device id between [0, 15] in memory stats," \ + "not support device id: %d", \ + id)); \ + break; \ + } \ + stat->func(__VA_ARGS__); \ + } while (0) + +#define MEMORY_STAT_CURRENT_VALUE(item, id) \ + MEMORY_STAT_FUNC(item, id, GetCurrentValue) +#define MEMORY_STAT_PEAK_VALUE(item, id) \ + MEMORY_STAT_FUNC(item, id, GetPeakValue) +#define MEMORY_STAT_UPDATE(item, id, increment) \ + MEMORY_STAT_FUNC(item, id, Update, increment) + +#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \ + struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {}; + +#define MEMORY_STAT_DECLARE(item) \ + MEMORY_STAT_DECLARE_WITH_ID(item, 0); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 1); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 2); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 3); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 4); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 5); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 6); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 7); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 8); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 9); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 10); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 11); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 12); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 13); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 14); \ + MEMORY_STAT_DECLARE_WITH_ID(item, 15) + +// To add a new STAT type, declare here and register in stats.cc +MEMORY_STAT_DECLARE(Allocated); +MEMORY_STAT_DECLARE(Reserved); + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc new file mode 100644 index 0000000000..436c737916 --- /dev/null +++ b/paddle/fluid/memory/stats_test.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/stats.h" +#include +#include +#include +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace memory { + +TEST(stats_test, MultiThreadReadWriteTest) { + std::string stat_type = "Allocated"; + size_t thread_num = 3; + size_t data_num = 10; + + std::condition_variable cv; + std::mutex mutex; + std::vector threads; + size_t ready_thread_num = 0; + + for (size_t i = 0; i < thread_num; ++i) { + threads.emplace_back( + [&stat_type, data_num, &cv, &mutex, &ready_thread_num]() { + for (size_t data = 0; data < data_num; ++data) { + StatUpdate(stat_type, 0, data); + } + /* lock guard*/ { + std::lock_guard lock_guard{mutex}; + ++ready_thread_num; + cv.notify_one(); + } + // Sleep here to not exit before the main thread checking stat + // results, because the thread-local stat data will be destroyed when + // the thread exit + std::this_thread::sleep_for(std::chrono::seconds(1)); + }); + } + + std::unique_lock unique_lock(mutex); + cv.wait(unique_lock, [&ready_thread_num, thread_num]() { + return ready_thread_num == thread_num; + }); + + EXPECT_EQ(StatGetCurrentValue(stat_type, 0), + int64_t((thread_num * data_num * (data_num - 1)) >> 1)); + + for (size_t i = 0; i < thread_num; ++i) { + threads[i].join(); + } +} + +TEST(stats_test, PeakValueTest) { + std::string stat_type = "Allocated"; + std::vector datas = { + 543149808935355, 634698327471328, 706215795436611, 577939367795333, + 419479490054362, 21975227714595, 812939817942250, 984428837942082, + 537304104446806, 685008544452453, 563352858161268, 690143831596330, + 964829938186077, 476984078018245, 804403365180177, -57918691189304, + 947611269236893, 752188963801927, 710946451346683, -49226452527666, + -59049377393968, 14128239868858, 463298869064035, 71954818131880, + 894087341752481, 971337322257029, 202325222441382, 128423535063606, + -89146949094815, 756429151957759, 444400180007032, 937040878834954, + 303916192293233, 16628488962638, 544031750807065, 392396591234910, + 686663859558365, 941126625484539, 120719755546781, 938838399629825, + 364952832531949, 237865770815218, -64409925441421, 130095171433100, + 140906146174023, 635514857321759, -65954585142544, 505369882354612, + 939334896592688, 591590196329715, 424834428510773, 316569328289240, + 44932622352645, 464924685290752, 396541464249293, 937169087747437, + 437992536948503, 44395833829426, 968496835801562, 80493658180301, + 836093264717766, 3339912102452, -32067753603273, 77353521424986, + 290980283590981, 496135147814915, 335112580987207, 571094882208895, + 776581672377954, -83075504255716, -93690563747742, 115144063088100, + 422629490055299, 917988755593299, 283511671626409, 715179006446336, + 760708617230450, 183628659314298, 899792829140365, 214949068928854, + 848851506468080, 791041814082114, 801591030978388, 526551272394511, + 781034506085043, 279998089943681, 907197980150568, 974365521595836, + 282127262539024, 272870474932399, 346617645597508, 409964014011113, + 746465732805300, -74049761897414, -65640372433924, 852009039806484, + 305079802044257, -48409757869238, 266031781660228, 327287322379820}; + + int64_t peak_value = ((int64_t)1) << 63; + int64_t sum = 0; + for (int64_t data : datas) { + StatUpdate(stat_type, 0, data); + sum += data; + peak_value = std::max(peak_value, sum); + } + EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value); +} + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 9beb70f3d2..a671381d07 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -13,34 +13,35 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" + #include #include #include #include #include - #include "gflags/gflags.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/lock_guard_ptr.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" +#include "paddle/phi/backends/gpu/gpu_info.h" + #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" #else #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" #include "paddle/fluid/platform/dynload/cudnn.h" #endif -#include "paddle/fluid/memory/malloc.h" + #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10020 #include "paddle/fluid/platform/dynload/cuda_driver.h" #endif #endif -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/lock_guard_ptr.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/platform/monitor.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -#include "paddle/phi/backends/gpu/gpu_info.h" DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); @@ -193,6 +194,7 @@ class RecordedGpuMallocHelper { if (result == gpuSuccess) { cur_size_.fetch_add(size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); + MEMORY_STAT_UPDATE(Reserved, dev_id_, size); #ifdef PADDLE_WITH_TESTING gpu_ptrs.insert(*ptr); @@ -229,6 +231,7 @@ class RecordedGpuMallocHelper { PADDLE_ENFORCE_GPU_SUCCESS(err); cur_size_.fetch_sub(size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); + MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); } else { platform::GpuGetLastError(); // clear the error flag when // cudaErrorCudartUnloading / diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 84c711f9b8..96d569d47c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2939,6 +2939,8 @@ All parameter, weight, gradient are variables in Paddle. } return stats_map; }); + m.def("memory_stat_get_current", memory::StatGetCurrentValue); + m.def("memory_stat_get_peak", memory::StatGetPeakValue); m.def("run_cmd", [](const std::string &cmd, int time_out = -1, int sleep_inter = -1) -> const std::string { @@ -3143,6 +3145,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("op_support_gpu", OpSupportGPU); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("get_cuda_device_count", platform::GetGPUDeviceCount); + m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId); m.def("cuda_empty_cache", [] { for (int dev_id : platform::GetSelectedDevices()) { auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace( diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index 970fb35bfa..b33dc1aaeb 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -26,6 +26,10 @@ __all__ = [ 'synchronize', 'device_count', 'empty_cache', + 'max_memory_allocated', + 'max_memory_reserved', + 'memory_allocated', + 'memory_reserved', 'stream_guard', 'get_device_properties', 'get_device_name', @@ -149,6 +153,177 @@ def empty_cache(): core.cuda_empty_cache() +def extract_cuda_device_id(device, op_name): + ''' + Return the id of the given cuda device. It is just a utility that will not be exposed to users. + + Args: + device(paddle.CUDAPlace or int or str): The device, the id of the device or + the string name of device like 'gpu:x'. + Default: None. + + Return: + int: The id of the given device. If device is None, return the id of current device. + ''' + if (device is None): + return core.get_cuda_current_device_id() + + if isinstance(device, int): + device_id = device + elif isinstance(device, core.CUDAPlace): + device_id = device.get_device_id() + elif isinstance(device, str): + if device.startswith('gpu:'): + device_id = int(device[4:]) + else: + raise ValueError( + "The current string {} is not expected. Because {} only support string which is like 'gpu:x'. " + "Please input appropriate string again!".format(device, + op_name)) + else: + raise ValueError( + "The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. " + "Please input appropriate device again!".format(device, op_name)) + + assert device_id >= 0, f"The device id must be not less than 0, but got id = {device_id}." + assert device_id < device_count( + ), f"The device id {device_id} exceeds gpu card number {device_count()}" + + return device_id + + +def max_memory_allocated(device=None): + ''' + Return the peak size of gpu memory that is allocated to tensor of the given device. + + .. note:: + The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need. + For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. + + Args: + device(paddle.CUDAPlace or int or str): The device, the id of the device or + the string name of device like 'gpu:x'. If device is None, the device is the current device. + Default: None. + + Return: + int: The peak size of gpu memory that is allocated to tensor of the given device, in bytes. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + + max_memory_allocated_size = paddle.device.cuda.max_memory_allocated(paddle.CUDAPlace(0)) + max_memory_allocated_size = paddle.device.cuda.max_memory_allocated(0) + max_memory_allocated_size = paddle.device.cuda.max_memory_allocated("gpu:0") + ''' + name = "paddle.device.cuda.max_memory_allocated" + if not core.is_compiled_with_cuda(): + raise ValueError( + f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." + ) + device_id = extract_cuda_device_id(device, op_name=name) + return core.memory_stat_get_peak("Allocated", device_id) + + +def max_memory_reserved(device=None): + ''' + Return the peak size of GPU memory that is held by the allocator of the given device. + + Args: + device(paddle.CUDAPlace or int or str): The device, the id of the device or + the string name of device like 'gpu:x'. If device is None, the device is the current device. + Default: None. + + Return: + int: The peak size of GPU memory that is held by the allocator of the given device, in bytes. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + + max_memory_reserved_size = paddle.device.cuda.max_memory_reserved(paddle.CUDAPlace(0)) + max_memory_reserved_size = paddle.device.cuda.max_memory_reserved(0) + max_memory_reserved_size = paddle.device.cuda.max_memory_reserved("gpu:0") + ''' + name = "paddle.device.cuda.max_memory_reserved" + if not core.is_compiled_with_cuda(): + raise ValueError( + f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." + ) + device_id = extract_cuda_device_id(device, op_name=name) + return core.memory_stat_get_peak("Reserved", device_id) + + +def memory_allocated(device=None): + ''' + Return the current size of gpu memory that is allocated to tensor of the given device. + + .. note:: + The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need. + For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. + + Args: + device(paddle.CUDAPlace or int or str): The device, the id of the device or + the string name of device like 'gpu:x'. If device is None, the device is the current device. + Default: None. + + Return: + int: The current size of gpu memory that is allocated to tensor of the given device, in bytes. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + + memory_allocated_size = paddle.device.cuda.memory_allocated(paddle.CUDAPlace(0)) + memory_allocated_size = paddle.device.cuda.memory_allocated(0) + memory_allocated_size = paddle.device.cuda.memory_allocated("gpu:0") + ''' + name = "paddle.device.cuda.memory_allocated" + if not core.is_compiled_with_cuda(): + raise ValueError( + f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." + ) + device_id = extract_cuda_device_id(device, op_name=name) + return core.memory_stat_get_current("Allocated", device_id) + + +def memory_reserved(device=None): + ''' + Return the current size of GPU memory that is held by the allocator of the given device. + + Args: + device(paddle.CUDAPlace or int or str): The device, the id of the device or + the string name of device like 'gpu:x'. If device is None, the device is the current device. + Default: None. + + Return: + int: The current size of GPU memory that is held by the allocator of the given device, in bytes. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + + memory_reserved_size = paddle.device.cuda.memory_reserved(paddle.CUDAPlace(0)) + memory_reserved_size = paddle.device.cuda.memory_reserved(0) + memory_reserved_size = paddle.device.cuda.memory_reserved("gpu:0") + ''' + name = "paddle.device.cuda.memory_reserved" + if not core.is_compiled_with_cuda(): + raise ValueError( + f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." + ) + device_id = extract_cuda_device_id(device, op_name=name) + return core.memory_stat_get_current("Reserved", device_id) + + def _set_current_stream(stream): ''' Set the current stream. diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d65d871579..c816a8c4c2 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1219,6 +1219,7 @@ set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120) set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400) set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000) set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY") +set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth") if (WITH_GLOO) set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30) set_tests_properties(test_parallel_dygraph_unused_variables_gloo PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py new file mode 100644 index 0000000000..51c9ba182a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +from paddle.fluid import core +from paddle.device.cuda import device_count, memory_allocated, max_memory_allocated + + +class TestMaxMemoryAllocated(unittest.TestCase): + def test_max_memory_allocated(self, device=None): + if core.is_compiled_with_cuda(): + alloc_time = 100 + max_alloc_size = 10000 + peak_memory_allocated_size = max_memory_allocated(device) + for i in range(alloc_time): + shape = paddle.randint(max_alloc_size) + tensor = paddle.zeros(shape) + peak_memory_allocated_size = max(peak_memory_allocated_size, + memory_allocated(device)) + del shape + del tensor + + self.assertEqual(peak_memory_allocated_size, + max_memory_allocated(device)) + + def test_max_memory_allocated_for_all_places(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + paddle.device.set_device("gpu:" + str(i)) + self.test_max_memory_allocated(core.CUDAPlace(i)) + self.test_max_memory_allocated(i) + self.test_max_memory_allocated("gpu:" + str(i)) + + def test_max_memory_allocated_exception(self): + if core.is_compiled_with_cuda(): + wrong_device = [ + core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu" + ] + for device in wrong_device: + with self.assertRaises(BaseException): + max_memory_allocated(device) + else: + with self.assertRaises(BaseException): + max_memory_allocated() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py new file mode 100644 index 0000000000..e64e02bb7f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +from paddle.fluid import core +from paddle.device.cuda import device_count, memory_reserved, max_memory_reserved + + +class TestMaxMemoryreserved(unittest.TestCase): + def test_max_memory_reserved(self, device=None): + if core.is_compiled_with_cuda(): + alloc_time = 100 + max_alloc_size = 10000 + peak_memory_reserved_size = max_memory_reserved(device) + for i in range(alloc_time): + shape = paddle.randint(max_alloc_size) + tensor = paddle.zeros(shape) + peak_memory_reserved_size = max(peak_memory_reserved_size, + memory_reserved(device)) + del shape + del tensor + + self.assertEqual(peak_memory_reserved_size, + max_memory_reserved(device)) + + def test_max_memory_reserved_for_all_places(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + paddle.device.set_device("gpu:" + str(i)) + self.test_max_memory_reserved(core.CUDAPlace(i)) + self.test_max_memory_reserved(i) + self.test_max_memory_reserved("gpu:" + str(i)) + + def test_max_memory_reserved_exception(self): + if core.is_compiled_with_cuda(): + wrong_device = [ + core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu" + ] + for device in wrong_device: + with self.assertRaises(BaseException): + max_memory_reserved(device) + else: + with self.assertRaises(BaseException): + max_memory_reserved() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py new file mode 100644 index 0000000000..af45537b6d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +import numpy as np +from paddle.fluid import core +from paddle.device.cuda import device_count, memory_allocated + + +class TestMemoryAllocated(unittest.TestCase): + def test_memory_allocated(self, device=None): + if core.is_compiled_with_cuda(): + tensor = paddle.zeros(shape=[256]) + alloc_size = 4 * 256 # 256 float32 data, with 4 bytes for each one + memory_allocated_size = memory_allocated(device) + self.assertEqual(memory_allocated_size, alloc_size) + + def test_memory_allocated_for_all_places(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + paddle.device.set_device("gpu:" + str(i)) + self.test_memory_allocated(core.CUDAPlace(i)) + self.test_memory_allocated(i) + self.test_memory_allocated("gpu:" + str(i)) + + def test_memory_allocated_exception(self): + if core.is_compiled_with_cuda(): + wrong_device = [ + core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu" + ] + for device in wrong_device: + with self.assertRaises(BaseException): + memory_allocated(device) + else: + with self.assertRaises(BaseException): + memory_allocated() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py new file mode 100644 index 0000000000..149760de8b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +import numpy as np +from paddle.fluid import core +from paddle.device.cuda import device_count, memory_reserved + + +class TestMemoryreserved(unittest.TestCase): + def test_memory_reserved(self, device=None): + if core.is_compiled_with_cuda(): + tensor = paddle.zeros(shape=[256]) + alloc_size = 4 * 256 # 256 float32 data, with 4 bytes for each one + memory_reserved_size = memory_reserved(device) + self.assertEqual(memory_reserved_size, alloc_size) + + def test_memory_reserved_for_all_places(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + paddle.device.set_device("gpu:" + str(i)) + self.test_memory_reserved(core.CUDAPlace(i)) + self.test_memory_reserved(i) + self.test_memory_reserved("gpu:" + str(i)) + + def test_memory_reserved_exception(self): + if core.is_compiled_with_cuda(): + wrong_device = [ + core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu" + ] + for device in wrong_device: + with self.assertRaises(BaseException): + memory_reserved(device) + else: + with self.assertRaises(BaseException): + memory_reserved() + + +if __name__ == "__main__": + unittest.main() -- GitLab