未验证 提交 afe02e9d 编写于 作者: F From00 提交者: GitHub

Add new APIs for GPU memory monitoring (max_memory_allocated,...

Add new APIs for GPU memory monitoring (max_memory_allocated, max_memory_reserved, memory_allocated, memory_reserved) (#38657)

* Add new API memory_reserved

* Add memory_allocated, max_memory_reserved and max_memory_allocater

* Fix CI error

* Fix CI error

* Enhance UT

* Add FLAGS_memory_stats_opt

* Add STATS macro functions

* Add StatAllocator

* Fix CI errors

* Add UT

* Fix CI errors
上级 3d39f5c7
......@@ -10,8 +10,10 @@ endif()
cc_library(malloc SRCS malloc.cc DEPS
place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
cc_library(stats SRCS stats.cc DEPS enforce)
cc_library(memory DEPS malloc memcpy stats)
cc_library(memory DEPS malloc memcpy)
cc_test(stats_test SRCS stats_test.cc DEPS stats)
if (WITH_GPU)
nv_test(malloc_test
......
cc_library(allocator SRCS allocator.cc DEPS place)
cc_library(allocator SRCS allocator.cc DEPS place stats)
cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
......@@ -14,7 +14,7 @@ else ()
endif()
if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
......@@ -27,7 +27,7 @@ if (WITH_GPU)
endif()
if (WITH_ROCM)
hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
......@@ -101,7 +101,7 @@ endif()
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats)
if (WITH_GPU)
target_link_libraries(allocator_facade cuda_graph)
......
......@@ -22,6 +22,7 @@
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/stat_allocator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
......@@ -315,6 +316,8 @@ class AllocatorFacadePrivate {
WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
}
WrapStatAllocator();
CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA
......@@ -521,6 +524,7 @@ class AllocatorFacadePrivate {
InitAutoGrowthCUDAAllocator(p, stream);
WrapStreamSafeCUDAAllocator(p, stream);
WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
WrapStatAllocator(p, stream);
}
}
......@@ -715,6 +719,11 @@ class AllocatorFacadePrivate {
allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
}
void WrapStatAllocator(platform::CUDAPlace p, gpuStream_t stream) {
std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
allocator = std::make_shared<StatAllocator>(allocator);
}
#ifdef PADDLE_WITH_CUDA
void WrapCUDAGraphAllocator() {
for (auto& item : allocators_) {
......@@ -895,6 +904,15 @@ class AllocatorFacadePrivate {
}
}
void WrapStatAllocator() {
for (auto& pair : allocators_) {
// Now memory stats is only supported for GPU
if (platform::is_gpu_place(pair.first)) {
pair.second = std::make_shared<StatAllocator>(pair.second);
}
}
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// a standalone CUDA allocator to support multi-stream GC in new executor
std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/stats.h"
namespace paddle {
namespace memory {
namespace allocation {
class StatAllocator : public Allocator {
public:
explicit StatAllocator(std::shared_ptr<Allocator> underlying_allocator)
: underlying_allocator_(std::move(underlying_allocator)) {}
bool IsAllocThreadSafe() const override { return true; }
protected:
void FreeImpl(phi::Allocation* allocation) override {
MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
underlying_allocator_->Free(allocation);
}
phi::Allocation* AllocateImpl(size_t size) override {
phi::Allocator::AllocationPtr allocation =
underlying_allocator_->Allocate(size);
MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
allocation->size());
return allocation.release();
}
uint64_t ReleaseImpl(const platform::Place& place) override {
return underlying_allocator_->Release(place);
}
private:
std::shared_ptr<Allocator> underlying_allocator_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -16,3 +16,4 @@ limitations under the License. */
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/stats.h"
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace memory {
class StatRegistry {
public:
static StatRegistry* GetInstance() {
static StatRegistry instance;
return &instance;
}
StatBase* GetStat(const std::string& stat_type, int dev_id) {
auto it = stat_map_.find(GetStatKey(stat_type, dev_id));
if (it == stat_map_.end()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The STAT type \"%s\" for device %d has not been regeistered.",
stat_type.c_str(), dev_id));
}
return it->second;
}
std::string GetStatKey(const std::string& stat_type, int dev_id) {
return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
}
int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
return GetStat(stat_type, dev_id)->GetCurrentValue();
}
int64_t GetPeakValue(const std::string& stat_type, int dev_id) {
return GetStat(stat_type, dev_id)->GetPeakValue();
}
void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
stat_map_[GetStatKey(stat_type, dev_id)] = stat;
}
void Unregister(const std::string& stat_type, int dev_id) {
std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
stat_map_.erase(GetStatKey(stat_type, dev_id));
}
void Update(const std::string& stat_type, int dev_id, int64_t increment) {
stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
}
private:
StatRegistry() = default;
DISABLE_COPY_AND_ASSIGN(StatRegistry);
std::unordered_map<std::string, StatBase*> stat_map_;
SpinLock stat_map_lock_;
};
int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
}
int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
}
void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
}
#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
StatRegistry::GetInstance()->Register( \
#item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
#define MEMORY_STAT_REGISTER(item) \
MEMORY_STAT_REGISTER_WITH_ID(item, 0); \
MEMORY_STAT_REGISTER_WITH_ID(item, 1); \
MEMORY_STAT_REGISTER_WITH_ID(item, 2); \
MEMORY_STAT_REGISTER_WITH_ID(item, 3); \
MEMORY_STAT_REGISTER_WITH_ID(item, 4); \
MEMORY_STAT_REGISTER_WITH_ID(item, 5); \
MEMORY_STAT_REGISTER_WITH_ID(item, 6); \
MEMORY_STAT_REGISTER_WITH_ID(item, 7); \
MEMORY_STAT_REGISTER_WITH_ID(item, 8); \
MEMORY_STAT_REGISTER_WITH_ID(item, 9); \
MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
MEMORY_STAT_REGISTER_WITH_ID(item, 15)
int RegisterAllStats() {
MEMORY_STAT_REGISTER(Allocated);
MEMORY_STAT_REGISTER(Reserved);
return 0;
}
UNUSED static int regiester_all_stats = RegisterAllStats();
} // namespace memory
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <map>
#include <string>
#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace memory {
using framework::ThreadDataRegistry;
struct ThreadLocalStatBase {
int64_t current{0};
int64_t peak{0};
};
class StatBase {
public:
StatBase() = default;
virtual ~StatBase() = default;
virtual int64_t GetCurrentValue() = 0;
virtual int64_t GetPeakValue() = 0;
virtual void Update(int64_t) = 0;
private:
DISABLE_COPY_AND_ASSIGN(StatBase);
};
template <typename ThreadLocalStatType>
class Stat : public StatBase {
public:
static Stat* GetInstance() {
static Stat instance;
return &instance;
}
int64_t GetCurrentValue() override {
std::unordered_map<uint64_t, ThreadLocalStatType> thread_local_stats =
ThreadDataRegistry<ThreadLocalStatType>::GetInstance()
.GetAllThreadDataByValue();
int64_t current_value = 0;
for (auto pair : thread_local_stats) {
current_value += pair.second.current;
}
return current_value;
}
int64_t GetPeakValue() override { return peak_value_; }
void Update(int64_t increment) override {
auto& thread_data_registry =
ThreadDataRegistry<ThreadLocalStatType>::GetInstance();
ThreadLocalStatType* thread_local_stat =
thread_data_registry.GetMutableCurrentThreadData();
thread_local_stat->current += increment;
if (thread_local_stat->current > thread_local_stat->peak) {
thread_local_stat->peak = thread_local_stat->current;
int64_t current_value = GetCurrentValue();
int64_t prev_value = peak_value_;
while (prev_value < current_value &&
!peak_value_.compare_exchange_weak(prev_value, current_value)) {
}
VLOG(8) << "Update peak_value, after update, peak_value = " << peak_value_
<< " , current value = " << current_value;
}
}
private:
Stat() {}
~Stat() {}
std::atomic<int64_t> peak_value_{0};
};
// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT
// values by a string, however, they has worse performance than the macro
// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and
// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low
// performance overhead is required.
int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id);
int64_t StatGetPeakValue(const std::string& stat_type, int dev_id);
void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \
case id: \
stat = paddle::memory::Stat< \
paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \
break
#define MEMORY_STAT_FUNC(item, id, func, ...) \
do { \
paddle::memory::StatBase* stat = nullptr; \
switch (id) { \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15); \
default: \
PADDLE_THROW(paddle::platform::errors::OutOfRange( \
"Only support device id between [0, 15] in memory stats," \
"not support device id: %d", \
id)); \
break; \
} \
stat->func(__VA_ARGS__); \
} while (0)
#define MEMORY_STAT_CURRENT_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetCurrentValue)
#define MEMORY_STAT_PEAK_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define MEMORY_STAT_UPDATE(item, id, increment) \
MEMORY_STAT_FUNC(item, id, Update, increment)
#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \
struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {};
#define MEMORY_STAT_DECLARE(item) \
MEMORY_STAT_DECLARE_WITH_ID(item, 0); \
MEMORY_STAT_DECLARE_WITH_ID(item, 1); \
MEMORY_STAT_DECLARE_WITH_ID(item, 2); \
MEMORY_STAT_DECLARE_WITH_ID(item, 3); \
MEMORY_STAT_DECLARE_WITH_ID(item, 4); \
MEMORY_STAT_DECLARE_WITH_ID(item, 5); \
MEMORY_STAT_DECLARE_WITH_ID(item, 6); \
MEMORY_STAT_DECLARE_WITH_ID(item, 7); \
MEMORY_STAT_DECLARE_WITH_ID(item, 8); \
MEMORY_STAT_DECLARE_WITH_ID(item, 9); \
MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
MEMORY_STAT_DECLARE_WITH_ID(item, 15)
// To add a new STAT type, declare here and register in stats.cc
MEMORY_STAT_DECLARE(Allocated);
MEMORY_STAT_DECLARE(Reserved);
} // namespace memory
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/stats.h"
#include <condition_variable>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
#include "gtest/gtest.h"
namespace paddle {
namespace memory {
TEST(stats_test, MultiThreadReadWriteTest) {
std::string stat_type = "Allocated";
size_t thread_num = 3;
size_t data_num = 10;
std::condition_variable cv;
std::mutex mutex;
std::vector<std::thread> threads;
size_t ready_thread_num = 0;
for (size_t i = 0; i < thread_num; ++i) {
threads.emplace_back(
[&stat_type, data_num, &cv, &mutex, &ready_thread_num]() {
for (size_t data = 0; data < data_num; ++data) {
StatUpdate(stat_type, 0, data);
}
/* lock guard*/ {
std::lock_guard<std::mutex> lock_guard{mutex};
++ready_thread_num;
cv.notify_one();
}
// Sleep here to not exit before the main thread checking stat
// results, because the thread-local stat data will be destroyed when
// the thread exit
std::this_thread::sleep_for(std::chrono::seconds(1));
});
}
std::unique_lock<std::mutex> unique_lock(mutex);
cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
return ready_thread_num == thread_num;
});
EXPECT_EQ(StatGetCurrentValue(stat_type, 0),
int64_t((thread_num * data_num * (data_num - 1)) >> 1));
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
}
}
TEST(stats_test, PeakValueTest) {
std::string stat_type = "Allocated";
std::vector<int64_t> datas = {
543149808935355, 634698327471328, 706215795436611, 577939367795333,
419479490054362, 21975227714595, 812939817942250, 984428837942082,
537304104446806, 685008544452453, 563352858161268, 690143831596330,
964829938186077, 476984078018245, 804403365180177, -57918691189304,
947611269236893, 752188963801927, 710946451346683, -49226452527666,
-59049377393968, 14128239868858, 463298869064035, 71954818131880,
894087341752481, 971337322257029, 202325222441382, 128423535063606,
-89146949094815, 756429151957759, 444400180007032, 937040878834954,
303916192293233, 16628488962638, 544031750807065, 392396591234910,
686663859558365, 941126625484539, 120719755546781, 938838399629825,
364952832531949, 237865770815218, -64409925441421, 130095171433100,
140906146174023, 635514857321759, -65954585142544, 505369882354612,
939334896592688, 591590196329715, 424834428510773, 316569328289240,
44932622352645, 464924685290752, 396541464249293, 937169087747437,
437992536948503, 44395833829426, 968496835801562, 80493658180301,
836093264717766, 3339912102452, -32067753603273, 77353521424986,
290980283590981, 496135147814915, 335112580987207, 571094882208895,
776581672377954, -83075504255716, -93690563747742, 115144063088100,
422629490055299, 917988755593299, 283511671626409, 715179006446336,
760708617230450, 183628659314298, 899792829140365, 214949068928854,
848851506468080, 791041814082114, 801591030978388, 526551272394511,
781034506085043, 279998089943681, 907197980150568, 974365521595836,
282127262539024, 272870474932399, 346617645597508, 409964014011113,
746465732805300, -74049761897414, -65640372433924, 852009039806484,
305079802044257, -48409757869238, 266031781660228, 327287322379820};
int64_t peak_value = ((int64_t)1) << 63;
int64_t sum = 0;
for (int64_t data : datas) {
StatUpdate(stat_type, 0, data);
sum += data;
peak_value = std::max(peak_value, sum);
}
EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value);
}
} // namespace memory
} // namespace paddle
......@@ -13,34 +13,35 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include <array>
#include <cstdlib>
#include <mutex>
#include <set>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/miopen.h"
#else
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#endif
#include "paddle/fluid/memory/malloc.h"
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#endif
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
......@@ -193,6 +194,7 @@ class RecordedGpuMallocHelper {
if (result == gpuSuccess) {
cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
#ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr);
......@@ -229,6 +231,7 @@ class RecordedGpuMallocHelper {
PADDLE_ENFORCE_GPU_SUCCESS(err);
cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
} else {
platform::GpuGetLastError(); // clear the error flag when
// cudaErrorCudartUnloading /
......
......@@ -2939,6 +2939,8 @@ All parameter, weight, gradient are variables in Paddle.
}
return stats_map;
});
m.def("memory_stat_get_current", memory::StatGetCurrentValue);
m.def("memory_stat_get_peak", memory::StatGetPeakValue);
m.def("run_cmd",
[](const std::string &cmd, int time_out = -1,
int sleep_inter = -1) -> const std::string {
......@@ -3143,6 +3145,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("op_support_gpu", OpSupportGPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
m.def("cuda_empty_cache", [] {
for (int dev_id : platform::GetSelectedDevices()) {
auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
......
......@@ -26,6 +26,10 @@ __all__ = [
'synchronize',
'device_count',
'empty_cache',
'max_memory_allocated',
'max_memory_reserved',
'memory_allocated',
'memory_reserved',
'stream_guard',
'get_device_properties',
'get_device_name',
......@@ -149,6 +153,177 @@ def empty_cache():
core.cuda_empty_cache()
def extract_cuda_device_id(device, op_name):
'''
Return the id of the given cuda device. It is just a utility that will not be exposed to users.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'.
Default: None.
Return:
int: The id of the given device. If device is None, return the id of current device.
'''
if (device is None):
return core.get_cuda_current_device_id()
if isinstance(device, int):
device_id = device
elif isinstance(device, core.CUDAPlace):
device_id = device.get_device_id()
elif isinstance(device, str):
if device.startswith('gpu:'):
device_id = int(device[4:])
else:
raise ValueError(
"The current string {} is not expected. Because {} only support string which is like 'gpu:x'. "
"Please input appropriate string again!".format(device,
op_name))
else:
raise ValueError(
"The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. "
"Please input appropriate device again!".format(device, op_name))
assert device_id >= 0, f"The device id must be not less than 0, but got id = {device_id}."
assert device_id < device_count(
), f"The device id {device_id} exceeds gpu card number {device_count()}"
return device_id
def max_memory_allocated(device=None):
'''
Return the peak size of gpu memory that is allocated to tensor of the given device.
.. note::
The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Return:
int: The peak size of gpu memory that is allocated to tensor of the given device, in bytes.
Examples:
.. code-block:: python
# required: gpu
import paddle
max_memory_allocated_size = paddle.device.cuda.max_memory_allocated(paddle.CUDAPlace(0))
max_memory_allocated_size = paddle.device.cuda.max_memory_allocated(0)
max_memory_allocated_size = paddle.device.cuda.max_memory_allocated("gpu:0")
'''
name = "paddle.device.cuda.max_memory_allocated"
if not core.is_compiled_with_cuda():
raise ValueError(
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_peak("Allocated", device_id)
def max_memory_reserved(device=None):
'''
Return the peak size of GPU memory that is held by the allocator of the given device.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Return:
int: The peak size of GPU memory that is held by the allocator of the given device, in bytes.
Examples:
.. code-block:: python
# required: gpu
import paddle
max_memory_reserved_size = paddle.device.cuda.max_memory_reserved(paddle.CUDAPlace(0))
max_memory_reserved_size = paddle.device.cuda.max_memory_reserved(0)
max_memory_reserved_size = paddle.device.cuda.max_memory_reserved("gpu:0")
'''
name = "paddle.device.cuda.max_memory_reserved"
if not core.is_compiled_with_cuda():
raise ValueError(
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_peak("Reserved", device_id)
def memory_allocated(device=None):
'''
Return the current size of gpu memory that is allocated to tensor of the given device.
.. note::
The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Return:
int: The current size of gpu memory that is allocated to tensor of the given device, in bytes.
Examples:
.. code-block:: python
# required: gpu
import paddle
memory_allocated_size = paddle.device.cuda.memory_allocated(paddle.CUDAPlace(0))
memory_allocated_size = paddle.device.cuda.memory_allocated(0)
memory_allocated_size = paddle.device.cuda.memory_allocated("gpu:0")
'''
name = "paddle.device.cuda.memory_allocated"
if not core.is_compiled_with_cuda():
raise ValueError(
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_current("Allocated", device_id)
def memory_reserved(device=None):
'''
Return the current size of GPU memory that is held by the allocator of the given device.
Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None.
Return:
int: The current size of GPU memory that is held by the allocator of the given device, in bytes.
Examples:
.. code-block:: python
# required: gpu
import paddle
memory_reserved_size = paddle.device.cuda.memory_reserved(paddle.CUDAPlace(0))
memory_reserved_size = paddle.device.cuda.memory_reserved(0)
memory_reserved_size = paddle.device.cuda.memory_reserved("gpu:0")
'''
name = "paddle.device.cuda.memory_reserved"
if not core.is_compiled_with_cuda():
raise ValueError(
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_current("Reserved", device_id)
def _set_current_stream(stream):
'''
Set the current stream.
......
......@@ -1219,6 +1219,7 @@ set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000)
set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
if (WITH_GLOO)
set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30)
set_tests_properties(test_parallel_dygraph_unused_variables_gloo PROPERTIES TIMEOUT 120)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import unittest
from paddle.fluid import core
from paddle.device.cuda import device_count, memory_allocated, max_memory_allocated
class TestMaxMemoryAllocated(unittest.TestCase):
def test_max_memory_allocated(self, device=None):
if core.is_compiled_with_cuda():
alloc_time = 100
max_alloc_size = 10000
peak_memory_allocated_size = max_memory_allocated(device)
for i in range(alloc_time):
shape = paddle.randint(max_alloc_size)
tensor = paddle.zeros(shape)
peak_memory_allocated_size = max(peak_memory_allocated_size,
memory_allocated(device))
del shape
del tensor
self.assertEqual(peak_memory_allocated_size,
max_memory_allocated(device))
def test_max_memory_allocated_for_all_places(self):
if core.is_compiled_with_cuda():
gpu_num = device_count()
for i in range(gpu_num):
paddle.device.set_device("gpu:" + str(i))
self.test_max_memory_allocated(core.CUDAPlace(i))
self.test_max_memory_allocated(i)
self.test_max_memory_allocated("gpu:" + str(i))
def test_max_memory_allocated_exception(self):
if core.is_compiled_with_cuda():
wrong_device = [
core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
]
for device in wrong_device:
with self.assertRaises(BaseException):
max_memory_allocated(device)
else:
with self.assertRaises(BaseException):
max_memory_allocated()
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import unittest
from paddle.fluid import core
from paddle.device.cuda import device_count, memory_reserved, max_memory_reserved
class TestMaxMemoryreserved(unittest.TestCase):
def test_max_memory_reserved(self, device=None):
if core.is_compiled_with_cuda():
alloc_time = 100
max_alloc_size = 10000
peak_memory_reserved_size = max_memory_reserved(device)
for i in range(alloc_time):
shape = paddle.randint(max_alloc_size)
tensor = paddle.zeros(shape)
peak_memory_reserved_size = max(peak_memory_reserved_size,
memory_reserved(device))
del shape
del tensor
self.assertEqual(peak_memory_reserved_size,
max_memory_reserved(device))
def test_max_memory_reserved_for_all_places(self):
if core.is_compiled_with_cuda():
gpu_num = device_count()
for i in range(gpu_num):
paddle.device.set_device("gpu:" + str(i))
self.test_max_memory_reserved(core.CUDAPlace(i))
self.test_max_memory_reserved(i)
self.test_max_memory_reserved("gpu:" + str(i))
def test_max_memory_reserved_exception(self):
if core.is_compiled_with_cuda():
wrong_device = [
core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
]
for device in wrong_device:
with self.assertRaises(BaseException):
max_memory_reserved(device)
else:
with self.assertRaises(BaseException):
max_memory_reserved()
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import unittest
import numpy as np
from paddle.fluid import core
from paddle.device.cuda import device_count, memory_allocated
class TestMemoryAllocated(unittest.TestCase):
def test_memory_allocated(self, device=None):
if core.is_compiled_with_cuda():
tensor = paddle.zeros(shape=[256])
alloc_size = 4 * 256 # 256 float32 data, with 4 bytes for each one
memory_allocated_size = memory_allocated(device)
self.assertEqual(memory_allocated_size, alloc_size)
def test_memory_allocated_for_all_places(self):
if core.is_compiled_with_cuda():
gpu_num = device_count()
for i in range(gpu_num):
paddle.device.set_device("gpu:" + str(i))
self.test_memory_allocated(core.CUDAPlace(i))
self.test_memory_allocated(i)
self.test_memory_allocated("gpu:" + str(i))
def test_memory_allocated_exception(self):
if core.is_compiled_with_cuda():
wrong_device = [
core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
]
for device in wrong_device:
with self.assertRaises(BaseException):
memory_allocated(device)
else:
with self.assertRaises(BaseException):
memory_allocated()
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import unittest
import numpy as np
from paddle.fluid import core
from paddle.device.cuda import device_count, memory_reserved
class TestMemoryreserved(unittest.TestCase):
def test_memory_reserved(self, device=None):
if core.is_compiled_with_cuda():
tensor = paddle.zeros(shape=[256])
alloc_size = 4 * 256 # 256 float32 data, with 4 bytes for each one
memory_reserved_size = memory_reserved(device)
self.assertEqual(memory_reserved_size, alloc_size)
def test_memory_reserved_for_all_places(self):
if core.is_compiled_with_cuda():
gpu_num = device_count()
for i in range(gpu_num):
paddle.device.set_device("gpu:" + str(i))
self.test_memory_reserved(core.CUDAPlace(i))
self.test_memory_reserved(i)
self.test_memory_reserved("gpu:" + str(i))
def test_memory_reserved_exception(self):
if core.is_compiled_with_cuda():
wrong_device = [
core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
]
for device in wrong_device:
with self.assertRaises(BaseException):
memory_reserved(device)
else:
with self.assertRaises(BaseException):
memory_reserved()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册