未验证 提交 21f11d35 编写于 作者: R Ruibiao Chen 提交者: GitHub

Support memory stats for CPU (#42945)

* Support memory stats for CPU

* Add UTs

* Fix typos

* Fix typos
上级 b2b78cd4
......@@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
cc_library(stats SRCS stats.cc DEPS enforce)
cc_library(memory DEPS malloc memcpy stats)
cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory)
cc_test(stats_test SRCS stats_test.cc DEPS stats)
if (WITH_GPU)
......
......@@ -931,10 +931,7 @@ class AllocatorFacadePrivate {
void WrapStatAllocator() {
for (auto& pair : allocators_) {
// Now memory stats is only supported for GPU
if (platform::is_gpu_place(pair.first)) {
pair.second = std::make_shared<StatAllocator>(pair.second);
}
pair.second = std::make_shared<StatAllocator>(pair.second);
}
}
......
......@@ -30,16 +30,28 @@ class StatAllocator : public Allocator {
protected:
void FreeImpl(phi::Allocation* allocation) override {
MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
if (platform::is_cpu_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
}
underlying_allocator_->Free(allocation);
}
phi::Allocation* AllocateImpl(size_t size) override {
phi::Allocator::AllocationPtr allocation =
underlying_allocator_->Allocate(size);
MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
allocation->size());
if (platform::is_cpu_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
allocation->size());
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
allocation->size());
}
return allocation.release();
}
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/memory/stats.h"
#ifdef _WIN32
#include <malloc.h>
#ifndef NOMINMAX
......@@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
}
}
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
return p;
}
......@@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
#else
free(p);
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
}
bool CPUAllocator::UseGpu() const { return false; }
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/memory.h"
#include <algorithm>
#include <vector>
#include "gtest/gtest.h"
namespace paddle {
namespace memory {
TEST(stat_allocator_test, host_memory_stat_test) {
std::vector<int64_t> alloc_sizes{
5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
6235, 0, 7810, 940, 1239, 1945, 789, 2891, 7553, 8046, 2685,
1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
4, 1185, 2186, 357, 9774, 6743, 6136, 7073, 7674, 5640, 3935,
528, 6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
221, 309, 3617, 3793, 3334, 7281, 1302};
int64_t max_alloc_size = 0;
for (int64_t size : alloc_sizes) {
AllocationPtr allocation = Alloc(platform::CPUPlace(), size);
int64_t alloc_size = static_cast<int64_t>(allocation->size());
max_alloc_size = std::max(max_alloc_size, alloc_size);
EXPECT_EQ(HostMemoryStatCurrentValue("Allocated", 0), alloc_size);
}
EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(stat_allocator_test, device_memory_stat_test) {
std::vector<int64_t> alloc_sizes{
5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
6235, 0, 7810, 940, 1239, 1945, 789, 2891, 7553, 8046, 2685,
1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
4, 1185, 2186, 357, 9774, 6743, 6136, 7073, 7674, 5640, 3935,
528, 6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
221, 309, 3617, 3793, 3334, 7281, 1302};
int64_t max_alloc_size = 0;
for (int64_t size : alloc_sizes) {
AllocationPtr allocation = Alloc(platform::CUDAPlace(), size);
int64_t alloc_size = static_cast<int64_t>(allocation->size());
max_alloc_size = std::max(max_alloc_size, alloc_size);
EXPECT_EQ(DeviceMemoryStatCurrentValue("Allocated", 0), alloc_size);
}
EXPECT_EQ(DeviceMemoryStatPeakValue("Allocated", 0), max_alloc_size);
}
#endif
} // namespace memory
} // namespace paddle
......@@ -38,7 +38,7 @@ class StatRegistry {
}
std::string GetStatKey(const std::string& stat_type, int dev_id) {
return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
return stat_type + std::to_string(dev_id);
}
int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
......@@ -49,6 +49,10 @@ class StatRegistry {
return GetStat(stat_type, dev_id)->GetPeakValue();
}
void Update(const std::string& stat_type, int dev_id, int64_t increment) {
GetStat(stat_type, dev_id)->Update(increment);
}
void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
stat_map_[GetStatKey(stat_type, dev_id)] = stat;
......@@ -59,10 +63,6 @@ class StatRegistry {
stat_map_.erase(GetStatKey(stat_type, dev_id));
}
void Update(const std::string& stat_type, int dev_id, int64_t increment) {
stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
}
private:
StatRegistry() = default;
......@@ -72,43 +72,67 @@ class StatRegistry {
SpinLock stat_map_lock_;
};
int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetCurrentValue("Device" + stat_type,
dev_id);
}
int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetPeakValue("Device" + stat_type,
dev_id);
}
void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
int64_t increment) {
StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
}
#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
StatRegistry::GetInstance()->Register( \
#item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
#define MEMORY_STAT_REGISTER(item) \
MEMORY_STAT_REGISTER_WITH_ID(item, 0); \
MEMORY_STAT_REGISTER_WITH_ID(item, 1); \
MEMORY_STAT_REGISTER_WITH_ID(item, 2); \
MEMORY_STAT_REGISTER_WITH_ID(item, 3); \
MEMORY_STAT_REGISTER_WITH_ID(item, 4); \
MEMORY_STAT_REGISTER_WITH_ID(item, 5); \
MEMORY_STAT_REGISTER_WITH_ID(item, 6); \
MEMORY_STAT_REGISTER_WITH_ID(item, 7); \
MEMORY_STAT_REGISTER_WITH_ID(item, 8); \
MEMORY_STAT_REGISTER_WITH_ID(item, 9); \
MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
MEMORY_STAT_REGISTER_WITH_ID(item, 15)
int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
dev_id);
}
int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetPeakValue("Host" + stat_type, dev_id);
}
void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
int64_t increment) {
StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
}
#define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \
StatRegistry::GetInstance()->Register( \
"Device" #item, id, Stat<DeviceMemoryStat##item##id>::GetInstance());
#define DEVICE_MEMORY_STAT_REGISTER(item) \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 0); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 1); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 2); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 3); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 4); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 5); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 6); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 7); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 8); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 9); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 15)
#define HOST_MEMORY_STAT_REGISTER(item) \
StatRegistry::GetInstance()->Register( \
"Host" #item, 0, Stat<HostMemoryStat##item##0>::GetInstance());
int RegisterAllStats() {
MEMORY_STAT_REGISTER(Allocated);
MEMORY_STAT_REGISTER(Reserved);
DEVICE_MEMORY_STAT_REGISTER(Allocated);
DEVICE_MEMORY_STAT_REGISTER(Reserved);
HOST_MEMORY_STAT_REGISTER(Allocated);
HOST_MEMORY_STAT_REGISTER(Reserved);
return 0;
}
......
......@@ -91,82 +91,113 @@ class Stat : public StatBase {
std::atomic<int64_t> peak_value_{0};
};
// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT
// values by a string, however, they has worse performance than the macro
// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and
// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low
// performance overhead is required.
int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id);
int64_t StatGetPeakValue(const std::string& stat_type, int dev_id);
void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \
case id: \
stat = paddle::memory::Stat< \
paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \
// xxxMemoryStatCurrentValue, xxxMemoryStatPeakValue and xxxMemoryStatUpdate
// support to operate STAT values by a string, however, they has worse
// performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE,
// xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro
// functions where ultra-low performance overhead is required.
int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
int64_t increment);
int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
int64_t increment);
#define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \
case id: \
stat = paddle::memory::Stat< \
paddle::memory::DeviceMemoryStat##item##id>::GetInstance(); \
break
#define MEMORY_STAT_FUNC(item, id, func, ...) \
[&] { \
paddle::memory::StatBase* stat = nullptr; \
switch (id) { \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15); \
default: \
PADDLE_THROW(paddle::platform::errors::OutOfRange( \
"Only support device id between [0, 15] in memory stats," \
"not support device id: %d", \
id)); \
break; \
} \
return stat->func(__VA_ARGS__); \
#define DEVICE_MEMORY_STAT_FUNC(item, id, func, ...) \
[&] { \
paddle::memory::StatBase* stat = nullptr; \
switch (id) { \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14); \
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15); \
default: \
PADDLE_THROW(paddle::platform::errors::OutOfRange( \
"Only support device id between [0, 15] for device memory stats," \
"not support device id: %d", \
id)); \
break; \
} \
return stat->func(__VA_ARGS__); \
}()
#define MEMORY_STAT_CURRENT_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetCurrentValue)
#define MEMORY_STAT_PEAK_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define MEMORY_STAT_UPDATE(item, id, increment) \
MEMORY_STAT_FUNC(item, id, Update, increment)
#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \
struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {};
#define MEMORY_STAT_DECLARE(item) \
MEMORY_STAT_DECLARE_WITH_ID(item, 0); \
MEMORY_STAT_DECLARE_WITH_ID(item, 1); \
MEMORY_STAT_DECLARE_WITH_ID(item, 2); \
MEMORY_STAT_DECLARE_WITH_ID(item, 3); \
MEMORY_STAT_DECLARE_WITH_ID(item, 4); \
MEMORY_STAT_DECLARE_WITH_ID(item, 5); \
MEMORY_STAT_DECLARE_WITH_ID(item, 6); \
MEMORY_STAT_DECLARE_WITH_ID(item, 7); \
MEMORY_STAT_DECLARE_WITH_ID(item, 8); \
MEMORY_STAT_DECLARE_WITH_ID(item, 9); \
MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
MEMORY_STAT_DECLARE_WITH_ID(item, 15)
#define DEVICE_MEMORY_STAT_CURRENT_VALUE(item, id) \
DEVICE_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
#define DEVICE_MEMORY_STAT_PEAK_VALUE(item, id) \
DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
#define HOST_MEMORY_STAT_FUNC(item, id, func, ...) \
[&] { \
PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange( \
"Only support device id 0 for host memory " \
"stats, not support device id: %d", \
id)); \
return paddle::memory::Stat< \
paddle::memory::HostMemoryStat##item##0>::GetInstance() \
->func(__VA_ARGS__); \
}()
#define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
HOST_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
#define HOST_MEMORY_STAT_PEAK_VALUE(item, id) \
HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
#define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
#define DEVICE_MEMORY_STAT_DECLARE(item) \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 0); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 1); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 2); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 3); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 4); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 5); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 6); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 7); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 8); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 9); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 15)
// Only support id 0 for host memory stat
#define HOST_MEMORY_STAT_DECLARE(item) \
struct HostMemoryStat##item##0 : public ThreadLocalStatBase{};
// To add a new STAT type, declare here and register in stats.cc
MEMORY_STAT_DECLARE(Allocated);
MEMORY_STAT_DECLARE(Reserved);
DEVICE_MEMORY_STAT_DECLARE(Allocated);
DEVICE_MEMORY_STAT_DECLARE(Reserved);
HOST_MEMORY_STAT_DECLARE(Allocated);
HOST_MEMORY_STAT_DECLARE(Reserved);
} // namespace memory
} // namespace paddle
......@@ -23,50 +23,77 @@
namespace paddle {
namespace memory {
TEST(stats_test, MultiThreadReadWriteTest) {
std::string stat_type = "Allocated";
size_t thread_num = 3;
size_t data_num = 10;
std::condition_variable cv;
std::mutex mutex;
std::vector<std::thread> threads;
size_t ready_thread_num = 0;
for (size_t i = 0; i < thread_num; ++i) {
threads.emplace_back(
[&stat_type, data_num, &cv, &mutex, &ready_thread_num]() {
for (size_t data = 0; data < data_num; ++data) {
StatUpdate(stat_type, 0, data);
}
/* lock guard*/ {
std::lock_guard<std::mutex> lock_guard{mutex};
++ready_thread_num;
cv.notify_one();
}
// Sleep here to not exit before the main thread checking stat
// results, because the thread-local stat data will be destroyed when
// the thread exit
std::this_thread::sleep_for(std::chrono::seconds(1));
});
class StatsTest : public ::testing::Test {
protected:
void SetStatType(const std::string& stat_type) { stat_type_ = stat_type; }
void SetFunc(
std::function<void(const std::string, int, int64_t)> update_func,
std::function<int64_t(const std::string, int)> current_value_func,
std::function<int64_t(const std::string, int)> peak_value_func) {
update_func_ = update_func;
current_value_func_ = current_value_func;
peak_value_func_ = peak_value_func;
}
void RunTests() {
MultiThreadReadWriteTest();
PeakValueTest();
}
std::unique_lock<std::mutex> unique_lock(mutex);
cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
return ready_thread_num == thread_num;
});
private:
void MultiThreadReadWriteTest() {
size_t thread_num = 3;
size_t data_num = 10;
std::condition_variable cv;
std::mutex mutex;
std::vector<std::thread> threads;
size_t ready_thread_num = 0;
for (size_t i = 0; i < thread_num; ++i) {
threads.emplace_back([&]() {
for (size_t data = 0; data < data_num; ++data) {
update_func_(stat_type_, 0, data);
}
/* lock guard*/ {
std::lock_guard<std::mutex> lock_guard{mutex};
++ready_thread_num;
cv.notify_one();
}
// Sleep here to not exit before the main thread checking stat
// results, because the thread-local stat data will be destroyed when
// the thread exit
std::this_thread::sleep_for(std::chrono::seconds(1));
});
}
EXPECT_EQ(StatGetCurrentValue(stat_type, 0),
int64_t((thread_num * data_num * (data_num - 1)) >> 1));
std::unique_lock<std::mutex> unique_lock(mutex);
cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
return ready_thread_num == thread_num;
});
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
EXPECT_EQ(current_value_func_(stat_type_, 0),
int64_t((thread_num * data_num * (data_num - 1)) >> 1));
for (size_t i = 0; i < thread_num; ++i) {
threads[i].join();
}
}
void PeakValueTest() {
int64_t peak_value = ((int64_t)1) << 63;
int64_t sum = 0;
for (int64_t data : datas_) {
update_func_(stat_type_, 0, data);
sum += data;
peak_value = std::max(peak_value, sum);
}
EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
}
}
TEST(stats_test, PeakValueTest) {
std::string stat_type = "Allocated";
std::vector<int64_t> datas = {
std::string stat_type_;
std::vector<int64_t> datas_{
543149808935355, 634698327471328, 706215795436611, 577939367795333,
419479490054362, 21975227714595, 812939817942250, 984428837942082,
537304104446806, 685008544452453, 563352858161268, 690143831596330,
......@@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) {
746465732805300, -74049761897414, -65640372433924, 852009039806484,
305079802044257, -48409757869238, 266031781660228, 327287322379820};
int64_t peak_value = ((int64_t)1) << 63;
int64_t sum = 0;
for (int64_t data : datas) {
StatUpdate(stat_type, 0, data);
sum += data;
peak_value = std::max(peak_value, sum);
}
EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value);
std::function<void(const std::string, int, int64_t)> update_func_;
std::function<int64_t(const std::string, int)> current_value_func_;
std::function<int64_t(const std::string, int)> peak_value_func_;
};
TEST_F(StatsTest, DeviceAllocatedTest) {
SetStatType("Allocated");
SetFunc(DeviceMemoryStatUpdate, DeviceMemoryStatCurrentValue,
DeviceMemoryStatPeakValue);
RunTests();
}
TEST_F(StatsTest, DeviceReservedMacroTest) {
SetStatType("Reserved");
SetFunc(
[](const std::string stat_type, int id, int64_t increment) {
return DEVICE_MEMORY_STAT_UPDATE(Reserved, id, increment);
},
[](const std::string stat_type, int id) {
return DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, id);
},
[](const std::string stat_type, int id) {
return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
});
RunTests();
}
TEST_F(StatsTest, HostAllocatedMacroTest) {
SetStatType("Allocated");
SetFunc(
[](const std::string stat_type, int id, int64_t increment) {
return HOST_MEMORY_STAT_UPDATE(Allocated, id, increment);
},
[](const std::string stat_type, int id) {
return HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, id);
},
[](const std::string stat_type, int id) {
return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
});
RunTests();
}
TEST_F(StatsTest, HostReservedTest) {
SetStatType("Reserved");
SetFunc(HostMemoryStatUpdate, HostMemoryStatCurrentValue,
HostMemoryStatPeakValue);
RunTests();
}
} // namespace memory
......
......@@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() {
static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
if (!use_fixed_workspace) {
int device_id = platform::GetCurrentDeviceId();
int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
int64_t allocated =
memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved =
memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = platform::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved)
......
......@@ -149,8 +149,8 @@ class RecordedGpuMallocHelper {
if (FLAGS_enable_gpu_memory_usage_log) {
// A fake UPDATE to trigger the construction of memory stat instances,
// make sure that they are destructed after RecordedGpuMallocHelper.
MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
}
}
......@@ -161,15 +161,18 @@ class RecordedGpuMallocHelper {
if (FLAGS_enable_gpu_memory_usage_log) {
if (FLAGS_enable_gpu_memory_usage_log_mb) {
std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
<< MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0
<< DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) /
1048576.0
<< ", Allocated = "
<< MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0
<< DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) /
1048576.0
<< std::endl;
} else {
std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
<< MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
<< DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
<< ", Allocated = "
<< MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl;
<< DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_)
<< std::endl;
}
}
}
......@@ -230,7 +233,7 @@ class RecordedGpuMallocHelper {
if (result == gpuSuccess) {
cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
#ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr);
......@@ -269,7 +272,7 @@ class RecordedGpuMallocHelper {
PADDLE_ENFORCE_GPU_SUCCESS(err);
cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
} else {
platform::GpuGetLastError(); // clear the error flag when
// cudaErrorCudartUnloading /
......
......@@ -168,8 +168,10 @@ void PrintMemProfiler(
if (num_gpus > 0) {
std::cout << "GPU Memory Usage (MB):\n";
for (int dev_id = 0; dev_id < num_gpus; ++dev_id) {
int64_t allocated = memory::StatGetCurrentValue("Allocated", dev_id);
int64_t reserved = memory::StatGetCurrentValue("Reserved", dev_id);
int64_t allocated =
memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
int64_t reserved =
memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
size_t available = 0, total = 0, actual_available = 0, actual_total = 0;
RecordedGpuMemGetInfo(&available, &total, &actual_available,
&actual_total, dev_id);
......
......@@ -3005,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle.
}
return stats_map;
});
m.def("memory_stat_get_current", memory::StatGetCurrentValue);
m.def("memory_stat_get_peak", memory::StatGetPeakValue);
m.def("device_memory_stat_current_value",
memory::DeviceMemoryStatCurrentValue);
m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
m.def("run_cmd",
[](const std::string &cmd, int time_out = -1,
int sleep_inter = -1) -> const std::string {
......
......@@ -224,7 +224,7 @@ def max_memory_allocated(device=None):
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_peak("Allocated", device_id)
return core.device_memory_stat_peak_value("Allocated", device_id)
def max_memory_reserved(device=None):
......@@ -255,7 +255,7 @@ def max_memory_reserved(device=None):
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_peak("Reserved", device_id)
return core.device_memory_stat_peak_value("Reserved", device_id)
def memory_allocated(device=None):
......@@ -290,7 +290,7 @@ def memory_allocated(device=None):
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_current("Allocated", device_id)
return core.device_memory_stat_current_value("Allocated", device_id)
def memory_reserved(device=None):
......@@ -321,7 +321,7 @@ def memory_reserved(device=None):
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_current("Reserved", device_id)
return core.device_memory_stat_current_value("Reserved", device_id)
def _set_current_stream(stream):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册