未验证 提交 21f11d35 编写于 作者: R Ruibiao Chen 提交者: GitHub

Support memory stats for CPU (#42945)

* Support memory stats for CPU

* Add UTs

* Fix typos

* Fix typos
上级 b2b78cd4
...@@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context) ...@@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
cc_library(stats SRCS stats.cc DEPS enforce) cc_library(stats SRCS stats.cc DEPS enforce)
cc_library(memory DEPS malloc memcpy stats) cc_library(memory DEPS malloc memcpy stats)
cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory)
cc_test(stats_test SRCS stats_test.cc DEPS stats) cc_test(stats_test SRCS stats_test.cc DEPS stats)
if (WITH_GPU) if (WITH_GPU)
......
...@@ -931,12 +931,9 @@ class AllocatorFacadePrivate { ...@@ -931,12 +931,9 @@ class AllocatorFacadePrivate {
void WrapStatAllocator() { void WrapStatAllocator() {
for (auto& pair : allocators_) { for (auto& pair : allocators_) {
// Now memory stats is only supported for GPU
if (platform::is_gpu_place(pair.first)) {
pair.second = std::make_shared<StatAllocator>(pair.second); pair.second = std::make_shared<StatAllocator>(pair.second);
} }
} }
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// a standalone CUDA allocator to support multi-stream GC in new executor // a standalone CUDA allocator to support multi-stream GC in new executor
......
...@@ -30,16 +30,28 @@ class StatAllocator : public Allocator { ...@@ -30,16 +30,28 @@ class StatAllocator : public Allocator {
protected: protected:
void FreeImpl(phi::Allocation* allocation) override { void FreeImpl(phi::Allocation* allocation) override {
MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), if (platform::is_cpu_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size()); -allocation->size());
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
}
underlying_allocator_->Free(allocation); underlying_allocator_->Free(allocation);
} }
phi::Allocation* AllocateImpl(size_t size) override { phi::Allocation* AllocateImpl(size_t size) override {
phi::Allocator::AllocationPtr allocation = phi::Allocator::AllocationPtr allocation =
underlying_allocator_->Allocate(size); underlying_allocator_->Allocate(size);
MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
if (platform::is_cpu_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
allocation->size());
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
allocation->size()); allocation->size());
}
return allocation.release(); return allocation.release();
} }
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/memory/stats.h"
#ifdef _WIN32 #ifdef _WIN32
#include <malloc.h> #include <malloc.h>
#ifndef NOMINMAX #ifndef NOMINMAX
...@@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
} }
} }
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
return p; return p;
} }
...@@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
#else #else
free(p); free(p);
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
} }
bool CPUAllocator::UseGpu() const { return false; } bool CPUAllocator::UseGpu() const { return false; }
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/memory.h"
#include <algorithm>
#include <vector>
#include "gtest/gtest.h"
namespace paddle {
namespace memory {
TEST(stat_allocator_test, host_memory_stat_test) {
std::vector<int64_t> alloc_sizes{
5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
6235, 0, 7810, 940, 1239, 1945, 789, 2891, 7553, 8046, 2685,
1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
4, 1185, 2186, 357, 9774, 6743, 6136, 7073, 7674, 5640, 3935,
528, 6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
221, 309, 3617, 3793, 3334, 7281, 1302};
int64_t max_alloc_size = 0;
for (int64_t size : alloc_sizes) {
AllocationPtr allocation = Alloc(platform::CPUPlace(), size);
int64_t alloc_size = static_cast<int64_t>(allocation->size());
max_alloc_size = std::max(max_alloc_size, alloc_size);
EXPECT_EQ(HostMemoryStatCurrentValue("Allocated", 0), alloc_size);
}
EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(stat_allocator_test, device_memory_stat_test) {
std::vector<int64_t> alloc_sizes{
5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
6235, 0, 7810, 940, 1239, 1945, 789, 2891, 7553, 8046, 2685,
1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
4, 1185, 2186, 357, 9774, 6743, 6136, 7073, 7674, 5640, 3935,
528, 6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
221, 309, 3617, 3793, 3334, 7281, 1302};
int64_t max_alloc_size = 0;
for (int64_t size : alloc_sizes) {
AllocationPtr allocation = Alloc(platform::CUDAPlace(), size);
int64_t alloc_size = static_cast<int64_t>(allocation->size());
max_alloc_size = std::max(max_alloc_size, alloc_size);
EXPECT_EQ(DeviceMemoryStatCurrentValue("Allocated", 0), alloc_size);
}
EXPECT_EQ(DeviceMemoryStatPeakValue("Allocated", 0), max_alloc_size);
}
#endif
} // namespace memory
} // namespace paddle
...@@ -38,7 +38,7 @@ class StatRegistry { ...@@ -38,7 +38,7 @@ class StatRegistry {
} }
std::string GetStatKey(const std::string& stat_type, int dev_id) { std::string GetStatKey(const std::string& stat_type, int dev_id) {
return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type; return stat_type + std::to_string(dev_id);
} }
int64_t GetCurrentValue(const std::string& stat_type, int dev_id) { int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
...@@ -49,6 +49,10 @@ class StatRegistry { ...@@ -49,6 +49,10 @@ class StatRegistry {
return GetStat(stat_type, dev_id)->GetPeakValue(); return GetStat(stat_type, dev_id)->GetPeakValue();
} }
void Update(const std::string& stat_type, int dev_id, int64_t increment) {
GetStat(stat_type, dev_id)->Update(increment);
}
void Register(const std::string& stat_type, int dev_id, StatBase* stat) { void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
std::lock_guard<SpinLock> lock_guard(stat_map_lock_); std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
stat_map_[GetStatKey(stat_type, dev_id)] = stat; stat_map_[GetStatKey(stat_type, dev_id)] = stat;
...@@ -59,10 +63,6 @@ class StatRegistry { ...@@ -59,10 +63,6 @@ class StatRegistry {
stat_map_.erase(GetStatKey(stat_type, dev_id)); stat_map_.erase(GetStatKey(stat_type, dev_id));
} }
void Update(const std::string& stat_type, int dev_id, int64_t increment) {
stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
}
private: private:
StatRegistry() = default; StatRegistry() = default;
...@@ -72,43 +72,67 @@ class StatRegistry { ...@@ -72,43 +72,67 @@ class StatRegistry {
SpinLock stat_map_lock_; SpinLock stat_map_lock_;
}; };
int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) { int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id); return StatRegistry::GetInstance()->GetCurrentValue("Device" + stat_type,
dev_id);
}
int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetPeakValue("Device" + stat_type,
dev_id);
}
void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
int64_t increment) {
StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
} }
int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) { int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id); return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
dev_id);
} }
void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) { int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
StatRegistry::GetInstance()->Update(stat_type, dev_id, increment); return StatRegistry::GetInstance()->GetPeakValue("Host" + stat_type, dev_id);
} }
#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \ void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
int64_t increment) {
StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
}
#define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \
StatRegistry::GetInstance()->Register( \
"Device" #item, id, Stat<DeviceMemoryStat##item##id>::GetInstance());
#define DEVICE_MEMORY_STAT_REGISTER(item) \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 0); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 1); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 2); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 3); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 4); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 5); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 6); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 7); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 8); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 9); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 15)
#define HOST_MEMORY_STAT_REGISTER(item) \
StatRegistry::GetInstance()->Register( \ StatRegistry::GetInstance()->Register( \
#item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance()); "Host" #item, 0, Stat<HostMemoryStat##item##0>::GetInstance());
#define MEMORY_STAT_REGISTER(item) \
MEMORY_STAT_REGISTER_WITH_ID(item, 0); \
MEMORY_STAT_REGISTER_WITH_ID(item, 1); \
MEMORY_STAT_REGISTER_WITH_ID(item, 2); \
MEMORY_STAT_REGISTER_WITH_ID(item, 3); \
MEMORY_STAT_REGISTER_WITH_ID(item, 4); \
MEMORY_STAT_REGISTER_WITH_ID(item, 5); \
MEMORY_STAT_REGISTER_WITH_ID(item, 6); \
MEMORY_STAT_REGISTER_WITH_ID(item, 7); \
MEMORY_STAT_REGISTER_WITH_ID(item, 8); \
MEMORY_STAT_REGISTER_WITH_ID(item, 9); \
MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
MEMORY_STAT_REGISTER_WITH_ID(item, 15)
int RegisterAllStats() { int RegisterAllStats() {
MEMORY_STAT_REGISTER(Allocated); DEVICE_MEMORY_STAT_REGISTER(Allocated);
MEMORY_STAT_REGISTER(Reserved); DEVICE_MEMORY_STAT_REGISTER(Reserved);
HOST_MEMORY_STAT_REGISTER(Allocated);
HOST_MEMORY_STAT_REGISTER(Reserved);
return 0; return 0;
} }
......
...@@ -91,44 +91,50 @@ class Stat : public StatBase { ...@@ -91,44 +91,50 @@ class Stat : public StatBase {
std::atomic<int64_t> peak_value_{0}; std::atomic<int64_t> peak_value_{0};
}; };
// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT // xxxMemoryStatCurrentValue, xxxMemoryStatPeakValue and xxxMemoryStatUpdate
// values by a string, however, they has worse performance than the macro // support to operate STAT values by a string, however, they has worse
// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and // performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE,
// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low // xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro
// performance overhead is required. // functions where ultra-low performance overhead is required.
int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id); int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
int64_t StatGetPeakValue(const std::string& stat_type, int dev_id); int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment); void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
int64_t increment);
#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \
int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
int64_t increment);
#define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \
case id: \ case id: \
stat = paddle::memory::Stat< \ stat = paddle::memory::Stat< \
paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \ paddle::memory::DeviceMemoryStat##item##id>::GetInstance(); \
break break
#define MEMORY_STAT_FUNC(item, id, func, ...) \ #define DEVICE_MEMORY_STAT_FUNC(item, id, func, ...) \
[&] { \ [&] { \
paddle::memory::StatBase* stat = nullptr; \ paddle::memory::StatBase* stat = nullptr; \
switch (id) { \ switch (id) { \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14); \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15); \ DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15); \
default: \ default: \
PADDLE_THROW(paddle::platform::errors::OutOfRange( \ PADDLE_THROW(paddle::platform::errors::OutOfRange( \
"Only support device id between [0, 15] in memory stats," \ "Only support device id between [0, 15] for device memory stats," \
"not support device id: %d", \ "not support device id: %d", \
id)); \ id)); \
break; \ break; \
...@@ -136,37 +142,62 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment); ...@@ -136,37 +142,62 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
return stat->func(__VA_ARGS__); \ return stat->func(__VA_ARGS__); \
}() }()
#define MEMORY_STAT_CURRENT_VALUE(item, id) \ #define DEVICE_MEMORY_STAT_CURRENT_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetCurrentValue) DEVICE_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
#define MEMORY_STAT_PEAK_VALUE(item, id) \ #define DEVICE_MEMORY_STAT_PEAK_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetPeakValue) DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define MEMORY_STAT_UPDATE(item, id, increment) \ #define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
MEMORY_STAT_FUNC(item, id, Update, increment) DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \ #define HOST_MEMORY_STAT_FUNC(item, id, func, ...) \
struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {}; [&] { \
PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange( \
#define MEMORY_STAT_DECLARE(item) \ "Only support device id 0 for host memory " \
MEMORY_STAT_DECLARE_WITH_ID(item, 0); \ "stats, not support device id: %d", \
MEMORY_STAT_DECLARE_WITH_ID(item, 1); \ id)); \
MEMORY_STAT_DECLARE_WITH_ID(item, 2); \ return paddle::memory::Stat< \
MEMORY_STAT_DECLARE_WITH_ID(item, 3); \ paddle::memory::HostMemoryStat##item##0>::GetInstance() \
MEMORY_STAT_DECLARE_WITH_ID(item, 4); \ ->func(__VA_ARGS__); \
MEMORY_STAT_DECLARE_WITH_ID(item, 5); \ }()
MEMORY_STAT_DECLARE_WITH_ID(item, 6); \
MEMORY_STAT_DECLARE_WITH_ID(item, 7); \ #define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
MEMORY_STAT_DECLARE_WITH_ID(item, 8); \ HOST_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
MEMORY_STAT_DECLARE_WITH_ID(item, 9); \ #define HOST_MEMORY_STAT_PEAK_VALUE(item, id) \
MEMORY_STAT_DECLARE_WITH_ID(item, 10); \ HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
MEMORY_STAT_DECLARE_WITH_ID(item, 11); \ #define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
MEMORY_STAT_DECLARE_WITH_ID(item, 12); \ HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
MEMORY_STAT_DECLARE_WITH_ID(item, 14); \ #define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
MEMORY_STAT_DECLARE_WITH_ID(item, 15) struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
#define DEVICE_MEMORY_STAT_DECLARE(item) \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 0); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 1); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 2); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 3); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 4); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 5); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 6); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 7); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 8); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 9); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 15)
// Only support id 0 for host memory stat
#define HOST_MEMORY_STAT_DECLARE(item) \
struct HostMemoryStat##item##0 : public ThreadLocalStatBase{};
// To add a new STAT type, declare here and register in stats.cc // To add a new STAT type, declare here and register in stats.cc
MEMORY_STAT_DECLARE(Allocated); DEVICE_MEMORY_STAT_DECLARE(Allocated);
MEMORY_STAT_DECLARE(Reserved); DEVICE_MEMORY_STAT_DECLARE(Reserved);
HOST_MEMORY_STAT_DECLARE(Allocated);
HOST_MEMORY_STAT_DECLARE(Reserved);
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -23,8 +23,26 @@ ...@@ -23,8 +23,26 @@
namespace paddle { namespace paddle {
namespace memory { namespace memory {
TEST(stats_test, MultiThreadReadWriteTest) { class StatsTest : public ::testing::Test {
std::string stat_type = "Allocated"; protected:
void SetStatType(const std::string& stat_type) { stat_type_ = stat_type; }
void SetFunc(
std::function<void(const std::string, int, int64_t)> update_func,
std::function<int64_t(const std::string, int)> current_value_func,
std::function<int64_t(const std::string, int)> peak_value_func) {
update_func_ = update_func;
current_value_func_ = current_value_func;
peak_value_func_ = peak_value_func;
}
void RunTests() {
MultiThreadReadWriteTest();
PeakValueTest();
}
private:
void MultiThreadReadWriteTest() {
size_t thread_num = 3; size_t thread_num = 3;
size_t data_num = 10; size_t data_num = 10;
...@@ -34,10 +52,9 @@ TEST(stats_test, MultiThreadReadWriteTest) { ...@@ -34,10 +52,9 @@ TEST(stats_test, MultiThreadReadWriteTest) {
size_t ready_thread_num = 0; size_t ready_thread_num = 0;
for (size_t i = 0; i < thread_num; ++i) { for (size_t i = 0; i < thread_num; ++i) {
threads.emplace_back( threads.emplace_back([&]() {
[&stat_type, data_num, &cv, &mutex, &ready_thread_num]() {
for (size_t data = 0; data < data_num; ++data) { for (size_t data = 0; data < data_num; ++data) {
StatUpdate(stat_type, 0, data); update_func_(stat_type_, 0, data);
} }
/* lock guard*/ { /* lock guard*/ {
std::lock_guard<std::mutex> lock_guard{mutex}; std::lock_guard<std::mutex> lock_guard{mutex};
...@@ -56,17 +73,27 @@ TEST(stats_test, MultiThreadReadWriteTest) { ...@@ -56,17 +73,27 @@ TEST(stats_test, MultiThreadReadWriteTest) {
return ready_thread_num == thread_num; return ready_thread_num == thread_num;
}); });
EXPECT_EQ(StatGetCurrentValue(stat_type, 0), EXPECT_EQ(current_value_func_(stat_type_, 0),
int64_t((thread_num * data_num * (data_num - 1)) >> 1)); int64_t((thread_num * data_num * (data_num - 1)) >> 1));
for (size_t i = 0; i < thread_num; ++i) { for (size_t i = 0; i < thread_num; ++i) {
threads[i].join(); threads[i].join();
} }
} }
void PeakValueTest() {
int64_t peak_value = ((int64_t)1) << 63;
int64_t sum = 0;
for (int64_t data : datas_) {
update_func_(stat_type_, 0, data);
sum += data;
peak_value = std::max(peak_value, sum);
}
EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
}
TEST(stats_test, PeakValueTest) { std::string stat_type_;
std::string stat_type = "Allocated"; std::vector<int64_t> datas_{
std::vector<int64_t> datas = {
543149808935355, 634698327471328, 706215795436611, 577939367795333, 543149808935355, 634698327471328, 706215795436611, 577939367795333,
419479490054362, 21975227714595, 812939817942250, 984428837942082, 419479490054362, 21975227714595, 812939817942250, 984428837942082,
537304104446806, 685008544452453, 563352858161268, 690143831596330, 537304104446806, 685008544452453, 563352858161268, 690143831596330,
...@@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) { ...@@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) {
746465732805300, -74049761897414, -65640372433924, 852009039806484, 746465732805300, -74049761897414, -65640372433924, 852009039806484,
305079802044257, -48409757869238, 266031781660228, 327287322379820}; 305079802044257, -48409757869238, 266031781660228, 327287322379820};
int64_t peak_value = ((int64_t)1) << 63; std::function<void(const std::string, int, int64_t)> update_func_;
int64_t sum = 0; std::function<int64_t(const std::string, int)> current_value_func_;
for (int64_t data : datas) { std::function<int64_t(const std::string, int)> peak_value_func_;
StatUpdate(stat_type, 0, data); };
sum += data;
peak_value = std::max(peak_value, sum); TEST_F(StatsTest, DeviceAllocatedTest) {
} SetStatType("Allocated");
EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value); SetFunc(DeviceMemoryStatUpdate, DeviceMemoryStatCurrentValue,
DeviceMemoryStatPeakValue);
RunTests();
}
TEST_F(StatsTest, DeviceReservedMacroTest) {
SetStatType("Reserved");
SetFunc(
[](const std::string stat_type, int id, int64_t increment) {
return DEVICE_MEMORY_STAT_UPDATE(Reserved, id, increment);
},
[](const std::string stat_type, int id) {
return DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, id);
},
[](const std::string stat_type, int id) {
return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
});
RunTests();
}
TEST_F(StatsTest, HostAllocatedMacroTest) {
SetStatType("Allocated");
SetFunc(
[](const std::string stat_type, int id, int64_t increment) {
return HOST_MEMORY_STAT_UPDATE(Allocated, id, increment);
},
[](const std::string stat_type, int id) {
return HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, id);
},
[](const std::string stat_type, int id) {
return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
});
RunTests();
}
TEST_F(StatsTest, HostReservedTest) {
SetStatType("Reserved");
SetFunc(HostMemoryStatUpdate, HostMemoryStatCurrentValue,
HostMemoryStatPeakValue);
RunTests();
} }
} // namespace memory } // namespace memory
......
...@@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() { ...@@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() {
static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
if (!use_fixed_workspace) { if (!use_fixed_workspace) {
int device_id = platform::GetCurrentDeviceId(); int device_id = platform::GetCurrentDeviceId();
int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id); int64_t allocated =
int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id); memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
int64_t reserved =
memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
int64_t availble = platform::GpuAvailableMemToAlloc(); int64_t availble = platform::GpuAvailableMemToAlloc();
VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
<< " MB, reserved=" << ToMegaBytes(reserved) << " MB, reserved=" << ToMegaBytes(reserved)
......
...@@ -149,8 +149,8 @@ class RecordedGpuMallocHelper { ...@@ -149,8 +149,8 @@ class RecordedGpuMallocHelper {
if (FLAGS_enable_gpu_memory_usage_log) { if (FLAGS_enable_gpu_memory_usage_log) {
// A fake UPDATE to trigger the construction of memory stat instances, // A fake UPDATE to trigger the construction of memory stat instances,
// make sure that they are destructed after RecordedGpuMallocHelper. // make sure that they are destructed after RecordedGpuMallocHelper.
MEMORY_STAT_UPDATE(Reserved, dev_id, 0); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
MEMORY_STAT_UPDATE(Allocated, dev_id, 0); DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
} }
} }
...@@ -161,15 +161,18 @@ class RecordedGpuMallocHelper { ...@@ -161,15 +161,18 @@ class RecordedGpuMallocHelper {
if (FLAGS_enable_gpu_memory_usage_log) { if (FLAGS_enable_gpu_memory_usage_log) {
if (FLAGS_enable_gpu_memory_usage_log_mb) { if (FLAGS_enable_gpu_memory_usage_log_mb) {
std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = " std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
<< MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0 << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) /
1048576.0
<< ", Allocated = " << ", Allocated = "
<< MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0 << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) /
1048576.0
<< std::endl; << std::endl;
} else { } else {
std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = " std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
<< MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
<< ", Allocated = " << ", Allocated = "
<< MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl; << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_)
<< std::endl;
} }
} }
} }
...@@ -230,7 +233,7 @@ class RecordedGpuMallocHelper { ...@@ -230,7 +233,7 @@ class RecordedGpuMallocHelper {
if (result == gpuSuccess) { if (result == gpuSuccess) {
cur_size_.fetch_add(size); cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
MEMORY_STAT_UPDATE(Reserved, dev_id_, size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
#ifdef PADDLE_WITH_TESTING #ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr); gpu_ptrs.insert(*ptr);
...@@ -269,7 +272,7 @@ class RecordedGpuMallocHelper { ...@@ -269,7 +272,7 @@ class RecordedGpuMallocHelper {
PADDLE_ENFORCE_GPU_SUCCESS(err); PADDLE_ENFORCE_GPU_SUCCESS(err);
cur_size_.fetch_sub(size); cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
} else { } else {
platform::GpuGetLastError(); // clear the error flag when platform::GpuGetLastError(); // clear the error flag when
// cudaErrorCudartUnloading / // cudaErrorCudartUnloading /
......
...@@ -168,8 +168,10 @@ void PrintMemProfiler( ...@@ -168,8 +168,10 @@ void PrintMemProfiler(
if (num_gpus > 0) { if (num_gpus > 0) {
std::cout << "GPU Memory Usage (MB):\n"; std::cout << "GPU Memory Usage (MB):\n";
for (int dev_id = 0; dev_id < num_gpus; ++dev_id) { for (int dev_id = 0; dev_id < num_gpus; ++dev_id) {
int64_t allocated = memory::StatGetCurrentValue("Allocated", dev_id); int64_t allocated =
int64_t reserved = memory::StatGetCurrentValue("Reserved", dev_id); memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
int64_t reserved =
memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
size_t available = 0, total = 0, actual_available = 0, actual_total = 0; size_t available = 0, total = 0, actual_available = 0, actual_total = 0;
RecordedGpuMemGetInfo(&available, &total, &actual_available, RecordedGpuMemGetInfo(&available, &total, &actual_available,
&actual_total, dev_id); &actual_total, dev_id);
......
...@@ -3005,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3005,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle.
} }
return stats_map; return stats_map;
}); });
m.def("memory_stat_get_current", memory::StatGetCurrentValue); m.def("device_memory_stat_current_value",
m.def("memory_stat_get_peak", memory::StatGetPeakValue); memory::DeviceMemoryStatCurrentValue);
m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
m.def("run_cmd", m.def("run_cmd",
[](const std::string &cmd, int time_out = -1, [](const std::string &cmd, int time_out = -1,
int sleep_inter = -1) -> const std::string { int sleep_inter = -1) -> const std::string {
......
...@@ -224,7 +224,7 @@ def max_memory_allocated(device=None): ...@@ -224,7 +224,7 @@ def max_memory_allocated(device=None):
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
) )
device_id = extract_cuda_device_id(device, op_name=name) device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_peak("Allocated", device_id) return core.device_memory_stat_peak_value("Allocated", device_id)
def max_memory_reserved(device=None): def max_memory_reserved(device=None):
...@@ -255,7 +255,7 @@ def max_memory_reserved(device=None): ...@@ -255,7 +255,7 @@ def max_memory_reserved(device=None):
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
) )
device_id = extract_cuda_device_id(device, op_name=name) device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_peak("Reserved", device_id) return core.device_memory_stat_peak_value("Reserved", device_id)
def memory_allocated(device=None): def memory_allocated(device=None):
...@@ -290,7 +290,7 @@ def memory_allocated(device=None): ...@@ -290,7 +290,7 @@ def memory_allocated(device=None):
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
) )
device_id = extract_cuda_device_id(device, op_name=name) device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_current("Allocated", device_id) return core.device_memory_stat_current_value("Allocated", device_id)
def memory_reserved(device=None): def memory_reserved(device=None):
...@@ -321,7 +321,7 @@ def memory_reserved(device=None): ...@@ -321,7 +321,7 @@ def memory_reserved(device=None):
f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
) )
device_id = extract_cuda_device_id(device, op_name=name) device_id = extract_cuda_device_id(device, op_name=name)
return core.memory_stat_get_current("Reserved", device_id) return core.device_memory_stat_current_value("Reserved", device_id)
def _set_current_stream(stream): def _set_current_stream(stream):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册