diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 76bb8993cbefa2adea8f773712995544b228b14d..53e7993945586be15c3d9fac342cf2f07f8a99b5 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context) cc_library(stats SRCS stats.cc DEPS enforce) cc_library(memory DEPS malloc memcpy stats) +cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory) cc_test(stats_test SRCS stats_test.cc DEPS stats) if (WITH_GPU) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 35ad27f4c62b5a01715156ad875e0c7e98468215..99152607158eb436a70ae37b07655000fbc6c35e 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -931,10 +931,7 @@ class AllocatorFacadePrivate { void WrapStatAllocator() { for (auto& pair : allocators_) { - // Now memory stats is only supported for GPU - if (platform::is_gpu_place(pair.first)) { - pair.second = std::make_shared(pair.second); - } + pair.second = std::make_shared(pair.second); } } diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h index 71569366c2446330115eba4baf93011e86bbfeeb..68209bbaabecad81ceb2609a6e0347c5f0be4f09 100644 --- a/paddle/fluid/memory/allocation/stat_allocator.h +++ b/paddle/fluid/memory/allocation/stat_allocator.h @@ -30,16 +30,28 @@ class StatAllocator : public Allocator { protected: void FreeImpl(phi::Allocation* allocation) override { - MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), - -allocation->size()); + if (platform::is_cpu_place(allocation->place())) { + HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), + -allocation->size()); + } else { + DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), + -allocation->size()); + } + underlying_allocator_->Free(allocation); } phi::Allocation* AllocateImpl(size_t size) override { phi::Allocator::AllocationPtr allocation = underlying_allocator_->Allocate(size); - MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), - allocation->size()); + + if (platform::is_cpu_place(allocation->place())) { + HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), + allocation->size()); + } else { + DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), + allocation->size()); + } return allocation.release(); } diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 37ac0b4483291c8c3a3eeb31883c55c7eda24dc8..06038804e6efeae18370374b3d7dbe9413b02153 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/memory/stats.h" + #ifdef _WIN32 #include #ifndef NOMINMAX @@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) { } } + HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); + return p; } @@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { #else free(p); #endif + + HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); } bool CPUAllocator::UseGpu() const { return false; } diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2fc602e401edcbe8ee8209158a14399758f6018 --- /dev/null +++ b/paddle/fluid/memory/memory_stats_test.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/memory.h" +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace memory { + +TEST(stat_allocator_test, host_memory_stat_test) { + std::vector alloc_sizes{ + 5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527, + 6235, 0, 7810, 940, 1239, 1945, 789, 2891, 7553, 8046, 2685, + 1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395, + 4, 1185, 2186, 357, 9774, 6743, 6136, 7073, 7674, 5640, 3935, + 528, 6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694, + 221, 309, 3617, 3793, 3334, 7281, 1302}; + + int64_t max_alloc_size = 0; + for (int64_t size : alloc_sizes) { + AllocationPtr allocation = Alloc(platform::CPUPlace(), size); + int64_t alloc_size = static_cast(allocation->size()); + max_alloc_size = std::max(max_alloc_size, alloc_size); + EXPECT_EQ(HostMemoryStatCurrentValue("Allocated", 0), alloc_size); + } + EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size); +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +TEST(stat_allocator_test, device_memory_stat_test) { + std::vector alloc_sizes{ + 5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527, + 6235, 0, 7810, 940, 1239, 1945, 789, 2891, 7553, 8046, 2685, + 1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395, + 4, 1185, 2186, 357, 9774, 6743, 6136, 7073, 7674, 5640, 3935, + 528, 6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694, + 221, 309, 3617, 3793, 3334, 7281, 1302}; + + int64_t max_alloc_size = 0; + for (int64_t size : alloc_sizes) { + AllocationPtr allocation = Alloc(platform::CUDAPlace(), size); + int64_t alloc_size = static_cast(allocation->size()); + max_alloc_size = std::max(max_alloc_size, alloc_size); + EXPECT_EQ(DeviceMemoryStatCurrentValue("Allocated", 0), alloc_size); + } + EXPECT_EQ(DeviceMemoryStatPeakValue("Allocated", 0), max_alloc_size); +} +#endif + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc index 31d776de407027f6bc690dbf44f0e3c94878477b..97197b495f5fcf560e8b8dba4fbccd2f6ec773d5 100644 --- a/paddle/fluid/memory/stats.cc +++ b/paddle/fluid/memory/stats.cc @@ -38,7 +38,7 @@ class StatRegistry { } std::string GetStatKey(const std::string& stat_type, int dev_id) { - return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type; + return stat_type + std::to_string(dev_id); } int64_t GetCurrentValue(const std::string& stat_type, int dev_id) { @@ -49,6 +49,10 @@ class StatRegistry { return GetStat(stat_type, dev_id)->GetPeakValue(); } + void Update(const std::string& stat_type, int dev_id, int64_t increment) { + GetStat(stat_type, dev_id)->Update(increment); + } + void Register(const std::string& stat_type, int dev_id, StatBase* stat) { std::lock_guard lock_guard(stat_map_lock_); stat_map_[GetStatKey(stat_type, dev_id)] = stat; @@ -59,10 +63,6 @@ class StatRegistry { stat_map_.erase(GetStatKey(stat_type, dev_id)); } - void Update(const std::string& stat_type, int dev_id, int64_t increment) { - stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment); - } - private: StatRegistry() = default; @@ -72,43 +72,67 @@ class StatRegistry { SpinLock stat_map_lock_; }; -int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) { - return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id); +int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { + return StatRegistry::GetInstance()->GetCurrentValue("Device" + stat_type, + dev_id); } -int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) { - return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id); +int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id) { + return StatRegistry::GetInstance()->GetPeakValue("Device" + stat_type, + dev_id); } -void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) { - StatRegistry::GetInstance()->Update(stat_type, dev_id, increment); +void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id, + int64_t increment) { + StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment); } -#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \ - StatRegistry::GetInstance()->Register( \ - #item, id, Stat::GetInstance()); - -#define MEMORY_STAT_REGISTER(item) \ - MEMORY_STAT_REGISTER_WITH_ID(item, 0); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 1); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 2); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 3); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 4); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 5); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 6); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 7); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 8); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 9); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 10); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 11); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 12); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 13); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 14); \ - MEMORY_STAT_REGISTER_WITH_ID(item, 15) +int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) { + return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type, + dev_id); +} + +int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id) { + return StatRegistry::GetInstance()->GetPeakValue("Host" + stat_type, dev_id); +} + +void HostMemoryStatUpdate(const std::string& stat_type, int dev_id, + int64_t increment) { + StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment); +} + +#define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \ + StatRegistry::GetInstance()->Register( \ + "Device" #item, id, Stat::GetInstance()); + +#define DEVICE_MEMORY_STAT_REGISTER(item) \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 0); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 1); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 2); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 3); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 4); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 5); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 6); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 7); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 8); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 9); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 10); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 11); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 12); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 13); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 14); \ + DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 15) + +#define HOST_MEMORY_STAT_REGISTER(item) \ + StatRegistry::GetInstance()->Register( \ + "Host" #item, 0, Stat::GetInstance()); int RegisterAllStats() { - MEMORY_STAT_REGISTER(Allocated); - MEMORY_STAT_REGISTER(Reserved); + DEVICE_MEMORY_STAT_REGISTER(Allocated); + DEVICE_MEMORY_STAT_REGISTER(Reserved); + + HOST_MEMORY_STAT_REGISTER(Allocated); + HOST_MEMORY_STAT_REGISTER(Reserved); return 0; } diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h index b4850a8e9e919b88a5284bfbcfd54631a14911ec..bb6a3cca6644c1d47cb0dfc469b5ad734971b8e1 100644 --- a/paddle/fluid/memory/stats.h +++ b/paddle/fluid/memory/stats.h @@ -91,82 +91,113 @@ class Stat : public StatBase { std::atomic peak_value_{0}; }; -// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT -// values by a string, however, they has worse performance than the macro -// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and -// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low -// performance overhead is required. -int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id); -int64_t StatGetPeakValue(const std::string& stat_type, int dev_id); -void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment); - -#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \ - case id: \ - stat = paddle::memory::Stat< \ - paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \ +// xxxMemoryStatCurrentValue, xxxMemoryStatPeakValue and xxxMemoryStatUpdate +// support to operate STAT values by a string, however, they has worse +// performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE, +// xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro +// functions where ultra-low performance overhead is required. +int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id); +int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id); +void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id, + int64_t increment); + +int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id); +int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id); +void HostMemoryStatUpdate(const std::string& stat_type, int dev_id, + int64_t increment); + +#define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \ + case id: \ + stat = paddle::memory::Stat< \ + paddle::memory::DeviceMemoryStat##item##id>::GetInstance(); \ break -#define MEMORY_STAT_FUNC(item, id, func, ...) \ - [&] { \ - paddle::memory::StatBase* stat = nullptr; \ - switch (id) { \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14); \ - MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15); \ - default: \ - PADDLE_THROW(paddle::platform::errors::OutOfRange( \ - "Only support device id between [0, 15] in memory stats," \ - "not support device id: %d", \ - id)); \ - break; \ - } \ - return stat->func(__VA_ARGS__); \ +#define DEVICE_MEMORY_STAT_FUNC(item, id, func, ...) \ + [&] { \ + paddle::memory::StatBase* stat = nullptr; \ + switch (id) { \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14); \ + DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15); \ + default: \ + PADDLE_THROW(paddle::platform::errors::OutOfRange( \ + "Only support device id between [0, 15] for device memory stats," \ + "not support device id: %d", \ + id)); \ + break; \ + } \ + return stat->func(__VA_ARGS__); \ }() -#define MEMORY_STAT_CURRENT_VALUE(item, id) \ - MEMORY_STAT_FUNC(item, id, GetCurrentValue) -#define MEMORY_STAT_PEAK_VALUE(item, id) \ - MEMORY_STAT_FUNC(item, id, GetPeakValue) -#define MEMORY_STAT_UPDATE(item, id, increment) \ - MEMORY_STAT_FUNC(item, id, Update, increment) - -#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \ - struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {}; - -#define MEMORY_STAT_DECLARE(item) \ - MEMORY_STAT_DECLARE_WITH_ID(item, 0); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 1); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 2); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 3); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 4); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 5); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 6); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 7); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 8); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 9); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 10); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 11); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 12); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 13); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 14); \ - MEMORY_STAT_DECLARE_WITH_ID(item, 15) +#define DEVICE_MEMORY_STAT_CURRENT_VALUE(item, id) \ + DEVICE_MEMORY_STAT_FUNC(item, id, GetCurrentValue) +#define DEVICE_MEMORY_STAT_PEAK_VALUE(item, id) \ + DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue) +#define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \ + DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment) + +#define HOST_MEMORY_STAT_FUNC(item, id, func, ...) \ + [&] { \ + PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange( \ + "Only support device id 0 for host memory " \ + "stats, not support device id: %d", \ + id)); \ + return paddle::memory::Stat< \ + paddle::memory::HostMemoryStat##item##0>::GetInstance() \ + ->func(__VA_ARGS__); \ + }() + +#define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \ + HOST_MEMORY_STAT_FUNC(item, id, GetCurrentValue) +#define HOST_MEMORY_STAT_PEAK_VALUE(item, id) \ + HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue) +#define HOST_MEMORY_STAT_UPDATE(item, id, increment) \ + HOST_MEMORY_STAT_FUNC(item, id, Update, increment) + +#define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \ + struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {} + +#define DEVICE_MEMORY_STAT_DECLARE(item) \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 0); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 1); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 2); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 3); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 4); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 5); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 6); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 7); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 8); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 9); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 10); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 11); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 12); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 13); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 14); \ + DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 15) + +// Only support id 0 for host memory stat +#define HOST_MEMORY_STAT_DECLARE(item) \ + struct HostMemoryStat##item##0 : public ThreadLocalStatBase{}; // To add a new STAT type, declare here and register in stats.cc -MEMORY_STAT_DECLARE(Allocated); -MEMORY_STAT_DECLARE(Reserved); +DEVICE_MEMORY_STAT_DECLARE(Allocated); +DEVICE_MEMORY_STAT_DECLARE(Reserved); + +HOST_MEMORY_STAT_DECLARE(Allocated); +HOST_MEMORY_STAT_DECLARE(Reserved); } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc index 436c737916d9fb4e00fbad25d8b09c489ca11632..bcaba8e91080f7643b893232d2dee813ef3ffa77 100644 --- a/paddle/fluid/memory/stats_test.cc +++ b/paddle/fluid/memory/stats_test.cc @@ -23,50 +23,77 @@ namespace paddle { namespace memory { -TEST(stats_test, MultiThreadReadWriteTest) { - std::string stat_type = "Allocated"; - size_t thread_num = 3; - size_t data_num = 10; - - std::condition_variable cv; - std::mutex mutex; - std::vector threads; - size_t ready_thread_num = 0; - - for (size_t i = 0; i < thread_num; ++i) { - threads.emplace_back( - [&stat_type, data_num, &cv, &mutex, &ready_thread_num]() { - for (size_t data = 0; data < data_num; ++data) { - StatUpdate(stat_type, 0, data); - } - /* lock guard*/ { - std::lock_guard lock_guard{mutex}; - ++ready_thread_num; - cv.notify_one(); - } - // Sleep here to not exit before the main thread checking stat - // results, because the thread-local stat data will be destroyed when - // the thread exit - std::this_thread::sleep_for(std::chrono::seconds(1)); - }); +class StatsTest : public ::testing::Test { + protected: + void SetStatType(const std::string& stat_type) { stat_type_ = stat_type; } + + void SetFunc( + std::function update_func, + std::function current_value_func, + std::function peak_value_func) { + update_func_ = update_func; + current_value_func_ = current_value_func; + peak_value_func_ = peak_value_func; + } + + void RunTests() { + MultiThreadReadWriteTest(); + PeakValueTest(); } - std::unique_lock unique_lock(mutex); - cv.wait(unique_lock, [&ready_thread_num, thread_num]() { - return ready_thread_num == thread_num; - }); + private: + void MultiThreadReadWriteTest() { + size_t thread_num = 3; + size_t data_num = 10; + + std::condition_variable cv; + std::mutex mutex; + std::vector threads; + size_t ready_thread_num = 0; + + for (size_t i = 0; i < thread_num; ++i) { + threads.emplace_back([&]() { + for (size_t data = 0; data < data_num; ++data) { + update_func_(stat_type_, 0, data); + } + /* lock guard*/ { + std::lock_guard lock_guard{mutex}; + ++ready_thread_num; + cv.notify_one(); + } + // Sleep here to not exit before the main thread checking stat + // results, because the thread-local stat data will be destroyed when + // the thread exit + std::this_thread::sleep_for(std::chrono::seconds(1)); + }); + } - EXPECT_EQ(StatGetCurrentValue(stat_type, 0), - int64_t((thread_num * data_num * (data_num - 1)) >> 1)); + std::unique_lock unique_lock(mutex); + cv.wait(unique_lock, [&ready_thread_num, thread_num]() { + return ready_thread_num == thread_num; + }); - for (size_t i = 0; i < thread_num; ++i) { - threads[i].join(); + EXPECT_EQ(current_value_func_(stat_type_, 0), + int64_t((thread_num * data_num * (data_num - 1)) >> 1)); + + for (size_t i = 0; i < thread_num; ++i) { + threads[i].join(); + } + } + + void PeakValueTest() { + int64_t peak_value = ((int64_t)1) << 63; + int64_t sum = 0; + for (int64_t data : datas_) { + update_func_(stat_type_, 0, data); + sum += data; + peak_value = std::max(peak_value, sum); + } + EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value); } -} -TEST(stats_test, PeakValueTest) { - std::string stat_type = "Allocated"; - std::vector datas = { + std::string stat_type_; + std::vector datas_{ 543149808935355, 634698327471328, 706215795436611, 577939367795333, 419479490054362, 21975227714595, 812939817942250, 984428837942082, 537304104446806, 685008544452453, 563352858161268, 690143831596330, @@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) { 746465732805300, -74049761897414, -65640372433924, 852009039806484, 305079802044257, -48409757869238, 266031781660228, 327287322379820}; - int64_t peak_value = ((int64_t)1) << 63; - int64_t sum = 0; - for (int64_t data : datas) { - StatUpdate(stat_type, 0, data); - sum += data; - peak_value = std::max(peak_value, sum); - } - EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value); + std::function update_func_; + std::function current_value_func_; + std::function peak_value_func_; +}; + +TEST_F(StatsTest, DeviceAllocatedTest) { + SetStatType("Allocated"); + SetFunc(DeviceMemoryStatUpdate, DeviceMemoryStatCurrentValue, + DeviceMemoryStatPeakValue); + RunTests(); +} + +TEST_F(StatsTest, DeviceReservedMacroTest) { + SetStatType("Reserved"); + SetFunc( + [](const std::string stat_type, int id, int64_t increment) { + return DEVICE_MEMORY_STAT_UPDATE(Reserved, id, increment); + }, + [](const std::string stat_type, int id) { + return DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, id); + }, + [](const std::string stat_type, int id) { + return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id); + }); + RunTests(); +} + +TEST_F(StatsTest, HostAllocatedMacroTest) { + SetStatType("Allocated"); + SetFunc( + [](const std::string stat_type, int id, int64_t increment) { + return HOST_MEMORY_STAT_UPDATE(Allocated, id, increment); + }, + [](const std::string stat_type, int id) { + return HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, id); + }, + [](const std::string stat_type, int id) { + return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id); + }); + RunTests(); +} + +TEST_F(StatsTest, HostReservedTest) { + SetStatType("Reserved"); + SetFunc(HostMemoryStatUpdate, HostMemoryStatCurrentValue, + HostMemoryStatPeakValue); + RunTests(); } } // namespace memory diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 419fb8a4ca7030b0b45e058403e45c55f6c0f3ba..3044aa6cf6c5a274c36599cbf0cea433275f4140 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() { static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { if (!use_fixed_workspace) { int device_id = platform::GetCurrentDeviceId(); - int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id); - int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id); + int64_t allocated = + memory::DeviceMemoryStatCurrentValue("Allocated", device_id); + int64_t reserved = + memory::DeviceMemoryStatCurrentValue("Reserved", device_id); int64_t availble = platform::GpuAvailableMemToAlloc(); VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) << " MB, reserved=" << ToMegaBytes(reserved) diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 6da5d1244fbed922406a497d31d7a6a48f067987..5410638ceb39ab388c78f2d6a1645447d814553e 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -149,8 +149,8 @@ class RecordedGpuMallocHelper { if (FLAGS_enable_gpu_memory_usage_log) { // A fake UPDATE to trigger the construction of memory stat instances, // make sure that they are destructed after RecordedGpuMallocHelper. - MEMORY_STAT_UPDATE(Reserved, dev_id, 0); - MEMORY_STAT_UPDATE(Allocated, dev_id, 0); + DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0); + DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0); } } @@ -161,15 +161,18 @@ class RecordedGpuMallocHelper { if (FLAGS_enable_gpu_memory_usage_log) { if (FLAGS_enable_gpu_memory_usage_log_mb) { std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = " - << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0 + << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / + 1048576.0 << ", Allocated = " - << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0 + << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / + 1048576.0 << std::endl; } else { std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = " - << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) + << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << ", Allocated = " - << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl; + << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) + << std::endl; } } } @@ -230,7 +233,7 @@ class RecordedGpuMallocHelper { if (result == gpuSuccess) { cur_size_.fetch_add(size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); - MEMORY_STAT_UPDATE(Reserved, dev_id_, size); + DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size); #ifdef PADDLE_WITH_TESTING gpu_ptrs.insert(*ptr); @@ -269,7 +272,7 @@ class RecordedGpuMallocHelper { PADDLE_ENFORCE_GPU_SUCCESS(err); cur_size_.fetch_sub(size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); - MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); + DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); } else { platform::GpuGetLastError(); // clear the error flag when // cudaErrorCudartUnloading / diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 24c515f5b495682da998caf152bfdec4f38a4ad6..f64e05504aa3f9f53089e62a14348ab6c1535eee 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -168,8 +168,10 @@ void PrintMemProfiler( if (num_gpus > 0) { std::cout << "GPU Memory Usage (MB):\n"; for (int dev_id = 0; dev_id < num_gpus; ++dev_id) { - int64_t allocated = memory::StatGetCurrentValue("Allocated", dev_id); - int64_t reserved = memory::StatGetCurrentValue("Reserved", dev_id); + int64_t allocated = + memory::DeviceMemoryStatCurrentValue("Allocated", dev_id); + int64_t reserved = + memory::DeviceMemoryStatCurrentValue("Reserved", dev_id); size_t available = 0, total = 0, actual_available = 0, actual_total = 0; RecordedGpuMemGetInfo(&available, &total, &actual_available, &actual_total, dev_id); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f6be9b66d5dbd1b5d4e66650ff3dc83c7be27148..0e1271c1fe07f5791e106217c1b7b5a659fe019b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3005,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle. } return stats_map; }); - m.def("memory_stat_get_current", memory::StatGetCurrentValue); - m.def("memory_stat_get_peak", memory::StatGetPeakValue); + m.def("device_memory_stat_current_value", + memory::DeviceMemoryStatCurrentValue); + m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue); m.def("run_cmd", [](const std::string &cmd, int time_out = -1, int sleep_inter = -1) -> const std::string { diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index b33dc1aaeb08621f9308421713148872aa3c5c13..8cb4f5f765611401629f7dce7f4ad38e3be42763 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -224,7 +224,7 @@ def max_memory_allocated(device=None): f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." ) device_id = extract_cuda_device_id(device, op_name=name) - return core.memory_stat_get_peak("Allocated", device_id) + return core.device_memory_stat_peak_value("Allocated", device_id) def max_memory_reserved(device=None): @@ -255,7 +255,7 @@ def max_memory_reserved(device=None): f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." ) device_id = extract_cuda_device_id(device, op_name=name) - return core.memory_stat_get_peak("Reserved", device_id) + return core.device_memory_stat_peak_value("Reserved", device_id) def memory_allocated(device=None): @@ -290,7 +290,7 @@ def memory_allocated(device=None): f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." ) device_id = extract_cuda_device_id(device, op_name=name) - return core.memory_stat_get_current("Allocated", device_id) + return core.device_memory_stat_current_value("Allocated", device_id) def memory_reserved(device=None): @@ -321,7 +321,7 @@ def memory_reserved(device=None): f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API." ) device_id = extract_cuda_device_id(device, op_name=name) - return core.memory_stat_get_current("Reserved", device_id) + return core.device_memory_stat_current_value("Reserved", device_id) def _set_current_stream(stream):