diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 64aa63ffe9705d75e70c8d9d9cbc433dd6358596..5d8684f083bda8499000c9fd0a7617cf129db13b 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include +#include #include #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" @@ -37,7 +38,7 @@ template void *Alloc(const Place &place, size_t size); template -void Free(const Place &place, void *p); +void Free(const Place &place, void *p, size_t size); template size_t Used(const Place &place); @@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p); using BuddyAllocator = detail::BuddyAllocator; +std::unordered_map> + gpu_mem_info; + BuddyAllocator *GetCPUBuddyAllocator() { // We tried thread_local for inference::RNN1 model, but that not works much // for multi-thread test. @@ -98,7 +104,8 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { } template <> -void Free(const platform::CPUPlace &place, void *p) { +void Free(const platform::CPUPlace &place, void *p, + size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -177,9 +184,16 @@ void *Alloc(const platform::CUDAPlace &place, LOG(WARNING) << "GPU memory used: " << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); - } - if (FLAGS_init_allocated_mem) { - cudaMemset(ptr, 0xEF, size); + } else { + gpu_mem_info[place.device].first += size; + if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) { + gpu_mem_info[place.device].second = gpu_mem_info[place.device].first; + VLOG(3) << "device: " << place.device << " peak memory usage : " + << (gpu_mem_info[place.device].second >> 20) << " MiB"; + } + if (FLAGS_init_allocated_mem) { + cudaMemset(ptr, 0xEF, size); + } } return ptr; #else @@ -188,9 +202,11 @@ void *Alloc(const platform::CUDAPlace &place, } template <> -void Free(const platform::CUDAPlace &place, void *p) { +void Free(const platform::CUDAPlace &place, void *p, + size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); + gpu_mem_info[place.device].first -= size; #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -243,7 +259,7 @@ void *Alloc(const platform::CUDAPinnedPlace &place, template <> void Free(const platform::CUDAPinnedPlace &place, - void *p) { + void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor { }; struct FreeVisitor : public boost::static_visitor { - inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {} + inline explicit FreeVisitor(void *ptr, size_t size) + : ptr_(ptr), size_(size) {} template inline void operator()(const Place &place) const { - Free(place, ptr_); + Free(place, ptr_, size_); } private: void *ptr_; + size_t size_; }; size_t Usage::operator()(const platform::CPUPlace &cpu) const { @@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { } void LegacyAllocator::Free(Allocation *allocation) { - boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()), - allocation->place()); + boost::apply_visitor( + legacy::FreeVisitor(allocation->ptr(), allocation->size()), + allocation->place()); delete allocation; } } // namespace allocation