From 6e84eb131fcd7d548e1f04b74e1750611d237c6b Mon Sep 17 00:00:00 2001 From: liuwei1031 <46661762+liuwei1031@users.noreply.github.com> Date: Thu, 31 Jan 2019 12:57:39 +0800 Subject: [PATCH] expose peak gpu memory API to python test=develop (#15529) * expose peak gpu memory API to python test=develop * add unittest for peak gpu memory monitoring test=develop * add pybind change test=develop * add mutex to gpu mem usage monitor test=develop * update benchmark flag definition file test=develop * tweak unittest for memory monitoring test=develop --- paddle/fluid/framework/scope.cc | 6 +- .../memory/allocation/legacy_allocator.cc | 76 ++++++++++++++++--- .../memory/allocation/legacy_allocator.h | 47 ++++++++++++ paddle/fluid/platform/place.cc | 6 ++ paddle/fluid/pybind/pybind.cc | 8 ++ .../unittests/test_peak_gpumem_monitor.py | 59 ++++++++++++++ 6 files changed, 185 insertions(+), 17 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 953618560..87f0f307d 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -22,11 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" -DEFINE_bool(benchmark, false, - "Doing memory benchmark. It will make deleting scope synchronized, " - "and add some memory usage logs." - "Default cuda is asynchronous device, set to True will" - "force op run in synchronous mode."); +DECLARE_bool(benchmark); DEFINE_bool( eager_delete_scope, true, diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 8759ec809..ef62f758e 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -35,6 +35,7 @@ DEFINE_bool(init_allocated_mem, false, "To find this error in time, we use init_allocated_mem to indicate " "that initializing the allocated memory with a small value " "during unit testing."); +DECLARE_bool(benchmark); DECLARE_double(fraction_of_gpu_memory_to_use); namespace paddle { @@ -59,11 +60,6 @@ size_t memory_usage(const platform::Place &p); using BuddyAllocator = detail::BuddyAllocator; -std::unordered_map> - gpu_mem_info; - BuddyAllocator *GetCPUBuddyAllocator() { // We tried thread_local for inference::RNN1 model, but that not works much // for multi-thread test. @@ -144,6 +140,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { devices = platform::GetSelectedDevices(); int gpu_num = devices.size(); + allocation::GPUMemMonitor.Initialize(devices.size()); + a_arr = new BuddyAllocator *[gpu_num]; for (size_t i = 0; i < devices.size(); ++i) { int dev_id = devices[i]; @@ -204,12 +202,7 @@ void *Alloc(const platform::CUDAPlace &place, << string::HumanReadableSize(Used(place)); platform::SetDeviceId(cur_dev); } else { - gpu_mem_info[place.device].first += size; - if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) { - gpu_mem_info[place.device].second = gpu_mem_info[place.device].first; - VLOG(3) << "device: " << place.device << " peak memory usage : " - << (gpu_mem_info[place.device].second >> 20) << " MiB"; - } + if (FLAGS_benchmark) allocation::GPUMemMonitor.Add(place.device, size); if (FLAGS_init_allocated_mem) { cudaMemset(ptr, 0xEF, size); } @@ -225,7 +218,7 @@ void Free(const platform::CUDAPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_CUDA GetGPUBuddyAllocator(place.device)->Free(p); - gpu_mem_info[place.device].first -= size; + if (FLAGS_benchmark) allocation::GPUMemMonitor.Minus(place.device, size); #else PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif @@ -335,6 +328,8 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { namespace allocation { +LegacyMemMonitor GPUMemMonitor; + Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); return new Allocation(ptr, size, place_); @@ -346,6 +341,63 @@ void LegacyAllocator::Free(Allocation *allocation) { allocation->place()); delete allocation; } + +bool MemInfo::Add(const size_t &size) { + std::lock_guard lock(mutex_); + usage_ += size; + bool peak_point = usage_ > peak_usage_; + if (peak_point) peak_usage_ = usage_; + return peak_point; +} + +void MemInfo::Minus(const size_t &size) { + std::lock_guard lock(mutex_); + usage_ -= size; +} + +uint64_t MemInfo::GetPeakUsage() { return peak_usage_; } + +LegacyMemMonitor::~LegacyMemMonitor() { + for (auto &item : gpu_mem_info_) delete item.second; +} + +void LegacyMemMonitor::Initialize(const int &device_num) { + for (auto i = 0; i < device_num; ++i) { + gpu_mem_info_[i] = new MemInfo(); + } +} + +void LegacyMemMonitor::Add(const int &device, const size_t &size) { + if (gpu_mem_info_[device]->Add(size)) { + VLOG(3) << "#LegacyMemMonitor# device: " << device + << " peak memory usage : " + << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB"; + } +} + +void LegacyMemMonitor::Minus(const int &device, const size_t &size) { + gpu_mem_info_[device]->Minus(size); +} + +uint64_t LegacyMemMonitor::GetMemUsage(const int &device) { + return gpu_mem_info_.find(device) == gpu_mem_info_.end() + ? 0 + : gpu_mem_info_[device]->GetPeakUsage(); +} + +void LegacyMemMonitor::PrintMemUsage() { + std::vector devices; + for (const auto &item : gpu_mem_info_) { + devices.emplace_back(item.first); + } + std::sort(devices.begin(), devices.end()); + for (const auto &device : devices) { + std::cout << "Device : " << device << " Peak Memory Usage : " + << (gpu_mem_info_[device]->GetPeakUsage() >> 20) << " MiB" + << std::endl; + } +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index 503a7a685..ccbc8c70d 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -13,12 +13,59 @@ // limitations under the License. #pragma once +#include +#include // NOLINT +#include +#include +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { namespace allocation { +class MemInfo { + public: + MemInfo() : usage_(0), peak_usage_(0) {} + MemInfo(const MemInfo &) = delete; + MemInfo &operator=(const MemInfo &) = delete; + + // return a flag to indicate current operation will create a peak point or not + bool Add(const size_t &); + void Minus(const size_t &); + + uint64_t GetPeakUsage(); + + private: + /* current memory usage*/ + uint64_t usage_; + uint64_t peak_usage_; + std::mutex mutex_; +}; + +class LegacyMemMonitor { + public: + // used to store the GPU memory usage of each devices + using MemUsage = std::unordered_map; + + MemUsage GetMemUsageInfo() { return gpu_mem_info_; } + ~LegacyMemMonitor(); + + void Initialize(const int &); + void Add(const int &, const size_t &); + void Minus(const int &, const size_t &); + + uint64_t GetMemUsage(const int &); + + void PrintMemUsage(); + + protected: + MemUsage gpu_mem_info_; +}; + +extern LegacyMemMonitor GPUMemMonitor; + class LegacyAllocatorPrivate; class LegacyAllocator : public Allocator { public: diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 655ce8485..60b2d83f1 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -14,6 +14,12 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" +DEFINE_bool(benchmark, false, + "Doing memory benchmark. It will make deleting scope synchronized, " + "and add some memory usage logs." + "Default cuda is asynchronous device, set to True will" + "force op run in synchronous mode."); + namespace paddle { namespace platform { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 97e5bbaac..4dcec2195 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -37,6 +37,7 @@ limitations under the License. */ #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -127,6 +128,13 @@ PYBIND11_MODULE(core, m) { m.add_object("_cleanup", py::capsule([]() { ScopePool::Instance().Clear(); })); + m.def("get_mem_usage", [](int device) { + return memory::allocation::GPUMemMonitor.GetMemUsage(device); + }); + + m.def("print_mem_usage", + []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); + py::class_(m, "VarBase", R"DOC()DOC") // .def(py::init<>()) .def(py::init(), py::arg("stop_gradient") = false) diff --git a/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py new file mode 100644 index 000000000..3673fd10c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py @@ -0,0 +1,59 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +os.environ['FLAGS_benchmark'] = 'True' + +import numpy +import paddle.fluid.core as core +from paddle.fluid.executor import Executor +from paddle.fluid.layers import mul, data + + +class TestPeakMemoryMonitoring(unittest.TestCase): + def test_mul(self): + + a = data(name='a', shape=[784], dtype='float32') + b = data( + name='b', + shape=[784, 100], + dtype='float32', + append_batch_size=False) + out = mul(x=a, y=b) + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + + a_np = numpy.random.random((100, 784)).astype('float32') + b_np = numpy.random.random((784, 100)).astype('float32') + self.assertEqual(0, core.get_mem_usage(0)) + exe = Executor(place) + outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out]) + out = outs[0] + #disable this assert since ctest will ignore the os.environ setting + #self.assertGreater(core.get_mem_usage(0), 0) + + raised = False + try: + core.print_mem_usage() + except: + raised = True + self.assertFalse(raised, 'Exception raised') + + +if __name__ == '__main__': + unittest.main() -- GitLab