[cherry-pick 54036] add host memory stats (#54037)

* add host memory stats * add ut

[cherry-pick 54036] add host memory stats (#54037)
* add host memory stats * add ut
1ebac643 · Leo Chen · GitHub · 343c2862 · 1ebac643 · 1ebac643
5 changed file
--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -59,6 +59,8 @@ BuddyAllocator::BuddyAllocator(
 #endif
  }
 #endif
+  VLOG(1) << "min_chunk_size_: " << min_chunk_size_
+          << ", max_chunk_size_:" << max_chunk_size_;
 }
 BuddyAllocator::~BuddyAllocator() {
@@ -228,7 +230,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
  size_t index = 0;
  void* p = system_allocator_->Alloc(&index, size);
-  VLOG(10) << "Allocated " << p << " from system allocator.";
+  VLOG(8) << "Allocated " << p << " size " << size << " from system allocator.";
  if (p == nullptr) return nullptr;
@@ -258,8 +260,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
  if (p == nullptr) return pool_.end();
-  VLOG(10) << "Creating and inserting new block " << p
+  VLOG(8) << "Creating and inserting new block " << p << " size "
-           << " from system allocator";
+          << allocate_bytes << " from system allocator";
  static_cast<MemoryBlock*>(p)->Init(&cache_,
                                     MemoryBlock::FREE_CHUNK,

--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -16,6 +16,7 @@
 #include <stdlib.h>
+#include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@@ -25,12 +26,14 @@ namespace allocation {
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 void CPUAllocator::FreeImpl(phi::Allocation *allocation) {
+  auto size = allocation->size();
  void *p = allocation->ptr();
 #ifdef _WIN32
  _aligned_free(p);
 #else
  free(p);
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
  delete allocation;
 }
@@ -46,6 +49,7 @@ phi::Allocation *CPUAllocator::AllocateImpl(size_t size) {
      platform::errors::ResourceExhausted(
          "Fail to alloc memory of %ld size, error code is %d.", size, error));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
  return new Allocation(p, size, platform::CPUPlace());
 }
 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -380,6 +380,7 @@ template <>
 void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                       size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
  auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
  void *ptr = buddy_allocator->Alloc(size);
@@ -402,6 +403,7 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                     void *p,
                                     size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
  GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
  PADDLE_THROW(platform::errors::PermissionDenied(
@@ -413,6 +415,7 @@ template <>
 uint64_t Release<platform::CUDAPinnedPlace>(
    const platform::CUDAPinnedPlace &place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  VLOG(10) << "Release on " << platform::Place(place);
  return GetCUDAPinnedBuddyAllocator()->Release();
 #else
  PADDLE_THROW(platform::errors::PermissionDenied(

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1935,6 +1935,8 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("device_memory_stat_current_value",
        memory::DeviceMemoryStatCurrentValue);
  m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
+  m.def("host_memory_stat_current_value", memory::HostMemoryStatCurrentValue);
+  m.def("host_memory_stat_peak_value", memory::HostMemoryStatPeakValue);
  m.def(
      "run_cmd",
      [](const std::string &cmd,

--- a/python/paddle/fluid/tests/unittests/test_host_memory_stats.py
+++ b/python/paddle/fluid/tests/unittests/test_host_memory_stats.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import paddle
+from paddle.fluid import core
+paddle.set_device('cpu')
+class TestHostMemoryStats(unittest.TestCase):
+    def test_memory_allocated_with_pinned(self, device=None):
+        if core.is_compiled_with_cuda():
+            tensor = paddle.zeros(shape=[256])
+            tensor_pinned = tensor.pin_memory()
+            alloc_size = 4 * 256  # 256 float32 data, with 4 bytes for each one
+            memory_allocated_size = core.host_memory_stat_current_value(
+                "Allocated", 0
+            )
+            self.assertEqual(memory_allocated_size, alloc_size * 2)
+            def foo():
+                tensor = paddle.zeros(shape=[256])
+                tensor_pinned = tensor.pin_memory()
+                memory_allocated_size = core.host_memory_stat_current_value(
+                    "Allocated", 0
+                )
+                self.assertEqual(memory_allocated_size, alloc_size * 4)
+                max_allocated_size = core.host_memory_stat_peak_value(
+                    "Allocated", 0
+                )
+                self.assertEqual(memory_allocated_size, alloc_size * 4)
+            foo()
+            memory_allocated_size = core.host_memory_stat_current_value(
+                "Allocated", 0
+            )
+            self.assertEqual(memory_allocated_size, alloc_size * 2)
+            max_allocated_size = core.host_memory_stat_peak_value(
+                "Allocated", 0
+            )
+            self.assertEqual(max_allocated_size, alloc_size * 4)
+if __name__ == "__main__":
+    unittest.main()