Support memory stats for CPU (#42945)

* Support memory stats for CPU * Add UTs * Fix typos * Fix typos

Support memory stats for CPU (#42945)
* Support memory stats for CPU * Add UTs * Fix typos * Fix typos
21f11d35 · Ruibiao Chen · GitHub · b2b78cd4 · 21f11d35 · 21f11d35
13 changed file
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
 cc_library(stats SRCS stats.cc DEPS enforce)
 cc_library(memory DEPS malloc memcpy stats)

+cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory)
 cc_test(stats_test SRCS stats_test.cc DEPS stats)

 if (WITH_GPU)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -931,10 +931,7 @@ class AllocatorFacadePrivate {

  void WrapStatAllocator() {
    for (auto& pair : allocators_) {
-      // Now memory stats is only supported for GPU
-      if (platform::is_gpu_place(pair.first)) {
-        pair.second = std::make_shared<StatAllocator>(pair.second);
-      }
+      pair.second = std::make_shared<StatAllocator>(pair.second);
    }
  }


--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -30,16 +30,28 @@ class StatAllocator : public Allocator {

 protected:
  void FreeImpl(phi::Allocation* allocation) override {
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       -allocation->size());
+    if (platform::is_cpu_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                              -allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                                -allocation->size());
+    }
+
    underlying_allocator_->Free(allocation);
  }

  phi::Allocation* AllocateImpl(size_t size) override {
    phi::Allocator::AllocationPtr allocation =
        underlying_allocator_->Allocate(size);
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       allocation->size());
+
+    if (platform::is_cpu_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                              allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                                allocation->size());
+    }
    return allocation.release();
  }


--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -15,6 +15,8 @@ limitations under the License. */

 #include "paddle/fluid/memory/detail/system_allocator.h"

+#include "paddle/fluid/memory/stats.h"
+
 #ifdef _WIN32
 #include <malloc.h>
 #ifndef NOMINMAX
@@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
    }
  }

+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+
  return p;
 }

@@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 #else
  free(p);
 #endif
+
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
 }

 bool CPUAllocator::UseGpu() const { return false; }

--- a/paddle/fluid/memory/memory_stats_test.cc
+++ b/paddle/fluid/memory/memory_stats_test.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/memory.h"
+#include <algorithm>
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+
+TEST(stat_allocator_test, host_memory_stat_test) {
+  std::vector<int64_t> alloc_sizes{
+      5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
+      6235, 0,    7810, 940,  1239, 1945, 789,  2891, 7553, 8046, 2685,
+      1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
+      4,    1185, 2186, 357,  9774, 6743, 6136, 7073, 7674, 5640, 3935,
+      528,  6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
+      221,  309,  3617, 3793, 3334, 7281, 1302};
+
+  int64_t max_alloc_size = 0;
+  for (int64_t size : alloc_sizes) {
+    AllocationPtr allocation = Alloc(platform::CPUPlace(), size);
+    int64_t alloc_size = static_cast<int64_t>(allocation->size());
+    max_alloc_size = std::max(max_alloc_size, alloc_size);
+    EXPECT_EQ(HostMemoryStatCurrentValue("Allocated", 0), alloc_size);
+  }
+  EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(stat_allocator_test, device_memory_stat_test) {
+  std::vector<int64_t> alloc_sizes{
+      5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
+      6235, 0,    7810, 940,  1239, 1945, 789,  2891, 7553, 8046, 2685,
+      1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
+      4,    1185, 2186, 357,  9774, 6743, 6136, 7073, 7674, 5640, 3935,
+      528,  6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
+      221,  309,  3617, 3793, 3334, 7281, 1302};
+
+  int64_t max_alloc_size = 0;
+  for (int64_t size : alloc_sizes) {
+    AllocationPtr allocation = Alloc(platform::CUDAPlace(), size);
+    int64_t alloc_size = static_cast<int64_t>(allocation->size());
+    max_alloc_size = std::max(max_alloc_size, alloc_size);
+    EXPECT_EQ(DeviceMemoryStatCurrentValue("Allocated", 0), alloc_size);
+  }
+  EXPECT_EQ(DeviceMemoryStatPeakValue("Allocated", 0), max_alloc_size);
+}
+#endif
+
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/stats.cc
+++ b/paddle/fluid/memory/stats.cc
@@ -38,7 +38,7 @@ class StatRegistry {
  }

  std::string GetStatKey(const std::string& stat_type, int dev_id) {
-    return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
+    return stat_type + std::to_string(dev_id);
  }

  int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
@@ -49,6 +49,10 @@ class StatRegistry {
    return GetStat(stat_type, dev_id)->GetPeakValue();
  }

+  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
+    GetStat(stat_type, dev_id)->Update(increment);
+  }
+
  void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
    std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
    stat_map_[GetStatKey(stat_type, dev_id)] = stat;
@@ -59,10 +63,6 @@ class StatRegistry {
    stat_map_.erase(GetStatKey(stat_type, dev_id));
  }

-  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
-    stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
-  }
-
 private:
  StatRegistry() = default;

@@ -72,43 +72,67 @@ class StatRegistry {
  SpinLock stat_map_lock_;
 };

-int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
-  return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue("Device" + stat_type,
+                                                      dev_id);
 }

-int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
-  return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
+int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue("Device" + stat_type,
+                                                   dev_id);
 }

-void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
-  StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
+void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                            int64_t increment) {
+  StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
 }

-#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
-  StatRegistry::GetInstance()->Register(       \
-      #item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
-
-#define MEMORY_STAT_REGISTER(item)        \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
+                                                      dev_id);
+}
+
+int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue("Host" + stat_type, dev_id);
+}
+
+void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                          int64_t increment) {
+  StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
+}
+
+#define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \
+  StatRegistry::GetInstance()->Register(              \
+      "Device" #item, id, Stat<DeviceMemoryStat##item##id>::GetInstance());
+
+#define DEVICE_MEMORY_STAT_REGISTER(item)        \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+
+#define HOST_MEMORY_STAT_REGISTER(item)  \
+  StatRegistry::GetInstance()->Register( \
+      "Host" #item, 0, Stat<HostMemoryStat##item##0>::GetInstance());

 int RegisterAllStats() {
-  MEMORY_STAT_REGISTER(Allocated);
-  MEMORY_STAT_REGISTER(Reserved);
+  DEVICE_MEMORY_STAT_REGISTER(Allocated);
+  DEVICE_MEMORY_STAT_REGISTER(Reserved);
+
+  HOST_MEMORY_STAT_REGISTER(Allocated);
+  HOST_MEMORY_STAT_REGISTER(Reserved);
  return 0;
 }


--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -91,82 +91,113 @@ class Stat : public StatBase {
  std::atomic<int64_t> peak_value_{0};
 };

-// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT
-// values by a string, however, they has worse performance than the macro
-// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and
-// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low
-// performance overhead is required.
-int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id);
-int64_t StatGetPeakValue(const std::string& stat_type, int dev_id);
-void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
-
-#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)                          \
-  case id:                                                               \
-    stat = paddle::memory::Stat<                                         \
-        paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \
+// xxxMemoryStatCurrentValue, xxxMemoryStatPeakValue and xxxMemoryStatUpdate
+// support to operate STAT values by a string, however, they has worse
+// performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE,
+// xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro
+// functions where ultra-low performance overhead is required.
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
+void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                            int64_t increment);
+
+int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
+void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                          int64_t increment);
+
+#define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)              \
+  case id:                                                          \
+    stat = paddle::memory::Stat<                                    \
+        paddle::memory::DeviceMemoryStat##item##id>::GetInstance(); \
    break

-#define MEMORY_STAT_FUNC(item, id, func, ...)                         \
-  [&] {                                                               \
-    paddle::memory::StatBase* stat = nullptr;                         \
-    switch (id) {                                                     \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                        \
-      default:                                                        \
-        PADDLE_THROW(paddle::platform::errors::OutOfRange(            \
-            "Only support device id between [0, 15] in memory stats," \
-            "not support device id: %d",                              \
-            id));                                                     \
-        break;                                                        \
-    }                                                                 \
-    return stat->func(__VA_ARGS__);                                   \
+#define DEVICE_MEMORY_STAT_FUNC(item, id, func, ...)                          \
+  [&] {                                                                       \
+    paddle::memory::StatBase* stat = nullptr;                                 \
+    switch (id) {                                                             \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                         \
+      default:                                                                \
+        PADDLE_THROW(paddle::platform::errors::OutOfRange(                    \
+            "Only support device id between [0, 15] for device memory stats," \
+            "not support device id: %d",                                      \
+            id));                                                             \
+        break;                                                                \
+    }                                                                         \
+    return stat->func(__VA_ARGS__);                                           \
  }()

-#define MEMORY_STAT_CURRENT_VALUE(item, id) \
-  MEMORY_STAT_FUNC(item, id, GetCurrentValue)
-#define MEMORY_STAT_PEAK_VALUE(item, id) \
-  MEMORY_STAT_FUNC(item, id, GetPeakValue)
-#define MEMORY_STAT_UPDATE(item, id, increment) \
-  MEMORY_STAT_FUNC(item, id, Update, increment)
-
-#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \
-  struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {};
-
-#define MEMORY_STAT_DECLARE(item)        \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+#define DEVICE_MEMORY_STAT_CURRENT_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define DEVICE_MEMORY_STAT_PEAK_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                           \
+  [&] {                                                                      \
+    PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange(           \
+                                 "Only support device id 0 for host memory " \
+                                 "stats, not support device id: %d",         \
+                                 id));                                       \
+    return paddle::memory::Stat<                                             \
+               paddle::memory::HostMemoryStat##item##0>::GetInstance()       \
+        ->func(__VA_ARGS__);                                                 \
+  }()
+
+#define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define HOST_MEMORY_STAT_PEAK_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
+  HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
+  struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
+
+#define DEVICE_MEMORY_STAT_DECLARE(item)        \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+
+// Only support id 0 for host memory stat
+#define HOST_MEMORY_STAT_DECLARE(item) \
+  struct HostMemoryStat##item##0 : public ThreadLocalStatBase{};

 // To add a new STAT type, declare here and register in stats.cc
-MEMORY_STAT_DECLARE(Allocated);
-MEMORY_STAT_DECLARE(Reserved);
+DEVICE_MEMORY_STAT_DECLARE(Allocated);
+DEVICE_MEMORY_STAT_DECLARE(Reserved);
+
+HOST_MEMORY_STAT_DECLARE(Allocated);
+HOST_MEMORY_STAT_DECLARE(Reserved);

 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/stats_test.cc
+++ b/paddle/fluid/memory/stats_test.cc
@@ -23,50 +23,77 @@
 namespace paddle {
 namespace memory {

-TEST(stats_test, MultiThreadReadWriteTest) {
-  std::string stat_type = "Allocated";
-  size_t thread_num = 3;
-  size_t data_num = 10;
-
-  std::condition_variable cv;
-  std::mutex mutex;
-  std::vector<std::thread> threads;
-  size_t ready_thread_num = 0;
-
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back(
-        [&stat_type, data_num, &cv, &mutex, &ready_thread_num]() {
-          for (size_t data = 0; data < data_num; ++data) {
-            StatUpdate(stat_type, 0, data);
-          }
-          /* lock guard*/ {
-            std::lock_guard<std::mutex> lock_guard{mutex};
-            ++ready_thread_num;
-            cv.notify_one();
-          }
-          // Sleep here to not exit before the main thread checking stat
-          // results, because the thread-local stat data will be destroyed when
-          // the thread exit
-          std::this_thread::sleep_for(std::chrono::seconds(1));
-        });
+class StatsTest : public ::testing::Test {
+ protected:
+  void SetStatType(const std::string& stat_type) { stat_type_ = stat_type; }
+
+  void SetFunc(
+      std::function<void(const std::string, int, int64_t)> update_func,
+      std::function<int64_t(const std::string, int)> current_value_func,
+      std::function<int64_t(const std::string, int)> peak_value_func) {
+    update_func_ = update_func;
+    current_value_func_ = current_value_func;
+    peak_value_func_ = peak_value_func;
+  }
+
+  void RunTests() {
+    MultiThreadReadWriteTest();
+    PeakValueTest();
  }

-  std::unique_lock<std::mutex> unique_lock(mutex);
-  cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
-    return ready_thread_num == thread_num;
-  });
+ private:
+  void MultiThreadReadWriteTest() {
+    size_t thread_num = 3;
+    size_t data_num = 10;
+
+    std::condition_variable cv;
+    std::mutex mutex;
+    std::vector<std::thread> threads;
+    size_t ready_thread_num = 0;
+
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads.emplace_back([&]() {
+        for (size_t data = 0; data < data_num; ++data) {
+          update_func_(stat_type_, 0, data);
+        }
+        /* lock guard*/ {
+          std::lock_guard<std::mutex> lock_guard{mutex};
+          ++ready_thread_num;
+          cv.notify_one();
+        }
+        // Sleep here to not exit before the main thread checking stat
+        // results, because the thread-local stat data will be destroyed when
+        // the thread exit
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      });
+    }

-  EXPECT_EQ(StatGetCurrentValue(stat_type, 0),
-            int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+    std::unique_lock<std::mutex> unique_lock(mutex);
+    cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
+      return ready_thread_num == thread_num;
+    });

-  for (size_t i = 0; i < thread_num; ++i) {
-    threads[i].join();
+    EXPECT_EQ(current_value_func_(stat_type_, 0),
+              int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads[i].join();
+    }
+  }
+
+  void PeakValueTest() {
+    int64_t peak_value = ((int64_t)1) << 63;
+    int64_t sum = 0;
+    for (int64_t data : datas_) {
+      update_func_(stat_type_, 0, data);
+      sum += data;
+      peak_value = std::max(peak_value, sum);
+    }
+    EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
  }
-}

-TEST(stats_test, PeakValueTest) {
-  std::string stat_type = "Allocated";
-  std::vector<int64_t> datas = {
+  std::string stat_type_;
+  std::vector<int64_t> datas_{
      543149808935355, 634698327471328, 706215795436611, 577939367795333,
      419479490054362, 21975227714595,  812939817942250, 984428837942082,
      537304104446806, 685008544452453, 563352858161268, 690143831596330,
@@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) {
      746465732805300, -74049761897414, -65640372433924, 852009039806484,
      305079802044257, -48409757869238, 266031781660228, 327287322379820};

-  int64_t peak_value = ((int64_t)1) << 63;
-  int64_t sum = 0;
-  for (int64_t data : datas) {
-    StatUpdate(stat_type, 0, data);
-    sum += data;
-    peak_value = std::max(peak_value, sum);
-  }
-  EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value);
+  std::function<void(const std::string, int, int64_t)> update_func_;
+  std::function<int64_t(const std::string, int)> current_value_func_;
+  std::function<int64_t(const std::string, int)> peak_value_func_;
+};
+
+TEST_F(StatsTest, DeviceAllocatedTest) {
+  SetStatType("Allocated");
+  SetFunc(DeviceMemoryStatUpdate, DeviceMemoryStatCurrentValue,
+          DeviceMemoryStatPeakValue);
+  RunTests();
+}
+
+TEST_F(StatsTest, DeviceReservedMacroTest) {
+  SetStatType("Reserved");
+  SetFunc(
+      [](const std::string stat_type, int id, int64_t increment) {
+        return DEVICE_MEMORY_STAT_UPDATE(Reserved, id, increment);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, id);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
+      });
+  RunTests();
+}
+
+TEST_F(StatsTest, HostAllocatedMacroTest) {
+  SetStatType("Allocated");
+  SetFunc(
+      [](const std::string stat_type, int id, int64_t increment) {
+        return HOST_MEMORY_STAT_UPDATE(Allocated, id, increment);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, id);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
+      });
+  RunTests();
+}
+
+TEST_F(StatsTest, HostReservedTest) {
+  SetStatType("Reserved");
+  SetFunc(HostMemoryStatUpdate, HostMemoryStatCurrentValue,
+          HostMemoryStatPeakValue);
+  RunTests();
 }

 }  // namespace memory

--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() {
 static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
  if (!use_fixed_workspace) {
    int device_id = platform::GetCurrentDeviceId();
-    int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
-    int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
+    int64_t allocated =
+        memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
+    int64_t reserved =
+        memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
    int64_t availble = platform::GpuAvailableMemToAlloc();
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)

--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -149,8 +149,8 @@ class RecordedGpuMallocHelper {
    if (FLAGS_enable_gpu_memory_usage_log) {
      // A fake UPDATE to trigger the construction of memory stat instances,
      // make sure that they are destructed after RecordedGpuMallocHelper.
-      MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
-      MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
    }
  }

@@ -161,15 +161,18 @@ class RecordedGpuMallocHelper {
    if (FLAGS_enable_gpu_memory_usage_log) {
      if (FLAGS_enable_gpu_memory_usage_log_mb) {
        std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
-                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) /
+                         1048576.0
                  << ", Allocated = "
-                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) /
+                         1048576.0
                  << std::endl;
      } else {
        std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
-                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
                  << ", Allocated = "
-                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl;
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_)
+                  << std::endl;
      }
    }
  }
@@ -230,7 +233,7 @@ class RecordedGpuMallocHelper {
    if (result == gpuSuccess) {
      cur_size_.fetch_add(size);
      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);

 #ifdef PADDLE_WITH_TESTING
      gpu_ptrs.insert(*ptr);
@@ -269,7 +272,7 @@ class RecordedGpuMallocHelper {
      PADDLE_ENFORCE_GPU_SUCCESS(err);
      cur_size_.fetch_sub(size);
      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
    } else {
      platform::GpuGetLastError();  // clear the error flag when
                                    // cudaErrorCudartUnloading /

--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -168,8 +168,10 @@ void PrintMemProfiler(
  if (num_gpus > 0) {
    std::cout << "GPU Memory Usage (MB):\n";
    for (int dev_id = 0; dev_id < num_gpus; ++dev_id) {
-      int64_t allocated = memory::StatGetCurrentValue("Allocated", dev_id);
-      int64_t reserved = memory::StatGetCurrentValue("Reserved", dev_id);
+      int64_t allocated =
+          memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
+      int64_t reserved =
+          memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
      size_t available = 0, total = 0, actual_available = 0, actual_total = 0;
      RecordedGpuMemGetInfo(&available, &total, &actual_available,
                            &actual_total, dev_id);

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3005,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle.
    }
    return stats_map;
  });
-  m.def("memory_stat_get_current", memory::StatGetCurrentValue);
-  m.def("memory_stat_get_peak", memory::StatGetPeakValue);
+  m.def("device_memory_stat_current_value",
+        memory::DeviceMemoryStatCurrentValue);
+  m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
  m.def("run_cmd",
        [](const std::string &cmd, int time_out = -1,
           int sleep_inter = -1) -> const std::string {

--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -224,7 +224,7 @@ def max_memory_allocated(device=None):
            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
        )
    device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_peak("Allocated", device_id)
+    return core.device_memory_stat_peak_value("Allocated", device_id)


 def max_memory_reserved(device=None):
@@ -255,7 +255,7 @@ def max_memory_reserved(device=None):
            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
        )
    device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_peak("Reserved", device_id)
+    return core.device_memory_stat_peak_value("Reserved", device_id)


 def memory_allocated(device=None):
@@ -290,7 +290,7 @@ def memory_allocated(device=None):
            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
        )
    device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_current("Allocated", device_id)
+    return core.device_memory_stat_current_value("Allocated", device_id)


 def memory_reserved(device=None):
@@ -321,7 +321,7 @@ def memory_reserved(device=None):
            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
        )
    device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_current("Reserved", device_id)
+    return core.device_memory_stat_current_value("Reserved", device_id)


 def _set_current_stream(stream):