From afe02e9dca89292b97be91cf1cdb0b500dca64a4 Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Wed, 30 Mar 2022 15:46:33 +0800
Subject: [PATCH] Add new APIs for GPU memory monitoring (max_memory_allocated,
 max_memory_reserved, memory_allocated, memory_reserved) (#38657)

* Add new API memory_reserved

* Add memory_allocated, max_memory_reserved and max_memory_allocater

* Fix CI error

* Fix CI error

* Enhance UT

* Add FLAGS_memory_stats_opt

* Add STATS macro functions

* Add StatAllocator

* Fix CI errors

* Add UT

* Fix CI errors
---
 paddle/fluid/memory/CMakeLists.txt            |   4 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |   8 +-
 .../memory/allocation/allocator_facade.cc     |  18 ++
 .../fluid/memory/allocation/stat_allocator.h  |  56 ++++++
 paddle/fluid/memory/memory.h                  |   1 +
 paddle/fluid/memory/stats.cc                  | 118 ++++++++++++
 paddle/fluid/memory/stats.h                   | 172 +++++++++++++++++
 paddle/fluid/memory/stats_test.cc             | 107 +++++++++++
 paddle/fluid/platform/device/gpu/gpu_info.cc  |  23 ++-
 paddle/fluid/pybind/pybind.cc                 |   3 +
 python/paddle/device/cuda/__init__.py         | 175 ++++++++++++++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../test_cuda_max_memory_allocated.py         |  61 ++++++
 .../test_cuda_max_memory_reserved.py          |  61 ++++++
 .../unittests/test_cuda_memory_allocated.py   |  53 ++++++
 .../unittests/test_cuda_memory_reserved.py    |  53 ++++++
 16 files changed, 899 insertions(+), 15 deletions(-)
 create mode 100644 paddle/fluid/memory/allocation/stat_allocator.h
 create mode 100644 paddle/fluid/memory/stats.cc
 create mode 100644 paddle/fluid/memory/stats.h
 create mode 100644 paddle/fluid/memory/stats_test.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py

diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 4492615d23..76bb8993cb 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -10,8 +10,10 @@ endif()
 cc_library(malloc SRCS malloc.cc DEPS
     place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
 cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
+cc_library(stats SRCS stats.cc DEPS enforce)
+cc_library(memory DEPS malloc memcpy stats)
 
-cc_library(memory DEPS malloc memcpy)
+cc_test(stats_test SRCS stats_test.cc DEPS stats)
 
 if (WITH_GPU)
     nv_test(malloc_test
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index f296ce96d4..5af13f76b3 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(allocator SRCS allocator.cc DEPS place)
+cc_library(allocator SRCS allocator.cc DEPS place stats)
 cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
@@ -14,7 +14,7 @@ else ()
 endif()
 
 if (WITH_GPU)
-  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
   nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
@@ -27,7 +27,7 @@ if (WITH_GPU)
 endif()
 
 if (WITH_ROCM)
-  hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
   hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
@@ -101,7 +101,7 @@ endif()
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats)
 
 if (WITH_GPU)
   target_link_libraries(allocator_facade cuda_graph)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 88bbe339f8..7619767123 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
+#include "paddle/fluid/memory/allocation/stat_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -315,6 +316,8 @@ class AllocatorFacadePrivate {
       WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
     }
 
+    WrapStatAllocator();
+
     CheckAllocThreadSafe();
 
 #ifdef PADDLE_WITH_CUDA
@@ -521,6 +524,7 @@ class AllocatorFacadePrivate {
       InitAutoGrowthCUDAAllocator(p, stream);
       WrapStreamSafeCUDAAllocator(p, stream);
       WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
+      WrapStatAllocator(p, stream);
     }
   }
 
@@ -715,6 +719,11 @@ class AllocatorFacadePrivate {
     allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
   }
 
+  void WrapStatAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
+    allocator = std::make_shared<StatAllocator>(allocator);
+  }
+
 #ifdef PADDLE_WITH_CUDA
   void WrapCUDAGraphAllocator() {
     for (auto& item : allocators_) {
@@ -895,6 +904,15 @@ class AllocatorFacadePrivate {
     }
   }
 
+  void WrapStatAllocator() {
+    for (auto& pair : allocators_) {
+      // Now memory stats is only supported for GPU
+      if (platform::is_gpu_place(pair.first)) {
+        pair.second = std::make_shared<StatAllocator>(pair.second);
+      }
+    }
+  }
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
new file mode 100644
index 0000000000..71569366c2
--- /dev/null
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/stats.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class StatAllocator : public Allocator {
+ public:
+  explicit StatAllocator(std::shared_ptr<Allocator> underlying_allocator)
+      : underlying_allocator_(std::move(underlying_allocator)) {}
+
+  bool IsAllocThreadSafe() const override { return true; }
+
+ protected:
+  void FreeImpl(phi::Allocation* allocation) override {
+    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                       -allocation->size());
+    underlying_allocator_->Free(allocation);
+  }
+
+  phi::Allocation* AllocateImpl(size_t size) override {
+    phi::Allocator::AllocationPtr allocation =
+        underlying_allocator_->Allocate(size);
+    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                       allocation->size());
+    return allocation.release();
+  }
+
+  uint64_t ReleaseImpl(const platform::Place& place) override {
+    return underlying_allocator_->Release(place);
+  }
+
+ private:
+  std::shared_ptr<Allocator> underlying_allocator_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
index 8d904e3be5..db4950fdf5 100644
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -16,3 +16,4 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/stats.h"
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
new file mode 100644
index 0000000000..31d776de40
--- /dev/null
+++ b/paddle/fluid/memory/stats.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/stats.h"
+
+#include "paddle/fluid/memory/allocation/spin_lock.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace memory {
+
+class StatRegistry {
+ public:
+  static StatRegistry* GetInstance() {
+    static StatRegistry instance;
+    return &instance;
+  }
+
+  StatBase* GetStat(const std::string& stat_type, int dev_id) {
+    auto it = stat_map_.find(GetStatKey(stat_type, dev_id));
+    if (it == stat_map_.end()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The STAT type \"%s\" for device %d has not been regeistered.",
+          stat_type.c_str(), dev_id));
+    }
+    return it->second;
+  }
+
+  std::string GetStatKey(const std::string& stat_type, int dev_id) {
+    return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
+  }
+
+  int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
+    return GetStat(stat_type, dev_id)->GetCurrentValue();
+  }
+
+  int64_t GetPeakValue(const std::string& stat_type, int dev_id) {
+    return GetStat(stat_type, dev_id)->GetPeakValue();
+  }
+
+  void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
+    std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
+    stat_map_[GetStatKey(stat_type, dev_id)] = stat;
+  }
+
+  void Unregister(const std::string& stat_type, int dev_id) {
+    std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
+    stat_map_.erase(GetStatKey(stat_type, dev_id));
+  }
+
+  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
+    stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
+  }
+
+ private:
+  StatRegistry() = default;
+
+  DISABLE_COPY_AND_ASSIGN(StatRegistry);
+
+  std::unordered_map<std::string, StatBase*> stat_map_;
+  SpinLock stat_map_lock_;
+};
+
+int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
+}
+
+int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
+}
+
+void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
+  StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
+}
+
+#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
+  StatRegistry::GetInstance()->Register(       \
+      #item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
+
+#define MEMORY_STAT_REGISTER(item)        \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+
+int RegisterAllStats() {
+  MEMORY_STAT_REGISTER(Allocated);
+  MEMORY_STAT_REGISTER(Reserved);
+  return 0;
+}
+
+UNUSED static int regiester_all_stats = RegisterAllStats();
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
new file mode 100644
index 0000000000..f644d2f587
--- /dev/null
+++ b/paddle/fluid/memory/stats.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace memory {
+
+using framework::ThreadDataRegistry;
+
+struct ThreadLocalStatBase {
+  int64_t current{0};
+  int64_t peak{0};
+};
+
+class StatBase {
+ public:
+  StatBase() = default;
+  virtual ~StatBase() = default;
+
+  virtual int64_t GetCurrentValue() = 0;
+  virtual int64_t GetPeakValue() = 0;
+  virtual void Update(int64_t) = 0;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(StatBase);
+};
+
+template <typename ThreadLocalStatType>
+class Stat : public StatBase {
+ public:
+  static Stat* GetInstance() {
+    static Stat instance;
+    return &instance;
+  }
+
+  int64_t GetCurrentValue() override {
+    std::unordered_map<uint64_t, ThreadLocalStatType> thread_local_stats =
+        ThreadDataRegistry<ThreadLocalStatType>::GetInstance()
+            .GetAllThreadDataByValue();
+    int64_t current_value = 0;
+    for (auto pair : thread_local_stats) {
+      current_value += pair.second.current;
+    }
+    return current_value;
+  }
+
+  int64_t GetPeakValue() override { return peak_value_; }
+
+  void Update(int64_t increment) override {
+    auto& thread_data_registry =
+        ThreadDataRegistry<ThreadLocalStatType>::GetInstance();
+    ThreadLocalStatType* thread_local_stat =
+        thread_data_registry.GetMutableCurrentThreadData();
+    thread_local_stat->current += increment;
+
+    if (thread_local_stat->current > thread_local_stat->peak) {
+      thread_local_stat->peak = thread_local_stat->current;
+      int64_t current_value = GetCurrentValue();
+      int64_t prev_value = peak_value_;
+      while (prev_value < current_value &&
+             !peak_value_.compare_exchange_weak(prev_value, current_value)) {
+      }
+      VLOG(8) << "Update peak_value, after update, peak_value = " << peak_value_
+              << " , current value = " << current_value;
+    }
+  }
+
+ private:
+  Stat() {}
+  ~Stat() {}
+  std::atomic<int64_t> peak_value_{0};
+};
+
+// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT
+// values by a string, however, they has worse performance than the macro
+// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and
+// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low
+// performance overhead is required.
+int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id);
+int64_t StatGetPeakValue(const std::string& stat_type, int dev_id);
+void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
+
+#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)                          \
+  case id:                                                               \
+    stat = paddle::memory::Stat<                                         \
+        paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \
+    break
+
+#define MEMORY_STAT_FUNC(item, id, func, ...)                         \
+  do {                                                                \
+    paddle::memory::StatBase* stat = nullptr;                         \
+    switch (id) {                                                     \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                         \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                        \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                        \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                        \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                        \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                        \
+      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                        \
+      default:                                                        \
+        PADDLE_THROW(paddle::platform::errors::OutOfRange(            \
+            "Only support device id between [0, 15] in memory stats," \
+            "not support device id: %d",                              \
+            id));                                                     \
+        break;                                                        \
+    }                                                                 \
+    stat->func(__VA_ARGS__);                                          \
+  } while (0)
+
+#define MEMORY_STAT_CURRENT_VALUE(item, id) \
+  MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define MEMORY_STAT_PEAK_VALUE(item, id) \
+  MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define MEMORY_STAT_UPDATE(item, id, increment) \
+  MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \
+  struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {};
+
+#define MEMORY_STAT_DECLARE(item)        \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
+  MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+
+// To add a new STAT type, declare here and register in stats.cc
+MEMORY_STAT_DECLARE(Allocated);
+MEMORY_STAT_DECLARE(Reserved);
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc
new file mode 100644
index 0000000000..436c737916
--- /dev/null
+++ b/paddle/fluid/memory/stats_test.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/stats.h"
+#include <condition_variable>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+
+TEST(stats_test, MultiThreadReadWriteTest) {
+  std::string stat_type = "Allocated";
+  size_t thread_num = 3;
+  size_t data_num = 10;
+
+  std::condition_variable cv;
+  std::mutex mutex;
+  std::vector<std::thread> threads;
+  size_t ready_thread_num = 0;
+
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back(
+        [&stat_type, data_num, &cv, &mutex, &ready_thread_num]() {
+          for (size_t data = 0; data < data_num; ++data) {
+            StatUpdate(stat_type, 0, data);
+          }
+          /* lock guard*/ {
+            std::lock_guard<std::mutex> lock_guard{mutex};
+            ++ready_thread_num;
+            cv.notify_one();
+          }
+          // Sleep here to not exit before the main thread checking stat
+          // results, because the thread-local stat data will be destroyed when
+          // the thread exit
+          std::this_thread::sleep_for(std::chrono::seconds(1));
+        });
+  }
+
+  std::unique_lock<std::mutex> unique_lock(mutex);
+  cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
+    return ready_thread_num == thread_num;
+  });
+
+  EXPECT_EQ(StatGetCurrentValue(stat_type, 0),
+            int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(stats_test, PeakValueTest) {
+  std::string stat_type = "Allocated";
+  std::vector<int64_t> datas = {
+      543149808935355, 634698327471328, 706215795436611, 577939367795333,
+      419479490054362, 21975227714595,  812939817942250, 984428837942082,
+      537304104446806, 685008544452453, 563352858161268, 690143831596330,
+      964829938186077, 476984078018245, 804403365180177, -57918691189304,
+      947611269236893, 752188963801927, 710946451346683, -49226452527666,
+      -59049377393968, 14128239868858,  463298869064035, 71954818131880,
+      894087341752481, 971337322257029, 202325222441382, 128423535063606,
+      -89146949094815, 756429151957759, 444400180007032, 937040878834954,
+      303916192293233, 16628488962638,  544031750807065, 392396591234910,
+      686663859558365, 941126625484539, 120719755546781, 938838399629825,
+      364952832531949, 237865770815218, -64409925441421, 130095171433100,
+      140906146174023, 635514857321759, -65954585142544, 505369882354612,
+      939334896592688, 591590196329715, 424834428510773, 316569328289240,
+      44932622352645,  464924685290752, 396541464249293, 937169087747437,
+      437992536948503, 44395833829426,  968496835801562, 80493658180301,
+      836093264717766, 3339912102452,   -32067753603273, 77353521424986,
+      290980283590981, 496135147814915, 335112580987207, 571094882208895,
+      776581672377954, -83075504255716, -93690563747742, 115144063088100,
+      422629490055299, 917988755593299, 283511671626409, 715179006446336,
+      760708617230450, 183628659314298, 899792829140365, 214949068928854,
+      848851506468080, 791041814082114, 801591030978388, 526551272394511,
+      781034506085043, 279998089943681, 907197980150568, 974365521595836,
+      282127262539024, 272870474932399, 346617645597508, 409964014011113,
+      746465732805300, -74049761897414, -65640372433924, 852009039806484,
+      305079802044257, -48409757869238, 266031781660228, 327287322379820};
+
+  int64_t peak_value = ((int64_t)1) << 63;
+  int64_t sum = 0;
+  for (int64_t data : datas) {
+    StatUpdate(stat_type, 0, data);
+    sum += data;
+    peak_value = std::max(peak_value, sum);
+  }
+  EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value);
+}
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 9beb70f3d2..a671381d07 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -13,34 +13,35 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+
 #include <array>
 #include <cstdlib>
 #include <mutex>
 #include <set>
 #include <vector>
-
 #include "gflags/gflags.h"
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
 #else
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #endif
-#include "paddle/fluid/memory/malloc.h"
+
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
 #endif
 #endif
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-#include "paddle/phi/backends/gpu/gpu_info.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
@@ -193,6 +194,7 @@ class RecordedGpuMallocHelper {
     if (result == gpuSuccess) {
       cur_size_.fetch_add(size);
       STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+      MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
 
 #ifdef PADDLE_WITH_TESTING
       gpu_ptrs.insert(*ptr);
@@ -229,6 +231,7 @@ class RecordedGpuMallocHelper {
       PADDLE_ENFORCE_GPU_SUCCESS(err);
       cur_size_.fetch_sub(size);
       STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+      MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
     } else {
       platform::GpuGetLastError();  // clear the error flag when
                                     // cudaErrorCudartUnloading /
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 84c711f9b8..96d569d47c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2939,6 +2939,8 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return stats_map;
   });
+  m.def("memory_stat_get_current", memory::StatGetCurrentValue);
+  m.def("memory_stat_get_peak", memory::StatGetPeakValue);
   m.def("run_cmd",
         [](const std::string &cmd, int time_out = -1,
            int sleep_inter = -1) -> const std::string {
@@ -3143,6 +3145,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
+  m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
   m.def("cuda_empty_cache", [] {
     for (int dev_id : platform::GetSelectedDevices()) {
       auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index 970fb35bfa..b33dc1aaeb 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -26,6 +26,10 @@ __all__ = [
     'synchronize',
     'device_count',
     'empty_cache',
+    'max_memory_allocated',
+    'max_memory_reserved',
+    'memory_allocated',
+    'memory_reserved',
     'stream_guard',
     'get_device_properties',
     'get_device_name',
@@ -149,6 +153,177 @@ def empty_cache():
         core.cuda_empty_cache()
 
 
+def extract_cuda_device_id(device, op_name):
+    '''
+    Return the id of the given cuda device. It is just a utility that will not be exposed to users.
+
+    Args:
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+            the string name of device like 'gpu:x'.
+            Default: None.
+
+    Return:
+        int: The id of the given device. If device is None, return the id of current device.
+    '''
+    if (device is None):
+        return core.get_cuda_current_device_id()
+
+    if isinstance(device, int):
+        device_id = device
+    elif isinstance(device, core.CUDAPlace):
+        device_id = device.get_device_id()
+    elif isinstance(device, str):
+        if device.startswith('gpu:'):
+            device_id = int(device[4:])
+        else:
+            raise ValueError(
+                "The current string {} is not expected. Because {} only support string which is like 'gpu:x'. "
+                "Please input appropriate string again!".format(device,
+                                                                op_name))
+    else:
+        raise ValueError(
+            "The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. "
+            "Please input appropriate device again!".format(device, op_name))
+
+    assert device_id >= 0, f"The device id must be not less than 0, but got id = {device_id}."
+    assert device_id < device_count(
+    ), f"The device id {device_id} exceeds gpu card number {device_count()}"
+
+    return device_id
+
+
+def max_memory_allocated(device=None):
+    '''
+    Return the peak size of gpu memory that is allocated to tensor of the given device.
+
+    .. note::
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need. 
+        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
+
+    Args:
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+            Default: None.
+
+    Return:
+        int: The peak size of gpu memory that is allocated to tensor of the given device, in bytes.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+
+            max_memory_allocated_size = paddle.device.cuda.max_memory_allocated(paddle.CUDAPlace(0))
+            max_memory_allocated_size = paddle.device.cuda.max_memory_allocated(0)
+            max_memory_allocated_size = paddle.device.cuda.max_memory_allocated("gpu:0")
+    '''
+    name = "paddle.device.cuda.max_memory_allocated"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    return core.memory_stat_get_peak("Allocated", device_id)
+
+
+def max_memory_reserved(device=None):
+    '''
+    Return the peak size of GPU memory that is held by the allocator of the given device.
+
+    Args:
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+            Default: None.
+
+    Return:
+        int: The peak size of GPU memory that is held by the allocator of the given device, in bytes.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+
+            max_memory_reserved_size = paddle.device.cuda.max_memory_reserved(paddle.CUDAPlace(0))
+            max_memory_reserved_size = paddle.device.cuda.max_memory_reserved(0)
+            max_memory_reserved_size = paddle.device.cuda.max_memory_reserved("gpu:0")
+    '''
+    name = "paddle.device.cuda.max_memory_reserved"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    return core.memory_stat_get_peak("Reserved", device_id)
+
+
+def memory_allocated(device=None):
+    '''
+    Return the current size of gpu memory that is allocated to tensor of the given device.
+
+    .. note::
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need. 
+        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. 
+
+    Args:
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+            Default: None.
+
+    Return:
+        int: The current size of gpu memory that is allocated to tensor of the given device, in bytes.
+
+    Examples:
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+
+            memory_allocated_size = paddle.device.cuda.memory_allocated(paddle.CUDAPlace(0))
+            memory_allocated_size = paddle.device.cuda.memory_allocated(0)
+            memory_allocated_size = paddle.device.cuda.memory_allocated("gpu:0")
+    '''
+    name = "paddle.device.cuda.memory_allocated"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    return core.memory_stat_get_current("Allocated", device_id)
+
+
+def memory_reserved(device=None):
+    '''
+    Return the current size of GPU memory that is held by the allocator of the given device.
+
+    Args:
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+            Default: None.
+
+    Return:
+        int: The current size of GPU memory that is held by the allocator of the given device, in bytes.
+
+    Examples:    
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+
+            memory_reserved_size = paddle.device.cuda.memory_reserved(paddle.CUDAPlace(0))
+            memory_reserved_size = paddle.device.cuda.memory_reserved(0)
+            memory_reserved_size = paddle.device.cuda.memory_reserved("gpu:0")
+    '''
+    name = "paddle.device.cuda.memory_reserved"
+    if not core.is_compiled_with_cuda():
+        raise ValueError(
+            f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
+        )
+    device_id = extract_cuda_device_id(device, op_name=name)
+    return core.memory_stat_get_current("Reserved", device_id)
+
+
 def _set_current_stream(stream):
     '''
     Set the current stream.
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index d65d871579..c816a8c4c2 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1219,6 +1219,7 @@ set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
 set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000)
 set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
+set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
 if (WITH_GLOO)
     set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30)
     set_tests_properties(test_parallel_dygraph_unused_variables_gloo PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
new file mode 100644
index 0000000000..51c9ba182a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+from paddle.fluid import core
+from paddle.device.cuda import device_count, memory_allocated, max_memory_allocated
+
+
+class TestMaxMemoryAllocated(unittest.TestCase):
+    def test_max_memory_allocated(self, device=None):
+        if core.is_compiled_with_cuda():
+            alloc_time = 100
+            max_alloc_size = 10000
+            peak_memory_allocated_size = max_memory_allocated(device)
+            for i in range(alloc_time):
+                shape = paddle.randint(max_alloc_size)
+                tensor = paddle.zeros(shape)
+                peak_memory_allocated_size = max(peak_memory_allocated_size,
+                                                 memory_allocated(device))
+                del shape
+                del tensor
+
+            self.assertEqual(peak_memory_allocated_size,
+                             max_memory_allocated(device))
+
+    def test_max_memory_allocated_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.test_max_memory_allocated(core.CUDAPlace(i))
+                self.test_max_memory_allocated(i)
+                self.test_max_memory_allocated("gpu:" + str(i))
+
+    def test_max_memory_allocated_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):
+                    max_memory_allocated(device)
+        else:
+            with self.assertRaises(BaseException):
+                max_memory_allocated()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
new file mode 100644
index 0000000000..e64e02bb7f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+from paddle.fluid import core
+from paddle.device.cuda import device_count, memory_reserved, max_memory_reserved
+
+
+class TestMaxMemoryreserved(unittest.TestCase):
+    def test_max_memory_reserved(self, device=None):
+        if core.is_compiled_with_cuda():
+            alloc_time = 100
+            max_alloc_size = 10000
+            peak_memory_reserved_size = max_memory_reserved(device)
+            for i in range(alloc_time):
+                shape = paddle.randint(max_alloc_size)
+                tensor = paddle.zeros(shape)
+                peak_memory_reserved_size = max(peak_memory_reserved_size,
+                                                memory_reserved(device))
+                del shape
+                del tensor
+
+            self.assertEqual(peak_memory_reserved_size,
+                             max_memory_reserved(device))
+
+    def test_max_memory_reserved_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.test_max_memory_reserved(core.CUDAPlace(i))
+                self.test_max_memory_reserved(i)
+                self.test_max_memory_reserved("gpu:" + str(i))
+
+    def test_max_memory_reserved_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):
+                    max_memory_reserved(device)
+        else:
+            with self.assertRaises(BaseException):
+                max_memory_reserved()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
new file mode 100644
index 0000000000..af45537b6d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import numpy as np
+from paddle.fluid import core
+from paddle.device.cuda import device_count, memory_allocated
+
+
+class TestMemoryAllocated(unittest.TestCase):
+    def test_memory_allocated(self, device=None):
+        if core.is_compiled_with_cuda():
+            tensor = paddle.zeros(shape=[256])
+            alloc_size = 4 * 256  # 256 float32 data, with 4 bytes for each one
+            memory_allocated_size = memory_allocated(device)
+            self.assertEqual(memory_allocated_size, alloc_size)
+
+    def test_memory_allocated_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.test_memory_allocated(core.CUDAPlace(i))
+                self.test_memory_allocated(i)
+                self.test_memory_allocated("gpu:" + str(i))
+
+    def test_memory_allocated_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):
+                    memory_allocated(device)
+        else:
+            with self.assertRaises(BaseException):
+                memory_allocated()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
new file mode 100644
index 0000000000..149760de8b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import numpy as np
+from paddle.fluid import core
+from paddle.device.cuda import device_count, memory_reserved
+
+
+class TestMemoryreserved(unittest.TestCase):
+    def test_memory_reserved(self, device=None):
+        if core.is_compiled_with_cuda():
+            tensor = paddle.zeros(shape=[256])
+            alloc_size = 4 * 256  # 256 float32 data, with 4 bytes for each one
+            memory_reserved_size = memory_reserved(device)
+            self.assertEqual(memory_reserved_size, alloc_size)
+
+    def test_memory_reserved_for_all_places(self):
+        if core.is_compiled_with_cuda():
+            gpu_num = device_count()
+            for i in range(gpu_num):
+                paddle.device.set_device("gpu:" + str(i))
+                self.test_memory_reserved(core.CUDAPlace(i))
+                self.test_memory_reserved(i)
+                self.test_memory_reserved("gpu:" + str(i))
+
+    def test_memory_reserved_exception(self):
+        if core.is_compiled_with_cuda():
+            wrong_device = [
+                core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
+            ]
+            for device in wrong_device:
+                with self.assertRaises(BaseException):
+                    memory_reserved(device)
+        else:
+            with self.assertRaises(BaseException):
+                memory_reserved()
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab