record memory and op supplement info (#43550)

* record memory and op supplement info * update * update * fix a bug * fix memory recording * fix a bug * update * update * fix a bug * update * fix a bug * fix a bug * fix a bug * Revert "fix a bug" This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5. * fix a bug * fix format * fix

record memory and op supplement info (#43550)
* record memory and op supplement info * update * update * fix a bug * fix memory recording * fix a bug * update * update * fix a bug * update * fix a bug * fix a bug * fix a bug * Revert "fix a bug" This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5. * fix a bug * fix format * fix
8dd0a3b9 · chenjian · GitHub · e64823c1 · 8dd0a3b9 · 8dd0a3b9
19 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/core/kernel_context.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
        op_with_kernel->Info().infer_shape_(
            instr_node.InnerInferShapeContext().get());
      }
+      infershape_event.End();
+      platform::RecordOpInfoSupplement(op->Type(),
+                                       op->Attrs(),
+                                       *(instr_node.InnerInferShapeContext()),
+                                       *(instr_node.InnerRuntimeContext()));
    }
  }


--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
 cc_library(
  allocator
  SRCS allocator.cc
-  DEPS place stats)
+  DEPS place stats profiler)
 cc_library(
  cpu_allocator
  SRCS cpu_allocator.cc
@@ -21,7 +21,7 @@ cc_library(
 cc_library(
  naive_best_fit_allocator
  SRCS naive_best_fit_allocator.cc
-  DEPS allocator buddy_allocator profiler)
+  DEPS allocator buddy_allocator)
 cc_test(
  naive_best_fit_allocator_test
  SRCS naive_best_fit_allocator_test.cc

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -32,7 +32,8 @@
 #endif

 PADDLE_DEFINE_EXPORTED_bool(
-    init_allocated_mem, false,
+    init_allocated_mem,
+    false,
    "It is a mistake that the values of the memory allocated by "
    "BuddyAllocator are always zeroed in some op's implementation. "
    "To find this error in time, we use init_allocated_mem to indicate "
@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+        platform::CpuMinChunkSize(),
+        platform::CpuMaxChunkSize());
  });

  return a;
@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 }

 template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
+void Free<platform::CPUPlace>(const platform::CPUPlace &place,
+                              void *p,
                              size_t size) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
  return p;
 }
 template <>
-void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+void Free<platform::IPUPlace>(const platform::IPUPlace &place,
+                              void *p,
                              size_t size) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
    ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
  }
  PADDLE_ENFORCE_EQ(
-      ret, XPU_SUCCESS,
+      ret,
+      XPU_SUCCESS,
      platform::errors::External(
          "XPU API return wrong value[%d], no enough memory", ret));
  if (FLAGS_init_allocated_mem) {
@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 }

 template <>
-void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+void Free<platform::XPUPlace>(const platform::XPUPlace &place,
+                              void *p,
                              size_t size) {
 #ifdef PADDLE_WITH_XPU
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList {
  BuddyAllocator *Get(int npu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));

    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetNPUDeviceId(devices_[pos]);
@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList {
          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::NPUAllocator(devices_[pos])),
                             platform::NPUMinChunkSize(),
-                             platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
+                             platform::NPUMaxChunkSize(),
+                             EXTRA_PADDING_SIZE));
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "'FLAGS_fraction_of_gpu_memory_to_use' "
@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
        "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
 }

 template <>
-void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+void Free<platform::NPUPlace>(const platform::NPUPlace &place,
+                              void *p,
                              size_t size) {
 #ifdef PADDLE_WITH_ASCEND_CL
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,

 template <>
 void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
-                                    void *p, size_t size) {
+                                    void *p,
+                                    size_t size) {
 #ifdef PADDLE_WITH_ASCEND_CL
  GetNPUPinnedBuddyAllocator()->Free(p);
 #else
@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList {
  BuddyAllocator *Get(int gpu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));

    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetDeviceId(devices_[pos]);
-      allocators_[pos].reset(new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(
+      allocators_[pos].reset(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::GPUAllocator(devices_[pos])),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
+                             platform::GpuMinChunkSize(),
+                             platform::GpuMaxChunkSize()));
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "'FLAGS_fraction_of_gpu_memory_to_use' "
@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 }

 template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
+void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
+                               void *p,
                               size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  GetGPUBuddyAllocator(place.device)->Free(p);
@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,

 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p, size_t size) {
+                                     void *p,
+                                     size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList {
  BuddyAllocator *Get(int mlu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));

    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetMLUDeviceId(devices_[pos]);
-      allocators_[pos].reset(new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(
+      allocators_[pos].reset(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::MLUAllocator(devices_[pos])),
-          platform::MLUMinChunkSize(), platform::MLUMaxChunkSize()));
+                             platform::MLUMinChunkSize(),
+                             platform::MLUMaxChunkSize()));
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "(mlu reuse gpu GFlags) "
@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
        "%s, MLUMinChunkSize %s, MLU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::MLUPlace>(place))));
@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
 }

 template <>
-void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p,
+void Free<platform::MLUPlace>(const platform::MLUPlace &place,
+                              void *p,
                              size_t size) {
 #ifdef PADDLE_WITH_MLU
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -759,10 +785,12 @@ class BuddyAllocatorList {
  }

  BuddyAllocator *Get(int dev_id) {
-    PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
+    PADDLE_ENFORCE_NE(init_flags_.find(dev_id),
+                      init_flags_.end(),
                      platform::errors::OutOfRange(
                          "Cannot find %s %d, please check visible devices.",
-                          device_type_, dev_id));
+                          device_type_,
+                          dev_id));

    std::call_once(*init_flags_[dev_id], [this, dev_id] {
      phi::DeviceManager::SetDevice(device_type_, dev_id);
@@ -773,7 +801,8 @@ class BuddyAllocatorList {
              new detail::CustomAllocator(device_type_, dev_id)),
          phi::DeviceManager::GetMinChunkSize(place),
          phi::DeviceManager::GetMaxChunkSize(place),
-          phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
+          phi::DeviceManager::GetExtraPaddingSize(place),
+          device_type_));
    });

    return allocators_[dev_id].get();
@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
        "%s. ",
-        string::HumanReadableSize(size), place.GetDeviceType(), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.GetDeviceType(),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
        string::HumanReadableSize(total - avail)));
  } else {
    if (FLAGS_init_allocated_mem) {
@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
 }

 template <>
-void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
+void Free<platform::CustomPlace>(const platform::CustomPlace &place,
+                                 void *p,
                                 size_t size) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -922,8 +955,6 @@ namespace allocation {
 phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
  void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
  auto *tmp_alloc = new Allocation(ptr, size, place_);
-  platform::MemEvenRecorder::Instance().PushMemRecord(
-      static_cast<void *>(tmp_alloc), place_, size);
  return tmp_alloc;
 }

@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
  paddle::platform::VisitPlace(
      allocation->place(),
      legacy::FreeVisitor(allocation->ptr(), allocation->size()));
-  platform::MemEvenRecorder::Instance().PopMemRecord(
-      static_cast<void *>(allocation), place_);
  delete allocation;
 }


--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"

 #include "paddle/fluid/memory/stats.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
  PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
+  platform::RecordMemEvent(allocation->ptr(),
+                           allocation->place(),
+                           allocation->size(),
+                           platform::TracerMemEventType::ReservedFree);
  delete allocation;
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
  PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+  platform::RecordMemEvent(ptr,
+                           platform::CUDAPinnedPlace(),
+                           size,
+                           platform::TracerMemEventType::ReservedAllocate);
  return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -16,6 +16,7 @@

 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/stats.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"

 namespace paddle {
 namespace memory {
@@ -30,14 +31,18 @@ class StatAllocator : public Allocator {

 protected:
  void FreeImpl(phi::Allocation* allocation) override {
-    if (platform::is_cpu_place(allocation->place())) {
-      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                              -allocation->size());
+    if (platform::is_cpu_place(allocation->place()) ||
+        platform::is_cuda_pinned_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(
+          Allocated, allocation->place().GetDeviceId(), -allocation->size());
    } else {
-      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                                -allocation->size());
+      DEVICE_MEMORY_STAT_UPDATE(
+          Allocated, allocation->place().GetDeviceId(), -allocation->size());
    }
-
+    platform::RecordMemEvent(allocation->ptr(),
+                             allocation->place(),
+                             allocation->size(),
+                             platform::TracerMemEventType::Free);
    underlying_allocator_->Free(allocation);
  }

@@ -48,12 +53,16 @@ class StatAllocator : public Allocator {
    const platform::Place& place = allocation->place();
    if (platform::is_cpu_place(place) ||
        platform::is_cuda_pinned_place(place)) {
-      HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
-                              allocation->size());
+      HOST_MEMORY_STAT_UPDATE(
+          Allocated, place.GetDeviceId(), allocation->size());
    } else {
-      DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
-                                allocation->size());
+      DEVICE_MEMORY_STAT_UPDATE(
+          Allocated, place.GetDeviceId(), allocation->size());
    }
+    platform::RecordMemEvent(allocation->ptr(),
+                             allocation->place(),
+                             allocation->size(),
+                             platform::TracerMemEventType::Allocate);
    return allocation.release();
  }


--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -41,6 +41,7 @@ limitations under the License. */
 #endif

 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"

 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -64,11 +65,13 @@ void* AlignedMalloc(size_t size) {
 #else
  int error = posix_memalign(&p, alignment, size);
  PADDLE_ENFORCE_EQ(
-      error, 0,
+      error,
+      0,
      platform::errors::ResourceExhausted(
          "Fail to alloc memory of %ld size, error code is %d.", size, error));
 #endif
-  PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted(
+  PADDLE_ENFORCE_NOT_NULL(p,
+                          platform::errors::ResourceExhausted(
                              "Fail to alloc memory of %ld size.", size));
  return p;
 }
@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
  }

  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
-
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
  return p;
 }

@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 #endif

  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }

 bool CPUAllocator::UseGpu() const { return false; }
@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum GPU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
    }

    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        gpu_id_, string::HumanReadableSize(size), gpu_id_,
-        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
-        gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        gpu_id_,
+        string::HumanReadableSize(size),
+        gpu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        gpu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
  }
 }

 void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(gpu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(gpu_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
-                        size, gpu_alloc_size_));
+                        size,
+                        gpu_alloc_size_));
  gpu_alloc_size_ -= size;

  platform::RecordedGpuFree(p, size, gpu_id_);
@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
    *index = 1;  // PINNED memory
    cuda_pinnd_alloc_size_ += size;
    HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+    platform::RecordMemEvent(
+        p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
    return p;
  } else {
    LOG(WARNING) << "cudaHostAlloc failed.";
@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {

 void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  gpuError_t err;
-  PADDLE_ENFORCE_EQ(index, 1,
+  PADDLE_ENFORCE_EQ(index,
+                    1,
                    platform::errors::InvalidArgument(
                        "The index should be 1, but got %d", index));

-  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size,
+  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated cuda pinned memory (%d)",
-                        size, cuda_pinnd_alloc_size_));
+                        size,
+                        cuda_pinnd_alloc_size_));
  cuda_pinnd_alloc_size_ -= size;
 #ifdef PADDLE_WITH_HIP
  err = hipHostFree(p);
  if (err != hipErrorDeinitialized) {
    PADDLE_ENFORCE_EQ(
-        err, hipSuccess,
+        err,
+        hipSuccess,
        platform::errors::Fatal(
            "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
  }
@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  // cudaFreeHost succeeds.
  if (err != cudaErrorCudartUnloading) {
    PADDLE_ENFORCE_EQ(
-        err, 0,
+        err,
+        0,
        platform::errors::Fatal(
            "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
            err));
  }
 #endif
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }

 bool CUDAPinnedAllocator::UseGpu() const { return false; }
@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum GPU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
    }

    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        npu_id_, string::HumanReadableSize(size), npu_id_,
-        string::HumanReadableSize(avail), npu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        npu_id_,
+        string::HumanReadableSize(size),
+        npu_id_,
+        string::HumanReadableSize(avail),
+        npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
  }
 }

 void NPUAllocator::Free(void* p, size_t size, size_t index) {
  VLOG(4) << "Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(npu_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
-                        size, npu_alloc_size_));
+                        size,
+                        npu_alloc_size_));
  npu_alloc_size_ -= size;

  platform::RecordedNPUFree(p, size, npu_id_);
@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {

 void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
  aclError err;
-  PADDLE_ENFORCE_EQ(index, 1,
+  PADDLE_ENFORCE_EQ(index,
+                    1,
                    platform::errors::InvalidArgument(
                        "The index should be 1, but got %d", index));

-  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
+  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated npu pinned memory (%d)",
-                        size, npu_pinnd_alloc_size_));
+                        size,
+                        npu_pinnd_alloc_size_));
  npu_pinnd_alloc_size_ -= size;
  err = platform::NPUHostFree(p);

  if (err != ACL_ERROR_NONE) {
    PADDLE_ENFORCE_EQ(
-        err, 0,
+        err,
+        0,
        platform::errors::Fatal(
            "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
  }
@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum MLU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
    }

    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        mlu_id_, string::HumanReadableSize(size), mlu_id_,
-        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
-        mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        mlu_id_,
+        string::HumanReadableSize(size),
+        mlu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        mlu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
  }
 }

 void MLUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(mlu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(mlu_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
-                        size, mlu_alloc_size_));
+                        size,
+                        mlu_alloc_size_));
  mlu_alloc_size_ -= size;

  platform::RecordedMLUFree(p, size, mlu_id_);
@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
        "\n\nOut of memory error on %s %d. "
        "total memory is %s, used memory is %s, "
        "available memory is only %s.\n\n",
-        dev_type_, dev_id_, string::HumanReadableSize(total),
+        dev_type_,
+        dev_id_,
+        string::HumanReadableSize(total),
        string::HumanReadableSize(total - avail),
        string::HumanReadableSize(avail)));
  }
@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {

 void CustomAllocator::Free(void* p, size_t size, size_t index) {
  VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(plug_alloc_size, size,
+  PADDLE_ENFORCE_GE(plug_alloc_size,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
-                        size, plug_alloc_size));
+                        size,
+                        plug_alloc_size));
  plug_alloc_size -= size;
  auto place = platform::CustomPlace(dev_type_, dev_id_);
  auto device = phi::DeviceManager::GetDeviceWithPlace(place);

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -354,7 +354,9 @@ if(WITH_GPU)
         enforce
         dynload_cuda
         new_profiler
-         stats)
+         stats
+         op_proto_maker
+         shape_inference)
  nv_library(
    device_memory_aligment
    SRCS device_memory_aligment.cc
@@ -363,7 +365,14 @@ elseif(WITH_ROCM)
  hip_library(
    profiler
    SRCS profiler.cc profiler.cu
-    DEPS os_info device_tracer gpu_info enforce new_profiler stats)
+    DEPS os_info
+         device_tracer
+         gpu_info
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
  hip_library(
    device_memory_aligment
    SRCS device_memory_aligment.cc
@@ -372,7 +381,13 @@ else()
  cc_library(
    profiler
    SRCS profiler.cc
-    DEPS os_info device_tracer enforce new_profiler stats)
+    DEPS os_info
+         device_tracer
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
  cc_library(
    device_memory_aligment
    SRCS device_memory_aligment.cc

--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"

@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);

-PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log,
+                            false,
                            "Whether to print the message of gpu memory usage "
                            "at exit, mainly used for UT and CI.");
-PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true,
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
+                            true,
                            "Whether to print the message of gpu memory usage "
                            "MB as a unit of measurement.");

@@ -66,7 +69,10 @@ namespace platform {

 void GpuMemoryUsage(size_t *available, size_t *total) {
  size_t actual_available, actual_total;
-  RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total,
+  RecordedGpuMemGetInfo(available,
+                        total,
+                        &actual_available,
+                        &actual_total,
                        platform::GetCurrentDeviceId());
 }

@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() {
 static size_t GpuAllocSize(bool realloc) {
  size_t available_to_alloc = GpuAvailableMemToAlloc();
  PADDLE_ENFORCE_GT(
-      available_to_alloc, 0,
+      available_to_alloc,
+      0,
      platform::errors::ResourceExhausted("Not enough available GPU memory."));
  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
  // allocated by fraction
@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) {
           ? flag_mb << 20
           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
  PADDLE_ENFORCE_GE(
-      available_to_alloc, alloc_bytes,
+      available_to_alloc,
+      alloc_bytes,
      platform::errors::ResourceExhausted("Not enough available GPU memory."));
  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
           << " MiB, is it Re-alloc: " << realloc;
@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper {
    });

    PADDLE_ENFORCE_GE(
-        dev_id, 0,
+        dev_id,
+        0,
        platform::errors::OutOfRange(
            "Device id must be not less than 0, but got %d.", dev_id));
    PADDLE_ENFORCE_LT(
-        dev_id, instances_.size(),
+        dev_id,
+        instances_.size(),
        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
-                                     dev_id, instances_.size()));
+                                     dev_id,
+                                     instances_.size()));
    return instances_[dev_id].get();
  }

@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper {
   * or cudaSuccess would be returned, and the cudaGetLastError() flag
   * would be clear.
   */
-  gpuError_t Malloc(void **ptr, size_t size,
+  gpuError_t Malloc(void **ptr,
+                    size_t size,
                    bool malloc_managed_memory = false) {
    LockGuardPtr<std::mutex> lock(mtx_);
    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper {
      cur_size_.fetch_add(size);
      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
-
+      platform::RecordMemEvent(ptr,
+                               GPUPlace(dev_id_),
+                               size,
+                               platform::TracerMemEventType::ReservedAllocate);
 #ifdef PADDLE_WITH_TESTING
      gpu_ptrs.insert(*ptr);
 #endif
@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper {
      cur_size_.fetch_sub(size);
      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      platform::RecordMemEvent(ptr,
+                               GPUPlace(dev_id_),
+                               size,
+                               platform::TracerMemEventType::ReservedFree);
    } else {
      platform::GpuGetLastError();  // clear the error flag when
                                    // cudaErrorCudartUnloading /
@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper {
 #endif
  }

-  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+  bool GetMemInfo(size_t *avail,
+                  size_t *total,
+                  size_t *actual_avail,
                  size_t *actual_total) {
    {
      CUDADeviceGuard guard(dev_id_);
@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper {

 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+  CUresult MemCreate(CUmemGenericAllocationHandle *handle,
+                     size_t size,
                     const CUmemAllocationProp *prop,
                     unsigned long long flags) {  // NOLINT
    auto result =
@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper {

 std::once_flag RecordedGpuMallocHelper::once_flag_;

-gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
+gpuError_t RecordedGpuMalloc(void **ptr,
+                             size_t size,
+                             int dev_id,
                             bool malloc_managed_memory) {
  return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
      ptr, size, malloc_managed_memory);
@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {

 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
+                              size_t size,
                              const CUmemAllocationProp *prop,
-                              unsigned long long flags, int dev_id) {  // NOLINT
-  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size,
-                                                              prop, flags);
+                              unsigned long long flags,
+                              int dev_id) {  // NOLINT
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
+      handle, size, prop, flags);
 }

-CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
+                               size_t size,
                               int dev_id) {
  return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
 }
 #endif
 #endif

-bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                           size_t *actual_total, int dev_id) {
+bool RecordedGpuMemGetInfo(size_t *avail,
+                           size_t *total,
+                           size_t *actual_avail,
+                           size_t *actual_total,
+                           int dev_id) {
  return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
      avail, total, actual_avail, actual_total);
 }
@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) {

 void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); }

-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
  phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
 }

-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                   gpuMemcpyKind kind) {
  phi::backends::gpu::GpuMemcpySync(dst, src, count, kind);
 }

-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
-  phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
-                                         count, stream);
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
+  phi::backends::gpu::GpuMemcpyPeerAsync(
+      dst, dst_device, src, src_device, count, stream);
 }

-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
-  phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
-                                        count);
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
+  phi::backends::gpu::GpuMemcpyPeerSync(
+      dst, dst_device, src, src_device, count);
 }

 void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -30,12 +30,16 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/nvtx.h"
 #endif
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"

-PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
+PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
+                            false,
                            "Enable rpc profiler or not.");

-DEFINE_bool(enable_host_event_recorder_hook, false,
+DEFINE_bool(enable_host_event_recorder_hook,
+            false,
            "enable HostEventRecorder, hook Profiler");

 namespace paddle {
@@ -43,8 +47,11 @@ namespace platform {

 MemEvenRecorder MemEvenRecorder::recorder;

-Event::Event(EventType type, std::string name, uint32_t thread_id,
-             EventRole role, std::string attr)
+Event::Event(EventType type,
+             std::string name,
+             uint32_t thread_id,
+             EventRole role,
+             std::string attr)
    : type_(type),
      name_(name),
      thread_id_(thread_id),
@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const {
 #endif
 }

-RecordEvent::RecordEvent(const char *name, const TracerEventType type,
-                         uint32_t level, const EventRole role) {
+RecordEvent::RecordEvent(const char *name,
+                         const TracerEventType type,
+                         uint32_t level,
+                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
  if (g_enable_nvprof_hook) {
@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
  start_ns_ = PosixInNsec();
 }

-RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
-                         uint32_t level, const EventRole role) {
+RecordEvent::RecordEvent(const std::string &name,
+                         const TracerEventType type,
+                         uint32_t level,
+                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
  if (g_enable_nvprof_hook) {
@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
  start_ns_ = PosixInNsec();
 }

-RecordEvent::RecordEvent(const std::string &name, const std::string &attr,
-                         const TracerEventType type, uint32_t level,
+RecordEvent::RecordEvent(const std::string &name,
+                         const std::string &attr,
+                         const TracerEventType type,
+                         uint32_t level,
                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
@@ -215,8 +228,8 @@ void RecordEvent::End() {
  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer) {
    uint64_t end_ns = PosixInNsec();
-    tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
-                          g_thread_id);
+    tracer->AddCPURecords(
+        CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
  }
  ClearCurAnnotation();
  PopEvent(*name_, role_);
@@ -226,7 +239,8 @@ void RecordEvent::End() {
  is_enabled_ = false;
 }

-RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
+RecordInstantEvent::RecordInstantEvent(const char *name,
+                                       TracerEventType type,
                                       uint32_t level) {
  if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
    return;
@@ -236,21 +250,242 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
      name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
 }

-void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
+RecordOpInfoSupplement::RecordOpInfoSupplement(
+    const std::string &type,
+    const framework::AttributeMap &attrs,
+    const framework::InferShapeContext &shape_ctx,
+    const framework::RuntimeContext &ctx) {
+  if (FLAGS_enable_host_event_recorder_hook == false) {
+    return;
+  }
+  std::map<std::string, std::vector<framework::DDim>> input_shapes;
+  std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
+  for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
+    input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
+    dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
+  }
+
+  const std::vector<std::string> *callstack_ptr = nullptr;
+  std::vector<std::string> callstack;
+  auto iter = attrs.find(
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+  if (iter != attrs.end()) {
+    callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
+    callstack = *callstack_ptr;
+  }
+  HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
+      PosixInNsec(), type, input_shapes, dtypes, callstack);
+}
+
+RecordMemEvent::RecordMemEvent(const void *ptr,
+                               const phi::Place &place,
+                               size_t size,
+                               const TracerMemEventType type) {
+  if (g_state == ProfilerState::kDisabled &&
+      FLAGS_enable_host_event_recorder_hook == false) {
+    return;
+  }
+  if (type == TracerMemEventType::Allocate) {
+    uint64_t current_allocated;
+    uint64_t peak_allocated;
+    uint64_t current_reserved = 0;  // 0 means keep the same as before
+    uint64_t peak_reserved = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_allocated =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    } else {
+      current_allocated =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
+  } else if (type == TracerMemEventType::ReservedAllocate) {
+    uint64_t current_reserved;
+    uint64_t peak_reserved;
+    uint64_t current_allocated = 0;  // 0 means keep the same as before
+    uint64_t peak_allocated = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_reserved =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    } else {
+      current_reserved =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
+  } else if (type == TracerMemEventType::Free) {
+    uint64_t current_allocated;
+    uint64_t peak_allocated;
+    uint64_t current_reserved = 0;  // 0 means keep the same as before
+    uint64_t peak_reserved = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_allocated =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    } else {
+      current_allocated =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
+                                                       place,
+                                                       size,
+                                                       type,
+                                                       current_allocated,
+                                                       current_reserved,
+                                                       peak_allocated,
+                                                       peak_reserved);
+  } else if (type == TracerMemEventType::ReservedFree) {
+    uint64_t current_reserved;
+    uint64_t peak_reserved;
+    uint64_t current_allocated = 0;  // 0 means keep the same as before
+    uint64_t peak_allocated = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_reserved =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    } else {
+      current_reserved =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
+                                                       place,
+                                                       size,
+                                                       type,
+                                                       current_allocated,
+                                                       current_reserved,
+                                                       peak_allocated,
+                                                       peak_reserved);
+  }
+}
+
+void MemEvenRecorder::PushMemRecord(const void *ptr,
+                                    const Place &place,
                                    size_t size) {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled) {
+    return;
+  }
  std::lock_guard<std::mutex> guard(mtx_);
  auto &events = address_memevent_[place];
-  PADDLE_ENFORCE_EQ(events.count(ptr), 0,
+  PADDLE_ENFORCE_EQ(events.count(ptr),
+                    0,
                    platform::errors::InvalidArgument(
                        "The Place can't exist in the stage of PushMemRecord"));
-  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
+  events.emplace(ptr,
+                 std::unique_ptr<RecordMemEvent>(
                     new MemEvenRecorder::RecordMemEvent(place, size)));
 }

-void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
+void MemEvenRecorder::PushMemRecord(const void *ptr,
+                                    const Place &place,
+                                    size_t size,
+                                    TracerMemEventType type,
+                                    uint64_t current_allocated,
+                                    uint64_t current_reserved,
+                                    uint64_t peak_allocated,
+                                    uint64_t peak_reserved) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
+    HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
+        PosixInNsec(),
+        reinterpret_cast<uint64_t>(ptr),
+        type,
+        size,
+        place,
+        current_allocated,
+        current_reserved,
+        peak_allocated,
+        peak_reserved);
+    return;
+  }
+  if (type == TracerMemEventType::ReservedAllocate) {
+    // old profiler only analyse memory managed by paddle.
+    return;
+  }
  if (g_state == ProfilerState::kDisabled) return;
+  auto &events = address_memevent_[place];
+  PADDLE_ENFORCE_EQ(events.count(ptr),
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The Place can't exist in the stage of PushMemRecord"));
+  events.emplace(ptr,
+                 std::unique_ptr<RecordMemEvent>(
+                     new MemEvenRecorder::RecordMemEvent(place, size)));
+}
+
+void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
+  if (g_state == ProfilerState::kDisabled) {
+    return;
+  }
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto &events = address_memevent_[place];
+  auto iter = events.find(ptr);
+  // The ptr maybe not in address_memevent
+  if (iter != events.end()) {
+    events.erase(iter);
+  }
+}
+
+void MemEvenRecorder::PopMemRecord(const void *ptr,
+                                   const Place &place,
+                                   size_t size,
+                                   TracerMemEventType type,
+                                   uint64_t current_allocated,
+                                   uint64_t current_reserved,
+                                   uint64_t peak_allocated,
+                                   uint64_t peak_reserved) {
  std::lock_guard<std::mutex> guard(mtx_);
+  if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
+    HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
+        PosixInNsec(),
+        reinterpret_cast<uint64_t>(ptr),
+        type,
+        -size,
+        place,
+        current_allocated,
+        current_reserved,
+        peak_allocated,
+        peak_reserved);
+    return;
+  }
+  if (type == TracerMemEventType::ReservedFree) {
+    // old profiler only analyse memory managed by paddle.
+    return;
+  }
+  if (g_state == ProfilerState::kDisabled) return;
  auto &events = address_memevent_[place];
  auto iter = events.find(ptr);
  // The ptr maybe not in address_memevent
@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {

  auto annotation_free = CurAnnotationName();
  if (tracer) {
-    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
-                             annotation_free, g_mem_thread_id);
+    tracer->AddMemInfoRecord(start_ns_,
+                             end_ns_,
+                             bytes_,
+                             place_,
+                             alloc_in_,
+                             annotation_free,
+                             g_mem_thread_id);
  }
  PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
 }
@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() {
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
-    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          g_thread_id);
+    tracer->AddCPURecords(
+        name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
  }
  ClearCurBlock();
 }

-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                  const Place &place, const std::string &annotation) {
-  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
-                           place, g_mem_thread_id, annotation);
-}
-
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                 const Place &place, const std::string &annotation) {
-  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
-                           g_mem_thread_id, annotation);
+void PushMemEvent(uint64_t start_ns,
+                  uint64_t end_ns,
+                  size_t bytes,
+                  const Place &place,
+                  const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPushRange,
+                           start_ns,
+                           end_ns,
+                           bytes,
+                           place,
+                           g_mem_thread_id,
+                           annotation);
+}
+
+void PopMemEvent(uint64_t start_ns,
+                 uint64_t end_ns,
+                 size_t bytes,
+                 const Place &place,
+                 const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPopRange,
+                           start_ns,
+                           end_ns,
+                           bytes,
+                           place,
+                           g_mem_thread_id,
+                           annotation);
 }

 void Mark(const std::string &name) {
@@ -334,17 +590,19 @@ void Mark(const std::string &name) {
  GetEventList().Record(EventType::kMark, name, g_thread_id);
 }

-Event *PushEvent(const std::string &name, const EventRole role,
+Event *PushEvent(const std::string &name,
+                 const EventRole role,
                 std::string attr) {
-  return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role,
-                               attr);
+  return GetEventList().Record(
+      EventType::kPushRange, name, g_thread_id, role, attr);
 }

 void PopEvent(const std::string &name, const EventRole role, std::string attr) {
  GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
 }
 void EnableProfiler(ProfilerState state) {
-  PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
+  PADDLE_ENFORCE_NE(state,
+                    ProfilerState::kDisabled,
                    platform::errors::InvalidArgument(
                        "Can't enable profiling, since the input state is"
                        "ProfilerState::kDisabled"));
@@ -380,7 +638,8 @@ void ResetProfiler() {
    (*it)->Clear();
  }
  for (auto it = g_all_mem_event_lists.begin();
-       it != g_all_mem_event_lists.end(); ++it) {
+       it != g_all_mem_event_lists.end();
+       ++it) {
    (*it)->Clear();
  }
 }
@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop(
      std::string name =
          prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
      const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
-      Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid,
-                                             evt.role, attr);
+      Event *orig_evt = cur_thr_list->Record(
+          EventType::kPushRange, name, tid, evt.role, attr);
      (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
      cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
    }
@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd(
  for (const auto &thr_sec : host_sec.thr_sections) {
    uint64_t tid = thr_sec.thread_id;
    for (const auto &evt : thr_sec.events) {
-      tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(),
-                            tid);
+      tracer->AddCPURecords(
+          evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
    }
  }
 }

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -30,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -102,6 +104,22 @@ struct MemEvenRecorder {
 public:
  void PushMemRecord(const void* ptr, const Place& place, size_t size);
  void PopMemRecord(const void* ptr, const Place& place);
+  void PushMemRecord(const void* ptr,
+                     const Place& place,
+                     size_t size,
+                     TracerMemEventType type,
+                     uint64_t current_allocated,
+                     uint64_t current_reserved,
+                     uint64_t peak_allocated,
+                     uint64_t peak_reserved);
+  void PopMemRecord(const void* ptr,
+                    const Place& place,
+                    size_t size,
+                    TracerMemEventType type,
+                    uint64_t current_allocated,
+                    uint64_t current_reserved,
+                    uint64_t peak_allocated,
+                    uint64_t peak_reserved);
  void Flush();
  static MemEvenRecorder& Instance() { return recorder; }

@@ -160,7 +178,8 @@ struct EventList {
  std::vector<T> Reduce() {
    std::vector<T> result;
    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+      result.insert(result.begin(),
+                    std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
@@ -173,13 +192,21 @@ struct EventList {
 };

 void Mark(const std::string& name);
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                  const Place& place, const std::string& annotation);
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                 const Place& place, const std::string& annotation);
-Event* PushEvent(const std::string& name, const EventRole role,
+void PushMemEvent(uint64_t start_ns,
+                  uint64_t end_ns,
+                  size_t bytes,
+                  const Place& place,
+                  const std::string& annotation);
+void PopMemEvent(uint64_t start_ns,
+                 uint64_t end_ns,
+                 size_t bytes,
+                 const Place& place,
+                 const std::string& annotation);
+Event* PushEvent(const std::string& name,
+                 const EventRole role,
                 const std::string attr = "none");
-void PopEvent(const std::string& name, const EventRole role,
+void PopEvent(const std::string& name,
+              const EventRole role,
              const std::string attr = "none");
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.

--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
 cc_library(
  host_tracer
  SRCS host_tracer.cc
-  DEPS enforce)
+  DEPS enforce ddim var_type_traits)
 cc_library(
  cuda_tracer
  SRCS cuda_tracer.cc cupti_data_process.cc
@@ -10,7 +10,7 @@ add_subdirectory(mlu)
 cc_library(
  event_node
  SRCS event_node.cc
-  DEPS enforce)
+  DEPS enforce place)
 cc_library(
  profiler_utils
  SRCS utils.cc

--- a/paddle/fluid/platform/profiler/common_event.h
+++ b/paddle/fluid/platform/profiler/common_event.h
@@ -18,16 +18,21 @@
 #include <functional>
 #include <string>

+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/event.h"  // import EventRole, TODO(TIEXING): remove later
 #include "paddle/fluid/platform/profiler/trace_event.h"
+#include "paddle/phi/core/ddim.h"

 namespace paddle {
 namespace platform {

 struct CommonEvent {
 public:
-  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type)
+  CommonEvent(const char *name,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type)
      : name(name),
        start_ns(start_ns),
        end_ns(end_ns),
@@ -35,8 +40,12 @@ struct CommonEvent {
        type(type) {}

  CommonEvent(std::function<void *(size_t)> arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type, const std::string &attr_str)
+              const std::string &name_str,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type,
+              const std::string &attr_str)
      : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
    strncpy(buf, name_str.c_str(), name_str.length() + 1);
@@ -47,8 +56,11 @@ struct CommonEvent {
  }

  CommonEvent(std::function<void *(size_t)> arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type)
+              const std::string &name_str,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type)
      : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
    strncpy(buf, name_str.c_str(), name_str.length() + 1);
@@ -63,5 +75,61 @@ struct CommonEvent {
  const char *attr = nullptr;  // not owned, designed for performance
 };

+struct CommonMemEvent {
+ public:
+  CommonMemEvent(uint64_t timestamp_ns,
+                 uint64_t addr,
+                 TracerMemEventType type,
+                 int64_t increase_bytes,
+                 const Place &place,
+                 uint64_t current_allocated,
+                 uint64_t current_reserved,
+                 uint64_t peak_allocated,
+                 uint64_t peak_reserved)
+      : timestamp_ns(timestamp_ns),
+        addr(addr),
+        type(type),
+        increase_bytes(increase_bytes),
+        place(place),
+        peak_allocated(peak_allocated),
+        peak_reserved(peak_reserved) {}
+  uint64_t timestamp_ns;
+  uint64_t addr;
+  TracerMemEventType type;
+  int64_t increase_bytes;
+  Place place;
+  uint64_t current_allocated;
+  uint64_t current_reserved;
+  uint64_t peak_allocated;
+  uint64_t peak_reserved;
+};
+
+struct OperatorSupplementOriginEvent {
+ public:
+  OperatorSupplementOriginEvent(
+      std::function<void *(size_t)> arena_allocator,
+      uint64_t timestamp_ns,
+      const std::string &type_name,
+      const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
+      const std::map<std::string, std::vector<framework::proto::VarType::Type>>
+          &dtypes,
+      const std::vector<std::string> callstack)
+      : timestamp_ns(timestamp_ns),
+        input_shapes(input_shapes),
+        dtypes(dtypes),
+        callstack(callstack) {
+    auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
+    strncpy(buf, type_name.c_str(), type_name.length() + 1);
+    op_type = buf;
+  }
+  uint64_t timestamp_ns;
+  const char *op_type = nullptr;  // not owned, designed for performance
+  // input shapes
+  std::map<std::string, std::vector<framework::DDim>> input_shapes;
+  std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
+  // call stack
+  const std::vector<std::string> callstack;
+};
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -11,9 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/platform/profiler/host_tracer.h"

+#include <sstream>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
@@ -21,7 +22,8 @@

 // Used to filter events, works like glog VLOG(level).
 // RecordEvent will works if host_trace_level >= level.
-PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
+                             1,
                             "RecordEvent will works "
                             "if host_trace_level >= level.");

@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
  }
 }

+void ProcessHostMemEvents(
+    const HostEventSection<CommonMemEvent>& host_mem_events,
+    TraceEventCollector* collector) {
+  for (const auto& thr_sec : host_mem_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
+    for (const auto& evt : thr_sec.events) {
+      MemTraceEvent event;
+      event.timestamp_ns = evt.timestamp_ns;
+      event.addr = evt.addr;
+      event.type = evt.type;
+      event.increase_bytes = evt.increase_bytes;
+      event.place = evt.place.DebugString();
+      event.current_allocated = evt.current_allocated;
+      event.current_reserved = evt.current_reserved;
+      event.peak_allocated = evt.peak_allocated;
+      event.peak_reserved = evt.peak_reserved;
+      event.process_id = host_mem_events.process_id;
+      event.thread_id = tid;
+      collector->AddMemEvent(std::move(event));
+    }
+  }
+}
+
+void ProcessOperatorSupplementEvents(
+    const HostEventSection<OperatorSupplementOriginEvent>& op_supplement_events,
+    TraceEventCollector* collector) {
+  for (const auto& thr_sec : op_supplement_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
+    for (const auto& evt : thr_sec.events) {
+      OperatorSupplementEvent event;
+      event.timestamp_ns = evt.timestamp_ns;
+      event.op_type = evt.op_type;
+      std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+      std::map<std::string, std::vector<std::string>> dtypes;
+      std::string callstack;
+      for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end();
+           it++) {
+        for (auto idx = 0lu; idx < it->second.size(); idx++) {
+          input_shapes[it->first].push_back(std::vector<int64_t>());
+          for (auto dim_idx = 0; dim_idx < it->second.at(idx).size();
+               dim_idx++) {
+            input_shapes[it->first][idx].push_back(
+                it->second.at(idx).at(dim_idx));
+          }
+        }
+      }
+      for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) {
+        for (auto idx = 0lu; idx < it->second.size(); idx++) {
+          dtypes[it->first].push_back(
+              framework::proto::VarType::Type_Name(it->second.at(idx)));
+        }
+      }
+
+      std::ostringstream result_string;
+      for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) {
+        result_string << (*it) << std::endl;
+      }
+      event.input_shapes = input_shapes;
+      event.dtypes = dtypes;
+      event.callstack = result_string.str();
+      event.process_id = op_supplement_events.process_id;
+      event.thread_id = tid;
+      collector->AddOperatorSupplementEvent(std::move(event));
+    }
+  }
+}
+
 }  // namespace

 void HostTracer::PrepareTracing() {
@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() {

 void HostTracer::StartTracing() {
  PADDLE_ENFORCE_EQ(
-      state_ == TracerState::READY || state_ == TracerState::STOPED, true,
+      state_ == TracerState::READY || state_ == TracerState::STOPED,
+      true,
      platform::errors::PreconditionNotMet("TracerState must be READY"));
  HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
+  HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
+  HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
+      .GatherEvents();
  HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
  state_ = TracerState::STARTED;
 }

 void HostTracer::StopTracing() {
  PADDLE_ENFORCE_EQ(
-      state_, TracerState::STARTED,
+      state_,
+      TracerState::STARTED,
      platform::errors::PreconditionNotMet("TracerState must be STARTED"));
  HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
  state_ = TracerState::STOPED;
@@ -77,11 +157,19 @@ void HostTracer::StopTracing() {

 void HostTracer::CollectTraceData(TraceEventCollector* collector) {
  PADDLE_ENFORCE_EQ(
-      state_, TracerState::STOPED,
+      state_,
+      TracerState::STOPED,
      platform::errors::PreconditionNotMet("TracerState must be STOPED"));
  HostEventSection<CommonEvent> host_events =
      HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
  ProcessHostEvents(host_events, collector);
+  HostEventSection<CommonMemEvent> host_mem_events =
+      HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
+  ProcessHostMemEvents(host_mem_events, collector);
+  HostEventSection<OperatorSupplementOriginEvent> op_supplement_events =
+      HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
+          .GatherEvents();
+  ProcessOperatorSupplementEvents(op_supplement_events, collector);
 }

 }  // namespace platform

--- a/paddle/fluid/platform/profiler/mem_tracing.h
+++ b/paddle/fluid/platform/profiler/mem_tracing.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+namespace paddle {
+namespace platform {
+// Memory event tracing. A trace marks memory manipulation such as allocation
+// and free.
+// The events can be used to draw memory variation curve.
+class RecordMemEvent {
+ public:
+  /**
+   * @param ptr:  Pointer address allocated or free.
+   * @param place: Device for this memory event.
+   * @param size: Memory size allocated or free.
+   * @param type: Denote manipulation type for this memory event.
+   */
+  explicit RecordMemEvent(
+      const void* ptr,
+      const Place& place,
+      size_t size,
+      const TracerMemEventType type = TracerMemEventType::Allocate);
+};
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -23,6 +23,8 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/profiler.h"
@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) {
  profiler->Prepare();
  profiler->Start();
  {
-    RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined,
-                       2);
-    RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
-                       3);
+    RecordInstantEvent(
+        "TestTraceLevel_record1", TracerEventType::UserDefined, 2);
+    RecordInstantEvent(
+        "TestTraceLevel_record2", TracerEventType::UserDefined, 3);
  }
  auto profiler_result = profiler->Stop();
  auto nodetree = profiler_result->GetNodeTrees();
@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) {
  EXPECT_GT(runtime_events.size(), 0u);
 #endif
 }
+
+TEST(ProfilerTest, TestHostTracerForMem) {
+  using paddle::platform::CPUPlace;
+  using paddle::platform::EnableHostEventRecorder;
+  using paddle::platform::MemTraceEventNode;
+  using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
+  using paddle::platform::ProfilerResult;
+  using paddle::platform::RecordEvent;
+  using paddle::platform::RecordInstantEvent;
+  using paddle::platform::RecordMemEvent;
+  using paddle::platform::TracerEventType;
+  using paddle::platform::TracerMemEventType;
+  ProfilerOptions options;
+  options.trace_level = 1;
+  options.trace_switch = 3;
+  auto profiler = Profiler::Create(options);
+  EXPECT_TRUE(profiler);
+  EnableHostEventRecorder();
+  profiler->Prepare();
+  profiler->Start();
+  {
+    RecordEvent event1(
+        "TestTracerForMem_phase1", TracerEventType::UserDefined, 1);
+    RecordMemEvent(reinterpret_cast<void*>(0),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Allocate);
+    RecordMemEvent(
+        reinterpret_cast<void*>(0), CPUPlace(), 1024, TracerMemEventType::Free);
+  }
+  {
+    RecordEvent event2(
+        "TestTracerForMem_phase2", TracerEventType::UserDefined, 1);
+    RecordMemEvent(reinterpret_cast<void*>(1024),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Allocate);
+    RecordMemEvent(reinterpret_cast<void*>(1024),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Free);
+  }
+  auto profiler_result = profiler->Stop();
+  auto nodetree = profiler_result->GetNodeTrees();
+}
--- a/paddle/fluid/platform/profiler/supplement_tracing.h
+++ b/paddle/fluid/platform/profiler/supplement_tracing.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+namespace paddle {
+
+namespace framework {
+class RuntimeContext;
+}
+namespace platform {
+
+class RecordOpInfoSupplement {
+ public:
+  /**
+   * @param type:  Operator type name.
+   * @param attrs: Attribute map of op.
+   * @param shape_ctx: Infershape context object.
+   * @param ctx: Runtime context object.
+   */
+  explicit RecordOpInfoSupplement(const std::string& type,
+                                  const framework::AttributeMap& attrs,
+                                  const framework::InferShapeContext& shape_ctx,
+                                  const framework::RuntimeContext& ctx);
+};
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc