From 8dd0a3b935192ab7eb28f4af0ecc7884bdefd594 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Fri, 24 Jun 2022 10:35:29 +0800
Subject: [PATCH] record memory and op supplement info (#43550)

* record memory and op supplement info

* update

* update

* fix a bug

* fix memory recording

* fix a bug

* update

* update

* fix a bug

* update

* fix a bug

* fix a bug

* fix a bug

* Revert "fix a bug"

This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5.

* fix a bug

* fix format

* fix
---
 .../framework/new_executor/interpretercore.cc |   6 +
 paddle/fluid/framework/operator.cc            | 357 ++++++---
 paddle/fluid/memory/allocation/CMakeLists.txt |   4 +-
 .../allocation/naive_best_fit_allocator.cc    | 113 +--
 .../memory/allocation/pinned_allocator.cc     |   9 +
 .../fluid/memory/allocation/stat_allocator.h  |  29 +-
 .../fluid/memory/detail/system_allocator.cc   | 126 +++-
 paddle/fluid/memory/memcpy.cc                 | 581 ++++++++++-----
 paddle/fluid/platform/CMakeLists.txt          |  21 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  | 101 ++-
 paddle/fluid/platform/profiler.cc             | 343 +++++++--
 paddle/fluid/platform/profiler.h              |  41 +-
 paddle/fluid/platform/profiler/CMakeLists.txt |   4 +-
 paddle/fluid/platform/profiler/common_event.h |  80 +-
 paddle/fluid/platform/profiler/host_tracer.cc |  98 ++-
 paddle/fluid/platform/profiler/mem_tracing.h  |  43 ++
 .../fluid/platform/profiler/profiler_test.cc  |  56 +-
 .../platform/profiler/supplement_tracing.h    |  45 ++
 paddle/fluid/pybind/pybind.cc                 | 697 ++++++++++++------
 19 files changed, 2031 insertions(+), 723 deletions(-)
 create mode 100644 paddle/fluid/platform/profiler/mem_tracing.h
 create mode 100644 paddle/fluid/platform/profiler/supplement_tracing.h

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index dfa2179f44..c61243041a 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/core/kernel_context.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
         op_with_kernel->Info().infer_shape_(
             instr_node.InnerInferShapeContext().get());
       }
+      infershape_event.End();
+      platform::RecordOpInfoSupplement(op->Type(),
+                                       op->Attrs(),
+                                       *(instr_node.InnerInferShapeContext()),
+                                       *(instr_node.InnerRuntimeContext()));
     }
   }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index dbf6bec676..31c3ea7607 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_context.h"
@@ -70,7 +71,8 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
 
-static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name,
+static DDim GetDimsDebug(const ScopeBase& scope,
+                         const std::string& name,
                          bool get_actual_dim = false) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -264,7 +266,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           Type(), platform::TracerEventType::Operator, 1);
       auto op_name = platform::OpName(outputs_, Type());
       platform::RecordEvent op_name_record_event(
-          op_name, platform::TracerEventType::Operator,
+          op_name,
+          platform::TracerEventType::Operator,
           FLAGS_enable_host_event_recorder_hook ? 20 : 1,
           platform::EventRole::kUniqueOp);
       RunImpl(scope, place);
@@ -293,9 +296,11 @@ bool OperatorBase::HasInputs(const std::string& name) const {
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(
-      ins.size(), 1UL,
+      ins.size(),
+      1UL,
       platform::errors::InvalidArgument(
-          "Operator %s's input %s should contain only one variable.", type_,
+          "Operator %s's input %s should contain only one variable.",
+          type_,
           name));
   return ins.empty() ? kEmptyVarName : ins[0];
 }
@@ -304,9 +309,10 @@ const std::vector<std::string>& OperatorBase::Inputs(
     const std::string& name) const {
   auto it = inputs_.find(name);
   PADDLE_ENFORCE_NE(
-      it, inputs_.end(),
-      platform::errors::NotFound("Operator %s does not have the input %s.",
-                                 type_, name));
+      it,
+      inputs_.end(),
+      platform::errors::NotFound(
+          "Operator %s does not have the input %s.", type_, name));
   return it->second;
 }
 
@@ -321,9 +327,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const {
 std::string OperatorBase::Output(const std::string& name) const {
   auto& outs = Outputs(name);
   PADDLE_ENFORCE_LE(
-      outs.size(), 1UL,
+      outs.size(),
+      1UL,
       platform::errors::InvalidArgument(
-          "Operator %s's output %s should contain only one variable.", type_,
+          "Operator %s's output %s should contain only one variable.",
+          type_,
           name));
   return outs.empty() ? kEmptyVarName : outs[0];
 }
@@ -332,7 +340,8 @@ const std::vector<std::string>& OperatorBase::Outputs(
     const std::string& name) const {
   auto it = outputs_.find(name);
   PADDLE_ENFORCE_NE(
-      it, outputs_.end(),
+      it,
+      outputs_.end(),
       platform::errors::NotFound(
           "Operator %s does not have an output called %s.", type_, name));
   return it->second;
@@ -480,18 +489,20 @@ void OperatorBase::CheckAllInputOutputSet() const {
   for (auto& in : info_->Proto().inputs()) {
     if (!in.dispensable() && !in.extra()) {
       PADDLE_ENFORCE_NE(
-          inputs_.find(in.name()), inputs_.end(),
-          platform::errors::NotFound("Operator %s's input (%s) is not set.",
-                                     Type(), in.name()));
+          inputs_.find(in.name()),
+          inputs_.end(),
+          platform::errors::NotFound(
+              "Operator %s's input (%s) is not set.", Type(), in.name()));
     }
   }
 
   for (auto& out : info_->Proto().outputs()) {
     if (!out.dispensable() && !out.extra()) {
       PADDLE_ENFORCE_NE(
-          outputs_.find(out.name()), outputs_.end(),
-          platform::errors::NotFound("Operator %s's output (%s) is not set.",
-                                     Type(), out.name()));
+          outputs_.find(out.name()),
+          outputs_.end(),
+          platform::errors::NotFound(
+              "Operator %s's output (%s) is not set.", Type(), out.name()));
     }
   }
 }
@@ -564,10 +575,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
   if (it == ctx_.inputs.end()) return nullptr;
 
   PADDLE_ENFORCE_LE(
-      it->second.size(), 1UL,
+      it->second.size(),
+      1UL,
       platform::errors::InvalidArgument(
           "Operator %s's input %s should contain only one variable.",
-          op_.Type(), name));
+          op_.Type(),
+          name));
   return it->second.empty() ? nullptr : it->second[0];
 }
 
@@ -576,10 +589,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
   if (it == ctx_.outputs.end()) return nullptr;
 
   PADDLE_ENFORCE_LE(
-      it->second.size(), 1UL,
+      it->second.size(),
+      1UL,
       platform::errors::InvalidArgument(
           "Operator %s's output %s should contain only one variable.",
-          op_.Type(), name));
+          op_.Type(),
+          name));
   return it->second.empty() ? nullptr : it->second[0];
 }
 
@@ -594,10 +609,13 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   }
   std::vector<const Tensor*> res;
   res.reserve(vars.size());
-  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+  std::transform(vars.begin(),
+                 vars.end(),
+                 std::back_inserter(res),
                  [&](const Variable* var) -> const Tensor* {
                    if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(),
+                                     true,
                                      platform::errors::InvalidArgument(
                                          "Input variable should be LoDTensor, "
                                          "but the received type is %s.",
@@ -617,7 +635,9 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
   }
   std::vector<Tensor*> res;
   res.reserve(vars.size());
-  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+  std::transform(vars.begin(),
+                 vars.end(),
+                 std::back_inserter(res),
                  [&](Variable* var) -> Tensor* {
                    return var == nullptr ? nullptr
                                          : var->GetMutable<LoDTensor>();
@@ -675,7 +695,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     const auto& in = it->second;
     if (in.size() == 0) return false;
     PADDLE_ENFORCE_EQ(
-        in.size(), 1UL,
+        in.size(),
+        1UL,
         platform::errors::InvalidArgument(
             "Input %s should not contain more than one inputs.", name));
     return in[0] != nullptr;
@@ -693,7 +714,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
       return false;
     }
     PADDLE_ENFORCE_EQ(
-        out.size(), 1UL,
+        out.size(),
+        1UL,
         platform::errors::InvalidArgument(
             "Output %s should not contain more than one outputs.", name));
     return out[0] != nullptr;
@@ -750,11 +772,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
   std::string GetInputNameByIdx(size_t idx) const override {
     auto& op_proto =
         paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
-    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+    PADDLE_ENFORCE_LT(idx,
+                      op_proto->inputs().size(),
                       platform::errors::OutOfRange(
                           "The index should be less than the size of inputs of "
                           "operator %s, but got index is %d and size is %d",
-                          op_.Type(), idx, op_proto->inputs().size()));
+                          op_.Type(),
+                          idx,
+                          op_proto->inputs().size()));
     return op_proto->inputs()[idx].name();
   }
 
@@ -762,42 +787,55 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto& op_proto =
         paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
     PADDLE_ENFORCE_LT(
-        idx, op_proto->outputs().size(),
+        idx,
+        op_proto->outputs().size(),
         platform::errors::OutOfRange(
             "The index should be less than the size of outputs of "
             "operator %s, but got index is %d and size is %d",
-            op_.Type(), idx, op_proto->outputs().size()));
+            op_.Type(),
+            idx,
+            op_proto->outputs().size()));
     return op_proto->outputs()[idx].name();
   }
 
-  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+  void ShareDim(const std::string& in,
+                const std::string& out,
+                size_t i = 0,
                 size_t j = 0) override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
     PADDLE_ENFORCE_NE(
-        in_it, ctx_.inputs.end(),
+        in_it,
+        ctx_.inputs.end(),
         platform::errors::NotFound("Input %s does not exist.", in));
     PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
+        out_it,
+        ctx_.outputs.end(),
         platform::errors::NotFound("Output %s does not exist.", out));
-    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+    PADDLE_ENFORCE_LT(i,
+                      in_it->second.size(),
                       platform::errors::InvalidArgument(
                           "The index of input dimension is out of range, "
                           "excepted index less than %zu, but received %zu.",
-                          in_it->second.size(), i));
-    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                          in_it->second.size(),
+                          i));
+    PADDLE_ENFORCE_LT(j,
+                      out_it->second.size(),
                       platform::errors::InvalidArgument(
                           "The index of output dimension is out of range, "
                           "excepted index less than %zu, but received %zu.",
-                          out_it->second.size(), j));
+                          out_it->second.size(),
+                          j));
 
     Variable* in_var = in_it->second[i];
     Variable* out_var = out_it->second[j];
 
     PADDLE_ENFORCE_EQ(
-        in_var->Type(), out_var->Type(),
+        in_var->Type(),
+        out_var->Type(),
         platform::errors::InvalidArgument(
-            "The type of input (%s) and output (%s) are inconsistent.", in,
+            "The type of input (%s) and output (%s) are inconsistent.",
+            in,
             out));
 
     if (in_var->IsType<phi::SelectedRows>()) {
@@ -821,19 +859,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
                    const std::string& out) const override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
+    PADDLE_ENFORCE_NE(in_it,
+                      ctx_.inputs.end(),
                       platform::errors::NotFound(
                           "Input [%s] found error in Op [%s]", in, op_.Type()));
     PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
-        platform::errors::NotFound("Output [%s] found error in Op [%s]", out,
-                                   op_.Type()));
+        out_it,
+        ctx_.outputs.end(),
+        platform::errors::NotFound(
+            "Output [%s] found error in Op [%s]", out, op_.Type()));
 
     auto& in_var_list = in_it->second;
     auto& out_var_list = out_it->second;
 
     PADDLE_ENFORCE_EQ(
-        in_var_list.size(), out_var_list.size(),
+        in_var_list.size(),
+        out_var_list.size(),
         platform::errors::PreconditionNotMet(
             "Op [%s]: Input var size should be equal with output var size",
             op_.Type()));
@@ -848,10 +889,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
       Variable* in_var = in_var_list[i];
       if (!in_var->IsType<LoDTensor>()) return;
       Variable* out_var = out_var_list[i];
-      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
+      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(),
+                        true,
                         platform::errors::PreconditionNotMet(
                             "The %d-th output of Output(%s) must be LoDTensor.",
-                            i, out_var_names[i]));
+                            i,
+                            out_var_names[i]));
       auto& in_tensor = in_var->Get<LoDTensor>();
       auto* out_tensor = out_var->GetMutable<LoDTensor>();
       out_tensor->set_lod(in_tensor.lod());
@@ -862,32 +905,41 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
   }
 
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+  void ShareLoD(const std::string& in,
+                const std::string& out,
+                size_t i = 0,
                 size_t j = 0) const override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
     PADDLE_ENFORCE_NE(
-        in_it, ctx_.inputs.end(),
+        in_it,
+        ctx_.inputs.end(),
         platform::errors::NotFound("Input %s does not exist.", in));
     PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
+        out_it,
+        ctx_.outputs.end(),
         platform::errors::NotFound("Output %s does not exist.", out));
-    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+    PADDLE_ENFORCE_LT(i,
+                      in_it->second.size(),
                       platform::errors::InvalidArgument(
                           "The index of input dimension is out of range, "
                           "excepted index less than %zu, but received %zu.",
-                          in_it->second.size(), i));
-    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                          in_it->second.size(),
+                          i));
+    PADDLE_ENFORCE_LT(j,
+                      out_it->second.size(),
                       platform::errors::InvalidArgument(
                           "The index of output dimension is out of range, "
                           "excepted index less than %zu, but received %zu.",
-                          out_it->second.size(), j));
+                          out_it->second.size(),
+                          j));
 
     Variable* in_var = in_it->second.at(i);
     if (!in_var->IsType<LoDTensor>()) return;
     Variable* out_var = out_it->second.at(j);
     PADDLE_ENFORCE_EQ(
-        out_var->IsType<LoDTensor>(), true,
+        out_var->IsType<LoDTensor>(),
+        true,
         platform::errors::InvalidArgument(
             "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
     auto& in_tensor = in_var->Get<LoDTensor>();
@@ -922,7 +974,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
         "set in the runtime kernel."));
   }
 
-  void SetLoDLevel(const std::string& out, int32_t lod_level,
+  void SetLoDLevel(const std::string& out,
+                   int32_t lod_level,
                    size_t j = 0) const override {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "SetLoDLevel is only used in compile time. The calculation of "
@@ -965,10 +1018,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
   DDim GetInputDim(const std::string& name) const override {
     const std::vector<Variable*>& vars = InputVars(name);
     PADDLE_ENFORCE_EQ(
-        vars.size(), 1UL,
+        vars.size(),
+        1UL,
         platform::errors::InvalidArgument(
             "Input(%s) should hold one element, but now it holds %zu elements.",
-            name, vars.size()));
+            name,
+            vars.size()));
     return this->GetDim(vars[0]);
   }
 
@@ -994,10 +1049,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
   void SetOutputDim(const std::string& name, const DDim& dim) override {
     auto& vars = OutputVars(name);
     PADDLE_ENFORCE_EQ(
-        vars.size(), 1UL,
+        vars.size(),
+        1UL,
         platform::errors::InvalidArgument("Output(%s) should hold one element, "
                                           "but now it holds %zu elements.",
-                                          name, vars.size()));
+                                          name,
+                                          vars.size()));
     SetDim(vars[0], dim);
   }
 
@@ -1034,7 +1091,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
   std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
     std::vector<DDim> ret;
     ret.reserve(vars.size());
-    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
+    std::transform(vars.begin(),
+                   vars.end(),
+                   std::back_inserter(ret),
                    [this](Variable* var) { return this->GetDim(var); });
     return ret;
   }
@@ -1060,12 +1119,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
   void SetDims(const std::vector<Variable*>& vars,
                const std::vector<DDim>& dims) {
     size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(length, dims.size(),
+    PADDLE_ENFORCE_EQ(length,
+                      dims.size(),
                       platform::errors::InvalidArgument(
                           "The number of input variables do not match the "
                           "number of input dimensions, the number of variables "
                           "is %zu, the number of dimensions is %zu.",
-                          length, dims.size()));
+                          length,
+                          dims.size()));
     for (size_t i = 0; i < length; ++i) {
       if (vars[i] == nullptr) {
         continue;
@@ -1084,9 +1145,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
       const std::vector<Variable*>& vars) const {
     std::vector<proto::VarType::Type> retv;
     retv.resize(vars.size());
-    std::transform(vars.begin(), vars.end(), retv.begin(),
+    std::transform(vars.begin(),
+                   vars.end(),
+                   retv.begin(),
                    std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
-                             this, std::placeholders::_1));
+                             this,
+                             std::placeholders::_1));
     return retv;
   }
 
@@ -1098,7 +1162,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const std::vector<Variable*>& InputVars(const std::string& name) const {
     auto it = ctx_.inputs.find(name);
     PADDLE_ENFORCE_NE(
-        it, ctx_.inputs.end(),
+        it,
+        ctx_.inputs.end(),
         platform::errors::NotFound(
             "Operator (%s) does not have the input (%s).", op_.Type(), name));
     return it->second;
@@ -1107,7 +1172,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const std::vector<Variable*>& OutputVars(const std::string& name) const {
     auto it = ctx_.outputs.find(name);
     PADDLE_ENFORCE_NE(
-        it, ctx_.outputs.end(),
+        it,
+        ctx_.outputs.end(),
         platform::errors::NotFound(
             "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
     return it->second;
@@ -1143,20 +1209,23 @@ static void CheckTensorNANOrInf(const std::string& op_type,
     return;
   }
   PADDLE_ENFORCE_NE(
-      framework::TensorContainsInf(tensor), true,
-      platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
-                              op_type, name));
+      framework::TensorContainsInf(tensor),
+      true,
+      platform::errors::Fatal(
+          "Operator %s output Tensor %s contains Inf.", op_type, name));
   PADDLE_ENFORCE_NE(
-      framework::TensorContainsNAN(tensor), true,
-      platform::errors::Fatal("Operator %s output Tensor %s contains NAN.",
-                              op_type, name));
+      framework::TensorContainsNAN(tensor),
+      true,
+      platform::errors::Fatal(
+          "Operator %s output Tensor %s contains NAN.", op_type, name));
 }
 
 bool OperatorWithKernel::SupportGPU() const {
   auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
       phi::TransToPhiKernelName(type_));
   auto has_phi_kernel =
-      std::any_of(phi_kernels.begin(), phi_kernels.end(),
+      std::any_of(phi_kernels.begin(),
+                  phi_kernels.end(),
                   [](phi::KernelKeyMap::const_reference kern_pair) {
                     return kern_pair.first.backend() == phi::Backend::GPU;
                   });
@@ -1169,7 +1238,8 @@ bool OperatorWithKernel::SupportGPU() const {
     } else {
       auto& op_kernels = kernel_iter->second;
       return std::any_of(
-          op_kernels.begin(), op_kernels.end(),
+          op_kernels.begin(),
+          op_kernels.end(),
           [](OpKernelMap::const_reference kern_pair) {
             return platform::is_gpu_place(kern_pair.first.place_);
           });
@@ -1181,7 +1251,8 @@ bool OperatorWithKernel::SupportNPU() const {
   auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
       phi::TransToPhiKernelName(type_));
   auto has_phi_kernel =
-      std::any_of(phi_kernels.begin(), phi_kernels.end(),
+      std::any_of(phi_kernels.begin(),
+                  phi_kernels.end(),
                   [](phi::KernelKeyMap::const_reference kern_pair) {
                     return kern_pair.first.backend() == phi::Backend::NPU;
                   });
@@ -1194,7 +1265,8 @@ bool OperatorWithKernel::SupportNPU() const {
     } else {
       auto& op_kernels = kernel_iter->second;
       return std::any_of(
-          op_kernels.begin(), op_kernels.end(),
+          op_kernels.begin(),
+          op_kernels.end(),
           [](OpKernelMap::const_reference kern_pair) {
             return platform::is_npu_place(kern_pair.first.place_);
           });
@@ -1214,7 +1286,8 @@ bool OperatorWithKernel::SupportsMKLDNN(
     return false;
   }
   auto& op_kernels = op_kernel_iter->second;
-  return std::any_of(op_kernels.begin(), op_kernels.end(),
+  return std::any_of(op_kernels.begin(),
+                     op_kernels.end(),
                      [data_type](OpKernelMap::const_reference kern_pair) {
                        return platform::is_cpu_place(kern_pair.first.place_) &&
                               kern_pair.first.library_type_ ==
@@ -1496,10 +1569,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   {
     platform::RecordEvent record_event("prepare_data",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
     if (need_prepare_data_) {
-      transfer_scope = PrepareData(scope, *kernel_type_,
-                                   &transfered_inplace_vars, runtime_ctx);
+      transfer_scope = PrepareData(
+          scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
     }
   }
   // exec scope is the scope that kernel actually executed on.
@@ -1509,9 +1583,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   if (!all_kernels_must_compute_runtime_shape_) {
     platform::RecordEvent record_event("infer_shape",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
     RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
     this->Info().infer_shape_(&infer_shape_ctx);
+    record_event.End();
+    platform::RecordOpInfoSupplement(
+        Type(), Attrs(), infer_shape_ctx, *runtime_ctx);
   }
 
   if (FLAGS_enable_unused_var_check) {
@@ -1523,7 +1601,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   {
     platform::RecordEvent record_event("compute",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
     if (run_phi_kernel_) {
       phi::KernelContext pt_kernel_context;
       // Do data transform before building KernelContext
@@ -1663,7 +1742,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
   PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
+      kernels_iter,
+      all_op_kernels.end(),
       platform::errors::Unavailable(
           "There are no kernels which are registered in the %s operator.",
           type_));
@@ -1785,10 +1865,12 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                    platform::errors::NotFound(
-                        "Operator (%s) does not have kernel for %s.", type_,
-                        KernelTypeToString(expected_kernel_key)));
+  PADDLE_ENFORCE_NE(
+      kernel_iter,
+      kernels.end(),
+      platform::errors::NotFound("Operator (%s) does not have kernel for %s.",
+                                 type_,
+                                 KernelTypeToString(expected_kernel_key)));
 
   std::lock_guard<std::mutex> lock(cache_update_mutex_);
   if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
@@ -1798,7 +1880,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 }
 
 void OperatorWithKernel::TransferInplaceVarsBack(
-    const Scope& scope, const std::vector<std::string>& inplace_vars,
+    const Scope& scope,
+    const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
@@ -1809,8 +1892,9 @@ void OperatorWithKernel::TransferInplaceVarsBack(
     auto* original_tensor =
         GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
     auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
-                                     "The variable[%s] is nullptr.", var_name));
+    PADDLE_ENFORCE_NOT_NULL(var,
+                            platform::errors::InvalidArgument(
+                                "The variable[%s] is nullptr.", var_name));
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     auto original_dims = original_tensor->dims();
     original_tensor->ShareDataWith(*transformed_tensor);
@@ -1890,7 +1974,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad(
 }
 
 Scope* OperatorWithKernel::PrepareData(
-    const Scope& scope, const OpKernelType& expected_kernel_key,
+    const Scope& scope,
+    const OpKernelType& expected_kernel_key,
     std::vector<std::string>* transfered_inplace_vars,
     RuntimeContext* ctx) const {
   Scope* new_scope = nullptr;
@@ -1947,8 +2032,8 @@ Scope* OperatorWithKernel::PrepareData(
           input_vars[i] = trans_var;
           auto out = trans_var->GetMutable<LoDTensor>();
           out->Resize(tensor_in->dims());
-          platform::MatchShapeToLayout(out, tensor_in->layout(),
-                                       DataLayout::kNHWC);
+          platform::MatchShapeToLayout(
+              out, tensor_in->layout(), DataLayout::kNHWC);
           VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
                      "but kNHWC layout"
                   << var_name_item.first << " in Operator " << type_;
@@ -1995,8 +2080,8 @@ Scope* OperatorWithKernel::PrepareData(
       if (!run_by_executor_ &&
           (platform::is_gpu_place(kernel_type_for_var.place_) ||
            platform::is_gpu_place(expected_kernel_key.place_))) {
-        new_scope = TryCreateTransferScope(kernel_type_for_var,
-                                           expected_kernel_key, &scope);
+        new_scope = TryCreateTransferScope(
+            kernel_type_for_var, expected_kernel_key, &scope);
         enable_cache_transfer_scope_ = true;
       }
       if (!new_scope) {
@@ -2058,7 +2143,8 @@ Scope* OperatorWithKernel::PrepareData(
 }
 
 void OperatorWithKernel::ParseInputDataType(
-    const Variable* var, const std::string& name,
+    const Variable* var,
+    const std::string& name,
     proto::VarType::Type* data_type) const {
   if (var != nullptr) {
     const Tensor* t = nullptr;
@@ -2078,17 +2164,20 @@ void OperatorWithKernel::ParseInputDataType(
     }
     if (t != nullptr) {
       PADDLE_ENFORCE_EQ(
-          t->IsInitialized(), true,
+          t->IsInitialized(),
+          true,
           platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
                                             "contains uninitialized Tensor.",
-                                            Type(), name));
+                                            Type(),
+                                            name));
       *data_type = paddle::framework::TransToProtoVarType(t->dtype());
     }
   }
 }
 
 void OperatorWithKernel::ParseMultiInputDataType(
-    const std::vector<Variable*>& vars, const std::string& name,
+    const std::vector<Variable*>& vars,
+    const std::string& name,
     proto::VarType::Type* data_type) const {
   proto::VarType::Type default_data_type =
       static_cast<proto::VarType::Type>(-1);
@@ -2112,10 +2201,12 @@ void OperatorWithKernel::ParseMultiInputDataType(
       }
       if (t != nullptr) {
         PADDLE_ENFORCE_EQ(
-            t->IsInitialized(), true,
+            t->IsInitialized(),
+            true,
             platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
                                               "contains uninitialized Tensor.",
-                                              Type(), name));
+                                              Type(),
+                                              name));
         proto::VarType::Type tmp =
             paddle::framework::TransToProtoVarType(t->dtype());
         PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
@@ -2125,7 +2216,9 @@ void OperatorWithKernel::ParseMultiInputDataType(
                            "consistent or reigster GetExpectedKernelType. The "
                            "current variable type is (%s), but the "
                            "previous variable type is (%s).",
-                           Type(), name, DataTypeToString(tmp),
+                           Type(),
+                           name,
+                           DataTypeToString(tmp),
                            DataTypeToString(*data_type)));
         *data_type = tmp;
       }
@@ -2146,7 +2239,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
     }
   }
   PADDLE_ENFORCE_NE(
-      data_type, dafault_data_type,
+      data_type,
+      dafault_data_type,
       platform::errors::NotFound(
           "DataType should be indicated by input Variable at %s.", Type()));
   return data_type;
@@ -2163,12 +2257,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
     ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type);
   }
   PADDLE_ENFORCE_NE(
-      data_type, dafault_data_type,
+      data_type,
+      dafault_data_type,
       platform::errors::InvalidArgument(
           "The Input Variable(%s) of (%s) Operator used to determine kernel "
           "data type is empty or not LoDTensor or SelectedRows or "
           "LoDTensorArray.",
-          name, Type()));
+          name,
+          Type()));
   return data_type;
 }
 
@@ -2200,11 +2296,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
       t,
       platform::errors::InvalidArgument(
           "The Tensor of variable %s is nullptr when promote complex types."));
-  PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+  PADDLE_ENFORCE_EQ(t->IsInitialized(),
+                    true,
                     platform::errors::InvalidArgument(
                         "The Tensor in the %s Op's Input Variable %s(%s) is "
                         "not initialized.",
-                        Type(), name, ctx.InputName(name)));
+                        Type(),
+                        name,
+                        ctx.InputName(name)));
   return t;
 }
 
@@ -2216,7 +2315,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
  * the kernel data type.
  */
 proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes(
-    const ExecutionContext& ctx, const std::string& name1,
+    const ExecutionContext& ctx,
+    const std::string& name1,
     const std::string& name2) const {
   // 1. Get tensor
   auto* tensor_a = GetTensorFormInputSafely(ctx, name1);
@@ -2238,10 +2338,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 }
 
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
-    const std::string& var_name, const Tensor& tensor,
+    const std::string& var_name,
+    const Tensor& tensor,
     const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
-                      tensor.layout());
+  return OpKernelType(
+      expected_kernel_type.data_type_, tensor.place(), tensor.layout());
 }
 
 phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
@@ -2264,16 +2365,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
 }
 
 Scope* OperatorWithKernel::PreparePhiData(
-    const Scope& scope, const phi::Kernel& pt_kernel,
+    const Scope& scope,
+    const phi::Kernel& pt_kernel,
     const phi::KernelSignature& pt_kernel_signature,
     RuntimeContext* ctx) const {
   const auto& input_names = pt_kernel_signature.input_names;
   auto input_defs = pt_kernel.args_def().input_defs();
-  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+  PADDLE_ENFORCE_EQ(input_names.size(),
+                    input_defs.size(),
                     platform::errors::InvalidArgument(
                         "The size of inputs_args names (%d) must be equal to "
                         "the size of kernel input_defs (%d).",
-                        input_names.size(), input_defs.size()));
+                        input_names.size(),
+                        input_defs.size()));
   Scope* new_scope = nullptr;
   auto& name_map = Inputs();
   const std::unordered_set<std::string>* no_buffer_ins = nullptr;
@@ -2362,7 +2466,8 @@ Scope* OperatorWithKernel::PreparePhiData(
 }
 
 void OperatorWithKernel::BuildPhiKernelContext(
-    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
+    const RuntimeContext& ctx,
+    platform::DeviceContext* dev_ctx,
     phi::KernelContext* pt_kernel_context) const {
   pt_kernel_context->SetDeviceContext(dev_ctx);
 
@@ -2374,23 +2479,29 @@ void OperatorWithKernel::BuildPhiKernelContext(
   auto attr_defs = pt_kernel_->args_def().attribute_defs();
   auto output_defs = pt_kernel_->args_def().output_defs();
 
-  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+  PADDLE_ENFORCE_EQ(input_names.size(),
+                    input_defs.size(),
                     platform::errors::InvalidArgument(
                         "The size of inputs_args names (%d) must be equal to "
                         "the size of kernel input_defs (%d).",
-                        input_names.size(), input_defs.size()));
+                        input_names.size(),
+                        input_defs.size()));
 
-  PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
+  PADDLE_ENFORCE_EQ(output_names.size(),
+                    output_defs.size(),
                     platform::errors::InvalidArgument(
                         "The size of outputs_args names (%d) must be equal to "
                         "the size of kernel output_defs (%d).",
-                        output_names.size(), output_defs.size()));
+                        output_names.size(),
+                        output_defs.size()));
 
-  PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(),
+  PADDLE_ENFORCE_EQ(attr_names.size(),
+                    attr_defs.size(),
                     platform::errors::InvalidArgument(
                         "The size of attribute_args names (%d) must be equal "
                         "to the size of kernel attribute_defs (%d).",
-                        attr_names.size(), attr_defs.size()));
+                        attr_names.size(),
+                        attr_defs.size()));
 
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto it = ctx.inputs.find(input_names[i]);
@@ -2572,7 +2683,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
         break;
       case phi::AttributeType::SCALARS: {
         PADDLE_ENFORCE_NE(
-            attr_iter, Attrs().end(),
+            attr_iter,
+            Attrs().end(),
             platform::errors::NotFound("(%s) is not found in AttributeMap when "
                                        "buildind static KernelContext.",
                                        attr_names[i]));
@@ -2636,7 +2748,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
       } break;
       default: {
         PADDLE_ENFORCE_NE(
-            attr_iter, Attrs().end(),
+            attr_iter,
+            Attrs().end(),
             platform::errors::NotFound("(%s) is not found in AttributeMap when "
                                        "buildind static KernelContext.",
                                        attr_names[i]));
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index e1b14c4bae..46a46b04b3 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   allocator
   SRCS allocator.cc
-  DEPS place stats)
+  DEPS place stats profiler)
 cc_library(
   cpu_allocator
   SRCS cpu_allocator.cc
@@ -21,7 +21,7 @@ cc_library(
 cc_library(
   naive_best_fit_allocator
   SRCS naive_best_fit_allocator.cc
-  DEPS allocator buddy_allocator profiler)
+  DEPS allocator buddy_allocator)
 cc_test(
   naive_best_fit_allocator_test
   SRCS naive_best_fit_allocator_test.cc
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 7cc95de831..9d5f048a16 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -32,7 +32,8 @@
 #endif
 
 PADDLE_DEFINE_EXPORTED_bool(
-    init_allocated_mem, false,
+    init_allocated_mem,
+    false,
     "It is a mistake that the values of the memory allocated by "
     "BuddyAllocator are always zeroed in some op's implementation. "
     "To find this error in time, we use init_allocated_mem to indicate "
@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
   std::call_once(init_flag, []() {
     a = new detail::BuddyAllocator(
         std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+        platform::CpuMinChunkSize(),
+        platform::CpuMaxChunkSize());
   });
 
   return a;
@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
+void Free<platform::CPUPlace>(const platform::CPUPlace &place,
+                              void *p,
                               size_t size) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
   return p;
 }
 template <>
-void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+void Free<platform::IPUPlace>(const platform::IPUPlace &place,
+                              void *p,
                               size_t size) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
     ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
   }
   PADDLE_ENFORCE_EQ(
-      ret, XPU_SUCCESS,
+      ret,
+      XPU_SUCCESS,
       platform::errors::External(
           "XPU API return wrong value[%d], no enough memory", ret));
   if (FLAGS_init_allocated_mem) {
@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+void Free<platform::XPUPlace>(const platform::XPUPlace &place,
+                              void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_XPU
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList {
   BuddyAllocator *Get(int npu_id) {
     auto pos = std::distance(
         devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                       platform::errors::OutOfRange(
                           "The index exceeds the size of devices, the size of "
                           "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
 
     std::call_once(*init_flags_[pos], [this, pos] {
       platform::SetNPUDeviceId(devices_[pos]);
@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList {
           new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                  new detail::NPUAllocator(devices_[pos])),
                              platform::NPUMinChunkSize(),
-                             platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
+                             platform::NPUMaxChunkSize(),
+                             EXTRA_PADDING_SIZE));
       VLOG(10) << "\n\nNOTE:\n"
                << "You can set GFlags environment variable "
                << "'FLAGS_fraction_of_gpu_memory_to_use' "
@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
         "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
         string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
         string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
         string::HumanReadableSize(Used<platform::NPUPlace>(place))));
@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+void Free<platform::NPUPlace>(const platform::NPUPlace &place,
+                              void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_ASCEND_CL
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
 
 template <>
 void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
-                                    void *p, size_t size) {
+                                    void *p,
+                                    size_t size) {
 #ifdef PADDLE_WITH_ASCEND_CL
   GetNPUPinnedBuddyAllocator()->Free(p);
 #else
@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList {
   BuddyAllocator *Get(int gpu_id) {
     auto pos = std::distance(
         devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                       platform::errors::OutOfRange(
                           "The index exceeds the size of devices, the size of "
                           "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
 
     std::call_once(*init_flags_[pos], [this, pos] {
       platform::SetDeviceId(devices_[pos]);
-      allocators_[pos].reset(new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(
-              new detail::GPUAllocator(devices_[pos])),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
+      allocators_[pos].reset(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                 new detail::GPUAllocator(devices_[pos])),
+                             platform::GpuMinChunkSize(),
+                             platform::GpuMaxChunkSize()));
       VLOG(10) << "\n\nNOTE:\n"
                << "You can set GFlags environment variable "
                << "'FLAGS_fraction_of_gpu_memory_to_use' "
@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
         "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
         string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
         string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
         string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 }
 
 template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
+void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
+                               void *p,
                                size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   GetGPUBuddyAllocator(place.device)->Free(p);
@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p, size_t size) {
+                                     void *p,
+                                     size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList {
   BuddyAllocator *Get(int mlu_id) {
     auto pos = std::distance(
         devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                       platform::errors::OutOfRange(
                           "The index exceeds the size of devices, the size of "
                           "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
 
     std::call_once(*init_flags_[pos], [this, pos] {
       platform::SetMLUDeviceId(devices_[pos]);
-      allocators_[pos].reset(new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(
-              new detail::MLUAllocator(devices_[pos])),
-          platform::MLUMinChunkSize(), platform::MLUMaxChunkSize()));
+      allocators_[pos].reset(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                 new detail::MLUAllocator(devices_[pos])),
+                             platform::MLUMinChunkSize(),
+                             platform::MLUMaxChunkSize()));
       VLOG(10) << "\n\nNOTE:\n"
                << "You can set GFlags environment variable "
                << "(mlu reuse gpu GFlags) "
@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
         "%s, MLUMinChunkSize %s, MLU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
         string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
         string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
         string::HumanReadableSize(Used<platform::MLUPlace>(place))));
@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p,
+void Free<platform::MLUPlace>(const platform::MLUPlace &place,
+                              void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_MLU
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -759,10 +785,12 @@ class BuddyAllocatorList {
   }
 
   BuddyAllocator *Get(int dev_id) {
-    PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
+    PADDLE_ENFORCE_NE(init_flags_.find(dev_id),
+                      init_flags_.end(),
                       platform::errors::OutOfRange(
                           "Cannot find %s %d, please check visible devices.",
-                          device_type_, dev_id));
+                          device_type_,
+                          dev_id));
 
     std::call_once(*init_flags_[dev_id], [this, dev_id] {
       phi::DeviceManager::SetDevice(device_type_, dev_id);
@@ -773,7 +801,8 @@ class BuddyAllocatorList {
               new detail::CustomAllocator(device_type_, dev_id)),
           phi::DeviceManager::GetMinChunkSize(place),
           phi::DeviceManager::GetMaxChunkSize(place),
-          phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
+          phi::DeviceManager::GetExtraPaddingSize(place),
+          device_type_));
     });
 
     return allocators_[dev_id].get();
@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
         "%s. ",
-        string::HumanReadableSize(size), place.GetDeviceType(), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.GetDeviceType(),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
         string::HumanReadableSize(total - avail)));
   } else {
     if (FLAGS_init_allocated_mem) {
@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
 }
 
 template <>
-void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
+void Free<platform::CustomPlace>(const platform::CustomPlace &place,
+                                 void *p,
                                  size_t size) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -922,8 +955,6 @@ namespace allocation {
 phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
   auto *tmp_alloc = new Allocation(ptr, size, place_);
-  platform::MemEvenRecorder::Instance().PushMemRecord(
-      static_cast<void *>(tmp_alloc), place_, size);
   return tmp_alloc;
 }
 
@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
   paddle::platform::VisitPlace(
       allocation->place(),
       legacy::FreeVisitor(allocation->ptr(), allocation->size()));
-  platform::MemEvenRecorder::Instance().PopMemRecord(
-      static_cast<void *>(allocation), place_);
   delete allocation;
 }
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index ad11d81875..f1c0178faf 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 
 #include "paddle/fluid/memory/stats.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
+  platform::RecordMemEvent(allocation->ptr(),
+                           allocation->place(),
+                           allocation->size(),
+                           platform::TracerMemEventType::ReservedFree);
   delete allocation;
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+  platform::RecordMemEvent(ptr,
+                           platform::CUDAPinnedPlace(),
+                           size,
+                           platform::TracerMemEventType::ReservedAllocate);
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
index 8b54b96159..ef999dddf4 100644
--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/stats.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 
 namespace paddle {
 namespace memory {
@@ -30,14 +31,18 @@ class StatAllocator : public Allocator {
 
  protected:
   void FreeImpl(phi::Allocation* allocation) override {
-    if (platform::is_cpu_place(allocation->place())) {
-      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                              -allocation->size());
+    if (platform::is_cpu_place(allocation->place()) ||
+        platform::is_cuda_pinned_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(
+          Allocated, allocation->place().GetDeviceId(), -allocation->size());
     } else {
-      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                                -allocation->size());
+      DEVICE_MEMORY_STAT_UPDATE(
+          Allocated, allocation->place().GetDeviceId(), -allocation->size());
     }
-
+    platform::RecordMemEvent(allocation->ptr(),
+                             allocation->place(),
+                             allocation->size(),
+                             platform::TracerMemEventType::Free);
     underlying_allocator_->Free(allocation);
   }
 
@@ -48,12 +53,16 @@ class StatAllocator : public Allocator {
     const platform::Place& place = allocation->place();
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
-                              allocation->size());
+      HOST_MEMORY_STAT_UPDATE(
+          Allocated, place.GetDeviceId(), allocation->size());
     } else {
-      DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
-                                allocation->size());
+      DEVICE_MEMORY_STAT_UPDATE(
+          Allocated, place.GetDeviceId(), allocation->size());
     }
+    platform::RecordMemEvent(allocation->ptr(),
+                             allocation->place(),
+                             allocation->size(),
+                             platform::TracerMemEventType::Allocate);
     return allocation.release();
   }
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 244445d59b..eb5c74e56d 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -41,6 +41,7 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -64,12 +65,14 @@ void* AlignedMalloc(size_t size) {
 #else
   int error = posix_memalign(&p, alignment, size);
   PADDLE_ENFORCE_EQ(
-      error, 0,
+      error,
+      0,
       platform::errors::ResourceExhausted(
           "Fail to alloc memory of %ld size, error code is %d.", size, error));
 #endif
-  PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted(
-                                 "Fail to alloc memory of %ld size.", size));
+  PADDLE_ENFORCE_NOT_NULL(p,
+                          platform::errors::ResourceExhausted(
+                              "Fail to alloc memory of %ld size.", size));
   return p;
 }
 
@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
   }
 
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
-
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
   return p;
 }
 
@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 #endif
 
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }
 
 bool CPUAllocator::UseGpu() const { return false; }
@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
           "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
           "maximum GPU memory usage is limited to %d MB.\n"
           "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
     }
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
         "please set it to a higher value but less than 1.0.\n"
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        gpu_id_, string::HumanReadableSize(size), gpu_id_,
-        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
-        gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        gpu_id_,
+        string::HumanReadableSize(size),
+        gpu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        gpu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
   }
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                     platform::errors::InvalidArgument(
                         "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(gpu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(gpu_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated gpu memory (%d)",
-                        size, gpu_alloc_size_));
+                        size,
+                        gpu_alloc_size_));
   gpu_alloc_size_ -= size;
 
   platform::RecordedGpuFree(p, size, gpu_id_);
@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
     *index = 1;  // PINNED memory
     cuda_pinnd_alloc_size_ += size;
     HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+    platform::RecordMemEvent(
+        p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
     return p;
   } else {
     LOG(WARNING) << "cudaHostAlloc failed.";
@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   gpuError_t err;
-  PADDLE_ENFORCE_EQ(index, 1,
+  PADDLE_ENFORCE_EQ(index,
+                    1,
                     platform::errors::InvalidArgument(
                         "The index should be 1, but got %d", index));
 
-  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size,
+  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated cuda pinned memory (%d)",
-                        size, cuda_pinnd_alloc_size_));
+                        size,
+                        cuda_pinnd_alloc_size_));
   cuda_pinnd_alloc_size_ -= size;
 #ifdef PADDLE_WITH_HIP
   err = hipHostFree(p);
   if (err != hipErrorDeinitialized) {
     PADDLE_ENFORCE_EQ(
-        err, hipSuccess,
+        err,
+        hipSuccess,
         platform::errors::Fatal(
             "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
   }
@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   // cudaFreeHost succeeds.
   if (err != cudaErrorCudartUnloading) {
     PADDLE_ENFORCE_EQ(
-        err, 0,
+        err,
+        0,
         platform::errors::Fatal(
             "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
             err));
   }
 #endif
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }
 
 bool CUDAPinnedAllocator::UseGpu() const { return false; }
@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
           "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
           "maximum GPU memory usage is limited to %d MB.\n"
           "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
     }
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
         "please set it to a higher value but less than 1.0.\n"
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        npu_id_, string::HumanReadableSize(size), npu_id_,
-        string::HumanReadableSize(avail), npu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        npu_id_,
+        string::HumanReadableSize(size),
+        npu_id_,
+        string::HumanReadableSize(avail),
+        npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
   }
 }
 
 void NPUAllocator::Free(void* p, size_t size, size_t index) {
   VLOG(4) << "Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                     platform::errors::InvalidArgument(
                         "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(npu_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated gpu memory (%d)",
-                        size, npu_alloc_size_));
+                        size,
+                        npu_alloc_size_));
   npu_alloc_size_ -= size;
 
   platform::RecordedNPUFree(p, size, npu_id_);
@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
   aclError err;
-  PADDLE_ENFORCE_EQ(index, 1,
+  PADDLE_ENFORCE_EQ(index,
+                    1,
                     platform::errors::InvalidArgument(
                         "The index should be 1, but got %d", index));
 
-  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
+  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated npu pinned memory (%d)",
-                        size, npu_pinnd_alloc_size_));
+                        size,
+                        npu_pinnd_alloc_size_));
   npu_pinnd_alloc_size_ -= size;
   err = platform::NPUHostFree(p);
 
   if (err != ACL_ERROR_NONE) {
     PADDLE_ENFORCE_EQ(
-        err, 0,
+        err,
+        0,
         platform::errors::Fatal(
             "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
   }
@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
           "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
           "maximum MLU memory usage is limited to %d MB.\n"
           "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
     }
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
         "please set it to a higher value but less than 1.0.\n"
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        mlu_id_, string::HumanReadableSize(size), mlu_id_,
-        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
-        mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        mlu_id_,
+        string::HumanReadableSize(size),
+        mlu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        mlu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
   }
 }
 
 void MLUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                     platform::errors::InvalidArgument(
                         "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(mlu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(mlu_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated gpu memory (%d)",
-                        size, mlu_alloc_size_));
+                        size,
+                        mlu_alloc_size_));
   mlu_alloc_size_ -= size;
 
   platform::RecordedMLUFree(p, size, mlu_id_);
@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
         "\n\nOut of memory error on %s %d. "
         "total memory is %s, used memory is %s, "
         "available memory is only %s.\n\n",
-        dev_type_, dev_id_, string::HumanReadableSize(total),
+        dev_type_,
+        dev_id_,
+        string::HumanReadableSize(total),
         string::HumanReadableSize(total - avail),
         string::HumanReadableSize(avail)));
   }
@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
 
 void CustomAllocator::Free(void* p, size_t size, size_t index) {
   VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                     platform::errors::InvalidArgument(
                         "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(plug_alloc_size, size,
+  PADDLE_ENFORCE_GE(plug_alloc_size,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated gpu memory (%d)",
-                        size, plug_alloc_size));
+                        size,
+                        plug_alloc_size));
   plug_alloc_size -= size;
   auto place = platform::CustomPlace(dev_type_, dev_id_);
   auto device = phi::DeviceManager::GetDeviceWithPlace(place);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index c45180f600..f09cbfc3be 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/common/place.h"
 
 #ifdef PADDLE_WITH_XPU
@@ -33,8 +33,12 @@ namespace memory {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 template <>
 void Copy<platform::CPUPlace, platform::CustomPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CPUPlace dst_place,
+    void* dst,
+    platform::CustomPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
@@ -52,8 +56,12 @@ void Copy<platform::CPUPlace, platform::CustomPlace>(
 
 template <>
 void Copy<platform::CustomPlace, platform::CPUPlace>(
-    platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CustomPlace dst_place,
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
   auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
   auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
@@ -70,8 +78,12 @@ void Copy<platform::CustomPlace, platform::CPUPlace>(
 
 template <>
 void Copy<platform::CustomPlace, platform::CustomPlace>(
-    platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CustomPlace dst_place,
+    void* dst,
+    platform::CustomPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
@@ -102,9 +114,11 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
 #endif  // PADDLE_WITH_CUSTOM_DEVICE
 
 template <>
-void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
+                                                  void* dst,
                                                   platform::CPUPlace,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
   std::memcpy(dst, src, num);
@@ -115,7 +129,8 @@ template <>
 void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
@@ -123,7 +138,8 @@ template <>
 void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::IPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
@@ -131,15 +147,18 @@ template <>
 void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
                                                   void* dst,
                                                   platform::IPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
 // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace).
 template <>
-void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
+void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                      size_t num) {
   if (src_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_src;
@@ -152,8 +171,10 @@ void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
 
 // NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace).
 template <>
-void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::IPUPlace src_place, const void* src,
+void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::IPUPlace src_place,
+                                     const void* src,
                                      size_t num) {
   if (dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst;
@@ -170,7 +191,8 @@ template <>
 void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (num <= 0) {
     VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
     return;
@@ -182,7 +204,8 @@ template <>
 void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::XPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (num <= 0) {
     VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
     return;
@@ -194,7 +217,8 @@ template <>
 void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
                                                   void* dst,
                                                   platform::XPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (num <= 0) {
     VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
     return;
@@ -204,8 +228,10 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
 
 // NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace).
 template <>
-void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
+void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                      size_t num) {
   if (src_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_src;
@@ -218,8 +244,10 @@ void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
 
 // NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace).
 template <>
-void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::XPUPlace src_place, const void* src,
+void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::XPUPlace src_place,
+                                     const void* src,
                                      size_t num) {
   if (dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst;
@@ -236,7 +264,8 @@ template <>
 void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -248,7 +277,10 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
   if (stream) {
     platform::RecordEvent record_event(
         "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_HOST_TO_DEVICE,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     // On NPU, async operation after sync operation is ok, while sync operation
@@ -267,7 +299,8 @@ template <>
 void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::NPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -279,7 +312,10 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
   if (stream) {
     platform::RecordEvent record_event(
         "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_DEVICE_TO_HOST,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -295,7 +331,8 @@ template <>
 void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
                                                   void* dst,
                                                   platform::NPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -307,7 +344,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+      platform::NPUMemcpyAsync(dst,
+                               src,
+                               num,
+                               ACL_MEMCPY_DEVICE_TO_DEVICE,
                                reinterpret_cast<aclrtStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
@@ -329,7 +369,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+      platform::NPUMemcpyAsync(dst,
+                               src,
+                               num,
+                               ACL_MEMCPY_DEVICE_TO_DEVICE,
                                reinterpret_cast<aclrtStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
@@ -346,8 +389,11 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
 
 template <>
 void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
-    const void* src, size_t num) {
+    platform::CPUPlace dst_place,
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -356,8 +402,11 @@ void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
 
 template <>
 void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num) {
+    platform::NPUPinnedPlace dst_place,
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -366,8 +415,11 @@ void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
 
 template <>
 void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst,
-    platform::NPUPinnedPlace src_place, const void* src, size_t num) {
+    platform::NPUPinnedPlace dst_place,
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -376,8 +428,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
 
 template <>
 void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::NPUPinnedPlace dst_place,
+    void* dst,
+    platform::NPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(src_place.device);
@@ -389,7 +445,10 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
     platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
                                        platform::TracerEventType::UserDefined,
                                        1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_DEVICE_TO_HOST,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -404,8 +463,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
 
 template <>
 void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
-    platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::NPUPlace dst_place,
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(dst_place.device);
@@ -417,7 +480,10 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
     platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_HOST_TO_DEVICE,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     // On NPU, async operation after sync operation is ok, while sync operation
@@ -435,9 +501,12 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
 
 // NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
-                                  size_t num, aclrtStream stream) {
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  aclrtStream stream) {
   if (src_place.GetType() == phi::AllocationType::CPU &&
       dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -504,52 +573,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, aclrtStream stream) {
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
-                                     size_t num, aclrtStream stream) {
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
 template <>
-void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, aclrtStream stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
-       src, num, stream);
+void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
 template <>
-void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::NPUPlace src_place, const void* src,
-                                     size_t num, aclrtStream stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src, num, stream);
+void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::NPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
 template <>
 void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
-                                           const void* src, size_t num,
+                                           void* dst,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num,
                                            aclrtStream stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
 template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                            phi::NPUPinnedPlace src_place,
-                                           const void* src, size_t num,
+                                           const void* src,
+                                           size_t num,
                                            aclrtStream stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
@@ -557,16 +650,20 @@ void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
 template <>
 void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
-                                           const void* src, size_t num) {
+                                           void* dst,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
 }
 
 // NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
 template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                            phi::NPUPinnedPlace src_place,
-                                           const void* src, size_t num) {
+                                           const void* src,
+                                           size_t num) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
 }
 #endif
@@ -608,8 +705,12 @@ inline void SyncCUDAStream() {
 
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CPUPlace dst_place,
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(src_place.device);
@@ -619,10 +720,16 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
@@ -642,8 +749,12 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
 
 template <>
 void Copy<platform::CUDAPlace, platform::CPUPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CUDAPlace dst_place,
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(dst_place.device);
@@ -653,10 +764,16 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
@@ -676,8 +793,12 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
 
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CUDAPlace dst_place,
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -689,10 +810,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                          platform::TracerEventType::UserDefined,
                                          1);
 #ifdef PADDLE_WITH_HIP
-      platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               hipMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
 #else
-      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice,
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               cudaMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
 #endif
     } else {
@@ -710,22 +837,29 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
       platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, reinterpret_cast<gpuStream_t>(stream));
+      platform::GpuMemcpyPeerAsync(dst,
+                                   dst_place.device,
+                                   src,
+                                   src_place.device,
+                                   num,
+                                   reinterpret_cast<gpuStream_t>(stream));
     } else {
       platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
-                                  num);
+      platform::GpuMemcpyPeerSync(
+          dst, dst_place.device, src, src_place.device, num);
     }
   }
 }
 
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
-    platform::CPUPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+    platform::CPUPlace dst_place,
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -734,8 +868,11 @@ void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
 
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CPUPlace src_place, const void* src, size_t num) {
+    platform::CUDAPinnedPlace dst_place,
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -744,8 +881,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
 
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+    platform::CUDAPinnedPlace dst_place,
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -754,8 +894,12 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
 
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CUDAPlace src_place, const void* src, size_t num, void* stream) {
+    platform::CUDAPinnedPlace dst_place,
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -765,10 +909,16 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                                        platform::TracerEventType::UserDefined,
                                        1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
@@ -785,8 +935,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
 
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
-    platform::CUDAPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    platform::CUDAPlace dst_place,
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num,
     void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -798,10 +951,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                                        platform::TracerEventType::UserDefined,
                                        1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
@@ -818,9 +977,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
 
 // NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
-                                  size_t num, void* stream) {
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
   if (src_place.GetType() == phi::AllocationType::CPU &&
       dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -887,52 +1049,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace)
 template <>
-void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
-       src, num, stream);
+void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
 template <>
-void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::GPUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src, num, stream);
+void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::GPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace)
 template <>
 void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
-                                           const void* src, size_t num,
+                                           void* dst,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num,
                                            void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
 template <>
-void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                            phi::GPUPinnedPlace src_place,
-                                           const void* src, size_t num,
+                                           const void* src,
+                                           size_t num,
                                            void* stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
@@ -940,16 +1126,20 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace) -> (CUDAPinnedPlace)
 template <>
 void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
-                                           const void* src, size_t num) {
+                                           void* dst,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
 }
 
 // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace)
 template <>
-void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                            phi::GPUPinnedPlace src_place,
-                                           const void* src, size_t num) {
+                                           const void* src,
+                                           size_t num) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
 }
 #endif
@@ -959,7 +1149,8 @@ template <>
 void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::MLUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -970,8 +1161,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
     platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-    platform::MLUMemcpyD2HAsync(dst, src, num,
-                                reinterpret_cast<mluStream>(stream));
+    platform::MLUMemcpyD2HAsync(
+        dst, src, num, reinterpret_cast<mluStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -988,7 +1179,8 @@ template <>
 void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -999,8 +1191,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
     platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-    platform::MLUMemcpyH2DAsync(dst, src, num,
-                                reinterpret_cast<mluStream>(stream));
+    platform::MLUMemcpyH2DAsync(
+        dst, src, num, reinterpret_cast<mluStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -1017,7 +1209,8 @@ template <>
 void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
                                                   void* dst,
                                                   platform::MLUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -1029,8 +1222,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
       platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::MLUMemcpyD2DAsync(dst, src, num,
-                                  reinterpret_cast<mluStream>(stream));
+      platform::MLUMemcpyD2DAsync(
+          dst, src, num, reinterpret_cast<mluStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -1050,25 +1243,32 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
       platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, reinterpret_cast<mluStream>(stream));
+      platform::MLUMemcpyPeerAsync(dst,
+                                   dst_place.device,
+                                   src,
+                                   src_place.device,
+                                   num,
+                                   reinterpret_cast<mluStream>(stream));
     } else {
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
       platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
-                                  num);
+      platform::MLUMemcpyPeerSync(
+          dst, dst_place.device, src, src_place.device, num);
     }
   }
 }
 
 // NOTE: only for CPUPlace and MLUPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
-                                  size_t num, void* stream) {
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
   if (src_place.GetType() == phi::AllocationType::CPU &&
       dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -1110,35 +1310,55 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 
 // NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
 template <>
-void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
-       src, num, stream);
+void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
 template <>
-void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place, void* dst,
-                                     phi::MLUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src, num, stream);
+void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::MLUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 
@@ -1146,8 +1366,10 @@ void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
 
 // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
                                   size_t num) {
   if (UNLIKELY(num == 0)) return;
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -1224,16 +1446,20 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 
 // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
                                      size_t num) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num);
 }
 
 // NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                      size_t num) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num);
 }
@@ -1243,9 +1469,12 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
     !defined(PADDLE_WITH_MLU)
 
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
-                                  size_t num, void* stream) {
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
   if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
       dst_place.GetType() == phi::AllocationType::CUSTOM) {
     platform::CPUPlace place_src;
@@ -1265,17 +1494,23 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 }
 
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 #endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index e01e2eb599..ffb3f7e6eb 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -354,7 +354,9 @@ if(WITH_GPU)
          enforce
          dynload_cuda
          new_profiler
-         stats)
+         stats
+         op_proto_maker
+         shape_inference)
   nv_library(
     device_memory_aligment
     SRCS device_memory_aligment.cc
@@ -363,7 +365,14 @@ elseif(WITH_ROCM)
   hip_library(
     profiler
     SRCS profiler.cc profiler.cu
-    DEPS os_info device_tracer gpu_info enforce new_profiler stats)
+    DEPS os_info
+         device_tracer
+         gpu_info
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
   hip_library(
     device_memory_aligment
     SRCS device_memory_aligment.cc
@@ -372,7 +381,13 @@ else()
   cc_library(
     profiler
     SRCS profiler.cc
-    DEPS os_info device_tracer enforce new_profiler stats)
+    DEPS os_info
+         device_tracer
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
   cc_library(
     device_memory_aligment
     SRCS device_memory_aligment.cc
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 6b302d2449..7cceb8ccec 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);
 
-PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log,
+                            false,
                             "Whether to print the message of gpu memory usage "
                             "at exit, mainly used for UT and CI.");
-PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true,
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
+                            true,
                             "Whether to print the message of gpu memory usage "
                             "MB as a unit of measurement.");
 
@@ -66,7 +69,10 @@ namespace platform {
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
   size_t actual_available, actual_total;
-  RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total,
+  RecordedGpuMemGetInfo(available,
+                        total,
+                        &actual_available,
+                        &actual_total,
                         platform::GetCurrentDeviceId());
 }
 
@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() {
 static size_t GpuAllocSize(bool realloc) {
   size_t available_to_alloc = GpuAvailableMemToAlloc();
   PADDLE_ENFORCE_GT(
-      available_to_alloc, 0,
+      available_to_alloc,
+      0,
       platform::errors::ResourceExhausted("Not enough available GPU memory."));
   // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
   // allocated by fraction
@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) {
            ? flag_mb << 20
            : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(
-      available_to_alloc, alloc_bytes,
+      available_to_alloc,
+      alloc_bytes,
       platform::errors::ResourceExhausted("Not enough available GPU memory."));
   VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
            << " MiB, is it Re-alloc: " << realloc;
@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper {
     });
 
     PADDLE_ENFORCE_GE(
-        dev_id, 0,
+        dev_id,
+        0,
         platform::errors::OutOfRange(
             "Device id must be not less than 0, but got %d.", dev_id));
     PADDLE_ENFORCE_LT(
-        dev_id, instances_.size(),
+        dev_id,
+        instances_.size(),
         platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
-                                     dev_id, instances_.size()));
+                                     dev_id,
+                                     instances_.size()));
     return instances_[dev_id].get();
   }
 
@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper {
    * or cudaSuccess would be returned, and the cudaGetLastError() flag
    * would be clear.
    */
-  gpuError_t Malloc(void **ptr, size_t size,
+  gpuError_t Malloc(void **ptr,
+                    size_t size,
                     bool malloc_managed_memory = false) {
     LockGuardPtr<std::mutex> lock(mtx_);
     if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper {
       cur_size_.fetch_add(size);
       STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
       DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
-
+      platform::RecordMemEvent(ptr,
+                               GPUPlace(dev_id_),
+                               size,
+                               platform::TracerMemEventType::ReservedAllocate);
 #ifdef PADDLE_WITH_TESTING
       gpu_ptrs.insert(*ptr);
 #endif
@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper {
       cur_size_.fetch_sub(size);
       STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
       DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      platform::RecordMemEvent(ptr,
+                               GPUPlace(dev_id_),
+                               size,
+                               platform::TracerMemEventType::ReservedFree);
     } else {
       platform::GpuGetLastError();  // clear the error flag when
                                     // cudaErrorCudartUnloading /
@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper {
 #endif
   }
 
-  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+  bool GetMemInfo(size_t *avail,
+                  size_t *total,
+                  size_t *actual_avail,
                   size_t *actual_total) {
     {
       CUDADeviceGuard guard(dev_id_);
@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper {
 
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+  CUresult MemCreate(CUmemGenericAllocationHandle *handle,
+                     size_t size,
                      const CUmemAllocationProp *prop,
                      unsigned long long flags) {  // NOLINT
     auto result =
@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper {
 
 std::once_flag RecordedGpuMallocHelper::once_flag_;
 
-gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
+gpuError_t RecordedGpuMalloc(void **ptr,
+                             size_t size,
+                             int dev_id,
                              bool malloc_managed_memory) {
   return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
       ptr, size, malloc_managed_memory);
@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
 
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
+                              size_t size,
                               const CUmemAllocationProp *prop,
-                              unsigned long long flags, int dev_id) {  // NOLINT
-  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size,
-                                                              prop, flags);
+                              unsigned long long flags,
+                              int dev_id) {  // NOLINT
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
+      handle, size, prop, flags);
 }
 
-CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
+                               size_t size,
                                int dev_id) {
   return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
 }
 #endif
 #endif
 
-bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                           size_t *actual_total, int dev_id) {
+bool RecordedGpuMemGetInfo(size_t *avail,
+                           size_t *total,
+                           size_t *actual_avail,
+                           size_t *actual_total,
+                           int dev_id) {
   return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
       avail, total, actual_avail, actual_total);
 }
@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) {
 
 void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); }
 
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
   phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                    gpuMemcpyKind kind) {
   phi::backends::gpu::GpuMemcpySync(dst, src, count, kind);
 }
 
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
-  phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
-                                         count, stream);
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
+  phi::backends::gpu::GpuMemcpyPeerAsync(
+      dst, dst_device, src, src_device, count, stream);
 }
 
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
-  phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
-                                        count);
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
+  phi::backends::gpu::GpuMemcpyPeerSync(
+      dst, dst_device, src, src_device, count);
 }
 
 void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 47141bd73a..0369202284 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -30,12 +30,16 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/nvtx.h"
 #endif
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"
 
-PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
+PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
+                            false,
                             "Enable rpc profiler or not.");
 
-DEFINE_bool(enable_host_event_recorder_hook, false,
+DEFINE_bool(enable_host_event_recorder_hook,
+            false,
             "enable HostEventRecorder, hook Profiler");
 
 namespace paddle {
@@ -43,8 +47,11 @@ namespace platform {
 
 MemEvenRecorder MemEvenRecorder::recorder;
 
-Event::Event(EventType type, std::string name, uint32_t thread_id,
-             EventRole role, std::string attr)
+Event::Event(EventType type,
+             std::string name,
+             uint32_t thread_id,
+             EventRole role,
+             std::string attr)
     : type_(type),
       name_(name),
       thread_id_(thread_id),
@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const {
 #endif
 }
 
-RecordEvent::RecordEvent(const char *name, const TracerEventType type,
-                         uint32_t level, const EventRole role) {
+RecordEvent::RecordEvent(const char *name,
+                         const TracerEventType type,
+                         uint32_t level,
+                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
   start_ns_ = PosixInNsec();
 }
 
-RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
-                         uint32_t level, const EventRole role) {
+RecordEvent::RecordEvent(const std::string &name,
+                         const TracerEventType type,
+                         uint32_t level,
+                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
   start_ns_ = PosixInNsec();
 }
 
-RecordEvent::RecordEvent(const std::string &name, const std::string &attr,
-                         const TracerEventType type, uint32_t level,
+RecordEvent::RecordEvent(const std::string &name,
+                         const std::string &attr,
+                         const TracerEventType type,
+                         uint32_t level,
                          const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
@@ -215,8 +228,8 @@ void RecordEvent::End() {
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
     uint64_t end_ns = PosixInNsec();
-    tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
-                          g_thread_id);
+    tracer->AddCPURecords(
+        CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
   PopEvent(*name_, role_);
@@ -226,7 +239,8 @@ void RecordEvent::End() {
   is_enabled_ = false;
 }
 
-RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
+RecordInstantEvent::RecordInstantEvent(const char *name,
+                                       TracerEventType type,
                                        uint32_t level) {
   if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
     return;
@@ -236,20 +250,206 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
       name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
 }
 
-void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
+RecordOpInfoSupplement::RecordOpInfoSupplement(
+    const std::string &type,
+    const framework::AttributeMap &attrs,
+    const framework::InferShapeContext &shape_ctx,
+    const framework::RuntimeContext &ctx) {
+  if (FLAGS_enable_host_event_recorder_hook == false) {
+    return;
+  }
+  std::map<std::string, std::vector<framework::DDim>> input_shapes;
+  std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
+  for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
+    input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
+    dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
+  }
+
+  const std::vector<std::string> *callstack_ptr = nullptr;
+  std::vector<std::string> callstack;
+  auto iter = attrs.find(
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+  if (iter != attrs.end()) {
+    callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
+    callstack = *callstack_ptr;
+  }
+  HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
+      PosixInNsec(), type, input_shapes, dtypes, callstack);
+}
+
+RecordMemEvent::RecordMemEvent(const void *ptr,
+                               const phi::Place &place,
+                               size_t size,
+                               const TracerMemEventType type) {
+  if (g_state == ProfilerState::kDisabled &&
+      FLAGS_enable_host_event_recorder_hook == false) {
+    return;
+  }
+  if (type == TracerMemEventType::Allocate) {
+    uint64_t current_allocated;
+    uint64_t peak_allocated;
+    uint64_t current_reserved = 0;  // 0 means keep the same as before
+    uint64_t peak_reserved = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_allocated =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    } else {
+      current_allocated =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
+  } else if (type == TracerMemEventType::ReservedAllocate) {
+    uint64_t current_reserved;
+    uint64_t peak_reserved;
+    uint64_t current_allocated = 0;  // 0 means keep the same as before
+    uint64_t peak_allocated = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_reserved =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    } else {
+      current_reserved =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
+  } else if (type == TracerMemEventType::Free) {
+    uint64_t current_allocated;
+    uint64_t peak_allocated;
+    uint64_t current_reserved = 0;  // 0 means keep the same as before
+    uint64_t peak_reserved = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_allocated =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    } else {
+      current_allocated =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
+                                                       place,
+                                                       size,
+                                                       type,
+                                                       current_allocated,
+                                                       current_reserved,
+                                                       peak_allocated,
+                                                       peak_reserved);
+  } else if (type == TracerMemEventType::ReservedFree) {
+    uint64_t current_reserved;
+    uint64_t peak_reserved;
+    uint64_t current_allocated = 0;  // 0 means keep the same as before
+    uint64_t peak_allocated = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_reserved =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    } else {
+      current_reserved =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
+                                                       place,
+                                                       size,
+                                                       type,
+                                                       current_allocated,
+                                                       current_reserved,
+                                                       peak_allocated,
+                                                       peak_reserved);
+  }
+}
+
+void MemEvenRecorder::PushMemRecord(const void *ptr,
+                                    const Place &place,
                                     size_t size) {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled) {
+    return;
+  }
   std::lock_guard<std::mutex> guard(mtx_);
   auto &events = address_memevent_[place];
-  PADDLE_ENFORCE_EQ(events.count(ptr), 0,
+  PADDLE_ENFORCE_EQ(events.count(ptr),
+                    0,
                     platform::errors::InvalidArgument(
                         "The Place can't exist in the stage of PushMemRecord"));
-  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
-                          new MemEvenRecorder::RecordMemEvent(place, size)));
+  events.emplace(ptr,
+                 std::unique_ptr<RecordMemEvent>(
+                     new MemEvenRecorder::RecordMemEvent(place, size)));
+}
+
+void MemEvenRecorder::PushMemRecord(const void *ptr,
+                                    const Place &place,
+                                    size_t size,
+                                    TracerMemEventType type,
+                                    uint64_t current_allocated,
+                                    uint64_t current_reserved,
+                                    uint64_t peak_allocated,
+                                    uint64_t peak_reserved) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
+    HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
+        PosixInNsec(),
+        reinterpret_cast<uint64_t>(ptr),
+        type,
+        size,
+        place,
+        current_allocated,
+        current_reserved,
+        peak_allocated,
+        peak_reserved);
+    return;
+  }
+  if (type == TracerMemEventType::ReservedAllocate) {
+    // old profiler only analyse memory managed by paddle.
+    return;
+  }
+  if (g_state == ProfilerState::kDisabled) return;
+  auto &events = address_memevent_[place];
+  PADDLE_ENFORCE_EQ(events.count(ptr),
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The Place can't exist in the stage of PushMemRecord"));
+  events.emplace(ptr,
+                 std::unique_ptr<RecordMemEvent>(
+                     new MemEvenRecorder::RecordMemEvent(place, size)));
 }
 
 void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled) {
+    return;
+  }
   std::lock_guard<std::mutex> guard(mtx_);
   auto &events = address_memevent_[place];
   auto iter = events.find(ptr);
@@ -259,6 +459,41 @@ void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
   }
 }
 
+void MemEvenRecorder::PopMemRecord(const void *ptr,
+                                   const Place &place,
+                                   size_t size,
+                                   TracerMemEventType type,
+                                   uint64_t current_allocated,
+                                   uint64_t current_reserved,
+                                   uint64_t peak_allocated,
+                                   uint64_t peak_reserved) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
+    HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
+        PosixInNsec(),
+        reinterpret_cast<uint64_t>(ptr),
+        type,
+        -size,
+        place,
+        current_allocated,
+        current_reserved,
+        peak_allocated,
+        peak_reserved);
+    return;
+  }
+  if (type == TracerMemEventType::ReservedFree) {
+    // old profiler only analyse memory managed by paddle.
+    return;
+  }
+  if (g_state == ProfilerState::kDisabled) return;
+  auto &events = address_memevent_[place];
+  auto iter = events.find(ptr);
+  // The ptr maybe not in address_memevent
+  if (iter != events.end()) {
+    events.erase(iter);
+  }
+}
+
 void MemEvenRecorder::Flush() {
   std::lock_guard<std::mutex> guard(mtx_);
   address_memevent_.clear();
@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
 
   auto annotation_free = CurAnnotationName();
   if (tracer) {
-    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
-                             annotation_free, g_mem_thread_id);
+    tracer->AddMemInfoRecord(start_ns_,
+                             end_ns_,
+                             bytes_,
+                             place_,
+                             alloc_in_,
+                             annotation_free,
+                             g_mem_thread_id);
   }
   PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
 }
@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() {
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
     // same timeline lane. and distinguish the using thread_id.
-    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          g_thread_id);
+    tracer->AddCPURecords(
+        name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
   }
   ClearCurBlock();
 }
 
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                  const Place &place, const std::string &annotation) {
-  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
-                           place, g_mem_thread_id, annotation);
-}
-
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                 const Place &place, const std::string &annotation) {
-  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
-                           g_mem_thread_id, annotation);
+void PushMemEvent(uint64_t start_ns,
+                  uint64_t end_ns,
+                  size_t bytes,
+                  const Place &place,
+                  const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPushRange,
+                           start_ns,
+                           end_ns,
+                           bytes,
+                           place,
+                           g_mem_thread_id,
+                           annotation);
+}
+
+void PopMemEvent(uint64_t start_ns,
+                 uint64_t end_ns,
+                 size_t bytes,
+                 const Place &place,
+                 const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPopRange,
+                           start_ns,
+                           end_ns,
+                           bytes,
+                           place,
+                           g_mem_thread_id,
+                           annotation);
 }
 
 void Mark(const std::string &name) {
@@ -334,17 +590,19 @@ void Mark(const std::string &name) {
   GetEventList().Record(EventType::kMark, name, g_thread_id);
 }
 
-Event *PushEvent(const std::string &name, const EventRole role,
+Event *PushEvent(const std::string &name,
+                 const EventRole role,
                  std::string attr) {
-  return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role,
-                               attr);
+  return GetEventList().Record(
+      EventType::kPushRange, name, g_thread_id, role, attr);
 }
 
 void PopEvent(const std::string &name, const EventRole role, std::string attr) {
   GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
 }
 void EnableProfiler(ProfilerState state) {
-  PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
+  PADDLE_ENFORCE_NE(state,
+                    ProfilerState::kDisabled,
                     platform::errors::InvalidArgument(
                         "Can't enable profiling, since the input state is"
                         "ProfilerState::kDisabled"));
@@ -380,7 +638,8 @@ void ResetProfiler() {
     (*it)->Clear();
   }
   for (auto it = g_all_mem_event_lists.begin();
-       it != g_all_mem_event_lists.end(); ++it) {
+       it != g_all_mem_event_lists.end();
+       ++it) {
     (*it)->Clear();
   }
 }
@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop(
       std::string name =
           prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
       const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
-      Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid,
-                                             evt.role, attr);
+      Event *orig_evt = cur_thr_list->Record(
+          EventType::kPushRange, name, tid, evt.role, attr);
       (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
       cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
     }
@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd(
   for (const auto &thr_sec : host_sec.thr_sections) {
     uint64_t tid = thr_sec.thread_id;
     for (const auto &evt : thr_sec.events) {
-      tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(),
-                            tid);
+      tracer->AddCPURecords(
+          evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
     }
   }
 }
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 78275341cb..4773b1a177 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -30,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -102,6 +104,22 @@ struct MemEvenRecorder {
  public:
   void PushMemRecord(const void* ptr, const Place& place, size_t size);
   void PopMemRecord(const void* ptr, const Place& place);
+  void PushMemRecord(const void* ptr,
+                     const Place& place,
+                     size_t size,
+                     TracerMemEventType type,
+                     uint64_t current_allocated,
+                     uint64_t current_reserved,
+                     uint64_t peak_allocated,
+                     uint64_t peak_reserved);
+  void PopMemRecord(const void* ptr,
+                    const Place& place,
+                    size_t size,
+                    TracerMemEventType type,
+                    uint64_t current_allocated,
+                    uint64_t current_reserved,
+                    uint64_t peak_allocated,
+                    uint64_t peak_reserved);
   void Flush();
   static MemEvenRecorder& Instance() { return recorder; }
 
@@ -160,7 +178,8 @@ struct EventList {
   std::vector<T> Reduce() {
     std::vector<T> result;
     for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+      result.insert(result.begin(),
+                    std::make_move_iterator(block.begin()),
                     std::make_move_iterator(block.end()));
     }
     event_blocks.clear();
@@ -173,13 +192,21 @@ struct EventList {
 };
 
 void Mark(const std::string& name);
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                  const Place& place, const std::string& annotation);
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                 const Place& place, const std::string& annotation);
-Event* PushEvent(const std::string& name, const EventRole role,
+void PushMemEvent(uint64_t start_ns,
+                  uint64_t end_ns,
+                  size_t bytes,
+                  const Place& place,
+                  const std::string& annotation);
+void PopMemEvent(uint64_t start_ns,
+                 uint64_t end_ns,
+                 size_t bytes,
+                 const Place& place,
+                 const std::string& annotation);
+Event* PushEvent(const std::string& name,
+                 const EventRole role,
                  const std::string attr = "none");
-void PopEvent(const std::string& name, const EventRole role,
+void PopEvent(const std::string& name,
+              const EventRole role,
               const std::string attr = "none");
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index ea3111b736..1daed7db1e 100644
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   host_tracer
   SRCS host_tracer.cc
-  DEPS enforce)
+  DEPS enforce ddim var_type_traits)
 cc_library(
   cuda_tracer
   SRCS cuda_tracer.cc cupti_data_process.cc
@@ -10,7 +10,7 @@ add_subdirectory(mlu)
 cc_library(
   event_node
   SRCS event_node.cc
-  DEPS enforce)
+  DEPS enforce place)
 cc_library(
   profiler_utils
   SRCS utils.cc
diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h
index 8fe3b15052..3e166d1d04 100644
--- a/paddle/fluid/platform/profiler/common_event.h
+++ b/paddle/fluid/platform/profiler/common_event.h
@@ -18,16 +18,21 @@
 #include <functional>
 #include <string>
 
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/event.h"  // import EventRole, TODO(TIEXING): remove later
 #include "paddle/fluid/platform/profiler/trace_event.h"
+#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace platform {
 
 struct CommonEvent {
  public:
-  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type)
+  CommonEvent(const char *name,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type)
       : name(name),
         start_ns(start_ns),
         end_ns(end_ns),
@@ -35,8 +40,12 @@ struct CommonEvent {
         type(type) {}
 
   CommonEvent(std::function<void *(size_t)> arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type, const std::string &attr_str)
+              const std::string &name_str,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type,
+              const std::string &attr_str)
       : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
     auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
     strncpy(buf, name_str.c_str(), name_str.length() + 1);
@@ -47,8 +56,11 @@ struct CommonEvent {
   }
 
   CommonEvent(std::function<void *(size_t)> arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type)
+              const std::string &name_str,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type)
       : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
     auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
     strncpy(buf, name_str.c_str(), name_str.length() + 1);
@@ -63,5 +75,61 @@ struct CommonEvent {
   const char *attr = nullptr;  // not owned, designed for performance
 };
 
+struct CommonMemEvent {
+ public:
+  CommonMemEvent(uint64_t timestamp_ns,
+                 uint64_t addr,
+                 TracerMemEventType type,
+                 int64_t increase_bytes,
+                 const Place &place,
+                 uint64_t current_allocated,
+                 uint64_t current_reserved,
+                 uint64_t peak_allocated,
+                 uint64_t peak_reserved)
+      : timestamp_ns(timestamp_ns),
+        addr(addr),
+        type(type),
+        increase_bytes(increase_bytes),
+        place(place),
+        peak_allocated(peak_allocated),
+        peak_reserved(peak_reserved) {}
+  uint64_t timestamp_ns;
+  uint64_t addr;
+  TracerMemEventType type;
+  int64_t increase_bytes;
+  Place place;
+  uint64_t current_allocated;
+  uint64_t current_reserved;
+  uint64_t peak_allocated;
+  uint64_t peak_reserved;
+};
+
+struct OperatorSupplementOriginEvent {
+ public:
+  OperatorSupplementOriginEvent(
+      std::function<void *(size_t)> arena_allocator,
+      uint64_t timestamp_ns,
+      const std::string &type_name,
+      const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
+      const std::map<std::string, std::vector<framework::proto::VarType::Type>>
+          &dtypes,
+      const std::vector<std::string> callstack)
+      : timestamp_ns(timestamp_ns),
+        input_shapes(input_shapes),
+        dtypes(dtypes),
+        callstack(callstack) {
+    auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
+    strncpy(buf, type_name.c_str(), type_name.length() + 1);
+    op_type = buf;
+  }
+  uint64_t timestamp_ns;
+  const char *op_type = nullptr;  // not owned, designed for performance
+  // input shapes
+  std::map<std::string, std::vector<framework::DDim>> input_shapes;
+  std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
+  // call stack
+  const std::vector<std::string> callstack;
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index bde1395c12..1c2c00d75b 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -11,9 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/platform/profiler/host_tracer.h"
 
+#include <sstream>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
@@ -21,7 +22,8 @@
 
 // Used to filter events, works like glog VLOG(level).
 // RecordEvent will works if host_trace_level >= level.
-PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
+                             1,
                              "RecordEvent will works "
                              "if host_trace_level >= level.");
 
@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
   }
 }
 
+void ProcessHostMemEvents(
+    const HostEventSection<CommonMemEvent>& host_mem_events,
+    TraceEventCollector* collector) {
+  for (const auto& thr_sec : host_mem_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
+    for (const auto& evt : thr_sec.events) {
+      MemTraceEvent event;
+      event.timestamp_ns = evt.timestamp_ns;
+      event.addr = evt.addr;
+      event.type = evt.type;
+      event.increase_bytes = evt.increase_bytes;
+      event.place = evt.place.DebugString();
+      event.current_allocated = evt.current_allocated;
+      event.current_reserved = evt.current_reserved;
+      event.peak_allocated = evt.peak_allocated;
+      event.peak_reserved = evt.peak_reserved;
+      event.process_id = host_mem_events.process_id;
+      event.thread_id = tid;
+      collector->AddMemEvent(std::move(event));
+    }
+  }
+}
+
+void ProcessOperatorSupplementEvents(
+    const HostEventSection<OperatorSupplementOriginEvent>& op_supplement_events,
+    TraceEventCollector* collector) {
+  for (const auto& thr_sec : op_supplement_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
+    for (const auto& evt : thr_sec.events) {
+      OperatorSupplementEvent event;
+      event.timestamp_ns = evt.timestamp_ns;
+      event.op_type = evt.op_type;
+      std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+      std::map<std::string, std::vector<std::string>> dtypes;
+      std::string callstack;
+      for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end();
+           it++) {
+        for (auto idx = 0lu; idx < it->second.size(); idx++) {
+          input_shapes[it->first].push_back(std::vector<int64_t>());
+          for (auto dim_idx = 0; dim_idx < it->second.at(idx).size();
+               dim_idx++) {
+            input_shapes[it->first][idx].push_back(
+                it->second.at(idx).at(dim_idx));
+          }
+        }
+      }
+      for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) {
+        for (auto idx = 0lu; idx < it->second.size(); idx++) {
+          dtypes[it->first].push_back(
+              framework::proto::VarType::Type_Name(it->second.at(idx)));
+        }
+      }
+
+      std::ostringstream result_string;
+      for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) {
+        result_string << (*it) << std::endl;
+      }
+      event.input_shapes = input_shapes;
+      event.dtypes = dtypes;
+      event.callstack = result_string.str();
+      event.process_id = op_supplement_events.process_id;
+      event.thread_id = tid;
+      collector->AddOperatorSupplementEvent(std::move(event));
+    }
+  }
+}
+
 }  // namespace
 
 void HostTracer::PrepareTracing() {
@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() {
 
 void HostTracer::StartTracing() {
   PADDLE_ENFORCE_EQ(
-      state_ == TracerState::READY || state_ == TracerState::STOPED, true,
+      state_ == TracerState::READY || state_ == TracerState::STOPED,
+      true,
       platform::errors::PreconditionNotMet("TracerState must be READY"));
   HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
+  HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
+  HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
+      .GatherEvents();
   HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
   state_ = TracerState::STARTED;
 }
 
 void HostTracer::StopTracing() {
   PADDLE_ENFORCE_EQ(
-      state_, TracerState::STARTED,
+      state_,
+      TracerState::STARTED,
       platform::errors::PreconditionNotMet("TracerState must be STARTED"));
   HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
   state_ = TracerState::STOPED;
@@ -77,11 +157,19 @@ void HostTracer::StopTracing() {
 
 void HostTracer::CollectTraceData(TraceEventCollector* collector) {
   PADDLE_ENFORCE_EQ(
-      state_, TracerState::STOPED,
+      state_,
+      TracerState::STOPED,
       platform::errors::PreconditionNotMet("TracerState must be STOPED"));
   HostEventSection<CommonEvent> host_events =
       HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   ProcessHostEvents(host_events, collector);
+  HostEventSection<CommonMemEvent> host_mem_events =
+      HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
+  ProcessHostMemEvents(host_mem_events, collector);
+  HostEventSection<OperatorSupplementOriginEvent> op_supplement_events =
+      HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
+          .GatherEvents();
+  ProcessOperatorSupplementEvents(op_supplement_events, collector);
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/mem_tracing.h b/paddle/fluid/platform/profiler/mem_tracing.h
new file mode 100644
index 0000000000..3d3508c7bd
--- /dev/null
+++ b/paddle/fluid/platform/profiler/mem_tracing.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+namespace paddle {
+namespace platform {
+// Memory event tracing. A trace marks memory manipulation such as allocation
+// and free.
+// The events can be used to draw memory variation curve.
+class RecordMemEvent {
+ public:
+  /**
+   * @param ptr:  Pointer address allocated or free.
+   * @param place: Device for this memory event.
+   * @param size: Memory size allocated or free.
+   * @param type: Denote manipulation type for this memory event.
+   */
+  explicit RecordMemEvent(
+      const void* ptr,
+      const Place& place,
+      size_t size,
+      const TracerMemEventType type = TracerMemEventType::Allocate);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index 1f1fbcb71e..db8895576b 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -23,6 +23,8 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/profiler.h"
@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) {
   profiler->Prepare();
   profiler->Start();
   {
-    RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined,
-                       2);
-    RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
-                       3);
+    RecordInstantEvent(
+        "TestTraceLevel_record1", TracerEventType::UserDefined, 2);
+    RecordInstantEvent(
+        "TestTraceLevel_record2", TracerEventType::UserDefined, 3);
   }
   auto profiler_result = profiler->Stop();
   auto nodetree = profiler_result->GetNodeTrees();
@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) {
   EXPECT_GT(runtime_events.size(), 0u);
 #endif
 }
+
+TEST(ProfilerTest, TestHostTracerForMem) {
+  using paddle::platform::CPUPlace;
+  using paddle::platform::EnableHostEventRecorder;
+  using paddle::platform::MemTraceEventNode;
+  using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
+  using paddle::platform::ProfilerResult;
+  using paddle::platform::RecordEvent;
+  using paddle::platform::RecordInstantEvent;
+  using paddle::platform::RecordMemEvent;
+  using paddle::platform::TracerEventType;
+  using paddle::platform::TracerMemEventType;
+  ProfilerOptions options;
+  options.trace_level = 1;
+  options.trace_switch = 3;
+  auto profiler = Profiler::Create(options);
+  EXPECT_TRUE(profiler);
+  EnableHostEventRecorder();
+  profiler->Prepare();
+  profiler->Start();
+  {
+    RecordEvent event1(
+        "TestTracerForMem_phase1", TracerEventType::UserDefined, 1);
+    RecordMemEvent(reinterpret_cast<void*>(0),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Allocate);
+    RecordMemEvent(
+        reinterpret_cast<void*>(0), CPUPlace(), 1024, TracerMemEventType::Free);
+  }
+  {
+    RecordEvent event2(
+        "TestTracerForMem_phase2", TracerEventType::UserDefined, 1);
+    RecordMemEvent(reinterpret_cast<void*>(1024),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Allocate);
+    RecordMemEvent(reinterpret_cast<void*>(1024),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Free);
+  }
+  auto profiler_result = profiler->Stop();
+  auto nodetree = profiler_result->GetNodeTrees();
+}
diff --git a/paddle/fluid/platform/profiler/supplement_tracing.h b/paddle/fluid/platform/profiler/supplement_tracing.h
new file mode 100644
index 0000000000..46b1616d71
--- /dev/null
+++ b/paddle/fluid/platform/profiler/supplement_tracing.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+namespace paddle {
+
+namespace framework {
+class RuntimeContext;
+}
+namespace platform {
+
+class RecordOpInfoSupplement {
+ public:
+  /**
+   * @param type:  Operator type name.
+   * @param attrs: Attribute map of op.
+   * @param shape_ctx: Infershape context object.
+   * @param ctx: Runtime context object.
+   */
+  explicit RecordOpInfoSupplement(const std::string& type,
+                                  const framework::AttributeMap& attrs,
+                                  const framework::InferShapeContext& shape_ctx,
+                                  const framework::RuntimeContext& ctx);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b81f494f1a..b24c3546a3 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -382,7 +382,8 @@ static T PyObjectCast(PyObject *obj) {
   } catch (py::cast_error &) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Python object is not type of %s, the real type is %s",
-        typeid(T).name(), obj->ob_type->tp_name));
+        typeid(T).name(),
+        obj->ob_type->tp_name));
   }
 }
 
@@ -441,7 +442,8 @@ static std::vector<std::string> inline GetNameList(
 }
 
 static void inline CreateVariableIfNotExit(
-    const py::handle &py_handle, const framework::Scope &scope,
+    const py::handle &py_handle,
+    const framework::Scope &scope,
     const framework::Executor *exe = nullptr) {
   std::vector<std::string> vec_res;
 
@@ -479,8 +481,9 @@ static void inline CreateVariableIfNotExit(
         PyObject *py_var_desc =
             PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kVarDescField);
         PADDLE_ENFORCE_NOT_NULL(
-            py_var_desc, platform::errors::InvalidArgument(
-                             "The var_desc of parameter to set is None"));
+            py_var_desc,
+            platform::errors::InvalidArgument(
+                "The var_desc of parameter to set is None"));
         auto var_desc = PyObjectCast<framework::VarDesc>(py_var_desc);
         Py_DECREF(py_var_desc);
         var = const_cast<framework::Scope *>(&scope)->Var(para_name);
@@ -515,7 +518,8 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
       }
     }
   }
-  PADDLE_ENFORCE_EQ(ops.empty(), true,
+  PADDLE_ENFORCE_EQ(ops.empty(),
+                    true,
                     platform::errors::Unimplemented(
                         "OperatorWithKernel [%s] have only static graph grad "
                         "maker or have only dygraph grad maker, which is not "
@@ -537,8 +541,10 @@ static int GetNCCLVersion() {
 #endif
 
 template <typename PlaceType>
-static void TensorCopyFrom(framework::Tensor *dst, const framework::Tensor &src,
-                           const PlaceType &place, int64_t batch_size) {
+static void TensorCopyFrom(framework::Tensor *dst,
+                           const framework::Tensor &src,
+                           const PlaceType &place,
+                           int64_t batch_size) {
   if (batch_size < 0) {
     framework::TensorCopy(src, place, dst);
   } else {
@@ -624,9 +630,10 @@ PYBIND11_MODULE(core_noavx, m) {
         PyCapsule_GetPointer(dltensor->ptr(), "dltensor"));
 
     PADDLE_ENFORCE_NOT_NULL(
-        dmt, platform::errors::InvalidArgument(
-                 "from_dlpack received an invalid capsule. "
-                 "Note that a DLPack tensor can be consumed only once."));
+        dmt,
+        platform::errors::InvalidArgument(
+            "from_dlpack received an invalid capsule. "
+            "Note that a DLPack tensor can be consumed only once."));
 
     PyCapsule_SetName(dltensor->ptr(), "used_dltensor");
     DLTensor dl = dmt->dl_tensor;
@@ -644,7 +651,8 @@ PYBIND11_MODULE(core_noavx, m) {
   });
 
   m.def("_create_loaded_parameter",
-        [](const py::handle &vec_var_list, const Scope &scope,
+        [](const py::handle &vec_var_list,
+           const Scope &scope,
            const Executor *executor) {
           CreateVariableIfNotExit(vec_var_list, scope, executor);
         });
@@ -682,11 +690,12 @@ PYBIND11_MODULE(core_noavx, m) {
             << ", sci_mode=" << print_opt.sci_mode;
   });
 
-  m.def("broadcast_shape", [](const std::vector<int64_t> &x_dim,
-                              const std::vector<int64_t> &y_dim) {
-    return phi::vectorize(operators::details::BroadcastTwoDims(
-        phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1));
-  });
+  m.def(
+      "broadcast_shape",
+      [](const std::vector<int64_t> &x_dim, const std::vector<int64_t> &y_dim) {
+        return phi::vectorize(operators::details::BroadcastTwoDims(
+            phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1));
+      });
 
   m.def(
       "_append_python_callable_object_and_return_id",
@@ -808,14 +817,22 @@ PYBIND11_MODULE(core_noavx, m) {
                self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
              }
            })
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
-                          bool attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
-                          int attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
-                          float attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
-                          int64_t attr) { self.EmplaceBackAttr(attr); })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, bool attr) {
+             self.EmplaceBackAttr(attr);
+           })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, int attr) {
+             self.EmplaceBackAttr(attr);
+           })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, float attr) {
+             self.EmplaceBackAttr(attr);
+           })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, int64_t attr) {
+             self.EmplaceBackAttr(attr);
+           })
       .def("add_attr",
            [](paddle::CustomOpKernelContext &self, const std::string &attr) {
              self.EmplaceBackAttr(attr);
@@ -829,13 +846,14 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("add_attr",
            [](paddle::CustomOpKernelContext &self,
               const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
-                          const std::vector<std::string> &attr) {
-        self.EmplaceBackAttr(attr);
-      });
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
+              const std::vector<std::string> &attr) {
+             self.EmplaceBackAttr(attr);
+           });
 
-  py::class_<framework::Tensor> framework_tensor(m, "Tensor",
-                                                 py::buffer_protocol());
+  py::class_<framework::Tensor> framework_tensor(
+      m, "Tensor", py::buffer_protocol());
   g_framework_tensor_pytype =
       reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
   framework_tensor
@@ -918,80 +936,135 @@ PYBIND11_MODULE(core_noavx, m) {
              self.mutable_data<float>(place);
            })
       .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::CPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::CustomPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::CustomPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::XPUPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::XPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::CUDAPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::MLUPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::MLUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
       .def("_clear", &framework::Tensor::clear)
       .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::NPUPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::NPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::CPUPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::CustomPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::XPUPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::NPUPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::MLUPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::Place>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
-      .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
-      .def("set", SetTensorFromPyArray<paddle::platform::CustomPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
-      .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
-      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
-      .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
-      .def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
-      .def("set", SetTensorFromPyArray<paddle::platform::MLUPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
-      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CustomPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::XPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CUDAPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::NPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::MLUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::Place>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CustomPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::XPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CUDAPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::IPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::MLUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false,
            R"DOC(
         Set the data of Tensor on place with given numpy array.
         
@@ -1077,25 +1150,26 @@ PYBIND11_MODULE(core_noavx, m) {
              ostr << self;
              return ostr.str();
            }) /* ------ End of original Tensor ------ */
-      .def(
-          "__init__",
-          [](framework::Tensor &instance, const std::vector<std::vector<size_t>>
-                                              &recursive_sequence_lengths) {
-            LoD new_lod;
-            new_lod.reserve(recursive_sequence_lengths.size());
-            std::copy(recursive_sequence_lengths.begin(),
-                      recursive_sequence_lengths.end(),
-                      std::back_inserter(new_lod));
-            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-            PADDLE_ENFORCE_EQ(
-                CheckLoD(new_offset_lod, -1), true,
-                platform::errors::InvalidArgument(
-                    "The provided recursive_sequence_lengths info is "
-                    "invalid, "
-                    "the LoD converted by recursive_sequence_lengths is %s",
-                    new_lod));
-            new (&instance) framework::Tensor(new_offset_lod);
-          })
+      .def("__init__",
+           [](framework::Tensor &instance,
+              const std::vector<std::vector<size_t>>
+                  &recursive_sequence_lengths) {
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE_EQ(
+                 CheckLoD(new_offset_lod, -1),
+                 true,
+                 platform::errors::InvalidArgument(
+                     "The provided recursive_sequence_lengths info is "
+                     "invalid, "
+                     "the LoD converted by recursive_sequence_lengths is %s",
+                     new_lod));
+             new (&instance) framework::Tensor(new_offset_lod);
+           })
       .def("__init__",
            [](framework::Tensor &instance) {
              new (&instance) framework::Tensor();
@@ -1115,12 +1189,14 @@ PYBIND11_MODULE(core_noavx, m) {
             new_lod.reserve(lod.size());
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
             PADDLE_ENFORCE_EQ(
-                CheckLoD(new_lod, vectorize(self.dims()).front()), true,
+                CheckLoD(new_lod, vectorize(self.dims()).front()),
+                true,
                 platform::errors::InvalidArgument(
                     "The provided LoD is invalid, the LoD is %s", new_lod));
             self.set_lod(new_lod);
           },
-          py::arg("lod"), R"DOC(
+          py::arg("lod"),
+          R"DOC(
            Set LoD of the Tensor.
 
            Args:
@@ -1142,8 +1218,9 @@ PYBIND11_MODULE(core_noavx, m) {
            )DOC")
       .def(
           "set_recursive_sequence_lengths",
-          [](framework::Tensor &self, const std::vector<std::vector<size_t>>
-                                          &recursive_sequence_lengths) {
+          [](framework::Tensor &self,
+             const std::vector<std::vector<size_t>>
+                 &recursive_sequence_lengths) {
             // the input recursive_sequence_lengths is length-based
             // level-of-detail info
             LoD new_lod;
@@ -1153,7 +1230,8 @@ PYBIND11_MODULE(core_noavx, m) {
                       std::back_inserter(new_lod));
             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
             PADDLE_ENFORCE_EQ(
-                CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
+                CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                true,
                 platform::errors::InvalidArgument(
                     "The provided recursive_sequence_lengths info is "
                     "invalid, "
@@ -1162,7 +1240,8 @@ PYBIND11_MODULE(core_noavx, m) {
                     new_lod));
             self.set_lod(new_offset_lod);
           },
-          py::arg("recursive_sequence_lengths"), R"DOC(
+          py::arg("recursive_sequence_lengths"),
+          R"DOC(
            Set LoD of the Tensor according to recursive sequence lengths.
 
            For example, if recursive_sequence_lengths=[[2, 3]], which means
@@ -1630,7 +1709,8 @@ PYBIND11_MODULE(core_noavx, m) {
              new (&instance) phi::SelectedRows();
            })
       .def("__init__",
-           [](phi::SelectedRows &instance, const std::vector<int64_t> rows,
+           [](phi::SelectedRows &instance,
+              const std::vector<int64_t> rows,
               const int64_t &height) {
              new (&instance) phi::SelectedRows(rows, height);
            })
@@ -1693,8 +1773,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](Variable &self, Strings str_list) {
              *self.GetMutable<Strings>() = str_list;
            })
-      .def("set_vocab", [](Variable &self,
-                           Vocab vocab) { *self.GetMutable<Vocab>() = vocab; })
+      .def("set_vocab",
+           [](Variable &self, Vocab vocab) {
+             *self.GetMutable<Vocab>() = vocab;
+           })
       .def(
           "get_string_tensor",
           [](Variable &self) { return self.GetMutable<Strings>(); },
@@ -1732,7 +1814,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def(
           "get_reader",
           [](Variable &self) -> framework::ReaderHolder * {
-            PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(), true,
+            PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(),
+                              true,
                               platform::errors::InvalidArgument(
                                   "The variable is not type of ReaderHolder."));
             return self.GetMutable<framework::ReaderHolder>();
@@ -1743,7 +1826,8 @@ All parameter, weight, gradient are variables in Paddle.
           [](Variable &self) -> Scope * {
             auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>();
             PADDLE_ENFORCE_GT(
-                scope_vec->size(), 0,
+                scope_vec->size(),
+                0,
                 platform::errors::InvalidArgument(
                     "The size of scope_vec should be greater than 0"));
             return scope_vec->front();
@@ -1801,7 +1885,9 @@ All parameter, weight, gradient are variables in Paddle.
                out (core.Variable): the found or created variable.
            )DOC",
           py::return_value_policy::reference)
-      .def("find_var", &Scope::FindVar, py::arg("name"),
+      .def("find_var",
+           &Scope::FindVar,
+           py::arg("name"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
            its parent scope. Return None if not found. 
@@ -1814,7 +1900,9 @@ All parameter, weight, gradient are variables in Paddle.
            )DOC",
            py::return_value_policy::reference)
       .def("size", &Scope::Size)
-      .def("erase", &Scope::EraseVars, py::arg("names"),
+      .def("erase",
+           &Scope::EraseVars,
+           py::arg("names"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
            its parent scope. Return None if not found. 
@@ -1827,7 +1915,8 @@ All parameter, weight, gradient are variables in Paddle.
            )DOC",
            py::return_value_policy::reference)
       .def(
-          "new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+          "new_scope",
+          [](Scope &self) -> Scope * { return &self.NewScope(); },
           R"DOC(
            Create a new sub-scope of the current scope.
 
@@ -1835,7 +1924,8 @@ All parameter, weight, gradient are variables in Paddle.
                out (core._Scope): the created sub-scope.
            )DOC",
           py::return_value_policy::reference)
-      .def("drop_kids", &Scope::DropKids,
+      .def("drop_kids",
+           &Scope::DropKids,
            R"DOC(
            Delete all sub-scopes of the current scope.
            )DOC")
@@ -1865,7 +1955,8 @@ All parameter, weight, gradient are variables in Paddle.
       if (info.HasOpProtoAndChecker()) {
         std::string str;
         PADDLE_ENFORCE_EQ(
-            info.Proto().SerializeToString(&str), true,
+            info.Proto().SerializeToString(&str),
+            true,
             platform::errors::Fatal(
                 "Serialize OpProto Error. This could be a bug of Paddle."));
         ret_values.emplace_back(str);
@@ -1886,22 +1977,24 @@ All parameter, weight, gradient are variables in Paddle.
           }
           return res;
         });
-  m.def(
-      "get_grad_op_desc", [](const OpDesc &op_desc,
-                             const std::unordered_set<std::string> &no_grad_set,
-                             const std::vector<BlockDesc *> &grad_sub_block) {
-        std::unordered_map<std::string, std::string> grad_to_var;
-        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
-            framework::OpInfoMap::Instance()
-                .Get(op_desc.Type())
-                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
-                               grad_sub_block);
-        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
-        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
-                       grad_op_desc_ptrs.begin(),
-                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
-        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
-      });
+  m.def("get_grad_op_desc",
+        [](const OpDesc &op_desc,
+           const std::unordered_set<std::string> &no_grad_set,
+           const std::vector<BlockDesc *> &grad_sub_block) {
+          std::unordered_map<std::string, std::string> grad_to_var;
+          std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
+              framework::OpInfoMap::Instance()
+                  .Get(op_desc.Type())
+                  .GradOpMaker()(
+                      op_desc, no_grad_set, &grad_to_var, grad_sub_block);
+          std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
+          std::transform(
+              grad_op_descs.begin(),
+              grad_op_descs.end(),
+              grad_op_desc_ptrs.begin(),
+              [](std::unique_ptr<OpDesc> &p) { return p.release(); });
+          return std::make_pair(grad_op_desc_ptrs, grad_to_var);
+        });
   m.def("has_grad_op_maker", [](const std::string op_type) {
     return framework::OpInfoMap::Instance().Get(op_type).HasGradOpMaker();
   });
@@ -1914,7 +2007,8 @@ All parameter, weight, gradient are variables in Paddle.
     return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
   });
   m.def("infer_no_need_buffer_slots",
-        [](const std::string op_type, const framework::VariableNameMap &inputs,
+        [](const std::string op_type,
+           const framework::VariableNameMap &inputs,
            const framework::VariableNameMap &outputs,
            const framework::AttributeMap &attrs) {
           auto infer_func = framework::OpInfoMap::Instance()
@@ -1927,20 +2021,21 @@ All parameter, weight, gradient are variables in Paddle.
             return empty;
           }
         });
-  m.def("prune", [](const ProgramDesc &origin,
-                    const std::set<std::string> &feeded_var_names,
-                    const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDesc prog_with_targets(origin);
-
-    for (const auto &t : targets) {
-      prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true);
-    }
-    proto::ProgramDesc pruned_desc;
-    auto pruned_origin_block_id_map =
-        Prune(*prog_with_targets.Proto(), feeded_var_names, &pruned_desc);
-    return std::make_tuple(ProgramDesc(pruned_desc),
-                           pruned_origin_block_id_map);
-  });
+  m.def("prune",
+        [](const ProgramDesc &origin,
+           const std::set<std::string> &feeded_var_names,
+           const std::vector<std::array<size_t, 2>> &targets) {
+          ProgramDesc prog_with_targets(origin);
+
+          for (const auto &t : targets) {
+            prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true);
+          }
+          proto::ProgramDesc pruned_desc;
+          auto pruned_origin_block_id_map =
+              Prune(*prog_with_targets.Proto(), feeded_var_names, &pruned_desc);
+          return std::make_tuple(ProgramDesc(pruned_desc),
+                                 pruned_origin_block_id_map);
+        });
   m.def(
       "prune_backward",
       [](const framework::ProgramDesc &program) {
@@ -2168,7 +2263,8 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
     return devices;
   });
-  py::class_<platform::CustomPlace> customplace(m, "CustomPlace",
+  py::class_<platform::CustomPlace> customplace(m,
+                                                "CustomPlace",
                                                 R"DOC(
     CustomPlace is a descriptor of a device.
     It represents a custom device on which a tensor will be allocated and a model will run.
@@ -2182,7 +2278,8 @@ All parameter, weight, gradient are variables in Paddle.
   g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
   customplace
       .def("__init__",
-           [](platform::CustomPlace &self, const std::string &device_type,
+           [](platform::CustomPlace &self,
+              const std::string &device_type,
               int dev_id) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
              if (UNLIKELY(dev_id < 0)) {
@@ -2190,7 +2287,8 @@ All parameter, weight, gradient are variables in Paddle.
                    "Invalid CustomPlace(%s, %d), device id must be 0 "
                    "or "
                    "positive integer",
-                   device_type, dev_id);
+                   device_type,
+                   dev_id);
                std::exit(-1);
              }
 
@@ -2211,7 +2309,11 @@ All parameter, weight, gradient are variables in Paddle.
                        "inside "
                        "[0, %d), because %s "
                        "number on your machine is %d",
-                       device_type, dev_id, dev_count, device_type, dev_count);
+                       device_type,
+                       dev_id,
+                       dev_count,
+                       device_type,
+                       dev_count);
                    std::exit(-1);
                  }
                }
@@ -2221,7 +2323,8 @@ All parameter, weight, gradient are variables in Paddle.
                    "Invalid CustomPlace(%s, %d), the device type is "
                    "not registered "
                    "as a custom device.",
-                   device_type, dev_id);
+                   device_type,
+                   dev_id);
                std::exit(-1);
              }
 #else
@@ -2293,7 +2396,8 @@ All parameter, weight, gradient are variables in Paddle.
                  LOG(ERROR) << string::Sprintf(
                      "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
                      "number on your machine is %d",
-                     dev_id, platform::GetGPUDeviceCount(),
+                     dev_id,
+                     platform::GetGPUDeviceCount(),
                      platform::GetGPUDeviceCount());
                  std::exit(-1);
                }
@@ -2359,7 +2463,8 @@ All parameter, weight, gradient are variables in Paddle.
                  LOG(ERROR) << string::Sprintf(
                      "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
                      "number on your machine is %d",
-                     dev_id, platform::GetXPUDeviceCount(),
+                     dev_id,
+                     platform::GetXPUDeviceCount(),
                      platform::GetXPUDeviceCount());
                  std::exit(-1);
                }
@@ -2524,7 +2629,8 @@ All parameter, weight, gradient are variables in Paddle.
                  LOG(ERROR) << string::Sprintf(
                      "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
                      "number on your machine is %d",
-                     dev_id, platform::GetNPUDeviceCount(),
+                     dev_id,
+                     platform::GetNPUDeviceCount(),
                      platform::GetNPUDeviceCount());
                  std::exit(-1);
                }
@@ -2640,7 +2746,8 @@ All parameter, weight, gradient are variables in Paddle.
                  LOG(ERROR) << string::Sprintf(
                      "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
                      "number on your machine is %d",
-                     dev_id, platform::GetMLUDeviceCount(),
+                     dev_id,
+                     platform::GetMLUDeviceCount(),
                      platform::GetMLUDeviceCount());
                  std::exit(-1);
                }
@@ -2713,8 +2820,10 @@ All parameter, weight, gradient are variables in Paddle.
       .def("mlu_device_id", [](platform::Place &self) { return self.device; })
       .def("custom_device_id",
            [](platform::Place &self) { return self.device; })
-      .def("set_place", [](platform::Place &self,
-                           const platform::Place &other) { self = other; })
+      .def("set_place",
+           [](platform::Place &self, const platform::Place &other) {
+             self = other;
+           })
       .def("set_place",
            [](platform::Place &self, const platform::CPUPlace &cpu_place) {
              self = cpu_place;
@@ -2759,7 +2868,8 @@ All parameter, weight, gradient are variables in Paddle.
                                       true,
                                       platform::errors::InvalidArgument(
                                           "Cannot parse user input to OpDesc"));
-                    PADDLE_ENFORCE_EQ(desc.IsInitialized(), true,
+                    PADDLE_ENFORCE_EQ(desc.IsInitialized(),
+                                      true,
                                       platform::errors::InvalidArgument(
                                           "The provided OpDesc is not "
                                           "initialized, the reason is: %s",
@@ -2767,43 +2877,50 @@ All parameter, weight, gradient are variables in Paddle.
                     return OpRegistry::CreateOp(desc);
                   })
       .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
               const platform::CPUPlace &place) {
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
       .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
               const platform::XPUPlace &place) {
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
       .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
               const platform::NPUPlace &place) {
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
       .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
               const platform::CUDAPlace &place) {
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
       .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
               const platform::CUDAPinnedPlace &place) {
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
       .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
               const platform::MLUPlace &place) {
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
       .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
               const platform::CustomPlace &place) {
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
@@ -2843,13 +2960,17 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
       .def("close", &Executor::Close)
-      .def("run_from_dataset", &Executor::RunFromDataset,
+      .def("run_from_dataset",
+           &Executor::RunFromDataset,
            py::call_guard<py::gil_scoped_release>())
-      .def("release_trainer", &Executor::ReleaseTrainer,
+      .def("release_trainer",
+           &Executor::ReleaseTrainer,
            py::call_guard<py::gil_scoped_release>())
       .def("init_for_dataset",
-           [](Executor &self, const ProgramDesc &prog,
-              const std::string &trainer_desc, Scope *scope,
+           [](Executor &self,
+              const ProgramDesc &prog,
+              const std::string &trainer_desc,
+              Scope *scope,
               Dataset *dataset) -> std::shared_ptr<TrainerBase> {
              pybind11::gil_scoped_release release;
              return self.InitForDataset(prog, trainer_desc, scope, dataset);
@@ -2860,42 +2981,64 @@ All parameter, weight, gradient are variables in Paddle.
              self.RunFromDataset(trainer);
            })
       .def("run_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
+           [](Executor &self,
+              ExecutorPrepareContext *ctx,
+              Scope *scope,
               std::map<std::string, const LoDTensor *> *feed_targets,
               std::map<std::string, FetchType *> *fetch_targets,
-              bool create_local_scope = true, bool create_vars = true,
+              bool create_local_scope = true,
+              bool create_vars = true,
               const std::string &feed_holder_name = "feed",
               const std::string &fetch_holder_name = "fetch") {
              pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets,
-                                     create_local_scope, create_vars,
-                                     feed_holder_name, fetch_holder_name);
+             self.RunPreparedContext(ctx,
+                                     scope,
+                                     feed_targets,
+                                     fetch_targets,
+                                     create_local_scope,
+                                     create_vars,
+                                     feed_holder_name,
+                                     fetch_holder_name);
            })
       .def("run_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
-              bool create_local_scope = true, bool create_vars = true,
+           [](Executor &self,
+              ExecutorPrepareContext *ctx,
+              Scope *scope,
+              bool create_local_scope = true,
+              bool create_vars = true,
               bool keep_kids = false) {
              pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, create_local_scope,
-                                     create_vars, keep_kids);
+             self.RunPreparedContext(
+                 ctx, scope, create_local_scope, create_vars, keep_kids);
            })
       .def("prepare",
-           [](Executor &self, const ProgramDesc &program, int block_id,
+           [](Executor &self,
+              const ProgramDesc &program,
+              int block_id,
               const std::vector<std::string> &skip_ref_cnt_vars =
                   std::vector<std::string>(),
               bool force_disable_gc = false) {
              pybind11::gil_scoped_release release;
-             return self.Prepare(program, block_id, skip_ref_cnt_vars,
-                                 force_disable_gc);
+             return self.Prepare(
+                 program, block_id, skip_ref_cnt_vars, force_disable_gc);
            })
       .def("create_variables", &Executor::CreateVariables)
-      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
-                     int block_id, bool create_local_scope, bool create_vars,
-                     const std::vector<std::string> &fetch_vars) {
-        pybind11::gil_scoped_release release;
-        self.Run(prog, scope, block_id, create_local_scope, create_vars,
-                 fetch_vars);
-      });
+      .def("run",
+           [](Executor &self,
+              const ProgramDesc &prog,
+              Scope *scope,
+              int block_id,
+              bool create_local_scope,
+              bool create_vars,
+              const std::vector<std::string> &fetch_vars) {
+             pybind11::gil_scoped_release release;
+             self.Run(prog,
+                      scope,
+                      block_id,
+                      create_local_scope,
+                      create_vars,
+                      fetch_vars);
+           });
 
   py::class_<framework::interpreter::CostInfo>(m, "CostInfo")
       .def(py::init<>())
@@ -2906,8 +3049,10 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
-      .def(py::init<const platform::Place &, const ProgramDesc &,
-                    const ProgramDesc &, Scope *>())
+      .def(py::init<const platform::Place &,
+                    const ProgramDesc &,
+                    const ProgramDesc &,
+                    Scope *>())
       .def("run",
            [](StandaloneExecutor &self,
               const std::unordered_map<std::string, py::array> &input_dict,
@@ -2951,7 +3096,8 @@ All parameter, weight, gradient are variables in Paddle.
              return py::cast(std::move(ret));
            })
       .def("run",
-           [](StandaloneExecutor &self, std::vector<std::string> feed_names,
+           [](StandaloneExecutor &self,
+              std::vector<std::string> feed_names,
               std::vector<std::string> fetch_names) {
              paddle::framework::FetchList ret;
              {
@@ -3036,20 +3182,27 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
   m.def(
       "run_cmd",
-      [](const std::string &cmd, int time_out = -1,
+      [](const std::string &cmd,
+         int time_out = -1,
          int sleep_inter = -1) -> const std::string {
-        return paddle::framework::shell_get_command_output(cmd, time_out,
-                                                           sleep_inter);
+        return paddle::framework::shell_get_command_output(
+            cmd, time_out, sleep_inter);
       },
-      py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1);
+      py::arg("cmd"),
+      py::arg("time_out") = -1,
+      py::arg("sleep_inter") = -1);
   m.def(
       "shell_execute_cmd",
-      [](const std::string &cmd, int time_out = 0, int sleep_inter = 0,
+      [](const std::string &cmd,
+         int time_out = 0,
+         int sleep_inter = 0,
          bool redirect_stderr = false) -> std::vector<std::string> {
-        return paddle::framework::shell_execute_cmd(cmd, time_out, sleep_inter,
-                                                    redirect_stderr);
+        return paddle::framework::shell_execute_cmd(
+            cmd, time_out, sleep_inter, redirect_stderr);
       },
-      py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
+      py::arg("cmd"),
+      py::arg("time_out") = 0,
+      py::arg("sleep_inter") = 0,
       py::arg("redirect_stderr") = false);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -3064,13 +3217,16 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 
   m.def("set_feed_variable",
-        static_cast<void (*)(Scope *, const LoDTensor &, const std::string &,
-                             size_t)>(&framework::SetFeedVariable));
+        static_cast<void (*)(
+            Scope *, const LoDTensor &, const std::string &, size_t)>(
+            &framework::SetFeedVariable));
   m.def("set_feed_variable",
-        static_cast<void (*)(Scope *, const Strings &, const std::string &,
-                             size_t)>(&framework::SetFeedVariable));
+        static_cast<void (*)(
+            Scope *, const Strings &, const std::string &, size_t)>(
+            &framework::SetFeedVariable));
   m.def("get_fetch_variable",
-        [](const Scope &scope, const std::string &var_name,
+        [](const Scope &scope,
+           const std::string &var_name,
            size_t index) -> py::object {
           auto &var = framework::GetFetchVariable(scope, var_name, index);
           if (data_is_lod_tensor(var)) {
@@ -3125,7 +3281,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__len__", [](LoDTensorArray &self) { return self.size(); })
       .def("__setitem__",
            [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
-             PADDLE_ENFORCE_LT(i, self.size(),
+             PADDLE_ENFORCE_LT(i,
+                               self.size(),
                                platform::errors::InvalidArgument(
                                    "The index to set is larger than the size "
                                    "of LoDTensorArray."));
@@ -3139,7 +3296,8 @@ All parameter, weight, gradient are variables in Paddle.
             self.back().ShareDataWith(t);
             self.back().set_lod(t.lod());
           },
-          py::arg("tensor"), R"DOC(
+          py::arg("tensor"),
+          R"DOC(
              Append a LoDensor to LoDTensorArray.
               
              Args:
@@ -3376,18 +3534,20 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("reset_profiler", platform::ResetProfiler);
   m.def("register_pass", [](const std::string &pass_type, py::object callable) {
     PADDLE_ENFORCE_EQ(
-        framework::ir::PassRegistry::Instance().Has(pass_type), false,
+        framework::ir::PassRegistry::Instance().Has(pass_type),
+        false,
         platform::errors::AlreadyExists("Pass '%s' is registered more than "
                                         "once. Please use another name.",
                                         pass_type));
     callable.inc_ref();
-    framework::ir::PassRegistry::Instance().Insert(pass_type, [pass_type,
-                                                               callable]() {
-      py::gil_scoped_acquire guard;
-      std::unique_ptr<framework::ir::Pass> pass(
-          new framework::ir::GeneratePass(py::cast<std::string>(callable())));
-      return pass;
-    });
+    framework::ir::PassRegistry::Instance().Insert(
+        pass_type, [pass_type, callable]() {
+          py::gil_scoped_acquire guard;
+          std::unique_ptr<framework::ir::Pass> pass(
+              new framework::ir::GeneratePass(
+                  py::cast<std::string>(callable())));
+          return pass;
+        });
   });
   m.def("get_pass", [](const std::string &pass_type) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
@@ -3397,11 +3557,32 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("size_of_dtype", framework::SizeOfType);
   py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult")
       .def(py::init<>())
-      .def("get_data", &paddle::platform::ProfilerResult::GetData,
+      .def("get_data",
+           &paddle::platform::ProfilerResult::GetData,
            py::return_value_policy::automatic_reference)
       .def("save", &paddle::platform::ProfilerResult::Save)
       .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
 
+  py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode")
+      .def(py::init<>())
+      .def_readwrite("timestamp_ns",
+                     &paddle::platform::MemPythonNode::timestamp_ns)
+      .def_readwrite("addr", &paddle::platform::MemPythonNode::addr)
+      .def_readwrite("type", &paddle::platform::MemPythonNode::type)
+      .def_readwrite("process_id", &paddle::platform::MemPythonNode::process_id)
+      .def_readwrite("thread_id", &paddle::platform::MemPythonNode::thread_id)
+      .def_readwrite("increase_bytes",
+                     &paddle::platform::MemPythonNode::increase_bytes)
+      .def_readwrite("place", &paddle::platform::MemPythonNode::place)
+      .def_readwrite("current_allocated",
+                     &paddle::platform::MemPythonNode::current_allocated)
+      .def_readwrite("current_reserved",
+                     &paddle::platform::MemPythonNode::current_reserved)
+      .def_readwrite("peak_allocated",
+                     &paddle::platform::MemPythonNode::peak_allocated)
+      .def_readwrite("peak_reserved",
+                     &paddle::platform::MemPythonNode::peak_reserved);
+
   py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
       .def(py::init<>())
       .def_readwrite("name", &paddle::platform::DevicePythonNode::name)
@@ -3424,15 +3605,22 @@ All parameter, weight, gradient are variables in Paddle.
       .def_readwrite("process_id",
                      &paddle::platform::HostPythonNode::process_id)
       .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
+      .def_readwrite("input_shapes",
+                     &paddle::platform::HostPythonNode::input_shapes)
+      .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)
+      .def_readwrite("callstack", &paddle::platform::HostPythonNode::callstack)
       .def_readwrite("children_node",
                      &paddle::platform::HostPythonNode::children_node_ptrs)
       .def_readwrite("runtime_node",
                      &paddle::platform::HostPythonNode::runtime_node_ptrs)
       .def_readwrite("device_node",
-                     &paddle::platform::HostPythonNode::device_node_ptrs);
+                     &paddle::platform::HostPythonNode::device_node_ptrs)
+      .def_readwrite("mem_node",
+                     &paddle::platform::HostPythonNode::mem_node_ptrs);
 
   py::class_<paddle::platform::Profiler>(m, "_Profiler")
-      .def("create", &paddle::platform::Profiler::Create,
+      .def("create",
+           &paddle::platform::Profiler::Create,
            py::return_value_policy::take_ownership)
       .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
       .def("is_cnpapi_supported",
@@ -3466,6 +3654,14 @@ All parameter, weight, gradient are variables in Paddle.
       }))
       .def("end", [](platform::RecordEvent *event) { event->End(); });
 
+  py::enum_<paddle::platform::TracerMemEventType>(m, "TracerMemEventType")
+      .value("Allocate", paddle::platform::TracerMemEventType::Allocate)
+      .value("Free", paddle::platform::TracerMemEventType::Free)
+      .value("ReservedAllocate",
+             paddle::platform::TracerMemEventType::ReservedAllocate)
+      .value("ReservedFree",
+             paddle::platform::TracerMemEventType::ReservedFree);
+
   py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
       .value("Operator", paddle::platform::TracerEventType::Operator)
       .value("Dataloader", paddle::platform::TracerEventType::Dataloader)
@@ -3509,22 +3705,29 @@ All parameter, weight, gradient are variables in Paddle.
           [](ir::Pass &self, const std::string &name, const std::string &attr) {
             self.Set<std::string>(name, new std::string(attr));
           })
-      .def("set", [](ir::Pass &self, const std::string &name,
-                     bool val) { self.Set<bool>(name, new bool(val)); })
-      .def("set", [](ir::Pass &self, const std::string &name,
-                     int val) { self.Set<const int>(name, new int(val)); })
       .def("set",
-           [](ir::Pass &self, const std::string &name,
+           [](ir::Pass &self, const std::string &name, bool val) {
+             self.Set<bool>(name, new bool(val));
+           })
+      .def("set",
+           [](ir::Pass &self, const std::string &name, int val) {
+             self.Set<const int>(name, new int(val));
+           })
+      .def("set",
+           [](ir::Pass &self,
+              const std::string &name,
               std::vector<std::string> set) {
              self.Set(name, new std::vector<std::string>(set));
            })
       .def("set",
-           [](ir::Pass &self, const std::string &name,
+           [](ir::Pass &self,
+              const std::string &name,
               std::unordered_set<std::string> set) {
              self.Set(name, new std::unordered_set<std::string>(set));
            })
       .def("set",
-           [](ir::Pass &self, const std::string &name,
+           [](ir::Pass &self,
+              const std::string &name,
               std::unordered_set<int> set) {
              self.Set(name, new std::unordered_set<int>(set));
            })
@@ -3769,7 +3972,8 @@ All parameter, weight, gradient are variables in Paddle.
           "reduce_strategy",
           [](const BuildStrategy &self) { return self.reduce_; },
           [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -3799,7 +4003,8 @@ All parameter, weight, gradient are variables in Paddle.
           [](const BuildStrategy &self) { return self.gradient_scale_; },
           [](BuildStrategy &self,
              BuildStrategy::GradientScaleStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -3864,7 +4069,8 @@ All parameter, weight, gradient are variables in Paddle.
           "debug_graphviz_path",
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
           [](BuildStrategy &self, const std::string &path) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -3891,7 +4097,8 @@ All parameter, weight, gradient are variables in Paddle.
             return self.enable_sequential_execution_;
           },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -3917,7 +4124,8 @@ All parameter, weight, gradient are variables in Paddle.
             return self.remove_unnecessary_lock_;
           },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -3995,7 +4203,8 @@ All parameter, weight, gradient are variables in Paddle.
             return self.fuse_elewise_add_act_ops_;
           },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -4020,7 +4229,8 @@ All parameter, weight, gradient are variables in Paddle.
           "fuse_gemm_epilogue",
           [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -4045,7 +4255,8 @@ All parameter, weight, gradient are variables in Paddle.
           "fuse_bn_act_ops",
           [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -4070,7 +4281,8 @@ All parameter, weight, gradient are variables in Paddle.
           "fuse_bn_add_act_ops",
           [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -4095,7 +4307,8 @@ All parameter, weight, gradient are variables in Paddle.
           "enable_auto_fusion",
           [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -4123,7 +4336,8 @@ All parameter, weight, gradient are variables in Paddle.
             return self.fuse_relu_depthwise_conv_;
           },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -4153,7 +4367,8 @@ All parameter, weight, gradient are variables in Paddle.
                    self.fuse_broadcast_ops_ == paddle::none;
           },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, "
                                   "cannot be configured again."));
@@ -4184,7 +4399,8 @@ All parameter, weight, gradient are variables in Paddle.
                    self.fuse_all_optimizer_ops_ == paddle::none;
           },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, "
                                   "cannot be configured again."));
@@ -4194,7 +4410,8 @@ All parameter, weight, gradient are variables in Paddle.
           "sync_batch_norm",
           [](const BuildStrategy &self) { return self.sync_batch_norm_; },
           [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
@@ -4348,9 +4565,13 @@ All parameter, weight, gradient are variables in Paddle.
         });
 
   pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::vector<std::string> &, const std::string &,
-                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
-                  const BuildStrategy &, ir::Graph *>())
+                  const std::vector<std::string> &,
+                  const std::string &,
+                  Scope *,
+                  std::vector<Scope *> &,
+                  const ExecutionStrategy &,
+                  const BuildStrategy &,
+                  ir::Graph *>())
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
       // of vec<Scope*> will be freed by Python GC. We can only return Scope*
@@ -4439,7 +4660,8 @@ All parameter, weight, gradient are variables in Paddle.
                      PADDLE_THROW(platform::errors::Unimplemented(
                          "Failed to convert type: %s when set IpuStrategy "
                          "option: %s",
-                         option.get_type(), option_name));
+                         option.get_type(),
+                         option_name));
                    }
                    self.InsertStringOption(option_name, option_val);
                  }
@@ -4447,7 +4669,8 @@ All parameter, weight, gradient are variables in Paddle.
                  if (option_name.rfind("location_", 0) == 0) {
                    for (auto option : element.second.cast<py::dict>()) {
                      self.SetTensorLocation(
-                         option_name, option.first.cast<std::string>(),
+                         option_name,
+                         option.first.cast<std::string>(),
                          option.second.cast<std::uint64_t>());
                    }
                  } else if (option_name == "replicated_collectives_settings") {
@@ -4501,17 +4724,19 @@ All parameter, weight, gradient are variables in Paddle.
                        PADDLE_THROW(platform::errors::Unimplemented(
                            "Failed to convert value type: %s when set "
                            "IpuStrategy option: %s",
-                           option.second.get_type(), option_key));
+                           option.second.get_type(),
+                           option_key));
                      }
-                     self.InsertStringPairOption(option_name, option_key,
-                                                 option_val);
+                     self.InsertStringPairOption(
+                         option_name, option_key, option_val);
                    }
                  }
                } else {
                  PADDLE_THROW(platform::errors::InvalidArgument(
                      "Invalid IpuStrategy option value type: %s, please check "
                      "input value for option: %s",
-                     element.second.get_type(), option_name));
+                     element.second.get_type(),
+                     option_name));
                }
              }
            })
-- 
GitLab