record memory and op supplement info (#43550)

* record memory and op supplement info * update * update * fix a bug * fix memory recording * fix a bug * update * update * fix a bug * update * fix a bug * fix a bug * fix a bug * Revert "fix a bug" This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5. * fix a bug * fix format * fix

record memory and op supplement info (#43550)
* record memory and op supplement info * update * update * fix a bug * fix memory recording * fix a bug * update * update * fix a bug * update * fix a bug * fix a bug * fix a bug * Revert "fix a bug" This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5. * fix a bug * fix format * fix
8dd0a3b9 · chenjian · GitHub · e64823c1 · 8dd0a3b9 · 8dd0a3b9
19 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/core/kernel_context.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
        op_with_kernel->Info().infer_shape_(
            instr_node.InnerInferShapeContext().get());
      }
+      infershape_event.End();
+      platform::RecordOpInfoSupplement(op->Type(),
+                                       op->Attrs(),
+                                       *(instr_node.InnerInferShapeContext()),
+                                       *(instr_node.InnerRuntimeContext()));
    }
  }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_context.h"
@@ -70,7 +71,8 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
-static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name,
+static DDim GetDimsDebug(const ScopeBase& scope,
+                         const std::string& name,
                         bool get_actual_dim = false) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
@@ -264,7 +266,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
          Type(), platform::TracerEventType::Operator, 1);
      auto op_name = platform::OpName(outputs_, Type());
      platform::RecordEvent op_name_record_event(
-          op_name, platform::TracerEventType::Operator,
+          op_name,
+          platform::TracerEventType::Operator,
          FLAGS_enable_host_event_recorder_hook ? 20 : 1,
          platform::EventRole::kUniqueOp);
      RunImpl(scope, place);
@@ -293,9 +296,11 @@ bool OperatorBase::HasInputs(const std::string& name) const {
 std::string OperatorBase::Input(const std::string& name) const {
  auto& ins = Inputs(name);
  PADDLE_ENFORCE_LE(
-      ins.size(), 1UL,
+      ins.size(),
+      1UL,
      platform::errors::InvalidArgument(
-          "Operator %s's input %s should contain only one variable.", type_,
+          "Operator %s's input %s should contain only one variable.",
+          type_,
          name));
  return ins.empty() ? kEmptyVarName : ins[0];
 }
@@ -304,9 +309,10 @@ const std::vector<std::string>& OperatorBase::Inputs(
    const std::string& name) const {
  auto it = inputs_.find(name);
  PADDLE_ENFORCE_NE(
-      it, inputs_.end(),
+      it,
-      platform::errors::NotFound("Operator %s does not have the input %s.",
+      inputs_.end(),
-                                 type_, name));
+      platform::errors::NotFound(
+          "Operator %s does not have the input %s.", type_, name));
  return it->second;
 }
@@ -321,9 +327,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const {
 std::string OperatorBase::Output(const std::string& name) const {
  auto& outs = Outputs(name);
  PADDLE_ENFORCE_LE(
-      outs.size(), 1UL,
+      outs.size(),
+      1UL,
      platform::errors::InvalidArgument(
-          "Operator %s's output %s should contain only one variable.", type_,
+          "Operator %s's output %s should contain only one variable.",
+          type_,
          name));
  return outs.empty() ? kEmptyVarName : outs[0];
 }
@@ -332,7 +340,8 @@ const std::vector<std::string>& OperatorBase::Outputs(
    const std::string& name) const {
  auto it = outputs_.find(name);
  PADDLE_ENFORCE_NE(
-      it, outputs_.end(),
+      it,
+      outputs_.end(),
      platform::errors::NotFound(
          "Operator %s does not have an output called %s.", type_, name));
  return it->second;
@@ -480,18 +489,20 @@ void OperatorBase::CheckAllInputOutputSet() const {
  for (auto& in : info_->Proto().inputs()) {
    if (!in.dispensable() && !in.extra()) {
      PADDLE_ENFORCE_NE(
-          inputs_.find(in.name()), inputs_.end(),
+          inputs_.find(in.name()),
-          platform::errors::NotFound("Operator %s's input (%s) is not set.",
+          inputs_.end(),
-                                     Type(), in.name()));
+          platform::errors::NotFound(
+              "Operator %s's input (%s) is not set.", Type(), in.name()));
    }
  }
  for (auto& out : info_->Proto().outputs()) {
    if (!out.dispensable() && !out.extra()) {
      PADDLE_ENFORCE_NE(
-          outputs_.find(out.name()), outputs_.end(),
+          outputs_.find(out.name()),
-          platform::errors::NotFound("Operator %s's output (%s) is not set.",
+          outputs_.end(),
-                                     Type(), out.name()));
+          platform::errors::NotFound(
+              "Operator %s's output (%s) is not set.", Type(), out.name()));
    }
  }
 }
@@ -564,10 +575,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
  if (it == ctx_.inputs.end()) return nullptr;
  PADDLE_ENFORCE_LE(
-      it->second.size(), 1UL,
+      it->second.size(),
+      1UL,
      platform::errors::InvalidArgument(
          "Operator %s's input %s should contain only one variable.",
-          op_.Type(), name));
+          op_.Type(),
+          name));
  return it->second.empty() ? nullptr : it->second[0];
 }
@@ -576,10 +589,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
  if (it == ctx_.outputs.end()) return nullptr;
  PADDLE_ENFORCE_LE(
-      it->second.size(), 1UL,
+      it->second.size(),
+      1UL,
      platform::errors::InvalidArgument(
          "Operator %s's output %s should contain only one variable.",
-          op_.Type(), name));
+          op_.Type(),
+          name));
  return it->second.empty() ? nullptr : it->second[0];
 }
@@ -594,10 +609,13 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
  }
  std::vector<const Tensor*> res;
  res.reserve(vars.size());
-  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+  std::transform(vars.begin(),
+                 vars.end(),
+                 std::back_inserter(res),
                 [&](const Variable* var) -> const Tensor* {
                   if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(),
+                                     true,
                                     platform::errors::InvalidArgument(
                                         "Input variable should be LoDTensor, "
                                         "but the received type is %s.",
@@ -617,7 +635,9 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
  }
  std::vector<Tensor*> res;
  res.reserve(vars.size());
-  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+  std::transform(vars.begin(),
+                 vars.end(),
+                 std::back_inserter(res),
                 [&](Variable* var) -> Tensor* {
                   return var == nullptr ? nullptr
                                         : var->GetMutable<LoDTensor>();
@@ -675,7 +695,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
    const auto& in = it->second;
    if (in.size() == 0) return false;
    PADDLE_ENFORCE_EQ(
-        in.size(), 1UL,
+        in.size(),
+        1UL,
        platform::errors::InvalidArgument(
            "Input %s should not contain more than one inputs.", name));
    return in[0] != nullptr;
@@ -693,7 +714,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
      return false;
    }
    PADDLE_ENFORCE_EQ(
-        out.size(), 1UL,
+        out.size(),
+        1UL,
        platform::errors::InvalidArgument(
            "Output %s should not contain more than one outputs.", name));
    return out[0] != nullptr;
@@ -750,11 +772,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
  std::string GetInputNameByIdx(size_t idx) const override {
    auto& op_proto =
        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
-    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+    PADDLE_ENFORCE_LT(idx,
+                      op_proto->inputs().size(),
                      platform::errors::OutOfRange(
                          "The index should be less than the size of inputs of "
                          "operator %s, but got index is %d and size is %d",
-                          op_.Type(), idx, op_proto->inputs().size()));
+                          op_.Type(),
+                          idx,
+                          op_proto->inputs().size()));
    return op_proto->inputs()[idx].name();
  }
@@ -762,42 +787,55 @@ class RuntimeInferShapeContext : public InferShapeContext {
    auto& op_proto =
        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
    PADDLE_ENFORCE_LT(
-        idx, op_proto->outputs().size(),
+        idx,
+        op_proto->outputs().size(),
        platform::errors::OutOfRange(
            "The index should be less than the size of outputs of "
            "operator %s, but got index is %d and size is %d",
-            op_.Type(), idx, op_proto->outputs().size()));
+            op_.Type(),
+            idx,
+            op_proto->outputs().size()));
    return op_proto->outputs()[idx].name();
  }
-  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+  void ShareDim(const std::string& in,
+                const std::string& out,
+                size_t i = 0,
                size_t j = 0) override {
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
    PADDLE_ENFORCE_NE(
-        in_it, ctx_.inputs.end(),
+        in_it,
+        ctx_.inputs.end(),
        platform::errors::NotFound("Input %s does not exist.", in));
    PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
+        out_it,
+        ctx_.outputs.end(),
        platform::errors::NotFound("Output %s does not exist.", out));
-    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+    PADDLE_ENFORCE_LT(i,
+                      in_it->second.size(),
                      platform::errors::InvalidArgument(
                          "The index of input dimension is out of range, "
                          "excepted index less than %zu, but received %zu.",
-                          in_it->second.size(), i));
+                          in_it->second.size(),
-    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                          i));
+    PADDLE_ENFORCE_LT(j,
+                      out_it->second.size(),
                      platform::errors::InvalidArgument(
                          "The index of output dimension is out of range, "
                          "excepted index less than %zu, but received %zu.",
-                          out_it->second.size(), j));
+                          out_it->second.size(),
+                          j));
    Variable* in_var = in_it->second[i];
    Variable* out_var = out_it->second[j];
    PADDLE_ENFORCE_EQ(
-        in_var->Type(), out_var->Type(),
+        in_var->Type(),
+        out_var->Type(),
        platform::errors::InvalidArgument(
-            "The type of input (%s) and output (%s) are inconsistent.", in,
+            "The type of input (%s) and output (%s) are inconsistent.",
+            in,
            out));
    if (in_var->IsType<phi::SelectedRows>()) {
@@ -821,19 +859,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
                   const std::string& out) const override {
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
+    PADDLE_ENFORCE_NE(in_it,
+                      ctx_.inputs.end(),
                      platform::errors::NotFound(
                          "Input [%s] found error in Op [%s]", in, op_.Type()));
    PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
+        out_it,
-        platform::errors::NotFound("Output [%s] found error in Op [%s]", out,
+        ctx_.outputs.end(),
-                                   op_.Type()));
+        platform::errors::NotFound(
+            "Output [%s] found error in Op [%s]", out, op_.Type()));
    auto& in_var_list = in_it->second;
    auto& out_var_list = out_it->second;
    PADDLE_ENFORCE_EQ(
-        in_var_list.size(), out_var_list.size(),
+        in_var_list.size(),
+        out_var_list.size(),
        platform::errors::PreconditionNotMet(
            "Op [%s]: Input var size should be equal with output var size",
            op_.Type()));
@@ -848,10 +889,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
      Variable* in_var = in_var_list[i];
      if (!in_var->IsType<LoDTensor>()) return;
      Variable* out_var = out_var_list[i];
-      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
+      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(),
+                        true,
                        platform::errors::PreconditionNotMet(
                            "The %d-th output of Output(%s) must be LoDTensor.",
-                            i, out_var_names[i]));
+                            i,
+                            out_var_names[i]));
      auto& in_tensor = in_var->Get<LoDTensor>();
      auto* out_tensor = out_var->GetMutable<LoDTensor>();
      out_tensor->set_lod(in_tensor.lod());
@@ -862,32 +905,41 @@ class RuntimeInferShapeContext : public InferShapeContext {
    }
  }
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+  void ShareLoD(const std::string& in,
+                const std::string& out,
+                size_t i = 0,
                size_t j = 0) const override {
    auto in_it = ctx_.inputs.find(in);
    auto out_it = ctx_.outputs.find(out);
    PADDLE_ENFORCE_NE(
-        in_it, ctx_.inputs.end(),
+        in_it,
+        ctx_.inputs.end(),
        platform::errors::NotFound("Input %s does not exist.", in));
    PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
+        out_it,
+        ctx_.outputs.end(),
        platform::errors::NotFound("Output %s does not exist.", out));
-    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+    PADDLE_ENFORCE_LT(i,
+                      in_it->second.size(),
                      platform::errors::InvalidArgument(
                          "The index of input dimension is out of range, "
                          "excepted index less than %zu, but received %zu.",
-                          in_it->second.size(), i));
+                          in_it->second.size(),
-    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                          i));
+    PADDLE_ENFORCE_LT(j,
+                      out_it->second.size(),
                      platform::errors::InvalidArgument(
                          "The index of output dimension is out of range, "
                          "excepted index less than %zu, but received %zu.",
-                          out_it->second.size(), j));
+                          out_it->second.size(),
+                          j));
    Variable* in_var = in_it->second.at(i);
    if (!in_var->IsType<LoDTensor>()) return;
    Variable* out_var = out_it->second.at(j);
    PADDLE_ENFORCE_EQ(
-        out_var->IsType<LoDTensor>(), true,
+        out_var->IsType<LoDTensor>(),
+        true,
        platform::errors::InvalidArgument(
            "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
    auto& in_tensor = in_var->Get<LoDTensor>();
@@ -922,7 +974,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
        "set in the runtime kernel."));
  }
-  void SetLoDLevel(const std::string& out, int32_t lod_level,
+  void SetLoDLevel(const std::string& out,
+                   int32_t lod_level,
                   size_t j = 0) const override {
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "SetLoDLevel is only used in compile time. The calculation of "
@@ -965,10 +1018,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
  DDim GetInputDim(const std::string& name) const override {
    const std::vector<Variable*>& vars = InputVars(name);
    PADDLE_ENFORCE_EQ(
-        vars.size(), 1UL,
+        vars.size(),
+        1UL,
        platform::errors::InvalidArgument(
            "Input(%s) should hold one element, but now it holds %zu elements.",
-            name, vars.size()));
+            name,
+            vars.size()));
    return this->GetDim(vars[0]);
  }
@@ -994,10 +1049,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
  void SetOutputDim(const std::string& name, const DDim& dim) override {
    auto& vars = OutputVars(name);
    PADDLE_ENFORCE_EQ(
-        vars.size(), 1UL,
+        vars.size(),
+        1UL,
        platform::errors::InvalidArgument("Output(%s) should hold one element, "
                                          "but now it holds %zu elements.",
-                                          name, vars.size()));
+                                          name,
+                                          vars.size()));
    SetDim(vars[0], dim);
  }
@@ -1034,7 +1091,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
  std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
    std::vector<DDim> ret;
    ret.reserve(vars.size());
-    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
+    std::transform(vars.begin(),
+                   vars.end(),
+                   std::back_inserter(ret),
                   [this](Variable* var) { return this->GetDim(var); });
    return ret;
  }
@@ -1060,12 +1119,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
  void SetDims(const std::vector<Variable*>& vars,
               const std::vector<DDim>& dims) {
    size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(length, dims.size(),
+    PADDLE_ENFORCE_EQ(length,
+                      dims.size(),
                      platform::errors::InvalidArgument(
                          "The number of input variables do not match the "
                          "number of input dimensions, the number of variables "
                          "is %zu, the number of dimensions is %zu.",
-                          length, dims.size()));
+                          length,
+                          dims.size()));
    for (size_t i = 0; i < length; ++i) {
      if (vars[i] == nullptr) {
        continue;
@@ -1084,9 +1145,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
      const std::vector<Variable*>& vars) const {
    std::vector<proto::VarType::Type> retv;
    retv.resize(vars.size());
-    std::transform(vars.begin(), vars.end(), retv.begin(),
+    std::transform(vars.begin(),
+                   vars.end(),
+                   retv.begin(),
                   std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
-                             this, std::placeholders::_1));
+                             this,
+                             std::placeholders::_1));
    return retv;
  }
@@ -1098,7 +1162,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
  const std::vector<Variable*>& InputVars(const std::string& name) const {
    auto it = ctx_.inputs.find(name);
    PADDLE_ENFORCE_NE(
-        it, ctx_.inputs.end(),
+        it,
+        ctx_.inputs.end(),
        platform::errors::NotFound(
            "Operator (%s) does not have the input (%s).", op_.Type(), name));
    return it->second;
@@ -1107,7 +1172,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
  const std::vector<Variable*>& OutputVars(const std::string& name) const {
    auto it = ctx_.outputs.find(name);
    PADDLE_ENFORCE_NE(
-        it, ctx_.outputs.end(),
+        it,
+        ctx_.outputs.end(),
        platform::errors::NotFound(
            "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
    return it->second;
@@ -1143,20 +1209,23 @@ static void CheckTensorNANOrInf(const std::string& op_type,
    return;
  }
  PADDLE_ENFORCE_NE(
-      framework::TensorContainsInf(tensor), true,
+      framework::TensorContainsInf(tensor),
-      platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
+      true,
-                              op_type, name));
+      platform::errors::Fatal(
+          "Operator %s output Tensor %s contains Inf.", op_type, name));
  PADDLE_ENFORCE_NE(
-      framework::TensorContainsNAN(tensor), true,
+      framework::TensorContainsNAN(tensor),
-      platform::errors::Fatal("Operator %s output Tensor %s contains NAN.",
+      true,
-                              op_type, name));
+      platform::errors::Fatal(
+          "Operator %s output Tensor %s contains NAN.", op_type, name));
 }
 bool OperatorWithKernel::SupportGPU() const {
  auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
      phi::TransToPhiKernelName(type_));
  auto has_phi_kernel =
-      std::any_of(phi_kernels.begin(), phi_kernels.end(),
+      std::any_of(phi_kernels.begin(),
+                  phi_kernels.end(),
                  [](phi::KernelKeyMap::const_reference kern_pair) {
                    return kern_pair.first.backend() == phi::Backend::GPU;
                  });
@@ -1169,7 +1238,8 @@ bool OperatorWithKernel::SupportGPU() const {
    } else {
      auto& op_kernels = kernel_iter->second;
      return std::any_of(
-          op_kernels.begin(), op_kernels.end(),
+          op_kernels.begin(),
+          op_kernels.end(),
          [](OpKernelMap::const_reference kern_pair) {
            return platform::is_gpu_place(kern_pair.first.place_);
          });
@@ -1181,7 +1251,8 @@ bool OperatorWithKernel::SupportNPU() const {
  auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
      phi::TransToPhiKernelName(type_));
  auto has_phi_kernel =
-      std::any_of(phi_kernels.begin(), phi_kernels.end(),
+      std::any_of(phi_kernels.begin(),
+                  phi_kernels.end(),
                  [](phi::KernelKeyMap::const_reference kern_pair) {
                    return kern_pair.first.backend() == phi::Backend::NPU;
                  });
@@ -1194,7 +1265,8 @@ bool OperatorWithKernel::SupportNPU() const {
    } else {
      auto& op_kernels = kernel_iter->second;
      return std::any_of(
-          op_kernels.begin(), op_kernels.end(),
+          op_kernels.begin(),
+          op_kernels.end(),
          [](OpKernelMap::const_reference kern_pair) {
            return platform::is_npu_place(kern_pair.first.place_);
          });
@@ -1214,7 +1286,8 @@ bool OperatorWithKernel::SupportsMKLDNN(
    return false;
  }
  auto& op_kernels = op_kernel_iter->second;
-  return std::any_of(op_kernels.begin(), op_kernels.end(),
+  return std::any_of(op_kernels.begin(),
+                     op_kernels.end(),
                     [data_type](OpKernelMap::const_reference kern_pair) {
                       return platform::is_cpu_place(kern_pair.first.place_) &&
                              kern_pair.first.library_type_ ==
@@ -1496,10 +1569,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  {
    platform::RecordEvent record_event("prepare_data",
                                       platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
    if (need_prepare_data_) {
-      transfer_scope = PrepareData(scope, *kernel_type_,
+      transfer_scope = PrepareData(
-                                   &transfered_inplace_vars, runtime_ctx);
+          scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
    }
  }
  // exec scope is the scope that kernel actually executed on.
@@ -1509,9 +1583,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  if (!all_kernels_must_compute_runtime_shape_) {
    platform::RecordEvent record_event("infer_shape",
                                       platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
    RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
    this->Info().infer_shape_(&infer_shape_ctx);
+    record_event.End();
+    platform::RecordOpInfoSupplement(
+        Type(), Attrs(), infer_shape_ctx, *runtime_ctx);
  }
  if (FLAGS_enable_unused_var_check) {
@@ -1523,7 +1601,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  {
    platform::RecordEvent record_event("compute",
                                       platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
    if (run_phi_kernel_) {
      phi::KernelContext pt_kernel_context;
      // Do data transform before building KernelContext
@@ -1663,7 +1742,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
  auto& all_op_kernels = AllOpKernels();
  auto kernels_iter = all_op_kernels.find(type_);
  PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
+      kernels_iter,
+      all_op_kernels.end(),
      platform::errors::Unavailable(
          "There are no kernels which are registered in the %s operator.",
          type_));
@@ -1785,9 +1865,11 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
    kernel_iter = kernels.find(expected_kernel_key);
  }
 #endif
-  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+  PADDLE_ENFORCE_NE(
-                    platform::errors::NotFound(
+      kernel_iter,
-                        "Operator (%s) does not have kernel for %s.", type_,
+      kernels.end(),
+      platform::errors::NotFound("Operator (%s) does not have kernel for %s.",
+                                 type_,
                                 KernelTypeToString(expected_kernel_key)));
  std::lock_guard<std::mutex> lock(cache_update_mutex_);
@@ -1798,7 +1880,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 }
 void OperatorWithKernel::TransferInplaceVarsBack(
-    const Scope& scope, const std::vector<std::string>& inplace_vars,
+    const Scope& scope,
+    const std::vector<std::string>& inplace_vars,
    const Scope& transfer_scope) const {
  for (auto& var_name : inplace_vars) {
    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
@@ -1809,7 +1892,8 @@ void OperatorWithKernel::TransferInplaceVarsBack(
    auto* original_tensor =
        GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
    auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_NOT_NULL(var,
+                            platform::errors::InvalidArgument(
                                "The variable[%s] is nullptr.", var_name));
    auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
    auto original_dims = original_tensor->dims();
@@ -1890,7 +1974,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad(
 }
 Scope* OperatorWithKernel::PrepareData(
-    const Scope& scope, const OpKernelType& expected_kernel_key,
+    const Scope& scope,
+    const OpKernelType& expected_kernel_key,
    std::vector<std::string>* transfered_inplace_vars,
    RuntimeContext* ctx) const {
  Scope* new_scope = nullptr;
@@ -1947,8 +2032,8 @@ Scope* OperatorWithKernel::PrepareData(
          input_vars[i] = trans_var;
          auto out = trans_var->GetMutable<LoDTensor>();
          out->Resize(tensor_in->dims());
-          platform::MatchShapeToLayout(out, tensor_in->layout(),
+          platform::MatchShapeToLayout(
-                                       DataLayout::kNHWC);
+              out, tensor_in->layout(), DataLayout::kNHWC);
          VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
                     "but kNHWC layout"
                  << var_name_item.first << " in Operator " << type_;
@@ -1995,8 +2080,8 @@ Scope* OperatorWithKernel::PrepareData(
      if (!run_by_executor_ &&
          (platform::is_gpu_place(kernel_type_for_var.place_) ||
           platform::is_gpu_place(expected_kernel_key.place_))) {
-        new_scope = TryCreateTransferScope(kernel_type_for_var,
+        new_scope = TryCreateTransferScope(
-                                           expected_kernel_key, &scope);
+            kernel_type_for_var, expected_kernel_key, &scope);
        enable_cache_transfer_scope_ = true;
      }
      if (!new_scope) {
@@ -2058,7 +2143,8 @@ Scope* OperatorWithKernel::PrepareData(
 }
 void OperatorWithKernel::ParseInputDataType(
-    const Variable* var, const std::string& name,
+    const Variable* var,
+    const std::string& name,
    proto::VarType::Type* data_type) const {
  if (var != nullptr) {
    const Tensor* t = nullptr;
@@ -2078,17 +2164,20 @@ void OperatorWithKernel::ParseInputDataType(
    }
    if (t != nullptr) {
      PADDLE_ENFORCE_EQ(
-          t->IsInitialized(), true,
+          t->IsInitialized(),
+          true,
          platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
                                            "contains uninitialized Tensor.",
-                                            Type(), name));
+                                            Type(),
+                                            name));
      *data_type = paddle::framework::TransToProtoVarType(t->dtype());
    }
  }
 }
 void OperatorWithKernel::ParseMultiInputDataType(
-    const std::vector<Variable*>& vars, const std::string& name,
+    const std::vector<Variable*>& vars,
+    const std::string& name,
    proto::VarType::Type* data_type) const {
  proto::VarType::Type default_data_type =
      static_cast<proto::VarType::Type>(-1);
@@ -2112,10 +2201,12 @@ void OperatorWithKernel::ParseMultiInputDataType(
      }
      if (t != nullptr) {
        PADDLE_ENFORCE_EQ(
-            t->IsInitialized(), true,
+            t->IsInitialized(),
+            true,
            platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
                                              "contains uninitialized Tensor.",
-                                              Type(), name));
+                                              Type(),
+                                              name));
        proto::VarType::Type tmp =
            paddle::framework::TransToProtoVarType(t->dtype());
        PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
@@ -2125,7 +2216,9 @@ void OperatorWithKernel::ParseMultiInputDataType(
                           "consistent or reigster GetExpectedKernelType. The "
                           "current variable type is (%s), but the "
                           "previous variable type is (%s).",
-                           Type(), name, DataTypeToString(tmp),
+                           Type(),
+                           name,
+                           DataTypeToString(tmp),
                           DataTypeToString(*data_type)));
        *data_type = tmp;
      }
@@ -2146,7 +2239,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
    }
  }
  PADDLE_ENFORCE_NE(
-      data_type, dafault_data_type,
+      data_type,
+      dafault_data_type,
      platform::errors::NotFound(
          "DataType should be indicated by input Variable at %s.", Type()));
  return data_type;
@@ -2163,12 +2257,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
    ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type);
  }
  PADDLE_ENFORCE_NE(
-      data_type, dafault_data_type,
+      data_type,
+      dafault_data_type,
      platform::errors::InvalidArgument(
          "The Input Variable(%s) of (%s) Operator used to determine kernel "
          "data type is empty or not LoDTensor or SelectedRows or "
          "LoDTensorArray.",
-          name, Type()));
+          name,
+          Type()));
  return data_type;
 }
@@ -2200,11 +2296,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
      t,
      platform::errors::InvalidArgument(
          "The Tensor of variable %s is nullptr when promote complex types."));
-  PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+  PADDLE_ENFORCE_EQ(t->IsInitialized(),
+                    true,
                    platform::errors::InvalidArgument(
                        "The Tensor in the %s Op's Input Variable %s(%s) is "
                        "not initialized.",
-                        Type(), name, ctx.InputName(name)));
+                        Type(),
+                        name,
+                        ctx.InputName(name)));
  return t;
 }
@@ -2216,7 +2315,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
 * the kernel data type.
 */
 proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes(
-    const ExecutionContext& ctx, const std::string& name1,
+    const ExecutionContext& ctx,
+    const std::string& name1,
    const std::string& name2) const {
  // 1. Get tensor
  auto* tensor_a = GetTensorFormInputSafely(ctx, name1);
@@ -2238,10 +2338,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 }
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
-    const std::string& var_name, const Tensor& tensor,
+    const std::string& var_name,
+    const Tensor& tensor,
    const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
+  return OpKernelType(
-                      tensor.layout());
+      expected_kernel_type.data_type_, tensor.place(), tensor.layout());
 }
 phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
@@ -2264,16 +2365,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
 }
 Scope* OperatorWithKernel::PreparePhiData(
-    const Scope& scope, const phi::Kernel& pt_kernel,
+    const Scope& scope,
+    const phi::Kernel& pt_kernel,
    const phi::KernelSignature& pt_kernel_signature,
    RuntimeContext* ctx) const {
  const auto& input_names = pt_kernel_signature.input_names;
  auto input_defs = pt_kernel.args_def().input_defs();
-  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+  PADDLE_ENFORCE_EQ(input_names.size(),
+                    input_defs.size(),
                    platform::errors::InvalidArgument(
                        "The size of inputs_args names (%d) must be equal to "
                        "the size of kernel input_defs (%d).",
-                        input_names.size(), input_defs.size()));
+                        input_names.size(),
+                        input_defs.size()));
  Scope* new_scope = nullptr;
  auto& name_map = Inputs();
  const std::unordered_set<std::string>* no_buffer_ins = nullptr;
@@ -2362,7 +2466,8 @@ Scope* OperatorWithKernel::PreparePhiData(
 }
 void OperatorWithKernel::BuildPhiKernelContext(
-    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
+    const RuntimeContext& ctx,
+    platform::DeviceContext* dev_ctx,
    phi::KernelContext* pt_kernel_context) const {
  pt_kernel_context->SetDeviceContext(dev_ctx);
@@ -2374,23 +2479,29 @@ void OperatorWithKernel::BuildPhiKernelContext(
  auto attr_defs = pt_kernel_->args_def().attribute_defs();
  auto output_defs = pt_kernel_->args_def().output_defs();
-  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+  PADDLE_ENFORCE_EQ(input_names.size(),
+                    input_defs.size(),
                    platform::errors::InvalidArgument(
                        "The size of inputs_args names (%d) must be equal to "
                        "the size of kernel input_defs (%d).",
-                        input_names.size(), input_defs.size()));
+                        input_names.size(),
+                        input_defs.size()));
-  PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
+  PADDLE_ENFORCE_EQ(output_names.size(),
+                    output_defs.size(),
                    platform::errors::InvalidArgument(
                        "The size of outputs_args names (%d) must be equal to "
                        "the size of kernel output_defs (%d).",
-                        output_names.size(), output_defs.size()));
+                        output_names.size(),
+                        output_defs.size()));
-  PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(),
+  PADDLE_ENFORCE_EQ(attr_names.size(),
+                    attr_defs.size(),
                    platform::errors::InvalidArgument(
                        "The size of attribute_args names (%d) must be equal "
                        "to the size of kernel attribute_defs (%d).",
-                        attr_names.size(), attr_defs.size()));
+                        attr_names.size(),
+                        attr_defs.size()));
  for (size_t i = 0; i < input_names.size(); ++i) {
    auto it = ctx.inputs.find(input_names[i]);
@@ -2572,7 +2683,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
        break;
      case phi::AttributeType::SCALARS: {
        PADDLE_ENFORCE_NE(
-            attr_iter, Attrs().end(),
+            attr_iter,
+            Attrs().end(),
            platform::errors::NotFound("(%s) is not found in AttributeMap when "
                                       "buildind static KernelContext.",
                                       attr_names[i]));
@@ -2636,7 +2748,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
      } break;
      default: {
        PADDLE_ENFORCE_NE(
-            attr_iter, Attrs().end(),
+            attr_iter,
+            Attrs().end(),
            platform::errors::NotFound("(%s) is not found in AttributeMap when "
                                       "buildind static KernelContext.",
                                       attr_names[i]));

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
 cc_library(
  allocator
  SRCS allocator.cc
-  DEPS place stats)
+  DEPS place stats profiler)
 cc_library(
  cpu_allocator
  SRCS cpu_allocator.cc
@@ -21,7 +21,7 @@ cc_library(
 cc_library(
  naive_best_fit_allocator
  SRCS naive_best_fit_allocator.cc
-  DEPS allocator buddy_allocator profiler)
+  DEPS allocator buddy_allocator)
 cc_test(
  naive_best_fit_allocator_test
  SRCS naive_best_fit_allocator_test.cc

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -32,7 +32,8 @@
 #endif
 PADDLE_DEFINE_EXPORTED_bool(
-    init_allocated_mem, false,
+    init_allocated_mem,
+    false,
    "It is a mistake that the values of the memory allocated by "
    "BuddyAllocator are always zeroed in some op's implementation. "
    "To find this error in time, we use init_allocated_mem to indicate "
@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+        platform::CpuMinChunkSize(),
+        platform::CpuMaxChunkSize());
  });
  return a;
@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 }
 template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
+void Free<platform::CPUPlace>(const platform::CPUPlace &place,
+                              void *p,
                              size_t size) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
  return p;
 }
 template <>
-void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+void Free<platform::IPUPlace>(const platform::IPUPlace &place,
+                              void *p,
                              size_t size) {
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
  GetCPUBuddyAllocator()->Free(p);
@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
    ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
  }
  PADDLE_ENFORCE_EQ(
-      ret, XPU_SUCCESS,
+      ret,
+      XPU_SUCCESS,
      platform::errors::External(
          "XPU API return wrong value[%d], no enough memory", ret));
  if (FLAGS_init_allocated_mem) {
@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 }
 template <>
-void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+void Free<platform::XPUPlace>(const platform::XPUPlace &place,
+                              void *p,
                              size_t size) {
 #ifdef PADDLE_WITH_XPU
  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList {
  BuddyAllocator *Get(int npu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetNPUDeviceId(devices_[pos]);
@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList {
          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::NPUAllocator(devices_[pos])),
                             platform::NPUMinChunkSize(),
-                             platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
+                             platform::NPUMaxChunkSize(),
+                             EXTRA_PADDING_SIZE));
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "'FLAGS_fraction_of_gpu_memory_to_use' "
@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
        "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(size),
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
 }
 template <>
-void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+void Free<platform::NPUPlace>(const platform::NPUPlace &place,
+                              void *p,
                              size_t size) {
 #ifdef PADDLE_WITH_ASCEND_CL
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
 template <>
 void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
-                                    void *p, size_t size) {
+                                    void *p,
+                                    size_t size) {
 #ifdef PADDLE_WITH_ASCEND_CL
  GetNPUPinnedBuddyAllocator()->Free(p);
 #else
@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList {
  BuddyAllocator *Get(int gpu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetDeviceId(devices_[pos]);
-      allocators_[pos].reset(new BuddyAllocator(
+      allocators_[pos].reset(
-          std::unique_ptr<detail::SystemAllocator>(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::GPUAllocator(devices_[pos])),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
+                             platform::GpuMinChunkSize(),
+                             platform::GpuMaxChunkSize()));
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "'FLAGS_fraction_of_gpu_memory_to_use' "
@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(size),
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 }
 template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
+void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
+                               void *p,
                               size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  GetGPUBuddyAllocator(place.device)->Free(p);
@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p, size_t size) {
+                                     void *p,
+                                     size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList {
  BuddyAllocator *Get(int mlu_id) {
    auto pos = std::distance(
        devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                      platform::errors::OutOfRange(
                          "The index exceeds the size of devices, the size of "
                          "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
    std::call_once(*init_flags_[pos], [this, pos] {
      platform::SetMLUDeviceId(devices_[pos]);
-      allocators_[pos].reset(new BuddyAllocator(
+      allocators_[pos].reset(
-          std::unique_ptr<detail::SystemAllocator>(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::MLUAllocator(devices_[pos])),
-          platform::MLUMinChunkSize(), platform::MLUMaxChunkSize()));
+                             platform::MLUMinChunkSize(),
+                             platform::MLUMaxChunkSize()));
      VLOG(10) << "\n\nNOTE:\n"
               << "You can set GFlags environment variable "
               << "(mlu reuse gpu GFlags) "
@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
        "%s, MLUMinChunkSize %s, MLU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(size),
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
        string::HumanReadableSize(Used<platform::MLUPlace>(place))));
@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
 }
 template <>
-void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p,
+void Free<platform::MLUPlace>(const platform::MLUPlace &place,
+                              void *p,
                              size_t size) {
 #ifdef PADDLE_WITH_MLU
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -759,10 +785,12 @@ class BuddyAllocatorList {
  }
  BuddyAllocator *Get(int dev_id) {
-    PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
+    PADDLE_ENFORCE_NE(init_flags_.find(dev_id),
+                      init_flags_.end(),
                      platform::errors::OutOfRange(
                          "Cannot find %s %d, please check visible devices.",
-                          device_type_, dev_id));
+                          device_type_,
+                          dev_id));
    std::call_once(*init_flags_[dev_id], [this, dev_id] {
      phi::DeviceManager::SetDevice(device_type_, dev_id);
@@ -773,7 +801,8 @@ class BuddyAllocatorList {
              new detail::CustomAllocator(device_type_, dev_id)),
          phi::DeviceManager::GetMinChunkSize(place),
          phi::DeviceManager::GetMaxChunkSize(place),
-          phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
+          phi::DeviceManager::GetExtraPaddingSize(place),
+          device_type_));
    });
    return allocators_[dev_id].get();
@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
    PADDLE_THROW(platform::errors::ResourceExhausted(
        "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
        "%s. ",
-        string::HumanReadableSize(size), place.GetDeviceType(), place.device,
+        string::HumanReadableSize(size),
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        place.GetDeviceType(),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
        string::HumanReadableSize(total - avail)));
  } else {
    if (FLAGS_init_allocated_mem) {
@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
 }
 template <>
-void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
+void Free<platform::CustomPlace>(const platform::CustomPlace &place,
+                                 void *p,
                                 size_t size) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -922,8 +955,6 @@ namespace allocation {
 phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
  void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
  auto *tmp_alloc = new Allocation(ptr, size, place_);
-  platform::MemEvenRecorder::Instance().PushMemRecord(
-      static_cast<void *>(tmp_alloc), place_, size);
  return tmp_alloc;
 }
@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
  paddle::platform::VisitPlace(
      allocation->place(),
      legacy::FreeVisitor(allocation->ptr(), allocation->size()));
-  platform::MemEvenRecorder::Instance().PopMemRecord(
-      static_cast<void *>(allocation), place_);
  delete allocation;
 }

--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/stats.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
  PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
+  platform::RecordMemEvent(allocation->ptr(),
+                           allocation->place(),
+                           allocation->size(),
+                           platform::TracerMemEventType::ReservedFree);
  delete allocation;
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
  PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+  platform::RecordMemEvent(ptr,
+                           platform::CUDAPinnedPlace(),
+                           size,
+                           platform::TracerMemEventType::ReservedAllocate);
  return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -16,6 +16,7 @@
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/stats.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 namespace paddle {
 namespace memory {
@@ -30,14 +31,18 @@ class StatAllocator : public Allocator {
 protected:
  void FreeImpl(phi::Allocation* allocation) override {
-    if (platform::is_cpu_place(allocation->place())) {
+    if (platform::is_cpu_place(allocation->place()) ||
-      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+        platform::is_cuda_pinned_place(allocation->place())) {
-                              -allocation->size());
+      HOST_MEMORY_STAT_UPDATE(
+          Allocated, allocation->place().GetDeviceId(), -allocation->size());
    } else {
-      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+      DEVICE_MEMORY_STAT_UPDATE(
-                                -allocation->size());
+          Allocated, allocation->place().GetDeviceId(), -allocation->size());
    }
+    platform::RecordMemEvent(allocation->ptr(),
+                             allocation->place(),
+                             allocation->size(),
+                             platform::TracerMemEventType::Free);
    underlying_allocator_->Free(allocation);
  }
@@ -48,12 +53,16 @@ class StatAllocator : public Allocator {
    const platform::Place& place = allocation->place();
    if (platform::is_cpu_place(place) ||
        platform::is_cuda_pinned_place(place)) {
-      HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
+      HOST_MEMORY_STAT_UPDATE(
-                              allocation->size());
+          Allocated, place.GetDeviceId(), allocation->size());
    } else {
-      DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
+      DEVICE_MEMORY_STAT_UPDATE(
-                                allocation->size());
+          Allocated, place.GetDeviceId(), allocation->size());
    }
+    platform::RecordMemEvent(allocation->ptr(),
+                             allocation->place(),
+                             allocation->size(),
+                             platform::TracerMemEventType::Allocate);
    return allocation.release();
  }

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -41,6 +41,7 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -64,11 +65,13 @@ void* AlignedMalloc(size_t size) {
 #else
  int error = posix_memalign(&p, alignment, size);
  PADDLE_ENFORCE_EQ(
-      error, 0,
+      error,
+      0,
      platform::errors::ResourceExhausted(
          "Fail to alloc memory of %ld size, error code is %d.", size, error));
 #endif
-  PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted(
+  PADDLE_ENFORCE_NOT_NULL(p,
+                          platform::errors::ResourceExhausted(
                              "Fail to alloc memory of %ld size.", size));
  return p;
 }
@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
  }
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
  return p;
 }
@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 #endif
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }
 bool CPUAllocator::UseGpu() const { return false; }
@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum GPU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
    }
    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        gpu_id_, string::HumanReadableSize(size), gpu_id_,
+        gpu_id_,
-        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
+        string::HumanReadableSize(size),
-        gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        gpu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        gpu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
  }
 }
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(gpu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(gpu_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
-                        size, gpu_alloc_size_));
+                        size,
+                        gpu_alloc_size_));
  gpu_alloc_size_ -= size;
  platform::RecordedGpuFree(p, size, gpu_id_);
@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
    *index = 1;  // PINNED memory
    cuda_pinnd_alloc_size_ += size;
    HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+    platform::RecordMemEvent(
+        p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
    return p;
  } else {
    LOG(WARNING) << "cudaHostAlloc failed.";
@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  gpuError_t err;
-  PADDLE_ENFORCE_EQ(index, 1,
+  PADDLE_ENFORCE_EQ(index,
+                    1,
                    platform::errors::InvalidArgument(
                        "The index should be 1, but got %d", index));
-  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size,
+  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated cuda pinned memory (%d)",
-                        size, cuda_pinnd_alloc_size_));
+                        size,
+                        cuda_pinnd_alloc_size_));
  cuda_pinnd_alloc_size_ -= size;
 #ifdef PADDLE_WITH_HIP
  err = hipHostFree(p);
  if (err != hipErrorDeinitialized) {
    PADDLE_ENFORCE_EQ(
-        err, hipSuccess,
+        err,
+        hipSuccess,
        platform::errors::Fatal(
            "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
  }
@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
  // cudaFreeHost succeeds.
  if (err != cudaErrorCudartUnloading) {
    PADDLE_ENFORCE_EQ(
-        err, 0,
+        err,
+        0,
        platform::errors::Fatal(
            "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
            err));
  }
 #endif
  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }
 bool CUDAPinnedAllocator::UseGpu() const { return false; }
@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum GPU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
    }
    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        npu_id_, string::HumanReadableSize(size), npu_id_,
+        npu_id_,
-        string::HumanReadableSize(avail), npu_id_,
+        string::HumanReadableSize(size),
-        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        npu_id_,
+        string::HumanReadableSize(avail),
+        npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
  }
 }
 void NPUAllocator::Free(void* p, size_t size, size_t index) {
  VLOG(4) << "Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(npu_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
-                        size, npu_alloc_size_));
+                        size,
+                        npu_alloc_size_));
  npu_alloc_size_ -= size;
  platform::RecordedNPUFree(p, size, npu_id_);
@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
 void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
  aclError err;
-  PADDLE_ENFORCE_EQ(index, 1,
+  PADDLE_ENFORCE_EQ(index,
+                    1,
                    platform::errors::InvalidArgument(
                        "The index should be 1, but got %d", index));
-  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
+  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated npu pinned memory (%d)",
-                        size, npu_pinnd_alloc_size_));
+                        size,
+                        npu_pinnd_alloc_size_));
  npu_pinnd_alloc_size_ -= size;
  err = platform::NPUHostFree(p);
  if (err != ACL_ERROR_NONE) {
    PADDLE_ENFORCE_EQ(
-        err, 0,
+        err,
+        0,
        platform::errors::Fatal(
            "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
  }
@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
          "maximum MLU memory usage is limited to %d MB.\n"
          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
    }
    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
        "please set it to a higher value but less than 1.0.\n"
        "      The command is "
        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        mlu_id_, string::HumanReadableSize(size), mlu_id_,
+        mlu_id_,
-        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
+        string::HumanReadableSize(size),
-        mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        mlu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        mlu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
  }
 }
 void MLUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(mlu_alloc_size_, size,
+  PADDLE_ENFORCE_GE(mlu_alloc_size_,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
-                        size, mlu_alloc_size_));
+                        size,
+                        mlu_alloc_size_));
  mlu_alloc_size_ -= size;
  platform::RecordedMLUFree(p, size, mlu_id_);
@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
        "\n\nOut of memory error on %s %d. "
        "total memory is %s, used memory is %s, "
        "available memory is only %s.\n\n",
-        dev_type_, dev_id_, string::HumanReadableSize(total),
+        dev_type_,
+        dev_id_,
+        string::HumanReadableSize(total),
        string::HumanReadableSize(total - avail),
        string::HumanReadableSize(avail)));
  }
@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
 void CustomAllocator::Free(void* p, size_t size, size_t index) {
  VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
                    platform::errors::InvalidArgument(
                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(plug_alloc_size, size,
+  PADDLE_ENFORCE_GE(plug_alloc_size,
+                    size,
                    platform::errors::InvalidArgument(
                        "The size of memory (%d) to free exceeds the size of "
                        "allocated gpu memory (%d)",
-                        size, plug_alloc_size));
+                        size,
+                        plug_alloc_size));
  plug_alloc_size -= size;
  auto place = platform::CustomPlace(dev_type_, dev_id_);
  auto device = phi::DeviceManager::GetDeviceWithPlace(place);

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/common/place.h"
 #ifdef PADDLE_WITH_XPU
@@ -33,8 +33,12 @@ namespace memory {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 template <>
 void Copy<platform::CPUPlace, platform::CustomPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place,
+    platform::CPUPlace dst_place,
-    const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::CustomPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
@@ -52,8 +56,12 @@ void Copy<platform::CPUPlace, platform::CustomPlace>(
 template <>
 void Copy<platform::CustomPlace, platform::CPUPlace>(
-    platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place,
+    platform::CustomPlace dst_place,
-    const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
  auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
@@ -70,8 +78,12 @@ void Copy<platform::CustomPlace, platform::CPUPlace>(
 template <>
 void Copy<platform::CustomPlace, platform::CustomPlace>(
-    platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place,
+    platform::CustomPlace dst_place,
-    const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::CustomPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
@@ -102,9 +114,11 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
 #endif  // PADDLE_WITH_CUSTOM_DEVICE
 template <>
-void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
+                                                  void* dst,
                                                  platform::CPUPlace,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
  if (UNLIKELY(num == 0)) return;
  VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
  std::memcpy(dst, src, num);
@@ -115,7 +129,8 @@ template <>
 void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
                                                  void* dst,
                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
  if (UNLIKELY(num == 0)) return;
  std::memcpy(dst, src, num);
 }
@@ -123,7 +138,8 @@ template <>
 void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place,
                                                  void* dst,
                                                  platform::IPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
  if (UNLIKELY(num == 0)) return;
  std::memcpy(dst, src, num);
 }
@@ -131,15 +147,18 @@ template <>
 void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
                                                  void* dst,
                                                  platform::IPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
  if (UNLIKELY(num == 0)) return;
  std::memcpy(dst, src, num);
 }
 // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace).
 template <>
-void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
+void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                     size_t num) {
  if (src_place.GetType() == phi::AllocationType::CPU) {
    platform::CPUPlace place_src;
@@ -152,8 +171,10 @@ void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
 // NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace).
 template <>
-void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place,
-                                     phi::IPUPlace src_place, const void* src,
+                                     void* dst,
+                                     phi::IPUPlace src_place,
+                                     const void* src,
                                     size_t num) {
  if (dst_place.GetType() == phi::AllocationType::CPU) {
    platform::CPUPlace place_dst;
@@ -170,7 +191,8 @@ template <>
 void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
                                                  void* dst,
                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
  if (num <= 0) {
    VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
    return;
@@ -182,7 +204,8 @@ template <>
 void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
                                                  void* dst,
                                                  platform::XPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
  if (num <= 0) {
    VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
    return;
@@ -194,7 +217,8 @@ template <>
 void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
                                                  void* dst,
                                                  platform::XPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
  if (num <= 0) {
    VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
    return;
@@ -204,8 +228,10 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
 // NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace).
 template <>
-void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
+void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                     size_t num) {
  if (src_place.GetType() == phi::AllocationType::CPU) {
    platform::CPUPlace place_src;
@@ -218,8 +244,10 @@ void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
 // NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace).
 template <>
-void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
-                                     phi::XPUPlace src_place, const void* src,
+                                     void* dst,
+                                     phi::XPUPlace src_place,
+                                     const void* src,
                                     size_t num) {
  if (dst_place.GetType() == phi::AllocationType::CPU) {
    platform::CPUPlace place_dst;
@@ -236,7 +264,8 @@ template <>
 void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
                                                  void* dst,
                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                  void* stream) {
  if (UNLIKELY(num == 0)) return;
@@ -248,7 +277,10 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
  if (stream) {
    platform::RecordEvent record_event(
        "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_HOST_TO_DEVICE,
                             reinterpret_cast<aclrtStream>(stream));
  } else {
    // On NPU, async operation after sync operation is ok, while sync operation
@@ -267,7 +299,8 @@ template <>
 void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
                                                  void* dst,
                                                  platform::NPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                  void* stream) {
  if (UNLIKELY(num == 0)) return;
@@ -279,7 +312,10 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
  if (stream) {
    platform::RecordEvent record_event(
        "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_DEVICE_TO_HOST,
                             reinterpret_cast<aclrtStream>(stream));
  } else {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -295,7 +331,8 @@ template <>
 void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
                                                  void* dst,
                                                  platform::NPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                  void* stream) {
  if (UNLIKELY(num == 0)) return;
@@ -307,7 +344,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
                                         platform::TracerEventType::UserDefined,
                                         1);
-      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+      platform::NPUMemcpyAsync(dst,
+                               src,
+                               num,
+                               ACL_MEMCPY_DEVICE_TO_DEVICE,
                               reinterpret_cast<aclrtStream>(stream));
    } else {
      platform::DeviceContextPool& pool =
@@ -329,7 +369,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
                                         platform::TracerEventType::UserDefined,
                                         1);
-      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+      platform::NPUMemcpyAsync(dst,
+                               src,
+                               num,
+                               ACL_MEMCPY_DEVICE_TO_DEVICE,
                               reinterpret_cast<aclrtStream>(stream));
    } else {
      platform::DeviceContextPool& pool =
@@ -346,8 +389,11 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
 template <>
 void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
+    platform::CPUPlace dst_place,
-    const void* src, size_t num) {
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num) {
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
          << dst_place;
  if (UNLIKELY(num == 0)) return;
@@ -356,8 +402,11 @@ void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
 template <>
 void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place,
+    platform::NPUPinnedPlace dst_place,
-    const void* src, size_t num) {
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num) {
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
          << dst_place;
  if (UNLIKELY(num == 0)) return;
@@ -366,8 +415,11 @@ void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
 template <>
 void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst,
+    platform::NPUPinnedPlace dst_place,
-    platform::NPUPinnedPlace src_place, const void* src, size_t num) {
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num) {
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
          << dst_place;
  if (UNLIKELY(num == 0)) return;
@@ -376,8 +428,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
 template <>
 void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
+    platform::NPUPinnedPlace dst_place,
-    const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::NPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  platform::SetNPUDeviceId(src_place.device);
@@ -389,7 +445,10 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
                                       platform::TracerEventType::UserDefined,
                                       1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_DEVICE_TO_HOST,
                             reinterpret_cast<aclrtStream>(stream));
  } else {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -404,8 +463,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
 template <>
 void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
-    platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
+    platform::NPUPlace dst_place,
-    const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  platform::SetNPUDeviceId(dst_place.device);
@@ -417,7 +480,10 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
                                       platform::TracerEventType::UserDefined,
                                       1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_HOST_TO_DEVICE,
                             reinterpret_cast<aclrtStream>(stream));
  } else {
    // On NPU, async operation after sync operation is ok, while sync operation
@@ -435,9 +501,12 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
 // NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  phi::Place src_place, const void* src,
+                                  void* dst,
-                                  size_t num, aclrtStream stream) {
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  aclrtStream stream) {
  if (src_place.GetType() == phi::AllocationType::CPU &&
      dst_place.GetType() == phi::AllocationType::CPU) {
    platform::CPUPlace place_dst, place_src;
@@ -504,52 +573,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
-                                     size_t num, aclrtStream stream) {
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 // NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     phi::CPUPlace src_place, const void* src,
+                                     void* dst,
-                                     size_t num, aclrtStream stream) {
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
 template <>
-void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place, void* dst,
+void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
-                                     size_t num, aclrtStream stream) {
+                                     phi::Place src_place,
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
+                                     const void* src,
-       src, num, stream);
+                                     size_t num,
+                                     aclrtStream stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 // NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
 template <>
-void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place,
-                                     phi::NPUPlace src_place, const void* src,
+                                     void* dst,
-                                     size_t num, aclrtStream stream) {
+                                     phi::NPUPlace src_place,
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+                                     const void* src,
-       src, num, stream);
+                                     size_t num,
+                                     aclrtStream stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
 template <>
 void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
+                                           void* dst,
-                                           const void* src, size_t num,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num,
                                           aclrtStream stream) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 // NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
 template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                           phi::NPUPinnedPlace src_place,
-                                           const void* src, size_t num,
+                                           const void* src,
+                                           size_t num,
                                           aclrtStream stream) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
@@ -557,16 +650,20 @@ void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
 template <>
 void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
+                                           void* dst,
-                                           const void* src, size_t num) {
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
 }
 // NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
 template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                           phi::NPUPinnedPlace src_place,
-                                           const void* src, size_t num) {
+                                           const void* src,
+                                           size_t num) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
 }
 #endif
@@ -608,8 +705,12 @@ inline void SyncCUDAStream() {
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    platform::CPUPlace dst_place,
-    const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  platform::SetDeviceId(src_place.device);
@@ -619,10 +720,16 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
    platform::RecordEvent record_event(
        "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyDeviceToHost,
                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyDeviceToHost,
                             reinterpret_cast<gpuStream_t>(stream));
 #endif
  } else {
@@ -642,8 +749,12 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
 template <>
 void Copy<platform::CUDAPlace, platform::CPUPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
+    platform::CUDAPlace dst_place,
-    const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  platform::SetDeviceId(dst_place.device);
@@ -653,10 +764,16 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
    platform::RecordEvent record_event(
        "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyHostToDevice,
                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyHostToDevice,
                             reinterpret_cast<gpuStream_t>(stream));
 #endif
  } else {
@@ -676,8 +793,12 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    platform::CUDAPlace dst_place,
-    const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -689,10 +810,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                         platform::TracerEventType::UserDefined,
                                         1);
 #ifdef PADDLE_WITH_HIP
-      platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               hipMemcpyDeviceToDevice,
                               reinterpret_cast<gpuStream_t>(stream));
 #else
-      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice,
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               cudaMemcpyDeviceToDevice,
                               reinterpret_cast<gpuStream_t>(stream));
 #endif
    } else {
@@ -710,22 +837,29 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
      platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU",
                                         platform::TracerEventType::UserDefined,
                                         1);
-      platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
+      platform::GpuMemcpyPeerAsync(dst,
-                                   num, reinterpret_cast<gpuStream_t>(stream));
+                                   dst_place.device,
+                                   src,
+                                   src_place.device,
+                                   num,
+                                   reinterpret_cast<gpuStream_t>(stream));
    } else {
      platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU",
                                         platform::TracerEventType::UserDefined,
                                         1);
-      platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
+      platform::GpuMemcpyPeerSync(
-                                  num);
+          dst, dst_place.device, src, src_place.device, num);
    }
  }
 }
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
-    platform::CPUPlace dst_place, void* dst,
+    platform::CPUPlace dst_place,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num) {
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
          << dst_place;
  if (UNLIKELY(num == 0)) return;
@@ -734,8 +868,11 @@ void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace dst_place,
-    platform::CPUPlace src_place, const void* src, size_t num) {
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num) {
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
          << dst_place;
  if (UNLIKELY(num == 0)) return;
@@ -744,8 +881,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace dst_place,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num) {
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
          << dst_place;
  if (UNLIKELY(num == 0)) return;
@@ -754,8 +894,12 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace dst_place,
-    platform::CUDAPlace src_place, const void* src, size_t num, void* stream) {
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
  if (UNLIKELY(num == 0)) return;
  platform::SetDeviceId(src_place.device);
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -765,10 +909,16 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                                       platform::TracerEventType::UserDefined,
                                       1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyDeviceToHost,
                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyDeviceToHost,
                             reinterpret_cast<gpuStream_t>(stream));
 #endif
  } else {
@@ -785,8 +935,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
-    platform::CUDAPlace dst_place, void* dst,
+    platform::CUDAPlace dst_place,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num,
    void* stream) {
  if (UNLIKELY(num == 0)) return;
@@ -798,10 +951,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                                       platform::TracerEventType::UserDefined,
                                       1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyHostToDevice,
                             reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyHostToDevice,
                             reinterpret_cast<gpuStream_t>(stream));
 #endif
  } else {
@@ -818,9 +977,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
 // NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  phi::Place src_place, const void* src,
+                                  void* dst,
-                                  size_t num, void* stream) {
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
  if (src_place.GetType() == phi::AllocationType::CPU &&
      dst_place.GetType() == phi::AllocationType::CPU) {
    platform::CPUPlace place_dst, place_src;
@@ -887,52 +1049,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     phi::CPUPlace src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace)
 template <>
-void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place, void* dst,
+void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::Place src_place,
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
+                                     const void* src,
-       src, num, stream);
+                                     size_t num,
+                                     void* stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 // NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
 template <>
-void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place,
-                                     phi::GPUPlace src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::GPUPlace src_place,
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+                                     const void* src,
-       src, num, stream);
+                                     size_t num,
+                                     void* stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace)
 template <>
 void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
+                                           void* dst,
-                                           const void* src, size_t num,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num,
                                           void* stream) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
 template <>
-void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                           phi::GPUPinnedPlace src_place,
-                                           const void* src, size_t num,
+                                           const void* src,
+                                           size_t num,
                                           void* stream) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
@@ -940,16 +1126,20 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace) -> (CUDAPinnedPlace)
 template <>
 void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
+                                           void* dst,
-                                           const void* src, size_t num) {
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
 }
 // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace)
 template <>
-void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                           phi::GPUPinnedPlace src_place,
-                                           const void* src, size_t num) {
+                                           const void* src,
+                                           size_t num) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
 }
 #endif
@@ -959,7 +1149,8 @@ template <>
 void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
                                                  void* dst,
                                                  platform::MLUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                  void* stream) {
  if (UNLIKELY(num == 0)) return;
@@ -970,8 +1161,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
    platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
                                       platform::TracerEventType::UserDefined,
                                       1);
-    platform::MLUMemcpyD2HAsync(dst, src, num,
+    platform::MLUMemcpyD2HAsync(
-                                reinterpret_cast<mluStream>(stream));
+        dst, src, num, reinterpret_cast<mluStream>(stream));
  } else {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -988,7 +1179,8 @@ template <>
 void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
                                                  void* dst,
                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                  void* stream) {
  if (UNLIKELY(num == 0)) return;
@@ -999,8 +1191,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
    platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
                                       platform::TracerEventType::UserDefined,
                                       1);
-    platform::MLUMemcpyH2DAsync(dst, src, num,
+    platform::MLUMemcpyH2DAsync(
-                                reinterpret_cast<mluStream>(stream));
+        dst, src, num, reinterpret_cast<mluStream>(stream));
  } else {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -1017,7 +1209,8 @@ template <>
 void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
                                                  void* dst,
                                                  platform::MLUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                  void* stream) {
  if (UNLIKELY(num == 0)) return;
@@ -1029,8 +1222,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
      platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
                                         platform::TracerEventType::UserDefined,
                                         1);
-      platform::MLUMemcpyD2DAsync(dst, src, num,
+      platform::MLUMemcpyD2DAsync(
-                                  reinterpret_cast<mluStream>(stream));
+          dst, src, num, reinterpret_cast<mluStream>(stream));
    } else {
      platform::DeviceContextPool& pool =
          platform::DeviceContextPool::Instance();
@@ -1050,25 +1243,32 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
      platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
                                         platform::TracerEventType::UserDefined,
                                         1);
-      platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
+      platform::MLUMemcpyPeerAsync(dst,
-                                   num, reinterpret_cast<mluStream>(stream));
+                                   dst_place.device,
+                                   src,
+                                   src_place.device,
+                                   num,
+                                   reinterpret_cast<mluStream>(stream));
    } else {
      VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
              << " to " << dst_place;
      platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
                                         platform::TracerEventType::UserDefined,
                                         1);
-      platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
+      platform::MLUMemcpyPeerSync(
-                                  num);
+          dst, dst_place.device, src, src_place.device, num);
    }
  }
 }
 // NOTE: only for CPUPlace and MLUPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  phi::Place src_place, const void* src,
+                                  void* dst,
-                                  size_t num, void* stream) {
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
  if (src_place.GetType() == phi::AllocationType::CPU &&
      dst_place.GetType() == phi::AllocationType::CPU) {
    platform::CPUPlace place_dst, place_src;
@@ -1110,35 +1310,55 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
 template <>
-void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place, void* dst,
+void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::Place src_place,
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
+                                     const void* src,
-       src, num, stream);
+                                     size_t num,
+                                     void* stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 // NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
 template <>
-void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
-                                     phi::MLUPlace src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::MLUPlace src_place,
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+                                     const void* src,
-       src, num, stream);
+                                     size_t num,
+                                     void* stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 // NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 // NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     phi::CPUPlace src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
@@ -1146,8 +1366,10 @@ void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
 // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  phi::Place src_place, const void* src,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
                                  size_t num) {
  if (UNLIKELY(num == 0)) return;
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -1224,16 +1446,20 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     phi::CPUPlace src_place, const void* src,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
                                     size_t num) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num);
 }
 // NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                     size_t num) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num);
 }
@@ -1243,9 +1469,12 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
    !defined(PADDLE_WITH_MLU)
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  phi::Place src_place, const void* src,
+                                  void* dst,
-                                  size_t num, void* stream) {
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
  if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
      dst_place.GetType() == phi::AllocationType::CUSTOM) {
    platform::CPUPlace place_src;
@@ -1265,17 +1494,23 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 }
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     phi::Place src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     phi::CPUPlace src_place, const void* src,
+                                     void* dst,
-                                     size_t num, void* stream) {
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 #endif

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -354,7 +354,9 @@ if(WITH_GPU)
         enforce
         dynload_cuda
         new_profiler
-         stats)
+         stats
+         op_proto_maker
+         shape_inference)
  nv_library(
    device_memory_aligment
    SRCS device_memory_aligment.cc
@@ -363,7 +365,14 @@ elseif(WITH_ROCM)
  hip_library(
    profiler
    SRCS profiler.cc profiler.cu
-    DEPS os_info device_tracer gpu_info enforce new_profiler stats)
+    DEPS os_info
+         device_tracer
+         gpu_info
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
  hip_library(
    device_memory_aligment
    SRCS device_memory_aligment.cc
@@ -372,7 +381,13 @@ else()
  cc_library(
    profiler
    SRCS profiler.cc
-    DEPS os_info device_tracer enforce new_profiler stats)
+    DEPS os_info
+         device_tracer
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
  cc_library(
    device_memory_aligment
    SRCS device_memory_aligment.cc

--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);
-PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log,
+                            false,
                            "Whether to print the message of gpu memory usage "
                            "at exit, mainly used for UT and CI.");
-PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true,
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
+                            true,
                            "Whether to print the message of gpu memory usage "
                            "MB as a unit of measurement.");
@@ -66,7 +69,10 @@ namespace platform {
 void GpuMemoryUsage(size_t *available, size_t *total) {
  size_t actual_available, actual_total;
-  RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total,
+  RecordedGpuMemGetInfo(available,
+                        total,
+                        &actual_available,
+                        &actual_total,
                        platform::GetCurrentDeviceId());
 }
@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() {
 static size_t GpuAllocSize(bool realloc) {
  size_t available_to_alloc = GpuAvailableMemToAlloc();
  PADDLE_ENFORCE_GT(
-      available_to_alloc, 0,
+      available_to_alloc,
+      0,
      platform::errors::ResourceExhausted("Not enough available GPU memory."));
  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
  // allocated by fraction
@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) {
           ? flag_mb << 20
           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
  PADDLE_ENFORCE_GE(
-      available_to_alloc, alloc_bytes,
+      available_to_alloc,
+      alloc_bytes,
      platform::errors::ResourceExhausted("Not enough available GPU memory."));
  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
           << " MiB, is it Re-alloc: " << realloc;
@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper {
    });
    PADDLE_ENFORCE_GE(
-        dev_id, 0,
+        dev_id,
+        0,
        platform::errors::OutOfRange(
            "Device id must be not less than 0, but got %d.", dev_id));
    PADDLE_ENFORCE_LT(
-        dev_id, instances_.size(),
+        dev_id,
+        instances_.size(),
        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
-                                     dev_id, instances_.size()));
+                                     dev_id,
+                                     instances_.size()));
    return instances_[dev_id].get();
  }
@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper {
   * or cudaSuccess would be returned, and the cudaGetLastError() flag
   * would be clear.
   */
-  gpuError_t Malloc(void **ptr, size_t size,
+  gpuError_t Malloc(void **ptr,
+                    size_t size,
                    bool malloc_managed_memory = false) {
    LockGuardPtr<std::mutex> lock(mtx_);
    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper {
      cur_size_.fetch_add(size);
      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+      platform::RecordMemEvent(ptr,
+                               GPUPlace(dev_id_),
+                               size,
+                               platform::TracerMemEventType::ReservedAllocate);
 #ifdef PADDLE_WITH_TESTING
      gpu_ptrs.insert(*ptr);
 #endif
@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper {
      cur_size_.fetch_sub(size);
      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      platform::RecordMemEvent(ptr,
+                               GPUPlace(dev_id_),
+                               size,
+                               platform::TracerMemEventType::ReservedFree);
    } else {
      platform::GpuGetLastError();  // clear the error flag when
                                    // cudaErrorCudartUnloading /
@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper {
 #endif
  }
-  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+  bool GetMemInfo(size_t *avail,
+                  size_t *total,
+                  size_t *actual_avail,
                  size_t *actual_total) {
    {
      CUDADeviceGuard guard(dev_id_);
@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper {
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+  CUresult MemCreate(CUmemGenericAllocationHandle *handle,
+                     size_t size,
                     const CUmemAllocationProp *prop,
                     unsigned long long flags) {  // NOLINT
    auto result =
@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper {
 std::once_flag RecordedGpuMallocHelper::once_flag_;
-gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
+gpuError_t RecordedGpuMalloc(void **ptr,
+                             size_t size,
+                             int dev_id,
                             bool malloc_managed_memory) {
  return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
      ptr, size, malloc_managed_memory);
@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
+                              size_t size,
                              const CUmemAllocationProp *prop,
-                              unsigned long long flags, int dev_id) {  // NOLINT
+                              unsigned long long flags,
-  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size,
+                              int dev_id) {  // NOLINT
-                                                              prop, flags);
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
+      handle, size, prop, flags);
 }
-CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
+                               size_t size,
                               int dev_id) {
  return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
 }
 #endif
 #endif
-bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+bool RecordedGpuMemGetInfo(size_t *avail,
-                           size_t *actual_total, int dev_id) {
+                           size_t *total,
+                           size_t *actual_avail,
+                           size_t *actual_total,
+                           int dev_id) {
  return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
      avail, total, actual_avail, actual_total);
 }
@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) {
 void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); }
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+void GpuMemcpyAsync(void *dst,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
  phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
 }
-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                   gpuMemcpyKind kind) {
  phi::backends::gpu::GpuMemcpySync(dst, src, count, kind);
 }
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+void GpuMemcpyPeerAsync(void *dst,
-                        int src_device, size_t count, gpuStream_t stream) {
+                        int dst_device,
-  phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
+                        const void *src,
-                                         count, stream);
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
+  phi::backends::gpu::GpuMemcpyPeerAsync(
+      dst, dst_device, src, src_device, count, stream);
 }
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+void GpuMemcpyPeerSync(
-                       int src_device, size_t count) {
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
-  phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
+  phi::backends::gpu::GpuMemcpyPeerSync(
-                                        count);
+      dst, dst_device, src, src_device, count);
 }
 void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -30,12 +30,16 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/nvtx.h"
 #endif
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"
-PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
+PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
+                            false,
                            "Enable rpc profiler or not.");
-DEFINE_bool(enable_host_event_recorder_hook, false,
+DEFINE_bool(enable_host_event_recorder_hook,
+            false,
            "enable HostEventRecorder, hook Profiler");
 namespace paddle {
@@ -43,8 +47,11 @@ namespace platform {
 MemEvenRecorder MemEvenRecorder::recorder;
-Event::Event(EventType type, std::string name, uint32_t thread_id,
+Event::Event(EventType type,
-             EventRole role, std::string attr)
+             std::string name,
+             uint32_t thread_id,
+             EventRole role,
+             std::string attr)
    : type_(type),
      name_(name),
      thread_id_(thread_id),
@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const {
 #endif
 }
-RecordEvent::RecordEvent(const char *name, const TracerEventType type,
+RecordEvent::RecordEvent(const char *name,
-                         uint32_t level, const EventRole role) {
+                         const TracerEventType type,
+                         uint32_t level,
+                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
  if (g_enable_nvprof_hook) {
@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
  start_ns_ = PosixInNsec();
 }
-RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
+RecordEvent::RecordEvent(const std::string &name,
-                         uint32_t level, const EventRole role) {
+                         const TracerEventType type,
+                         uint32_t level,
+                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
  if (g_enable_nvprof_hook) {
@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
  start_ns_ = PosixInNsec();
 }
-RecordEvent::RecordEvent(const std::string &name, const std::string &attr,
+RecordEvent::RecordEvent(const std::string &name,
-                         const TracerEventType type, uint32_t level,
+                         const std::string &attr,
+                         const TracerEventType type,
+                         uint32_t level,
                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
@@ -215,8 +228,8 @@ void RecordEvent::End() {
  DeviceTracer *tracer = GetDeviceTracer();
  if (tracer) {
    uint64_t end_ns = PosixInNsec();
-    tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
+    tracer->AddCPURecords(
-                          g_thread_id);
+        CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
  }
  ClearCurAnnotation();
  PopEvent(*name_, role_);
@@ -226,7 +239,8 @@ void RecordEvent::End() {
  is_enabled_ = false;
 }
-RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
+RecordInstantEvent::RecordInstantEvent(const char *name,
+                                       TracerEventType type,
                                       uint32_t level) {
  if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
    return;
@@ -236,21 +250,242 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
      name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
 }
-void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
+RecordOpInfoSupplement::RecordOpInfoSupplement(
+    const std::string &type,
+    const framework::AttributeMap &attrs,
+    const framework::InferShapeContext &shape_ctx,
+    const framework::RuntimeContext &ctx) {
+  if (FLAGS_enable_host_event_recorder_hook == false) {
+    return;
+  }
+  std::map<std::string, std::vector<framework::DDim>> input_shapes;
+  std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
+  for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
+    input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
+    dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
+  }
+  const std::vector<std::string> *callstack_ptr = nullptr;
+  std::vector<std::string> callstack;
+  auto iter = attrs.find(
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+  if (iter != attrs.end()) {
+    callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
+    callstack = *callstack_ptr;
+  }
+  HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
+      PosixInNsec(), type, input_shapes, dtypes, callstack);
+}
+RecordMemEvent::RecordMemEvent(const void *ptr,
+                               const phi::Place &place,
+                               size_t size,
+                               const TracerMemEventType type) {
+  if (g_state == ProfilerState::kDisabled &&
+      FLAGS_enable_host_event_recorder_hook == false) {
+    return;
+  }
+  if (type == TracerMemEventType::Allocate) {
+    uint64_t current_allocated;
+    uint64_t peak_allocated;
+    uint64_t current_reserved = 0;  // 0 means keep the same as before
+    uint64_t peak_reserved = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_allocated =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    } else {
+      current_allocated =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    }
+    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
+  } else if (type == TracerMemEventType::ReservedAllocate) {
+    uint64_t current_reserved;
+    uint64_t peak_reserved;
+    uint64_t current_allocated = 0;  // 0 means keep the same as before
+    uint64_t peak_allocated = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_reserved =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    } else {
+      current_reserved =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    }
+    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
+  } else if (type == TracerMemEventType::Free) {
+    uint64_t current_allocated;
+    uint64_t peak_allocated;
+    uint64_t current_reserved = 0;  // 0 means keep the same as before
+    uint64_t peak_reserved = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_allocated =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    } else {
+      current_allocated =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    }
+    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
+                                                       place,
+                                                       size,
+                                                       type,
+                                                       current_allocated,
+                                                       current_reserved,
+                                                       peak_allocated,
+                                                       peak_reserved);
+  } else if (type == TracerMemEventType::ReservedFree) {
+    uint64_t current_reserved;
+    uint64_t peak_reserved;
+    uint64_t current_allocated = 0;  // 0 means keep the same as before
+    uint64_t peak_allocated = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_reserved =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    } else {
+      current_reserved =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    }
+    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
+                                                       place,
+                                                       size,
+                                                       type,
+                                                       current_allocated,
+                                                       current_reserved,
+                                                       peak_allocated,
+                                                       peak_reserved);
+  }
+}
+void MemEvenRecorder::PushMemRecord(const void *ptr,
+                                    const Place &place,
                                    size_t size) {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled) {
+    return;
+  }
  std::lock_guard<std::mutex> guard(mtx_);
  auto &events = address_memevent_[place];
-  PADDLE_ENFORCE_EQ(events.count(ptr), 0,
+  PADDLE_ENFORCE_EQ(events.count(ptr),
+                    0,
                    platform::errors::InvalidArgument(
                        "The Place can't exist in the stage of PushMemRecord"));
-  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
+  events.emplace(ptr,
+                 std::unique_ptr<RecordMemEvent>(
                     new MemEvenRecorder::RecordMemEvent(place, size)));
 }
-void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
+void MemEvenRecorder::PushMemRecord(const void *ptr,
+                                    const Place &place,
+                                    size_t size,
+                                    TracerMemEventType type,
+                                    uint64_t current_allocated,
+                                    uint64_t current_reserved,
+                                    uint64_t peak_allocated,
+                                    uint64_t peak_reserved) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
+    HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
+        PosixInNsec(),
+        reinterpret_cast<uint64_t>(ptr),
+        type,
+        size,
+        place,
+        current_allocated,
+        current_reserved,
+        peak_allocated,
+        peak_reserved);
+    return;
+  }
+  if (type == TracerMemEventType::ReservedAllocate) {
+    // old profiler only analyse memory managed by paddle.
+    return;
+  }
  if (g_state == ProfilerState::kDisabled) return;
+  auto &events = address_memevent_[place];
+  PADDLE_ENFORCE_EQ(events.count(ptr),
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The Place can't exist in the stage of PushMemRecord"));
+  events.emplace(ptr,
+                 std::unique_ptr<RecordMemEvent>(
+                     new MemEvenRecorder::RecordMemEvent(place, size)));
+}
+void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
+  if (g_state == ProfilerState::kDisabled) {
+    return;
+  }
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto &events = address_memevent_[place];
+  auto iter = events.find(ptr);
+  // The ptr maybe not in address_memevent
+  if (iter != events.end()) {
+    events.erase(iter);
+  }
+}
+void MemEvenRecorder::PopMemRecord(const void *ptr,
+                                   const Place &place,
+                                   size_t size,
+                                   TracerMemEventType type,
+                                   uint64_t current_allocated,
+                                   uint64_t current_reserved,
+                                   uint64_t peak_allocated,
+                                   uint64_t peak_reserved) {
  std::lock_guard<std::mutex> guard(mtx_);
+  if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
+    HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
+        PosixInNsec(),
+        reinterpret_cast<uint64_t>(ptr),
+        type,
+        -size,
+        place,
+        current_allocated,
+        current_reserved,
+        peak_allocated,
+        peak_reserved);
+    return;
+  }
+  if (type == TracerMemEventType::ReservedFree) {
+    // old profiler only analyse memory managed by paddle.
+    return;
+  }
+  if (g_state == ProfilerState::kDisabled) return;
  auto &events = address_memevent_[place];
  auto iter = events.find(ptr);
  // The ptr maybe not in address_memevent
@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
  auto annotation_free = CurAnnotationName();
  if (tracer) {
-    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
+    tracer->AddMemInfoRecord(start_ns_,
-                             annotation_free, g_mem_thread_id);
+                             end_ns_,
+                             bytes_,
+                             place_,
+                             alloc_in_,
+                             annotation_free,
+                             g_mem_thread_id);
  }
  PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
 }
@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() {
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
-    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
+    tracer->AddCPURecords(
-                          g_thread_id);
+        name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
  }
  ClearCurBlock();
 }
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+void PushMemEvent(uint64_t start_ns,
-                  const Place &place, const std::string &annotation) {
+                  uint64_t end_ns,
-  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
+                  size_t bytes,
-                           place, g_mem_thread_id, annotation);
+                  const Place &place,
-}
+                  const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPushRange,
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                           start_ns,
-                 const Place &place, const std::string &annotation) {
+                           end_ns,
-  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
+                           bytes,
-                           g_mem_thread_id, annotation);
+                           place,
+                           g_mem_thread_id,
+                           annotation);
+}
+void PopMemEvent(uint64_t start_ns,
+                 uint64_t end_ns,
+                 size_t bytes,
+                 const Place &place,
+                 const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPopRange,
+                           start_ns,
+                           end_ns,
+                           bytes,
+                           place,
+                           g_mem_thread_id,
+                           annotation);
 }
 void Mark(const std::string &name) {
@@ -334,17 +590,19 @@ void Mark(const std::string &name) {
  GetEventList().Record(EventType::kMark, name, g_thread_id);
 }
-Event *PushEvent(const std::string &name, const EventRole role,
+Event *PushEvent(const std::string &name,
+                 const EventRole role,
                 std::string attr) {
-  return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role,
+  return GetEventList().Record(
-                               attr);
+      EventType::kPushRange, name, g_thread_id, role, attr);
 }
 void PopEvent(const std::string &name, const EventRole role, std::string attr) {
  GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
 }
 void EnableProfiler(ProfilerState state) {
-  PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
+  PADDLE_ENFORCE_NE(state,
+                    ProfilerState::kDisabled,
                    platform::errors::InvalidArgument(
                        "Can't enable profiling, since the input state is"
                        "ProfilerState::kDisabled"));
@@ -380,7 +638,8 @@ void ResetProfiler() {
    (*it)->Clear();
  }
  for (auto it = g_all_mem_event_lists.begin();
-       it != g_all_mem_event_lists.end(); ++it) {
+       it != g_all_mem_event_lists.end();
+       ++it) {
    (*it)->Clear();
  }
 }
@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop(
      std::string name =
          prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
      const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
-      Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid,
+      Event *orig_evt = cur_thr_list->Record(
-                                             evt.role, attr);
+          EventType::kPushRange, name, tid, evt.role, attr);
      (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
      cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
    }
@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd(
  for (const auto &thr_sec : host_sec.thr_sections) {
    uint64_t tid = thr_sec.thread_id;
    for (const auto &evt : thr_sec.events) {
-      tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(),
+      tracer->AddCPURecords(
-                            tid);
+          evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
    }
  }
 }

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -30,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -102,6 +104,22 @@ struct MemEvenRecorder {
 public:
  void PushMemRecord(const void* ptr, const Place& place, size_t size);
  void PopMemRecord(const void* ptr, const Place& place);
+  void PushMemRecord(const void* ptr,
+                     const Place& place,
+                     size_t size,
+                     TracerMemEventType type,
+                     uint64_t current_allocated,
+                     uint64_t current_reserved,
+                     uint64_t peak_allocated,
+                     uint64_t peak_reserved);
+  void PopMemRecord(const void* ptr,
+                    const Place& place,
+                    size_t size,
+                    TracerMemEventType type,
+                    uint64_t current_allocated,
+                    uint64_t current_reserved,
+                    uint64_t peak_allocated,
+                    uint64_t peak_reserved);
  void Flush();
  static MemEvenRecorder& Instance() { return recorder; }
@@ -160,7 +178,8 @@ struct EventList {
  std::vector<T> Reduce() {
    std::vector<T> result;
    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+      result.insert(result.begin(),
+                    std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
@@ -173,13 +192,21 @@ struct EventList {
 };
 void Mark(const std::string& name);
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+void PushMemEvent(uint64_t start_ns,
-                  const Place& place, const std::string& annotation);
+                  uint64_t end_ns,
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
+                  size_t bytes,
-                 const Place& place, const std::string& annotation);
+                  const Place& place,
-Event* PushEvent(const std::string& name, const EventRole role,
+                  const std::string& annotation);
+void PopMemEvent(uint64_t start_ns,
+                 uint64_t end_ns,
+                 size_t bytes,
+                 const Place& place,
+                 const std::string& annotation);
+Event* PushEvent(const std::string& name,
+                 const EventRole role,
                 const std::string attr = "none");
-void PopEvent(const std::string& name, const EventRole role,
+void PopEvent(const std::string& name,
+              const EventRole role,
              const std::string attr = "none");
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.

--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
 cc_library(
  host_tracer
  SRCS host_tracer.cc
-  DEPS enforce)
+  DEPS enforce ddim var_type_traits)
 cc_library(
  cuda_tracer
  SRCS cuda_tracer.cc cupti_data_process.cc
@@ -10,7 +10,7 @@ add_subdirectory(mlu)
 cc_library(
  event_node
  SRCS event_node.cc
-  DEPS enforce)
+  DEPS enforce place)
 cc_library(
  profiler_utils
  SRCS utils.cc

--- a/paddle/fluid/platform/profiler/common_event.h
+++ b/paddle/fluid/platform/profiler/common_event.h
@@ -18,16 +18,21 @@
 #include <functional>
 #include <string>
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/event.h"  // import EventRole, TODO(TIEXING): remove later
 #include "paddle/fluid/platform/profiler/trace_event.h"
+#include "paddle/phi/core/ddim.h"
 namespace paddle {
 namespace platform {
 struct CommonEvent {
 public:
-  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
+  CommonEvent(const char *name,
-              EventRole role, TracerEventType type)
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type)
      : name(name),
        start_ns(start_ns),
        end_ns(end_ns),
@@ -35,8 +40,12 @@ struct CommonEvent {
        type(type) {}
  CommonEvent(std::function<void *(size_t)> arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+              const std::string &name_str,
-              EventRole role, TracerEventType type, const std::string &attr_str)
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type,
+              const std::string &attr_str)
      : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
    strncpy(buf, name_str.c_str(), name_str.length() + 1);
@@ -47,8 +56,11 @@ struct CommonEvent {
  }
  CommonEvent(std::function<void *(size_t)> arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+              const std::string &name_str,
-              EventRole role, TracerEventType type)
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type)
      : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
    strncpy(buf, name_str.c_str(), name_str.length() + 1);
@@ -63,5 +75,61 @@ struct CommonEvent {
  const char *attr = nullptr;  // not owned, designed for performance
 };
+struct CommonMemEvent {
+ public:
+  CommonMemEvent(uint64_t timestamp_ns,
+                 uint64_t addr,
+                 TracerMemEventType type,
+                 int64_t increase_bytes,
+                 const Place &place,
+                 uint64_t current_allocated,
+                 uint64_t current_reserved,
+                 uint64_t peak_allocated,
+                 uint64_t peak_reserved)
+      : timestamp_ns(timestamp_ns),
+        addr(addr),
+        type(type),
+        increase_bytes(increase_bytes),
+        place(place),
+        peak_allocated(peak_allocated),
+        peak_reserved(peak_reserved) {}
+  uint64_t timestamp_ns;
+  uint64_t addr;
+  TracerMemEventType type;
+  int64_t increase_bytes;
+  Place place;
+  uint64_t current_allocated;
+  uint64_t current_reserved;
+  uint64_t peak_allocated;
+  uint64_t peak_reserved;
+};
+struct OperatorSupplementOriginEvent {
+ public:
+  OperatorSupplementOriginEvent(
+      std::function<void *(size_t)> arena_allocator,
+      uint64_t timestamp_ns,
+      const std::string &type_name,
+      const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
+      const std::map<std::string, std::vector<framework::proto::VarType::Type>>
+          &dtypes,
+      const std::vector<std::string> callstack)
+      : timestamp_ns(timestamp_ns),
+        input_shapes(input_shapes),
+        dtypes(dtypes),
+        callstack(callstack) {
+    auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
+    strncpy(buf, type_name.c_str(), type_name.length() + 1);
+    op_type = buf;
+  }
+  uint64_t timestamp_ns;
+  const char *op_type = nullptr;  // not owned, designed for performance
+  // input shapes
+  std::map<std::string, std::vector<framework::DDim>> input_shapes;
+  std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
+  // call stack
+  const std::vector<std::string> callstack;
+};
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -11,9 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/profiler/host_tracer.h"
+#include <sstream>
 #include "glog/logging.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
@@ -21,7 +22,8 @@
 // Used to filter events, works like glog VLOG(level).
 // RecordEvent will works if host_trace_level >= level.
-PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
+                             1,
                             "RecordEvent will works "
                             "if host_trace_level >= level.");
@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
  }
 }
+void ProcessHostMemEvents(
+    const HostEventSection<CommonMemEvent>& host_mem_events,
+    TraceEventCollector* collector) {
+  for (const auto& thr_sec : host_mem_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
+    for (const auto& evt : thr_sec.events) {
+      MemTraceEvent event;
+      event.timestamp_ns = evt.timestamp_ns;
+      event.addr = evt.addr;
+      event.type = evt.type;
+      event.increase_bytes = evt.increase_bytes;
+      event.place = evt.place.DebugString();
+      event.current_allocated = evt.current_allocated;
+      event.current_reserved = evt.current_reserved;
+      event.peak_allocated = evt.peak_allocated;
+      event.peak_reserved = evt.peak_reserved;
+      event.process_id = host_mem_events.process_id;
+      event.thread_id = tid;
+      collector->AddMemEvent(std::move(event));
+    }
+  }
+}
+void ProcessOperatorSupplementEvents(
+    const HostEventSection<OperatorSupplementOriginEvent>& op_supplement_events,
+    TraceEventCollector* collector) {
+  for (const auto& thr_sec : op_supplement_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
+    for (const auto& evt : thr_sec.events) {
+      OperatorSupplementEvent event;
+      event.timestamp_ns = evt.timestamp_ns;
+      event.op_type = evt.op_type;
+      std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+      std::map<std::string, std::vector<std::string>> dtypes;
+      std::string callstack;
+      for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end();
+           it++) {
+        for (auto idx = 0lu; idx < it->second.size(); idx++) {
+          input_shapes[it->first].push_back(std::vector<int64_t>());
+          for (auto dim_idx = 0; dim_idx < it->second.at(idx).size();
+               dim_idx++) {
+            input_shapes[it->first][idx].push_back(
+                it->second.at(idx).at(dim_idx));
+          }
+        }
+      }
+      for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) {
+        for (auto idx = 0lu; idx < it->second.size(); idx++) {
+          dtypes[it->first].push_back(
+              framework::proto::VarType::Type_Name(it->second.at(idx)));
+        }
+      }
+      std::ostringstream result_string;
+      for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) {
+        result_string << (*it) << std::endl;
+      }
+      event.input_shapes = input_shapes;
+      event.dtypes = dtypes;
+      event.callstack = result_string.str();
+      event.process_id = op_supplement_events.process_id;
+      event.thread_id = tid;
+      collector->AddOperatorSupplementEvent(std::move(event));
+    }
+  }
+}
 }  // namespace
 void HostTracer::PrepareTracing() {
@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() {
 void HostTracer::StartTracing() {
  PADDLE_ENFORCE_EQ(
-      state_ == TracerState::READY || state_ == TracerState::STOPED, true,
+      state_ == TracerState::READY || state_ == TracerState::STOPED,
+      true,
      platform::errors::PreconditionNotMet("TracerState must be READY"));
  HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
+  HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
+  HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
+      .GatherEvents();
  HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
  state_ = TracerState::STARTED;
 }
 void HostTracer::StopTracing() {
  PADDLE_ENFORCE_EQ(
-      state_, TracerState::STARTED,
+      state_,
+      TracerState::STARTED,
      platform::errors::PreconditionNotMet("TracerState must be STARTED"));
  HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
  state_ = TracerState::STOPED;
@@ -77,11 +157,19 @@ void HostTracer::StopTracing() {
 void HostTracer::CollectTraceData(TraceEventCollector* collector) {
  PADDLE_ENFORCE_EQ(
-      state_, TracerState::STOPED,
+      state_,
+      TracerState::STOPED,
      platform::errors::PreconditionNotMet("TracerState must be STOPED"));
  HostEventSection<CommonEvent> host_events =
      HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
  ProcessHostEvents(host_events, collector);
+  HostEventSection<CommonMemEvent> host_mem_events =
+      HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
+  ProcessHostMemEvents(host_mem_events, collector);
+  HostEventSection<OperatorSupplementOriginEvent> op_supplement_events =
+      HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
+          .GatherEvents();
+  ProcessOperatorSupplementEvents(op_supplement_events, collector);
 }
 }  // namespace platform

--- a/paddle/fluid/platform/profiler/mem_tracing.h
+++ b/paddle/fluid/platform/profiler/mem_tracing.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+namespace paddle {
+namespace platform {
+// Memory event tracing. A trace marks memory manipulation such as allocation
+// and free.
+// The events can be used to draw memory variation curve.
+class RecordMemEvent {
+ public:
+  /**
+   * @param ptr:  Pointer address allocated or free.
+   * @param place: Device for this memory event.
+   * @param size: Memory size allocated or free.
+   * @param type: Denote manipulation type for this memory event.
+   */
+  explicit RecordMemEvent(
+      const void* ptr,
+      const Place& place,
+      size_t size,
+      const TracerMemEventType type = TracerMemEventType::Allocate);
+};
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -23,6 +23,8 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/profiler.h"
@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) {
  profiler->Prepare();
  profiler->Start();
  {
-    RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined,
+    RecordInstantEvent(
-                       2);
+        "TestTraceLevel_record1", TracerEventType::UserDefined, 2);
-    RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
+    RecordInstantEvent(
-                       3);
+        "TestTraceLevel_record2", TracerEventType::UserDefined, 3);
  }
  auto profiler_result = profiler->Stop();
  auto nodetree = profiler_result->GetNodeTrees();
@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) {
  EXPECT_GT(runtime_events.size(), 0u);
 #endif
 }
+TEST(ProfilerTest, TestHostTracerForMem) {
+  using paddle::platform::CPUPlace;
+  using paddle::platform::EnableHostEventRecorder;
+  using paddle::platform::MemTraceEventNode;
+  using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
+  using paddle::platform::ProfilerResult;
+  using paddle::platform::RecordEvent;
+  using paddle::platform::RecordInstantEvent;
+  using paddle::platform::RecordMemEvent;
+  using paddle::platform::TracerEventType;
+  using paddle::platform::TracerMemEventType;
+  ProfilerOptions options;
+  options.trace_level = 1;
+  options.trace_switch = 3;
+  auto profiler = Profiler::Create(options);
+  EXPECT_TRUE(profiler);
+  EnableHostEventRecorder();
+  profiler->Prepare();
+  profiler->Start();
+  {
+    RecordEvent event1(
+        "TestTracerForMem_phase1", TracerEventType::UserDefined, 1);
+    RecordMemEvent(reinterpret_cast<void*>(0),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Allocate);
+    RecordMemEvent(
+        reinterpret_cast<void*>(0), CPUPlace(), 1024, TracerMemEventType::Free);
+  }
+  {
+    RecordEvent event2(
+        "TestTracerForMem_phase2", TracerEventType::UserDefined, 1);
+    RecordMemEvent(reinterpret_cast<void*>(1024),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Allocate);
+    RecordMemEvent(reinterpret_cast<void*>(1024),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Free);
+  }
+  auto profiler_result = profiler->Stop();
+  auto nodetree = profiler_result->GetNodeTrees();
+}
--- a/paddle/fluid/platform/profiler/supplement_tracing.h
+++ b/paddle/fluid/platform/profiler/supplement_tracing.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+namespace paddle {
+namespace framework {
+class RuntimeContext;
+}
+namespace platform {
+class RecordOpInfoSupplement {
+ public:
+  /**
+   * @param type:  Operator type name.
+   * @param attrs: Attribute map of op.
+   * @param shape_ctx: Infershape context object.
+   * @param ctx: Runtime context object.
+   */
+  explicit RecordOpInfoSupplement(const std::string& type,
+                                  const framework::AttributeMap& attrs,
+                                  const framework::InferShapeContext& shape_ctx,
+                                  const framework::RuntimeContext& ctx);
+};
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -382,7 +382,8 @@ static T PyObjectCast(PyObject *obj) {
  } catch (py::cast_error &) {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "Python object is not type of %s, the real type is %s",
-        typeid(T).name(), obj->ob_type->tp_name));
+        typeid(T).name(),
+        obj->ob_type->tp_name));
  }
 }
@@ -441,7 +442,8 @@ static std::vector<std::string> inline GetNameList(
 }
 static void inline CreateVariableIfNotExit(
-    const py::handle &py_handle, const framework::Scope &scope,
+    const py::handle &py_handle,
+    const framework::Scope &scope,
    const framework::Executor *exe = nullptr) {
  std::vector<std::string> vec_res;
@@ -479,7 +481,8 @@ static void inline CreateVariableIfNotExit(
        PyObject *py_var_desc =
            PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kVarDescField);
        PADDLE_ENFORCE_NOT_NULL(
-            py_var_desc, platform::errors::InvalidArgument(
+            py_var_desc,
+            platform::errors::InvalidArgument(
                "The var_desc of parameter to set is None"));
        auto var_desc = PyObjectCast<framework::VarDesc>(py_var_desc);
        Py_DECREF(py_var_desc);
@@ -515,7 +518,8 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
      }
    }
  }
-  PADDLE_ENFORCE_EQ(ops.empty(), true,
+  PADDLE_ENFORCE_EQ(ops.empty(),
+                    true,
                    platform::errors::Unimplemented(
                        "OperatorWithKernel [%s] have only static graph grad "
                        "maker or have only dygraph grad maker, which is not "
@@ -537,8 +541,10 @@ static int GetNCCLVersion() {
 #endif
 template <typename PlaceType>
-static void TensorCopyFrom(framework::Tensor *dst, const framework::Tensor &src,
+static void TensorCopyFrom(framework::Tensor *dst,
-                           const PlaceType &place, int64_t batch_size) {
+                           const framework::Tensor &src,
+                           const PlaceType &place,
+                           int64_t batch_size) {
  if (batch_size < 0) {
    framework::TensorCopy(src, place, dst);
  } else {
@@ -624,7 +630,8 @@ PYBIND11_MODULE(core_noavx, m) {
        PyCapsule_GetPointer(dltensor->ptr(), "dltensor"));
    PADDLE_ENFORCE_NOT_NULL(
-        dmt, platform::errors::InvalidArgument(
+        dmt,
+        platform::errors::InvalidArgument(
            "from_dlpack received an invalid capsule. "
            "Note that a DLPack tensor can be consumed only once."));
@@ -644,7 +651,8 @@ PYBIND11_MODULE(core_noavx, m) {
  });
  m.def("_create_loaded_parameter",
-        [](const py::handle &vec_var_list, const Scope &scope,
+        [](const py::handle &vec_var_list,
+           const Scope &scope,
           const Executor *executor) {
          CreateVariableIfNotExit(vec_var_list, scope, executor);
        });
@@ -682,8 +690,9 @@ PYBIND11_MODULE(core_noavx, m) {
            << ", sci_mode=" << print_opt.sci_mode;
  });
-  m.def("broadcast_shape", [](const std::vector<int64_t> &x_dim,
+  m.def(
-                              const std::vector<int64_t> &y_dim) {
+      "broadcast_shape",
+      [](const std::vector<int64_t> &x_dim, const std::vector<int64_t> &y_dim) {
        return phi::vectorize(operators::details::BroadcastTwoDims(
            phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1));
      });
@@ -808,14 +817,22 @@ PYBIND11_MODULE(core_noavx, m) {
               self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
             }
           })
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+      .def("add_attr",
-                          bool attr) { self.EmplaceBackAttr(attr); })
+           [](paddle::CustomOpKernelContext &self, bool attr) {
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+             self.EmplaceBackAttr(attr);
-                          int attr) { self.EmplaceBackAttr(attr); })
+           })
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+      .def("add_attr",
-                          float attr) { self.EmplaceBackAttr(attr); })
+           [](paddle::CustomOpKernelContext &self, int attr) {
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+             self.EmplaceBackAttr(attr);
-                          int64_t attr) { self.EmplaceBackAttr(attr); })
+           })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, float attr) {
+             self.EmplaceBackAttr(attr);
+           })
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self, int64_t attr) {
+             self.EmplaceBackAttr(attr);
+           })
      .def("add_attr",
           [](paddle::CustomOpKernelContext &self, const std::string &attr) {
             self.EmplaceBackAttr(attr);
@@ -829,13 +846,14 @@ PYBIND11_MODULE(core_noavx, m) {
      .def("add_attr",
           [](paddle::CustomOpKernelContext &self,
              const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr", [](paddle::CustomOpKernelContext &self,
+      .def("add_attr",
+           [](paddle::CustomOpKernelContext &self,
              const std::vector<std::string> &attr) {
             self.EmplaceBackAttr(attr);
           });
-  py::class_<framework::Tensor> framework_tensor(m, "Tensor",
+  py::class_<framework::Tensor> framework_tensor(
-                                                 py::buffer_protocol());
+      m, "Tensor", py::buffer_protocol());
  g_framework_tensor_pytype =
      reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
  framework_tensor
@@ -918,80 +936,135 @@ PYBIND11_MODULE(core_noavx, m) {
             self.mutable_data<float>(place);
           })
      .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::CPUPlace &place,
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
      .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::CustomPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::CustomPlace &place,
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
      .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::XPUPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::XPUPlace &place,
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
      .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::CUDAPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPlace &place,
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
      .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place,
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
      .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::MLUPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::MLUPlace &place,
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
      .def("_clear", &framework::Tensor::clear)
      .def("_mutable_data",
-           [](framework::Tensor &self, paddle::platform::NPUPlace &place,
+           [](framework::Tensor &self,
+              paddle::platform::NPUPlace &place,
              paddle::framework::proto::VarType::Type type) {
             return reinterpret_cast<uintptr_t>(
                 self.mutable_data(place, framework::TransToPhiDataType(type)));
           })
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::CPUPlace>,
+      .def("_copy_from",
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+           &TensorCopyFrom<paddle::platform::CPUPlace>,
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::CustomPlace>,
+           py::arg("tensor"),
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+           py::arg("place"),
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::XPUPlace>,
+           py::arg("batch_size") = -1)
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from",
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPlace>,
+           &TensorCopyFrom<paddle::platform::CustomPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+           py::arg("tensor"),
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::NPUPlace>,
+           py::arg("place"),
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+           py::arg("batch_size") = -1)
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
+      .def("_copy_from",
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+           &TensorCopyFrom<paddle::platform::XPUPlace>,
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::MLUPlace>,
+           py::arg("tensor"),
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+           py::arg("place"),
-      .def("_copy_from", &TensorCopyFrom<paddle::platform::Place>,
+           py::arg("batch_size") = -1)
-           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from",
-      .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
+           &TensorCopyFrom<paddle::platform::CUDAPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+           py::arg("tensor"),
-      .def("set", SetTensorFromPyArray<paddle::platform::CustomPlace>,
+           py::arg("place"),
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+           py::arg("batch_size") = -1)
-      .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
+      .def("_copy_from",
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+           &TensorCopyFrom<paddle::platform::NPUPlace>,
-      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
+           py::arg("tensor"),
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+           py::arg("place"),
-      .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("batch_size") = -1)
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("_copy_from",
-      .def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
+           &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+           py::arg("tensor"),
-      .def("set", SetTensorFromPyArray<paddle::platform::MLUPlace>,
+           py::arg("place"),
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+           py::arg("batch_size") = -1)
-      .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
+      .def("_copy_from",
-           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
+           &TensorCopyFrom<paddle::platform::MLUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::Place>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CustomPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::XPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CUDAPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::IPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::MLUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false,
           R"DOC(
        Set the data of Tensor on place with given numpy array.
@@ -1077,9 +1150,9 @@ PYBIND11_MODULE(core_noavx, m) {
             ostr << self;
             return ostr.str();
           }) /* ------ End of original Tensor ------ */
-      .def(
+      .def("__init__",
-          "__init__",
+           [](framework::Tensor &instance,
-          [](framework::Tensor &instance, const std::vector<std::vector<size_t>>
+              const std::vector<std::vector<size_t>>
                  &recursive_sequence_lengths) {
             LoD new_lod;
             new_lod.reserve(recursive_sequence_lengths.size());
@@ -1088,7 +1161,8 @@ PYBIND11_MODULE(core_noavx, m) {
                       std::back_inserter(new_lod));
             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
             PADDLE_ENFORCE_EQ(
-                CheckLoD(new_offset_lod, -1), true,
+                 CheckLoD(new_offset_lod, -1),
+                 true,
                 platform::errors::InvalidArgument(
                     "The provided recursive_sequence_lengths info is "
                     "invalid, "
@@ -1115,12 +1189,14 @@ PYBIND11_MODULE(core_noavx, m) {
            new_lod.reserve(lod.size());
            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
            PADDLE_ENFORCE_EQ(
-                CheckLoD(new_lod, vectorize(self.dims()).front()), true,
+                CheckLoD(new_lod, vectorize(self.dims()).front()),
+                true,
                platform::errors::InvalidArgument(
                    "The provided LoD is invalid, the LoD is %s", new_lod));
            self.set_lod(new_lod);
          },
-          py::arg("lod"), R"DOC(
+          py::arg("lod"),
+          R"DOC(
           Set LoD of the Tensor.
           Args:
@@ -1142,7 +1218,8 @@ PYBIND11_MODULE(core_noavx, m) {
           )DOC")
      .def(
          "set_recursive_sequence_lengths",
-          [](framework::Tensor &self, const std::vector<std::vector<size_t>>
+          [](framework::Tensor &self,
+             const std::vector<std::vector<size_t>>
                 &recursive_sequence_lengths) {
            // the input recursive_sequence_lengths is length-based
            // level-of-detail info
@@ -1153,7 +1230,8 @@ PYBIND11_MODULE(core_noavx, m) {
                      std::back_inserter(new_lod));
            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
            PADDLE_ENFORCE_EQ(
-                CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
+                CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                true,
                platform::errors::InvalidArgument(
                    "The provided recursive_sequence_lengths info is "
                    "invalid, "
@@ -1162,7 +1240,8 @@ PYBIND11_MODULE(core_noavx, m) {
                    new_lod));
            self.set_lod(new_offset_lod);
          },
-          py::arg("recursive_sequence_lengths"), R"DOC(
+          py::arg("recursive_sequence_lengths"),
+          R"DOC(
           Set LoD of the Tensor according to recursive sequence lengths.
           For example, if recursive_sequence_lengths=[[2, 3]], which means
@@ -1630,7 +1709,8 @@ PYBIND11_MODULE(core_noavx, m) {
             new (&instance) phi::SelectedRows();
           })
      .def("__init__",
-           [](phi::SelectedRows &instance, const std::vector<int64_t> rows,
+           [](phi::SelectedRows &instance,
+              const std::vector<int64_t> rows,
              const int64_t &height) {
             new (&instance) phi::SelectedRows(rows, height);
           })
@@ -1693,8 +1773,10 @@ All parameter, weight, gradient are variables in Paddle.
           [](Variable &self, Strings str_list) {
             *self.GetMutable<Strings>() = str_list;
           })
-      .def("set_vocab", [](Variable &self,
+      .def("set_vocab",
-                           Vocab vocab) { *self.GetMutable<Vocab>() = vocab; })
+           [](Variable &self, Vocab vocab) {
+             *self.GetMutable<Vocab>() = vocab;
+           })
      .def(
          "get_string_tensor",
          [](Variable &self) { return self.GetMutable<Strings>(); },
@@ -1732,7 +1814,8 @@ All parameter, weight, gradient are variables in Paddle.
      .def(
          "get_reader",
          [](Variable &self) -> framework::ReaderHolder * {
-            PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(), true,
+            PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(),
+                              true,
                              platform::errors::InvalidArgument(
                                  "The variable is not type of ReaderHolder."));
            return self.GetMutable<framework::ReaderHolder>();
@@ -1743,7 +1826,8 @@ All parameter, weight, gradient are variables in Paddle.
          [](Variable &self) -> Scope * {
            auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>();
            PADDLE_ENFORCE_GT(
-                scope_vec->size(), 0,
+                scope_vec->size(),
+                0,
                platform::errors::InvalidArgument(
                    "The size of scope_vec should be greater than 0"));
            return scope_vec->front();
@@ -1801,7 +1885,9 @@ All parameter, weight, gradient are variables in Paddle.
               out (core.Variable): the found or created variable.
           )DOC",
          py::return_value_policy::reference)
-      .def("find_var", &Scope::FindVar, py::arg("name"),
+      .def("find_var",
+           &Scope::FindVar,
+           py::arg("name"),
           R"DOC(
           Find variable named :code:`name` in the current scope or
           its parent scope. Return None if not found. 
@@ -1814,7 +1900,9 @@ All parameter, weight, gradient are variables in Paddle.
           )DOC",
           py::return_value_policy::reference)
      .def("size", &Scope::Size)
-      .def("erase", &Scope::EraseVars, py::arg("names"),
+      .def("erase",
+           &Scope::EraseVars,
+           py::arg("names"),
           R"DOC(
           Find variable named :code:`name` in the current scope or
           its parent scope. Return None if not found. 
@@ -1827,7 +1915,8 @@ All parameter, weight, gradient are variables in Paddle.
           )DOC",
           py::return_value_policy::reference)
      .def(
-          "new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+          "new_scope",
+          [](Scope &self) -> Scope * { return &self.NewScope(); },
          R"DOC(
           Create a new sub-scope of the current scope.
@@ -1835,7 +1924,8 @@ All parameter, weight, gradient are variables in Paddle.
               out (core._Scope): the created sub-scope.
           )DOC",
          py::return_value_policy::reference)
-      .def("drop_kids", &Scope::DropKids,
+      .def("drop_kids",
+           &Scope::DropKids,
           R"DOC(
           Delete all sub-scopes of the current scope.
           )DOC")
@@ -1865,7 +1955,8 @@ All parameter, weight, gradient are variables in Paddle.
      if (info.HasOpProtoAndChecker()) {
        std::string str;
        PADDLE_ENFORCE_EQ(
-            info.Proto().SerializeToString(&str), true,
+            info.Proto().SerializeToString(&str),
+            true,
            platform::errors::Fatal(
                "Serialize OpProto Error. This could be a bug of Paddle."));
        ret_values.emplace_back(str);
@@ -1886,18 +1977,20 @@ All parameter, weight, gradient are variables in Paddle.
          }
          return res;
        });
-  m.def(
+  m.def("get_grad_op_desc",
-      "get_grad_op_desc", [](const OpDesc &op_desc,
+        [](const OpDesc &op_desc,
           const std::unordered_set<std::string> &no_grad_set,
           const std::vector<BlockDesc *> &grad_sub_block) {
          std::unordered_map<std::string, std::string> grad_to_var;
          std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
              framework::OpInfoMap::Instance()
                  .Get(op_desc.Type())
-                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                  .GradOpMaker()(
-                               grad_sub_block);
+                      op_desc, no_grad_set, &grad_to_var, grad_sub_block);
          std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
-        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
+          std::transform(
+              grad_op_descs.begin(),
+              grad_op_descs.end(),
              grad_op_desc_ptrs.begin(),
              [](std::unique_ptr<OpDesc> &p) { return p.release(); });
          return std::make_pair(grad_op_desc_ptrs, grad_to_var);
@@ -1914,7 +2007,8 @@ All parameter, weight, gradient are variables in Paddle.
    return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
  });
  m.def("infer_no_need_buffer_slots",
-        [](const std::string op_type, const framework::VariableNameMap &inputs,
+        [](const std::string op_type,
+           const framework::VariableNameMap &inputs,
           const framework::VariableNameMap &outputs,
           const framework::AttributeMap &attrs) {
          auto infer_func = framework::OpInfoMap::Instance()
@@ -1927,7 +2021,8 @@ All parameter, weight, gradient are variables in Paddle.
            return empty;
          }
        });
-  m.def("prune", [](const ProgramDesc &origin,
+  m.def("prune",
+        [](const ProgramDesc &origin,
           const std::set<std::string> &feeded_var_names,
           const std::vector<std::array<size_t, 2>> &targets) {
          ProgramDesc prog_with_targets(origin);
@@ -2168,7 +2263,8 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
    return devices;
  });
-  py::class_<platform::CustomPlace> customplace(m, "CustomPlace",
+  py::class_<platform::CustomPlace> customplace(m,
+                                                "CustomPlace",
                                                R"DOC(
    CustomPlace is a descriptor of a device.
    It represents a custom device on which a tensor will be allocated and a model will run.
@@ -2182,7 +2278,8 @@ All parameter, weight, gradient are variables in Paddle.
  g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
  customplace
      .def("__init__",
-           [](platform::CustomPlace &self, const std::string &device_type,
+           [](platform::CustomPlace &self,
+              const std::string &device_type,
              int dev_id) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
             if (UNLIKELY(dev_id < 0)) {
@@ -2190,7 +2287,8 @@ All parameter, weight, gradient are variables in Paddle.
                   "Invalid CustomPlace(%s, %d), device id must be 0 "
                   "or "
                   "positive integer",
-                   device_type, dev_id);
+                   device_type,
+                   dev_id);
               std::exit(-1);
             }
@@ -2211,7 +2309,11 @@ All parameter, weight, gradient are variables in Paddle.
                       "inside "
                       "[0, %d), because %s "
                       "number on your machine is %d",
-                       device_type, dev_id, dev_count, device_type, dev_count);
+                       device_type,
+                       dev_id,
+                       dev_count,
+                       device_type,
+                       dev_count);
                   std::exit(-1);
                 }
               }
@@ -2221,7 +2323,8 @@ All parameter, weight, gradient are variables in Paddle.
                   "Invalid CustomPlace(%s, %d), the device type is "
                   "not registered "
                   "as a custom device.",
-                   device_type, dev_id);
+                   device_type,
+                   dev_id);
               std::exit(-1);
             }
 #else
@@ -2293,7 +2396,8 @@ All parameter, weight, gradient are variables in Paddle.
                 LOG(ERROR) << string::Sprintf(
                     "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
                     "number on your machine is %d",
-                     dev_id, platform::GetGPUDeviceCount(),
+                     dev_id,
+                     platform::GetGPUDeviceCount(),
                     platform::GetGPUDeviceCount());
                 std::exit(-1);
               }
@@ -2359,7 +2463,8 @@ All parameter, weight, gradient are variables in Paddle.
                 LOG(ERROR) << string::Sprintf(
                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
                     "number on your machine is %d",
-                     dev_id, platform::GetXPUDeviceCount(),
+                     dev_id,
+                     platform::GetXPUDeviceCount(),
                     platform::GetXPUDeviceCount());
                 std::exit(-1);
               }
@@ -2524,7 +2629,8 @@ All parameter, weight, gradient are variables in Paddle.
                 LOG(ERROR) << string::Sprintf(
                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
                     "number on your machine is %d",
-                     dev_id, platform::GetNPUDeviceCount(),
+                     dev_id,
+                     platform::GetNPUDeviceCount(),
                     platform::GetNPUDeviceCount());
                 std::exit(-1);
               }
@@ -2640,7 +2746,8 @@ All parameter, weight, gradient are variables in Paddle.
                 LOG(ERROR) << string::Sprintf(
                     "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
                     "number on your machine is %d",
-                     dev_id, platform::GetMLUDeviceCount(),
+                     dev_id,
+                     platform::GetMLUDeviceCount(),
                     platform::GetMLUDeviceCount());
                 std::exit(-1);
               }
@@ -2713,8 +2820,10 @@ All parameter, weight, gradient are variables in Paddle.
      .def("mlu_device_id", [](platform::Place &self) { return self.device; })
      .def("custom_device_id",
           [](platform::Place &self) { return self.device; })
-      .def("set_place", [](platform::Place &self,
+      .def("set_place",
-                           const platform::Place &other) { self = other; })
+           [](platform::Place &self, const platform::Place &other) {
+             self = other;
+           })
      .def("set_place",
           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
             self = cpu_place;
@@ -2759,7 +2868,8 @@ All parameter, weight, gradient are variables in Paddle.
                                      true,
                                      platform::errors::InvalidArgument(
                                          "Cannot parse user input to OpDesc"));
-                    PADDLE_ENFORCE_EQ(desc.IsInitialized(), true,
+                    PADDLE_ENFORCE_EQ(desc.IsInitialized(),
+                                      true,
                                      platform::errors::InvalidArgument(
                                          "The provided OpDesc is not "
                                          "initialized, the reason is: %s",
@@ -2767,43 +2877,50 @@ All parameter, weight, gradient are variables in Paddle.
                    return OpRegistry::CreateOp(desc);
                  })
      .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
              const platform::CPUPlace &place) {
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
           })
      .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
              const platform::XPUPlace &place) {
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
           })
      .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
              const platform::NPUPlace &place) {
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
           })
      .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
              const platform::CUDAPlace &place) {
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
           })
      .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
              const platform::CUDAPinnedPlace &place) {
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
           })
      .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
              const platform::MLUPlace &place) {
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
           })
      .def("run",
-           [](OperatorBase &self, const Scope &scope,
+           [](OperatorBase &self,
+              const Scope &scope,
              const platform::CustomPlace &place) {
             pybind11::gil_scoped_release release;
             self.Run(scope, place);
@@ -2843,13 +2960,17 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<framework::Executor>(m, "Executor")
      .def(py::init<const platform::Place &>())
      .def("close", &Executor::Close)
-      .def("run_from_dataset", &Executor::RunFromDataset,
+      .def("run_from_dataset",
+           &Executor::RunFromDataset,
           py::call_guard<py::gil_scoped_release>())
-      .def("release_trainer", &Executor::ReleaseTrainer,
+      .def("release_trainer",
+           &Executor::ReleaseTrainer,
           py::call_guard<py::gil_scoped_release>())
      .def("init_for_dataset",
-           [](Executor &self, const ProgramDesc &prog,
+           [](Executor &self,
-              const std::string &trainer_desc, Scope *scope,
+              const ProgramDesc &prog,
+              const std::string &trainer_desc,
+              Scope *scope,
              Dataset *dataset) -> std::shared_ptr<TrainerBase> {
             pybind11::gil_scoped_release release;
             return self.InitForDataset(prog, trainer_desc, scope, dataset);
@@ -2860,40 +2981,62 @@ All parameter, weight, gradient are variables in Paddle.
             self.RunFromDataset(trainer);
           })
      .def("run_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
+           [](Executor &self,
+              ExecutorPrepareContext *ctx,
+              Scope *scope,
              std::map<std::string, const LoDTensor *> *feed_targets,
              std::map<std::string, FetchType *> *fetch_targets,
-              bool create_local_scope = true, bool create_vars = true,
+              bool create_local_scope = true,
+              bool create_vars = true,
              const std::string &feed_holder_name = "feed",
              const std::string &fetch_holder_name = "fetch") {
             pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets,
+             self.RunPreparedContext(ctx,
-                                     create_local_scope, create_vars,
+                                     scope,
-                                     feed_holder_name, fetch_holder_name);
+                                     feed_targets,
+                                     fetch_targets,
+                                     create_local_scope,
+                                     create_vars,
+                                     feed_holder_name,
+                                     fetch_holder_name);
           })
      .def("run_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
+           [](Executor &self,
-              bool create_local_scope = true, bool create_vars = true,
+              ExecutorPrepareContext *ctx,
+              Scope *scope,
+              bool create_local_scope = true,
+              bool create_vars = true,
              bool keep_kids = false) {
             pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, create_local_scope,
+             self.RunPreparedContext(
-                                     create_vars, keep_kids);
+                 ctx, scope, create_local_scope, create_vars, keep_kids);
           })
      .def("prepare",
-           [](Executor &self, const ProgramDesc &program, int block_id,
+           [](Executor &self,
+              const ProgramDesc &program,
+              int block_id,
              const std::vector<std::string> &skip_ref_cnt_vars =
                  std::vector<std::string>(),
              bool force_disable_gc = false) {
             pybind11::gil_scoped_release release;
-             return self.Prepare(program, block_id, skip_ref_cnt_vars,
+             return self.Prepare(
-                                 force_disable_gc);
+                 program, block_id, skip_ref_cnt_vars, force_disable_gc);
           })
      .def("create_variables", &Executor::CreateVariables)
-      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
+      .def("run",
-                     int block_id, bool create_local_scope, bool create_vars,
+           [](Executor &self,
+              const ProgramDesc &prog,
+              Scope *scope,
+              int block_id,
+              bool create_local_scope,
+              bool create_vars,
              const std::vector<std::string> &fetch_vars) {
             pybind11::gil_scoped_release release;
-        self.Run(prog, scope, block_id, create_local_scope, create_vars,
+             self.Run(prog,
+                      scope,
+                      block_id,
+                      create_local_scope,
+                      create_vars,
                      fetch_vars);
           });
@@ -2906,8 +3049,10 @@ All parameter, weight, gradient are variables in Paddle.
      });
  py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
-      .def(py::init<const platform::Place &, const ProgramDesc &,
+      .def(py::init<const platform::Place &,
-                    const ProgramDesc &, Scope *>())
+                    const ProgramDesc &,
+                    const ProgramDesc &,
+                    Scope *>())
      .def("run",
           [](StandaloneExecutor &self,
              const std::unordered_map<std::string, py::array> &input_dict,
@@ -2951,7 +3096,8 @@ All parameter, weight, gradient are variables in Paddle.
             return py::cast(std::move(ret));
           })
      .def("run",
-           [](StandaloneExecutor &self, std::vector<std::string> feed_names,
+           [](StandaloneExecutor &self,
+              std::vector<std::string> feed_names,
              std::vector<std::string> fetch_names) {
             paddle::framework::FetchList ret;
             {
@@ -3036,20 +3182,27 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
  m.def(
      "run_cmd",
-      [](const std::string &cmd, int time_out = -1,
+      [](const std::string &cmd,
+         int time_out = -1,
         int sleep_inter = -1) -> const std::string {
-        return paddle::framework::shell_get_command_output(cmd, time_out,
+        return paddle::framework::shell_get_command_output(
-                                                           sleep_inter);
+            cmd, time_out, sleep_inter);
      },
-      py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1);
+      py::arg("cmd"),
+      py::arg("time_out") = -1,
+      py::arg("sleep_inter") = -1);
  m.def(
      "shell_execute_cmd",
-      [](const std::string &cmd, int time_out = 0, int sleep_inter = 0,
+      [](const std::string &cmd,
+         int time_out = 0,
+         int sleep_inter = 0,
         bool redirect_stderr = false) -> std::vector<std::string> {
-        return paddle::framework::shell_execute_cmd(cmd, time_out, sleep_inter,
+        return paddle::framework::shell_execute_cmd(
-                                                    redirect_stderr);
+            cmd, time_out, sleep_inter, redirect_stderr);
      },
-      py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
+      py::arg("cmd"),
+      py::arg("time_out") = 0,
+      py::arg("sleep_inter") = 0,
      py::arg("redirect_stderr") = false);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -3064,13 +3217,16 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
  m.def("set_feed_variable",
-        static_cast<void (*)(Scope *, const LoDTensor &, const std::string &,
+        static_cast<void (*)(
-                             size_t)>(&framework::SetFeedVariable));
+            Scope *, const LoDTensor &, const std::string &, size_t)>(
+            &framework::SetFeedVariable));
  m.def("set_feed_variable",
-        static_cast<void (*)(Scope *, const Strings &, const std::string &,
+        static_cast<void (*)(
-                             size_t)>(&framework::SetFeedVariable));
+            Scope *, const Strings &, const std::string &, size_t)>(
+            &framework::SetFeedVariable));
  m.def("get_fetch_variable",
-        [](const Scope &scope, const std::string &var_name,
+        [](const Scope &scope,
+           const std::string &var_name,
           size_t index) -> py::object {
          auto &var = framework::GetFetchVariable(scope, var_name, index);
          if (data_is_lod_tensor(var)) {
@@ -3125,7 +3281,8 @@ All parameter, weight, gradient are variables in Paddle.
      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
      .def("__setitem__",
           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
-             PADDLE_ENFORCE_LT(i, self.size(),
+             PADDLE_ENFORCE_LT(i,
+                               self.size(),
                               platform::errors::InvalidArgument(
                                   "The index to set is larger than the size "
                                   "of LoDTensorArray."));
@@ -3139,7 +3296,8 @@ All parameter, weight, gradient are variables in Paddle.
            self.back().ShareDataWith(t);
            self.back().set_lod(t.lod());
          },
-          py::arg("tensor"), R"DOC(
+          py::arg("tensor"),
+          R"DOC(
             Append a LoDensor to LoDTensorArray.
             Args:
@@ -3376,16 +3534,18 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("reset_profiler", platform::ResetProfiler);
  m.def("register_pass", [](const std::string &pass_type, py::object callable) {
    PADDLE_ENFORCE_EQ(
-        framework::ir::PassRegistry::Instance().Has(pass_type), false,
+        framework::ir::PassRegistry::Instance().Has(pass_type),
+        false,
        platform::errors::AlreadyExists("Pass '%s' is registered more than "
                                        "once. Please use another name.",
                                        pass_type));
    callable.inc_ref();
-    framework::ir::PassRegistry::Instance().Insert(pass_type, [pass_type,
+    framework::ir::PassRegistry::Instance().Insert(
-                                                               callable]() {
+        pass_type, [pass_type, callable]() {
          py::gil_scoped_acquire guard;
          std::unique_ptr<framework::ir::Pass> pass(
-          new framework::ir::GeneratePass(py::cast<std::string>(callable())));
+              new framework::ir::GeneratePass(
+                  py::cast<std::string>(callable())));
          return pass;
        });
  });
@@ -3397,11 +3557,32 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("size_of_dtype", framework::SizeOfType);
  py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult")
      .def(py::init<>())
-      .def("get_data", &paddle::platform::ProfilerResult::GetData,
+      .def("get_data",
+           &paddle::platform::ProfilerResult::GetData,
           py::return_value_policy::automatic_reference)
      .def("save", &paddle::platform::ProfilerResult::Save)
      .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
+  py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode")
+      .def(py::init<>())
+      .def_readwrite("timestamp_ns",
+                     &paddle::platform::MemPythonNode::timestamp_ns)
+      .def_readwrite("addr", &paddle::platform::MemPythonNode::addr)
+      .def_readwrite("type", &paddle::platform::MemPythonNode::type)
+      .def_readwrite("process_id", &paddle::platform::MemPythonNode::process_id)
+      .def_readwrite("thread_id", &paddle::platform::MemPythonNode::thread_id)
+      .def_readwrite("increase_bytes",
+                     &paddle::platform::MemPythonNode::increase_bytes)
+      .def_readwrite("place", &paddle::platform::MemPythonNode::place)
+      .def_readwrite("current_allocated",
+                     &paddle::platform::MemPythonNode::current_allocated)
+      .def_readwrite("current_reserved",
+                     &paddle::platform::MemPythonNode::current_reserved)
+      .def_readwrite("peak_allocated",
+                     &paddle::platform::MemPythonNode::peak_allocated)
+      .def_readwrite("peak_reserved",
+                     &paddle::platform::MemPythonNode::peak_reserved);
  py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
      .def(py::init<>())
      .def_readwrite("name", &paddle::platform::DevicePythonNode::name)
@@ -3424,15 +3605,22 @@ All parameter, weight, gradient are variables in Paddle.
      .def_readwrite("process_id",
                     &paddle::platform::HostPythonNode::process_id)
      .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
+      .def_readwrite("input_shapes",
+                     &paddle::platform::HostPythonNode::input_shapes)
+      .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)
+      .def_readwrite("callstack", &paddle::platform::HostPythonNode::callstack)
      .def_readwrite("children_node",
                     &paddle::platform::HostPythonNode::children_node_ptrs)
      .def_readwrite("runtime_node",
                     &paddle::platform::HostPythonNode::runtime_node_ptrs)
      .def_readwrite("device_node",
-                     &paddle::platform::HostPythonNode::device_node_ptrs);
+                     &paddle::platform::HostPythonNode::device_node_ptrs)
+      .def_readwrite("mem_node",
+                     &paddle::platform::HostPythonNode::mem_node_ptrs);
  py::class_<paddle::platform::Profiler>(m, "_Profiler")
-      .def("create", &paddle::platform::Profiler::Create,
+      .def("create",
+           &paddle::platform::Profiler::Create,
           py::return_value_policy::take_ownership)
      .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
      .def("is_cnpapi_supported",
@@ -3466,6 +3654,14 @@ All parameter, weight, gradient are variables in Paddle.
      }))
      .def("end", [](platform::RecordEvent *event) { event->End(); });
+  py::enum_<paddle::platform::TracerMemEventType>(m, "TracerMemEventType")
+      .value("Allocate", paddle::platform::TracerMemEventType::Allocate)
+      .value("Free", paddle::platform::TracerMemEventType::Free)
+      .value("ReservedAllocate",
+             paddle::platform::TracerMemEventType::ReservedAllocate)
+      .value("ReservedFree",
+             paddle::platform::TracerMemEventType::ReservedFree);
  py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
      .value("Operator", paddle::platform::TracerEventType::Operator)
      .value("Dataloader", paddle::platform::TracerEventType::Dataloader)
@@ -3509,22 +3705,29 @@ All parameter, weight, gradient are variables in Paddle.
          [](ir::Pass &self, const std::string &name, const std::string &attr) {
            self.Set<std::string>(name, new std::string(attr));
          })
-      .def("set", [](ir::Pass &self, const std::string &name,
-                     bool val) { self.Set<bool>(name, new bool(val)); })
-      .def("set", [](ir::Pass &self, const std::string &name,
-                     int val) { self.Set<const int>(name, new int(val)); })
      .def("set",
-           [](ir::Pass &self, const std::string &name,
+           [](ir::Pass &self, const std::string &name, bool val) {
+             self.Set<bool>(name, new bool(val));
+           })
+      .def("set",
+           [](ir::Pass &self, const std::string &name, int val) {
+             self.Set<const int>(name, new int(val));
+           })
+      .def("set",
+           [](ir::Pass &self,
+              const std::string &name,
              std::vector<std::string> set) {
             self.Set(name, new std::vector<std::string>(set));
           })
      .def("set",
-           [](ir::Pass &self, const std::string &name,
+           [](ir::Pass &self,
+              const std::string &name,
              std::unordered_set<std::string> set) {
             self.Set(name, new std::unordered_set<std::string>(set));
           })
      .def("set",
-           [](ir::Pass &self, const std::string &name,
+           [](ir::Pass &self,
+              const std::string &name,
              std::unordered_set<int> set) {
             self.Set(name, new std::unordered_set<int>(set));
           })
@@ -3769,7 +3972,8 @@ All parameter, weight, gradient are variables in Paddle.
          "reduce_strategy",
          [](const BuildStrategy &self) { return self.reduce_; },
          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -3799,7 +4003,8 @@ All parameter, weight, gradient are variables in Paddle.
          [](const BuildStrategy &self) { return self.gradient_scale_; },
          [](BuildStrategy &self,
             BuildStrategy::GradientScaleStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -3864,7 +4069,8 @@ All parameter, weight, gradient are variables in Paddle.
          "debug_graphviz_path",
          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
          [](BuildStrategy &self, const std::string &path) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -3891,7 +4097,8 @@ All parameter, weight, gradient are variables in Paddle.
            return self.enable_sequential_execution_;
          },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -3917,7 +4124,8 @@ All parameter, weight, gradient are variables in Paddle.
            return self.remove_unnecessary_lock_;
          },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -3995,7 +4203,8 @@ All parameter, weight, gradient are variables in Paddle.
            return self.fuse_elewise_add_act_ops_;
          },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -4020,7 +4229,8 @@ All parameter, weight, gradient are variables in Paddle.
          "fuse_gemm_epilogue",
          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -4045,7 +4255,8 @@ All parameter, weight, gradient are variables in Paddle.
          "fuse_bn_act_ops",
          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -4070,7 +4281,8 @@ All parameter, weight, gradient are variables in Paddle.
          "fuse_bn_add_act_ops",
          [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -4095,7 +4307,8 @@ All parameter, weight, gradient are variables in Paddle.
          "enable_auto_fusion",
          [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -4123,7 +4336,8 @@ All parameter, weight, gradient are variables in Paddle.
            return self.fuse_relu_depthwise_conv_;
          },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -4153,7 +4367,8 @@ All parameter, weight, gradient are variables in Paddle.
                   self.fuse_broadcast_ops_ == paddle::none;
          },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, "
                                  "cannot be configured again."));
@@ -4184,7 +4399,8 @@ All parameter, weight, gradient are variables in Paddle.
                   self.fuse_all_optimizer_ops_ == paddle::none;
          },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, "
                                  "cannot be configured again."));
@@ -4194,7 +4410,8 @@ All parameter, weight, gradient are variables in Paddle.
          "sync_batch_norm",
          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
                              platform::errors::PreconditionNotMet(
                                  "BuildStrategy has been finlaized, cannot be "
                                  "configured again."));
@@ -4348,9 +4565,13 @@ All parameter, weight, gradient are variables in Paddle.
        });
  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::vector<std::string> &, const std::string &,
+                  const std::vector<std::string> &,
-                  Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
+                  const std::string &,
-                  const BuildStrategy &, ir::Graph *>())
+                  Scope *,
+                  std::vector<Scope *> &,
+                  const ExecutionStrategy &,
+                  const BuildStrategy &,
+                  ir::Graph *>())
      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
      // We still cannot get local_scope from this vector, since the element
      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
@@ -4439,7 +4660,8 @@ All parameter, weight, gradient are variables in Paddle.
                     PADDLE_THROW(platform::errors::Unimplemented(
                         "Failed to convert type: %s when set IpuStrategy "
                         "option: %s",
-                         option.get_type(), option_name));
+                         option.get_type(),
+                         option_name));
                   }
                   self.InsertStringOption(option_name, option_val);
                 }
@@ -4447,7 +4669,8 @@ All parameter, weight, gradient are variables in Paddle.
                 if (option_name.rfind("location_", 0) == 0) {
                   for (auto option : element.second.cast<py::dict>()) {
                     self.SetTensorLocation(
-                         option_name, option.first.cast<std::string>(),
+                         option_name,
+                         option.first.cast<std::string>(),
                         option.second.cast<std::uint64_t>());
                   }
                 } else if (option_name == "replicated_collectives_settings") {
@@ -4501,17 +4724,19 @@ All parameter, weight, gradient are variables in Paddle.
                       PADDLE_THROW(platform::errors::Unimplemented(
                           "Failed to convert value type: %s when set "
                           "IpuStrategy option: %s",
-                           option.second.get_type(), option_key));
+                           option.second.get_type(),
+                           option_key));
                     }
-                     self.InsertStringPairOption(option_name, option_key,
+                     self.InsertStringPairOption(
-                                                 option_val);
+                         option_name, option_key, option_val);
                   }
                 }
               } else {
                 PADDLE_THROW(platform::errors::InvalidArgument(
                     "Invalid IpuStrategy option value type: %s, please check "
                     "input value for option: %s",
-                     element.second.get_type(), option_name));
+                     element.second.get_type(),
+                     option_name));
               }
             }
           })