[MLU] add mlu new profiler (#41138)

* [MLU] add mlu new profiler * fix format

[MLU] add mlu new profiler (#41138)
* [MLU] add mlu new profiler * fix format
fc208b7e · fwenguang · GitHub · 605552a9 · fc208b7e · fc208b7e
14 changed file
--- a/paddle/fluid/platform/device/mlu/mlu_info.h
+++ b/paddle/fluid/platform/device/mlu/mlu_info.h
@@ -16,7 +16,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MLU
 #include <cn_api.h>
+#include <cndrv_id.h>
 #include <cnnl.h>
+#include <cnpapi.h>
 #include <cnrt.h>
 #ifdef PADDLE_WITH_CNCL
 #include <cncl.h>
@@ -33,7 +35,7 @@ using cnclStatus = cnclResult_t;
 #endif
 using mluStream = cnrtQueue_t;
 using mluCnnlHandle = cnnlHandle_t;
-using mluEventHandle = CNnotifier;
+using mluEventHandle = cnrtNotifier_t;
 using mluDeviceHandle = CNdev;
 namespace platform {

--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
 cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
 cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
+add_subdirectory(mlu)
 cc_library(event_node SRCS event_node.cc DEPS enforce)
 cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
 add_subdirectory(dump)
 cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
 cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
 cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind)
+cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer)
 cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
 cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
 cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)

--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -38,10 +38,12 @@ static std::string DefaultFileName() {
 }
 const char* ChromeTracingLogger::categary_name_[] = {
-    "Operator",      "Dataloader", "ProfileStep",      "CudaRuntime",
+    "Operator",      "Dataloader",  "ProfileStep",
-    "Kernel",        "Memcpy",     "Memset",           "UserDefined",
+    "CudaRuntime",   "Kernel",      "Memcpy",
-    "OperatorInner", "Forward",    "Backward",         "Optimization",
+    "Memset",        "UserDefined", "OperatorInner",
-    "Communication", "PythonOp",   "PythonUserDefined"};
+    "Forward",       "Backward",    "Optimization",
+    "Communication", "PythonOp",    "PythonUserDefined",
+    "MluRuntime"};
 void ChromeTracingLogger::OpenFile() {
  output_file_stream_.open(filename_,
@@ -598,6 +600,12 @@ void ChromeTracingLogger::RefineDisplayName(
        (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
  }
+#ifdef PADDLE_WITH_MLU
+  static std::string device_type("MLU");
+#else
+  static std::string device_type("GPU");
+#endif
  for (auto it = deviceid_streamid_set_.begin();
       it != deviceid_streamid_set_.end(); ++it) {
    output_file_stream_ << string_format(
@@ -607,7 +615,7 @@ void ChromeTracingLogger::RefineDisplayName(
    "name": "process_name", "pid": %lld, "tid": %lld,
    "ph": "M", 
    "args": {
-      "name": "Deivce %lld (GPU)"
+      "name": "Deivce %lld (%s)"
    }
  },
   {
@@ -632,9 +640,9 @@ void ChromeTracingLogger::RefineDisplayName(
    }
  },  
  )JSON"),
-        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).first, (*it).second, (*it).first, device_type.c_str(),
-        (*it).second, (*it).first, (*it).second, (*it).first + 0x10000000,
+        (*it).first, (*it).second, (*it).second, (*it).first, (*it).second,
-        (*it).first, (*it).second, (*it).second);
+        (*it).first + 0x10000000, (*it).first, (*it).second, (*it).second);
  }
 }

--- a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt
+if(WITH_MLU)
+  set(MLU_INFO mlu_info)
+endif()
+cc_library(mlu_tracer SRCS mlu_tracer.cc cnpapi_data_process.cc DEPS workqueue_utils enforce glog ${MLU_INFO})
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
+#include <cstdio>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/os_info.h"
+#ifdef PADDLE_WITH_MLU
+namespace paddle {
+namespace platform {
+namespace {
+inline uint64_t GetTimeGap() {
+  static uint64_t time_gap = []() -> uint64_t {
+    uint64_t cpu_time = PosixInNsec();
+    uint64_t mlu_time = cnpapiGetTimestamp();
+    return (cpu_time - mlu_time);
+  }();
+  return time_gap;
+}
+void AddKernelRecord(const cnpapiActivityKernel* kernel, uint64_t start_ns,
+                     TraceEventCollector* collector) {
+  static uint64_t time_gap = GetTimeGap();
+  if (kernel->start + time_gap < start_ns) {
+    return;
+  }
+  DeviceTraceEvent event;
+  event.name = demangle(kernel->name);
+  event.type = TracerEventType::Kernel;
+  event.start_ns = kernel->start + time_gap;
+  event.end_ns = kernel->end + time_gap;
+  event.device_id = kernel->device_id;
+  event.context_id = kernel->context_id;
+  event.stream_id = kernel->queue_id;
+  event.correlation_id = kernel->correlation_id;
+  event.kernel_info.block_x = kernel->dimx;
+  event.kernel_info.block_y = kernel->dimy;
+  event.kernel_info.block_z = kernel->dimz;
+  event.kernel_info.grid_x = kernel->kernel_type;
+  event.kernel_info.grid_y = 0;
+  event.kernel_info.grid_z = 0;
+  event.kernel_info.queued = kernel->queued;
+  event.kernel_info.submitted = kernel->submitted;
+  event.kernel_info.completed = kernel->received;
+  collector->AddDeviceEvent(std::move(event));
+}
+const char* MemcpyKind(cnpapiActivityMemcpyType kind) {
+  switch (kind) {
+    case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD:
+      return "MEMCPY_HtoD";
+    case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH:
+      return "MEMCPY_DtoH";
+    case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD:
+      return "MEMCPY_DtoD";
+    case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH:
+      return "MEMCPY_HtoH";
+    case CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP:
+      return "MEMCPY_PtoP";
+    default:
+      break;
+  }
+  return "MEMCPY";
+}
+void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy, uint64_t start_ns,
+                     TraceEventCollector* collector) {
+  static uint64_t time_gap = GetTimeGap();
+  if (memcpy->start + time_gap < start_ns) {
+    return;
+  }
+  DeviceTraceEvent event;
+  event.name = MemcpyKind(memcpy->copy_type);
+  event.type = TracerEventType::Memcpy;
+  event.start_ns = memcpy->start + time_gap;
+  event.end_ns = memcpy->end + time_gap;
+  event.device_id = memcpy->device_id;
+  event.context_id = memcpy->context_id;
+  event.stream_id = memcpy->queue_id;
+  event.correlation_id = memcpy->correlation_id;
+  event.memcpy_info.num_bytes = memcpy->bytes;
+  snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
+           MemcpyKind(memcpy->copy_type));
+  collector->AddDeviceEvent(std::move(event));
+}
+void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2,
+                      uint64_t start_ns, TraceEventCollector* collector) {
+  static uint64_t time_gap = GetTimeGap();
+  if (memcpy2->start + time_gap < start_ns) {
+    return;
+  }
+  DeviceTraceEvent event;
+  event.name = MemcpyKind(memcpy2->copy_type);
+  event.type = TracerEventType::Memcpy;
+  event.start_ns = memcpy2->start + time_gap;
+  event.end_ns = memcpy2->end + time_gap;
+  event.device_id = memcpy2->device_id;
+  event.context_id = memcpy2->context_id;
+  event.stream_id = memcpy2->queue_id;
+  event.correlation_id = memcpy2->correlation_id;
+  event.memcpy_info.num_bytes = memcpy2->bytes;
+  snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
+           MemcpyKind(memcpy2->copy_type));
+  collector->AddDeviceEvent(std::move(event));
+}
+void AddMemsetRecord(const cnpapiActivityMemset* memset, uint64_t start_ns,
+                     TraceEventCollector* collector) {
+  static uint64_t time_gap = GetTimeGap();
+  if (memset->start + time_gap < start_ns) {
+    return;
+  }
+  DeviceTraceEvent event;
+  event.name = "MEMSET";
+  event.type = TracerEventType::Memset;
+  event.start_ns = memset->start + time_gap;
+  event.end_ns = memset->end + time_gap;
+  event.device_id = memset->device_id;
+  event.context_id = memset->context_id;
+  event.stream_id = memset->queue_id;
+  event.correlation_id = memset->correlation_id;
+  event.memset_info.num_bytes = memset->bytes;
+  event.memset_info.value = memset->value;
+  collector->AddDeviceEvent(std::move(event));
+}
+class CnpapiRuntimeCbidStr {
+ public:
+  static const CnpapiRuntimeCbidStr& GetInstance() {
+    static CnpapiRuntimeCbidStr inst;
+    return inst;
+  }
+  std::string RuntimeKind(cnpapi_CallbackId cbid) const {
+    auto iter = cbid_str_.find(cbid);
+    if (iter == cbid_str_.end()) {
+      return "MLU Runtime API " + std::to_string(cbid);
+    }
+    return iter->second;
+  }
+ private:
+  CnpapiRuntimeCbidStr();
+  std::unordered_map<cnpapi_CallbackId, std::string> cbid_str_;
+};
+CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
+#define REGISTER_RUNTIME_CBID_STR(cbid) \
+  cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid
+  REGISTER_RUNTIME_CBID_STR(cnMalloc);
+  REGISTER_RUNTIME_CBID_STR(cnMallocHost);
+  REGISTER_RUNTIME_CBID_STR(cnFree);
+  REGISTER_RUNTIME_CBID_STR(cnFreeHost);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpy);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyPeer);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoD);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoH);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyAsync);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoDAsync);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoHAsync);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoDAsync);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD2D);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD3D);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpy2D);
+  REGISTER_RUNTIME_CBID_STR(cnMemcpy3D);
+  REGISTER_RUNTIME_CBID_STR(cnMemsetD8);
+  REGISTER_RUNTIME_CBID_STR(cnMemsetD16);
+  REGISTER_RUNTIME_CBID_STR(cnMemsetD32);
+  REGISTER_RUNTIME_CBID_STR(cnMemsetD8Async);
+  REGISTER_RUNTIME_CBID_STR(cnMemsetD16Async);
+  REGISTER_RUNTIME_CBID_STR(cnMemsetD32Async);
+  REGISTER_RUNTIME_CBID_STR(cnInvokeKernel);
+  REGISTER_RUNTIME_CBID_STR(cnCreateQueue);
+  REGISTER_RUNTIME_CBID_STR(cnDestroyQueue);
+  REGISTER_RUNTIME_CBID_STR(cnQueueSync);
+  REGISTER_RUNTIME_CBID_STR(cnQueueWaitNotifier);
+  REGISTER_RUNTIME_CBID_STR(cnWaitNotifier);
+  REGISTER_RUNTIME_CBID_STR(cnCreateNotifier);
+  REGISTER_RUNTIME_CBID_STR(cnDestroyNotifier);
+  REGISTER_RUNTIME_CBID_STR(cnPlaceNotifier);
+  REGISTER_RUNTIME_CBID_STR(cnCtxCreate);
+  REGISTER_RUNTIME_CBID_STR(cnCtxDestroy);
+  REGISTER_RUNTIME_CBID_STR(cnCtxGetCurrent);
+  REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
+  REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
+  REGISTER_RUNTIME_CBID_STR(cnCtxSync);
+#undef REGISTER_RUNTIME_CBID_STR
+}
+void AddApiRecord(const cnpapiActivityAPI* api, uint64_t start_ns,
+                  TraceEventCollector* collector) {
+  static uint64_t time_gap = GetTimeGap();
+  if (api->start + time_gap < start_ns) {
+    return;
+  }
+  RuntimeTraceEvent event;
+  event.name = CnpapiRuntimeCbidStr::GetInstance().RuntimeKind(api->cbid);
+  event.start_ns = api->start + time_gap;
+  event.end_ns = api->end + time_gap;
+  event.process_id = api->process_id;
+  event.thread_id = api->thread_id;
+  event.correlation_id = api->correlation_id;
+  event.callback_id = api->cbid;
+  event.type = TracerEventType::MluRuntime;
+  collector->AddRuntimeEvent(std::move(event));
+}
+}  // namespace
+namespace details {
+void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
+                                 uint64_t start_ns,
+                                 TraceEventCollector* collector) {
+  switch (record->type) {
+    case CNPAPI_ACTIVITY_TYPE_KERNEL:
+      AddKernelRecord(reinterpret_cast<const cnpapiActivityKernel*>(record),
+                      start_ns, collector);
+      break;
+    case CNPAPI_ACTIVITY_TYPE_MEMCPY:
+      AddMemcpyRecord(reinterpret_cast<const cnpapiActivityMemcpy*>(record),
+                      start_ns, collector);
+      break;
+    case CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP:
+      AddMemcpy2Record(
+          reinterpret_cast<const cnpapiActivityMemcpyPtoP*>(record), start_ns,
+          collector);
+      break;
+    case CNPAPI_ACTIVITY_TYPE_MEMSET:
+      AddMemsetRecord(reinterpret_cast<const cnpapiActivityMemset*>(record),
+                      start_ns, collector);
+      break;
+    case CNPAPI_ACTIVITY_TYPE_CNDRV_API:
+      AddApiRecord(reinterpret_cast<const cnpapiActivityAPI*>(record), start_ns,
+                   collector);
+      break;
+    default:
+      break;
+  }
+}
+}  // namespace details
+}  // namespace platform
+}  // namespace paddle
+#endif
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <unordered_map>
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+#include "paddle/fluid/platform/profiler/trace_event_collector.h"
+namespace paddle {
+namespace platform {
+namespace details {
+#ifdef PADDLE_WITH_MLU
+void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
+                                 uint64_t start_ns,
+                                 TraceEventCollector* collector);
+#endif
+}  // namespace details
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
+++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
+#include <string>
+#include <unordered_map>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
+#include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
+#define CNPAPI_CALL(call)                                                    \
+  do {                                                                       \
+    cnpapiResult _status = call;                                             \
+    if (_status != CNPAPI_SUCCESS) {                                         \
+      const char* errstr;                                                    \
+      cnpapiGetResultString(_status, &errstr);                               \
+      LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \
+    }                                                                        \
+  } while (0)
+namespace paddle {
+namespace platform {
+namespace {
+void BufferRequestedCallback(uint64_t** buffer, size_t* size,
+                             size_t* max_num_records) {
+  constexpr size_t kBufferSize = 1 << 23;  // 8 MB
+  constexpr size_t kBufferAlignSize = 8;
+  *buffer = reinterpret_cast<uint64_t*>(
+      paddle::framework::AlignedMalloc(kBufferSize, kBufferAlignSize));
+  *size = kBufferSize;
+  *max_num_records = 0;
+}
+void BufferCompletedCallback(uint64_t* buffer, size_t size, size_t valid_size) {
+  if (buffer == nullptr || valid_size == 0) {
+    return;
+  }
+  auto mlu_tracer = &MluTracer::GetInstance();
+  mlu_tracer->ProcessCnpapiActivity(buffer, valid_size);
+  paddle::framework::AlignedFree(buffer);
+}
+}  // namespace
+MluTracer::MluTracer() {
+#ifdef PADDLE_WITH_MLU
+  CNPAPI_CALL(cnpapiInit());
+  CNPAPI_CALL(cnpapiActivityRegisterCallbacks(BufferRequestedCallback,
+                                              BufferCompletedCallback));
+#endif
+}
+void MluTracer::PrepareTracing() {
+  PADDLE_ENFORCE_EQ(
+      state_ == TracerState::UNINITED || state_ == TracerState::STOPED, true,
+      platform::errors::PreconditionNotMet("MluTracer must be UNINITED"));
+  EnableCnpapiActivity();
+  state_ = TracerState::READY;
+}
+void MluTracer::StartTracing() {
+  PADDLE_ENFORCE_EQ(state_ == TracerState::READY, true,
+                    platform::errors::PreconditionNotMet(
+                        "MluTracer must be READY or STOPPED"));
+  tracing_start_ns_ = PosixInNsec();
+  state_ = TracerState::STARTED;
+}
+void MluTracer::StopTracing() {
+  PADDLE_ENFORCE_EQ(
+      state_, TracerState::STARTED,
+      platform::errors::PreconditionNotMet("MluTracer must be STARTED"));
+  DisableCnpapiActivity();
+  state_ = TracerState::STOPED;
+}
+void MluTracer::CollectTraceData(TraceEventCollector* collector) {
+  PADDLE_ENFORCE_EQ(
+      state_, TracerState::STOPED,
+      platform::errors::PreconditionNotMet("MluTracer must be STOPED"));
+  for (auto he : collector_.HostEvents()) {
+    collector->AddHostEvent(std::move(he));
+  }
+  for (auto rte : collector_.RuntimeEvents()) {
+    collector->AddRuntimeEvent(std::move(rte));
+  }
+  for (auto de : collector_.DeviceEvents()) {
+    collector->AddDeviceEvent(std::move(de));
+  }
+  for (auto tn : collector_.ThreadNames()) {
+    collector->AddThreadName(tn.first, tn.second);
+  }
+  collector_.ClearAll();
+}
+void MluTracer::ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size) {
+#ifdef PADDLE_WITH_MLU
+  cnpapiActivity* record = nullptr;
+  while (true) {
+    cnpapiResult status =
+        cnpapiActivityGetNextRecord(buffer, valid_size, &record);
+    if (status == CNPAPI_SUCCESS) {
+      details::ProcessCnpapiActivityRecord(record, tracing_start_ns_,
+                                           &collector_);
+    } else if (status == CNPAPI_ERROR_INSUFFICIENT_MEMORY ||
+               status == CNPAPI_ERROR_MAX_LIMIT_REACHED) {
+      break;
+    } else {
+      CNPAPI_CALL(status);
+    }
+  }
+#endif
+}
+void MluTracer::EnableCnpapiActivity() {
+#ifdef PADDLE_WITH_MLU
+  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_KERNEL));
+  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
+  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
+  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMSET));
+  CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
+  VLOG(3) << "enable cnpapi activity";
+#endif
+}
+void MluTracer::DisableCnpapiActivity() {
+#ifdef PADDLE_WITH_MLU
+  CNPAPI_CALL(cnpapiActivityFlushAll());
+  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_KERNEL));
+  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
+  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
+  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMSET));
+  CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
+  VLOG(3) << "disable cnpapi activity";
+#endif
+}
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/mlu/mlu_tracer.h
+++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cstdint>
+#include <vector>
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/profiler/tracer_base.h"
+namespace paddle {
+namespace platform {
+class MluTracer : public TracerBase {
+ public:
+  static MluTracer& GetInstance() {
+    static MluTracer instance;
+    return instance;
+  }
+  void PrepareTracing() override;
+  void StartTracing() override;
+  void StopTracing() override;
+  void CollectTraceData(TraceEventCollector* collector) override;
+  void ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size);
+ private:
+  MluTracer();
+  DISABLE_COPY_AND_ASSIGN(MluTracer);
+  void EnableCnpapiActivity();
+  void DisableCnpapiActivity();
+  uint64_t tracing_start_ns_ = UINT64_MAX;
+  TraceEventCollector collector_;
+};
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
+#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
 #include "paddle/fluid/platform/profiler/utils.h"
@@ -52,6 +53,14 @@ bool Profiler::IsCuptiSupported() {
  return supported;
 }
+bool Profiler::IsCnpapiSupported() {
+  bool supported = false;
+#ifdef PADDLE_WITH_MLU
+  supported = true;
+#endif
+  return supported;
+}
 Profiler::Profiler(const ProfilerOptions& options) {
  options_ = options;
  std::bitset<32> trace_switch(options_.trace_switch);
@@ -63,6 +72,9 @@ Profiler::Profiler(const ProfilerOptions& options) {
  if (trace_switch.test(kProfileGPUOptionBit)) {
    tracers_.emplace_back(&CudaTracer::GetInstance(), false);
  }
+  if (trace_switch.test(kProfileMLUOptionBit)) {
+    tracers_.emplace_back(&MluTracer::GetInstance(), false);
+  }
 }
 Profiler::~Profiler() { alive_.store(false); }

--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -33,9 +33,10 @@ namespace platform {
 static constexpr uint32_t kProfileCPUOptionBit = 0;
 static constexpr uint32_t kProfileGPUOptionBit = 1;
+static constexpr uint32_t kProfileMLUOptionBit = 2;
 struct ProfilerOptions {
-  uint32_t trace_switch = 0;  // bit 0: cpu, bit 1: gpu
+  uint32_t trace_switch = 0;  // bit 0: cpu, bit 1: gpu, bit 2: mlu
  uint32_t trace_level = FLAGS_host_trace_level;
 };
@@ -45,6 +46,8 @@ class Profiler {
  static bool IsCuptiSupported();
+  static bool IsCnpapiSupported();
  void Prepare();
  void Start();

--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -50,6 +50,8 @@ enum class TracerEventType {
  PythonOp = 13,
  // Used to mark python level userdefined
  PythonUserDefined = 14,
+  // Used to mark mlu runtime record returned by cnpapi
+  MluRuntime = 15,
  // A flag to denote the number of current types
  NumTypes
 };

--- a/paddle/fluid/platform/profiler/trace_event_collector.h
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
@@ -52,6 +52,13 @@ class TraceEventCollector {
    return thread_names_;
  }
+  void ClearAll() {
+    thread_names_.clear();
+    host_events_.clear();
+    runtime_events_.clear();
+    device_events_.clear();
+  }
 private:
  std::unordered_map<uint64_t, std::string> thread_names_;
  std::list<HostTraceEvent> host_events_;

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3369,6 +3369,8 @@ All parameter, weight, gradient are variables in Paddle.
      .def("create", &paddle::platform::Profiler::Create,
           py::return_value_policy::take_ownership)
      .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
+      .def("is_cnpapi_supported",
+           &paddle::platform::Profiler::IsCnpapiSupported)
      .def("prepare",
           [](paddle::platform::Profiler *profiler) {
             platform::EnableHostEventRecorder();

--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -52,16 +52,19 @@ class ProfilerState(Enum):
 class ProfilerTarget(Enum):
    r"""
-    ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU and GPU are supported currently.
+    ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU, GPU and MLU are supported currently.
    The meaning of each ProfilerState is as following
    - **ProfilerTarget.CPU** : Profile events on CPU.
    - **ProfilerTarget.GPU** : Profile events on GPU.
+    - **ProfilerTarget.MLU** : Profile events on MLU.
    """
    CPU = 0
    GPU = 1
+    MLU = 2
 def make_scheduler(*,
@@ -258,6 +261,8 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
    """
    if _Profiler.is_cupti_supported():
        return [ProfilerTarget.CPU, ProfilerTarget.GPU]
+    if _Profiler.is_cnpapi_supported():
+        return [ProfilerTarget.CPU, ProfilerTarget.MLU]
    return [ProfilerTarget.CPU]
@@ -266,7 +271,7 @@ class Profiler:
    Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
    Args:
-        targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` .
+        targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` , :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.MLU <api_paddle_profiler_ProfilerTarget>` .
        scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function.
            If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
            which means profiling range [start_batch, end_batch).
@@ -407,6 +412,8 @@ class Profiler:
            profileoption.trace_switch |= 1
        if ProfilerTarget.GPU in self.targets:
            profileoption.trace_switch |= (1 << 1)
+        if ProfilerTarget.MLU in self.targets:
+            profileoption.trace_switch |= (1 << 2)
        wrap_optimizers()
        self.profiler = _Profiler.create(profileoption)
        if callable(scheduler):