未验证 提交 fc208b7e 编写于 作者: F fwenguang 提交者: GitHub

[MLU] add mlu new profiler (#41138)

* [MLU] add mlu new profiler

* fix format
上级 605552a9
......@@ -16,7 +16,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include <cn_api.h>
#include <cndrv_id.h>
#include <cnnl.h>
#include <cnpapi.h>
#include <cnrt.h>
#ifdef PADDLE_WITH_CNCL
#include <cncl.h>
......@@ -33,7 +35,7 @@ using cnclStatus = cnclResult_t;
#endif
using mluStream = cnrtQueue_t;
using mluCnnlHandle = cnnlHandle_t;
using mluEventHandle = CNnotifier;
using mluEventHandle = cnrtNotifier_t;
using mluDeviceHandle = CNdev;
namespace platform {
......
cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
add_subdirectory(mlu)
cc_library(event_node SRCS event_node.cc DEPS enforce)
cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
add_subdirectory(dump)
cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer)
cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
......
......@@ -38,10 +38,12 @@ static std::string DefaultFileName() {
}
const char* ChromeTracingLogger::categary_name_[] = {
"Operator", "Dataloader", "ProfileStep", "CudaRuntime",
"Kernel", "Memcpy", "Memset", "UserDefined",
"OperatorInner", "Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined"};
"Operator", "Dataloader", "ProfileStep",
"CudaRuntime", "Kernel", "Memcpy",
"Memset", "UserDefined", "OperatorInner",
"Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};
void ChromeTracingLogger::OpenFile() {
output_file_stream_.open(filename_,
......@@ -598,6 +600,12 @@ void ChromeTracingLogger::RefineDisplayName(
(*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
}
#ifdef PADDLE_WITH_MLU
static std::string device_type("MLU");
#else
static std::string device_type("GPU");
#endif
for (auto it = deviceid_streamid_set_.begin();
it != deviceid_streamid_set_.end(); ++it) {
output_file_stream_ << string_format(
......@@ -607,7 +615,7 @@ void ChromeTracingLogger::RefineDisplayName(
"name": "process_name", "pid": %lld, "tid": %lld,
"ph": "M",
"args": {
"name": "Deivce %lld (GPU)"
"name": "Deivce %lld (%s)"
}
},
{
......@@ -632,9 +640,9 @@ void ChromeTracingLogger::RefineDisplayName(
}
},
)JSON"),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
(*it).second, (*it).first, (*it).second, (*it).first + 0x10000000,
(*it).first, (*it).second, (*it).second);
(*it).first, (*it).second, (*it).first, device_type.c_str(),
(*it).first, (*it).second, (*it).second, (*it).first, (*it).second,
(*it).first + 0x10000000, (*it).first, (*it).second, (*it).second);
}
}
......
if(WITH_MLU)
set(MLU_INFO mlu_info)
endif()
cc_library(mlu_tracer SRCS mlu_tracer.cc cnpapi_data_process.cc DEPS workqueue_utils enforce glog ${MLU_INFO})
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#include <cstdio>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#ifdef PADDLE_WITH_MLU
namespace paddle {
namespace platform {
namespace {
inline uint64_t GetTimeGap() {
static uint64_t time_gap = []() -> uint64_t {
uint64_t cpu_time = PosixInNsec();
uint64_t mlu_time = cnpapiGetTimestamp();
return (cpu_time - mlu_time);
}();
return time_gap;
}
void AddKernelRecord(const cnpapiActivityKernel* kernel, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (kernel->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = demangle(kernel->name);
event.type = TracerEventType::Kernel;
event.start_ns = kernel->start + time_gap;
event.end_ns = kernel->end + time_gap;
event.device_id = kernel->device_id;
event.context_id = kernel->context_id;
event.stream_id = kernel->queue_id;
event.correlation_id = kernel->correlation_id;
event.kernel_info.block_x = kernel->dimx;
event.kernel_info.block_y = kernel->dimy;
event.kernel_info.block_z = kernel->dimz;
event.kernel_info.grid_x = kernel->kernel_type;
event.kernel_info.grid_y = 0;
event.kernel_info.grid_z = 0;
event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->received;
collector->AddDeviceEvent(std::move(event));
}
const char* MemcpyKind(cnpapiActivityMemcpyType kind) {
switch (kind) {
case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD:
return "MEMCPY_HtoD";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH:
return "MEMCPY_DtoH";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD:
return "MEMCPY_DtoD";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH:
return "MEMCPY_HtoH";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP:
return "MEMCPY_PtoP";
default:
break;
}
return "MEMCPY";
}
void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memcpy->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = MemcpyKind(memcpy->copy_type);
event.type = TracerEventType::Memcpy;
event.start_ns = memcpy->start + time_gap;
event.end_ns = memcpy->end + time_gap;
event.device_id = memcpy->device_id;
event.context_id = memcpy->context_id;
event.stream_id = memcpy->queue_id;
event.correlation_id = memcpy->correlation_id;
event.memcpy_info.num_bytes = memcpy->bytes;
snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
MemcpyKind(memcpy->copy_type));
collector->AddDeviceEvent(std::move(event));
}
void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2,
uint64_t start_ns, TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memcpy2->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = MemcpyKind(memcpy2->copy_type);
event.type = TracerEventType::Memcpy;
event.start_ns = memcpy2->start + time_gap;
event.end_ns = memcpy2->end + time_gap;
event.device_id = memcpy2->device_id;
event.context_id = memcpy2->context_id;
event.stream_id = memcpy2->queue_id;
event.correlation_id = memcpy2->correlation_id;
event.memcpy_info.num_bytes = memcpy2->bytes;
snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
MemcpyKind(memcpy2->copy_type));
collector->AddDeviceEvent(std::move(event));
}
void AddMemsetRecord(const cnpapiActivityMemset* memset, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memset->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = "MEMSET";
event.type = TracerEventType::Memset;
event.start_ns = memset->start + time_gap;
event.end_ns = memset->end + time_gap;
event.device_id = memset->device_id;
event.context_id = memset->context_id;
event.stream_id = memset->queue_id;
event.correlation_id = memset->correlation_id;
event.memset_info.num_bytes = memset->bytes;
event.memset_info.value = memset->value;
collector->AddDeviceEvent(std::move(event));
}
class CnpapiRuntimeCbidStr {
public:
static const CnpapiRuntimeCbidStr& GetInstance() {
static CnpapiRuntimeCbidStr inst;
return inst;
}
std::string RuntimeKind(cnpapi_CallbackId cbid) const {
auto iter = cbid_str_.find(cbid);
if (iter == cbid_str_.end()) {
return "MLU Runtime API " + std::to_string(cbid);
}
return iter->second;
}
private:
CnpapiRuntimeCbidStr();
std::unordered_map<cnpapi_CallbackId, std::string> cbid_str_;
};
CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
#define REGISTER_RUNTIME_CBID_STR(cbid) \
cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid
REGISTER_RUNTIME_CBID_STR(cnMalloc);
REGISTER_RUNTIME_CBID_STR(cnMallocHost);
REGISTER_RUNTIME_CBID_STR(cnFree);
REGISTER_RUNTIME_CBID_STR(cnFreeHost);
REGISTER_RUNTIME_CBID_STR(cnMemcpy);
REGISTER_RUNTIME_CBID_STR(cnMemcpyPeer);
REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoD);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoH);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD);
REGISTER_RUNTIME_CBID_STR(cnMemcpyAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoDAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoHAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoDAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD2D);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD3D);
REGISTER_RUNTIME_CBID_STR(cnMemcpy2D);
REGISTER_RUNTIME_CBID_STR(cnMemcpy3D);
REGISTER_RUNTIME_CBID_STR(cnMemsetD8);
REGISTER_RUNTIME_CBID_STR(cnMemsetD16);
REGISTER_RUNTIME_CBID_STR(cnMemsetD32);
REGISTER_RUNTIME_CBID_STR(cnMemsetD8Async);
REGISTER_RUNTIME_CBID_STR(cnMemsetD16Async);
REGISTER_RUNTIME_CBID_STR(cnMemsetD32Async);
REGISTER_RUNTIME_CBID_STR(cnInvokeKernel);
REGISTER_RUNTIME_CBID_STR(cnCreateQueue);
REGISTER_RUNTIME_CBID_STR(cnDestroyQueue);
REGISTER_RUNTIME_CBID_STR(cnQueueSync);
REGISTER_RUNTIME_CBID_STR(cnQueueWaitNotifier);
REGISTER_RUNTIME_CBID_STR(cnWaitNotifier);
REGISTER_RUNTIME_CBID_STR(cnCreateNotifier);
REGISTER_RUNTIME_CBID_STR(cnDestroyNotifier);
REGISTER_RUNTIME_CBID_STR(cnPlaceNotifier);
REGISTER_RUNTIME_CBID_STR(cnCtxCreate);
REGISTER_RUNTIME_CBID_STR(cnCtxDestroy);
REGISTER_RUNTIME_CBID_STR(cnCtxGetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
REGISTER_RUNTIME_CBID_STR(cnCtxSync);
#undef REGISTER_RUNTIME_CBID_STR
}
void AddApiRecord(const cnpapiActivityAPI* api, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (api->start + time_gap < start_ns) {
return;
}
RuntimeTraceEvent event;
event.name = CnpapiRuntimeCbidStr::GetInstance().RuntimeKind(api->cbid);
event.start_ns = api->start + time_gap;
event.end_ns = api->end + time_gap;
event.process_id = api->process_id;
event.thread_id = api->thread_id;
event.correlation_id = api->correlation_id;
event.callback_id = api->cbid;
event.type = TracerEventType::MluRuntime;
collector->AddRuntimeEvent(std::move(event));
}
} // namespace
namespace details {
void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
uint64_t start_ns,
TraceEventCollector* collector) {
switch (record->type) {
case CNPAPI_ACTIVITY_TYPE_KERNEL:
AddKernelRecord(reinterpret_cast<const cnpapiActivityKernel*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMCPY:
AddMemcpyRecord(reinterpret_cast<const cnpapiActivityMemcpy*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP:
AddMemcpy2Record(
reinterpret_cast<const cnpapiActivityMemcpyPtoP*>(record), start_ns,
collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMSET:
AddMemsetRecord(reinterpret_cast<const cnpapiActivityMemset*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_CNDRV_API:
AddApiRecord(reinterpret_cast<const cnpapiActivityAPI*>(record), start_ns,
collector);
break;
default:
break;
}
}
} // namespace details
} // namespace platform
} // namespace paddle
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
namespace paddle {
namespace platform {
namespace details {
#ifdef PADDLE_WITH_MLU
void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
uint64_t start_ns,
TraceEventCollector* collector);
#endif
} // namespace details
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
#include <string>
#include <unordered_map>
#include "glog/logging.h"
#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#define CNPAPI_CALL(call) \
do { \
cnpapiResult _status = call; \
if (_status != CNPAPI_SUCCESS) { \
const char* errstr; \
cnpapiGetResultString(_status, &errstr); \
LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \
} \
} while (0)
namespace paddle {
namespace platform {
namespace {
void BufferRequestedCallback(uint64_t** buffer, size_t* size,
size_t* max_num_records) {
constexpr size_t kBufferSize = 1 << 23; // 8 MB
constexpr size_t kBufferAlignSize = 8;
*buffer = reinterpret_cast<uint64_t*>(
paddle::framework::AlignedMalloc(kBufferSize, kBufferAlignSize));
*size = kBufferSize;
*max_num_records = 0;
}
void BufferCompletedCallback(uint64_t* buffer, size_t size, size_t valid_size) {
if (buffer == nullptr || valid_size == 0) {
return;
}
auto mlu_tracer = &MluTracer::GetInstance();
mlu_tracer->ProcessCnpapiActivity(buffer, valid_size);
paddle::framework::AlignedFree(buffer);
}
} // namespace
MluTracer::MluTracer() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiInit());
CNPAPI_CALL(cnpapiActivityRegisterCallbacks(BufferRequestedCallback,
BufferCompletedCallback));
#endif
}
void MluTracer::PrepareTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::UNINITED || state_ == TracerState::STOPED, true,
platform::errors::PreconditionNotMet("MluTracer must be UNINITED"));
EnableCnpapiActivity();
state_ = TracerState::READY;
}
void MluTracer::StartTracing() {
PADDLE_ENFORCE_EQ(state_ == TracerState::READY, true,
platform::errors::PreconditionNotMet(
"MluTracer must be READY or STOPPED"));
tracing_start_ns_ = PosixInNsec();
state_ = TracerState::STARTED;
}
void MluTracer::StopTracing() {
PADDLE_ENFORCE_EQ(
state_, TracerState::STARTED,
platform::errors::PreconditionNotMet("MluTracer must be STARTED"));
DisableCnpapiActivity();
state_ = TracerState::STOPED;
}
void MluTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ(
state_, TracerState::STOPED,
platform::errors::PreconditionNotMet("MluTracer must be STOPED"));
for (auto he : collector_.HostEvents()) {
collector->AddHostEvent(std::move(he));
}
for (auto rte : collector_.RuntimeEvents()) {
collector->AddRuntimeEvent(std::move(rte));
}
for (auto de : collector_.DeviceEvents()) {
collector->AddDeviceEvent(std::move(de));
}
for (auto tn : collector_.ThreadNames()) {
collector->AddThreadName(tn.first, tn.second);
}
collector_.ClearAll();
}
void MluTracer::ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size) {
#ifdef PADDLE_WITH_MLU
cnpapiActivity* record = nullptr;
while (true) {
cnpapiResult status =
cnpapiActivityGetNextRecord(buffer, valid_size, &record);
if (status == CNPAPI_SUCCESS) {
details::ProcessCnpapiActivityRecord(record, tracing_start_ns_,
&collector_);
} else if (status == CNPAPI_ERROR_INSUFFICIENT_MEMORY ||
status == CNPAPI_ERROR_MAX_LIMIT_REACHED) {
break;
} else {
CNPAPI_CALL(status);
}
}
#endif
}
void MluTracer::EnableCnpapiActivity() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_KERNEL));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMSET));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
VLOG(3) << "enable cnpapi activity";
#endif
}
void MluTracer::DisableCnpapiActivity() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiActivityFlushAll());
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_KERNEL));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMSET));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
VLOG(3) << "disable cnpapi activity";
#endif
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <vector>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/profiler/tracer_base.h"
namespace paddle {
namespace platform {
class MluTracer : public TracerBase {
public:
static MluTracer& GetInstance() {
static MluTracer instance;
return instance;
}
void PrepareTracing() override;
void StartTracing() override;
void StopTracing() override;
void CollectTraceData(TraceEventCollector* collector) override;
void ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size);
private:
MluTracer();
DISABLE_COPY_AND_ASSIGN(MluTracer);
void EnableCnpapiActivity();
void DisableCnpapiActivity();
uint64_t tracing_start_ns_ = UINT64_MAX;
TraceEventCollector collector_;
};
} // namespace platform
} // namespace paddle
......@@ -27,6 +27,7 @@
#include "paddle/fluid/platform/profiler/cuda_tracer.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/host_tracer.h"
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/fluid/platform/profiler/utils.h"
......@@ -52,6 +53,14 @@ bool Profiler::IsCuptiSupported() {
return supported;
}
bool Profiler::IsCnpapiSupported() {
bool supported = false;
#ifdef PADDLE_WITH_MLU
supported = true;
#endif
return supported;
}
Profiler::Profiler(const ProfilerOptions& options) {
options_ = options;
std::bitset<32> trace_switch(options_.trace_switch);
......@@ -63,6 +72,9 @@ Profiler::Profiler(const ProfilerOptions& options) {
if (trace_switch.test(kProfileGPUOptionBit)) {
tracers_.emplace_back(&CudaTracer::GetInstance(), false);
}
if (trace_switch.test(kProfileMLUOptionBit)) {
tracers_.emplace_back(&MluTracer::GetInstance(), false);
}
}
Profiler::~Profiler() { alive_.store(false); }
......
......@@ -33,9 +33,10 @@ namespace platform {
static constexpr uint32_t kProfileCPUOptionBit = 0;
static constexpr uint32_t kProfileGPUOptionBit = 1;
static constexpr uint32_t kProfileMLUOptionBit = 2;
struct ProfilerOptions {
uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu
uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu
uint32_t trace_level = FLAGS_host_trace_level;
};
......@@ -45,6 +46,8 @@ class Profiler {
static bool IsCuptiSupported();
static bool IsCnpapiSupported();
void Prepare();
void Start();
......
......@@ -50,6 +50,8 @@ enum class TracerEventType {
PythonOp = 13,
// Used to mark python level userdefined
PythonUserDefined = 14,
// Used to mark mlu runtime record returned by cnpapi
MluRuntime = 15,
// A flag to denote the number of current types
NumTypes
};
......
......@@ -52,6 +52,13 @@ class TraceEventCollector {
return thread_names_;
}
void ClearAll() {
thread_names_.clear();
host_events_.clear();
runtime_events_.clear();
device_events_.clear();
}
private:
std::unordered_map<uint64_t, std::string> thread_names_;
std::list<HostTraceEvent> host_events_;
......
......@@ -3369,6 +3369,8 @@ All parameter, weight, gradient are variables in Paddle.
.def("create", &paddle::platform::Profiler::Create,
py::return_value_policy::take_ownership)
.def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
.def("is_cnpapi_supported",
&paddle::platform::Profiler::IsCnpapiSupported)
.def("prepare",
[](paddle::platform::Profiler *profiler) {
platform::EnableHostEventRecorder();
......
......@@ -52,16 +52,19 @@ class ProfilerState(Enum):
class ProfilerTarget(Enum):
r"""
ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU and GPU are supported currently.
ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU, GPU and MLU are supported currently.
The meaning of each ProfilerState is as following
- **ProfilerTarget.CPU** : Profile events on CPU.
- **ProfilerTarget.GPU** : Profile events on GPU.
- **ProfilerTarget.MLU** : Profile events on MLU.
"""
CPU = 0
GPU = 1
MLU = 2
def make_scheduler(*,
......@@ -258,6 +261,8 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
"""
if _Profiler.is_cupti_supported():
return [ProfilerTarget.CPU, ProfilerTarget.GPU]
if _Profiler.is_cnpapi_supported():
return [ProfilerTarget.CPU, ProfilerTarget.MLU]
return [ProfilerTarget.CPU]
......@@ -266,7 +271,7 @@ class Profiler:
Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
Args:
targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` .
targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` , :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.MLU <api_paddle_profiler_ProfilerTarget>` .
scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function.
If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
which means profiling range [start_batch, end_batch).
......@@ -407,6 +412,8 @@ class Profiler:
profileoption.trace_switch |= 1
if ProfilerTarget.GPU in self.targets:
profileoption.trace_switch |= (1 << 1)
if ProfilerTarget.MLU in self.targets:
profileoption.trace_switch |= (1 << 2)
wrap_optimizers()
self.profiler = _Profiler.create(profileoption)
if callable(scheduler):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册