未验证 提交 fc208b7e 编写于 作者: F fwenguang 提交者: GitHub

[MLU] add mlu new profiler (#41138)

* [MLU] add mlu new profiler

* fix format
上级 605552a9
...@@ -16,7 +16,9 @@ limitations under the License. */ ...@@ -16,7 +16,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
#include <cn_api.h> #include <cn_api.h>
#include <cndrv_id.h>
#include <cnnl.h> #include <cnnl.h>
#include <cnpapi.h>
#include <cnrt.h> #include <cnrt.h>
#ifdef PADDLE_WITH_CNCL #ifdef PADDLE_WITH_CNCL
#include <cncl.h> #include <cncl.h>
...@@ -33,7 +35,7 @@ using cnclStatus = cnclResult_t; ...@@ -33,7 +35,7 @@ using cnclStatus = cnclResult_t;
#endif #endif
using mluStream = cnrtQueue_t; using mluStream = cnrtQueue_t;
using mluCnnlHandle = cnnlHandle_t; using mluCnnlHandle = cnnlHandle_t;
using mluEventHandle = CNnotifier; using mluEventHandle = cnrtNotifier_t;
using mluDeviceHandle = CNdev; using mluDeviceHandle = CNdev;
namespace platform { namespace platform {
......
cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
add_subdirectory(mlu)
cc_library(event_node SRCS event_node.cc DEPS enforce) cc_library(event_node SRCS event_node.cc DEPS enforce)
cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
add_subdirectory(dump) add_subdirectory(dump)
cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils) cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
cc_library(event_bind SRCS event_python.cc DEPS profiler_logger) cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind) cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer)
cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger) cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind) cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
......
...@@ -38,10 +38,12 @@ static std::string DefaultFileName() { ...@@ -38,10 +38,12 @@ static std::string DefaultFileName() {
} }
const char* ChromeTracingLogger::categary_name_[] = { const char* ChromeTracingLogger::categary_name_[] = {
"Operator", "Dataloader", "ProfileStep", "CudaRuntime", "Operator", "Dataloader", "ProfileStep",
"Kernel", "Memcpy", "Memset", "UserDefined", "CudaRuntime", "Kernel", "Memcpy",
"OperatorInner", "Forward", "Backward", "Optimization", "Memset", "UserDefined", "OperatorInner",
"Communication", "PythonOp", "PythonUserDefined"}; "Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};
void ChromeTracingLogger::OpenFile() { void ChromeTracingLogger::OpenFile() {
output_file_stream_.open(filename_, output_file_stream_.open(filename_,
...@@ -598,6 +600,12 @@ void ChromeTracingLogger::RefineDisplayName( ...@@ -598,6 +600,12 @@ void ChromeTracingLogger::RefineDisplayName(
(*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1); (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
} }
#ifdef PADDLE_WITH_MLU
static std::string device_type("MLU");
#else
static std::string device_type("GPU");
#endif
for (auto it = deviceid_streamid_set_.begin(); for (auto it = deviceid_streamid_set_.begin();
it != deviceid_streamid_set_.end(); ++it) { it != deviceid_streamid_set_.end(); ++it) {
output_file_stream_ << string_format( output_file_stream_ << string_format(
...@@ -607,7 +615,7 @@ void ChromeTracingLogger::RefineDisplayName( ...@@ -607,7 +615,7 @@ void ChromeTracingLogger::RefineDisplayName(
"name": "process_name", "pid": %lld, "tid": %lld, "name": "process_name", "pid": %lld, "tid": %lld,
"ph": "M", "ph": "M",
"args": { "args": {
"name": "Deivce %lld (GPU)" "name": "Deivce %lld (%s)"
} }
}, },
{ {
...@@ -632,9 +640,9 @@ void ChromeTracingLogger::RefineDisplayName( ...@@ -632,9 +640,9 @@ void ChromeTracingLogger::RefineDisplayName(
} }
}, },
)JSON"), )JSON"),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second, (*it).first, (*it).second, (*it).first, device_type.c_str(),
(*it).second, (*it).first, (*it).second, (*it).first + 0x10000000, (*it).first, (*it).second, (*it).second, (*it).first, (*it).second,
(*it).first, (*it).second, (*it).second); (*it).first + 0x10000000, (*it).first, (*it).second, (*it).second);
} }
} }
......
if(WITH_MLU)
set(MLU_INFO mlu_info)
endif()
cc_library(mlu_tracer SRCS mlu_tracer.cc cnpapi_data_process.cc DEPS workqueue_utils enforce glog ${MLU_INFO})
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#include <cstdio>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#ifdef PADDLE_WITH_MLU
namespace paddle {
namespace platform {
namespace {
inline uint64_t GetTimeGap() {
static uint64_t time_gap = []() -> uint64_t {
uint64_t cpu_time = PosixInNsec();
uint64_t mlu_time = cnpapiGetTimestamp();
return (cpu_time - mlu_time);
}();
return time_gap;
}
void AddKernelRecord(const cnpapiActivityKernel* kernel, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (kernel->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = demangle(kernel->name);
event.type = TracerEventType::Kernel;
event.start_ns = kernel->start + time_gap;
event.end_ns = kernel->end + time_gap;
event.device_id = kernel->device_id;
event.context_id = kernel->context_id;
event.stream_id = kernel->queue_id;
event.correlation_id = kernel->correlation_id;
event.kernel_info.block_x = kernel->dimx;
event.kernel_info.block_y = kernel->dimy;
event.kernel_info.block_z = kernel->dimz;
event.kernel_info.grid_x = kernel->kernel_type;
event.kernel_info.grid_y = 0;
event.kernel_info.grid_z = 0;
event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->received;
collector->AddDeviceEvent(std::move(event));
}
const char* MemcpyKind(cnpapiActivityMemcpyType kind) {
switch (kind) {
case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD:
return "MEMCPY_HtoD";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH:
return "MEMCPY_DtoH";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD:
return "MEMCPY_DtoD";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH:
return "MEMCPY_HtoH";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP:
return "MEMCPY_PtoP";
default:
break;
}
return "MEMCPY";
}
void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memcpy->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = MemcpyKind(memcpy->copy_type);
event.type = TracerEventType::Memcpy;
event.start_ns = memcpy->start + time_gap;
event.end_ns = memcpy->end + time_gap;
event.device_id = memcpy->device_id;
event.context_id = memcpy->context_id;
event.stream_id = memcpy->queue_id;
event.correlation_id = memcpy->correlation_id;
event.memcpy_info.num_bytes = memcpy->bytes;
snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
MemcpyKind(memcpy->copy_type));
collector->AddDeviceEvent(std::move(event));
}
void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2,
uint64_t start_ns, TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memcpy2->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = MemcpyKind(memcpy2->copy_type);
event.type = TracerEventType::Memcpy;
event.start_ns = memcpy2->start + time_gap;
event.end_ns = memcpy2->end + time_gap;
event.device_id = memcpy2->device_id;
event.context_id = memcpy2->context_id;
event.stream_id = memcpy2->queue_id;
event.correlation_id = memcpy2->correlation_id;
event.memcpy_info.num_bytes = memcpy2->bytes;
snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
MemcpyKind(memcpy2->copy_type));
collector->AddDeviceEvent(std::move(event));
}
void AddMemsetRecord(const cnpapiActivityMemset* memset, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memset->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = "MEMSET";
event.type = TracerEventType::Memset;
event.start_ns = memset->start + time_gap;
event.end_ns = memset->end + time_gap;
event.device_id = memset->device_id;
event.context_id = memset->context_id;
event.stream_id = memset->queue_id;
event.correlation_id = memset->correlation_id;
event.memset_info.num_bytes = memset->bytes;
event.memset_info.value = memset->value;
collector->AddDeviceEvent(std::move(event));
}
class CnpapiRuntimeCbidStr {
public:
static const CnpapiRuntimeCbidStr& GetInstance() {
static CnpapiRuntimeCbidStr inst;
return inst;
}
std::string RuntimeKind(cnpapi_CallbackId cbid) const {
auto iter = cbid_str_.find(cbid);
if (iter == cbid_str_.end()) {
return "MLU Runtime API " + std::to_string(cbid);
}
return iter->second;
}
private:
CnpapiRuntimeCbidStr();
std::unordered_map<cnpapi_CallbackId, std::string> cbid_str_;
};
CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
#define REGISTER_RUNTIME_CBID_STR(cbid) \
cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid
REGISTER_RUNTIME_CBID_STR(cnMalloc);
REGISTER_RUNTIME_CBID_STR(cnMallocHost);
REGISTER_RUNTIME_CBID_STR(cnFree);
REGISTER_RUNTIME_CBID_STR(cnFreeHost);
REGISTER_RUNTIME_CBID_STR(cnMemcpy);
REGISTER_RUNTIME_CBID_STR(cnMemcpyPeer);
REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoD);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoH);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD);
REGISTER_RUNTIME_CBID_STR(cnMemcpyAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoDAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoHAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoDAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD2D);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD3D);
REGISTER_RUNTIME_CBID_STR(cnMemcpy2D);
REGISTER_RUNTIME_CBID_STR(cnMemcpy3D);
REGISTER_RUNTIME_CBID_STR(cnMemsetD8);
REGISTER_RUNTIME_CBID_STR(cnMemsetD16);
REGISTER_RUNTIME_CBID_STR(cnMemsetD32);
REGISTER_RUNTIME_CBID_STR(cnMemsetD8Async);
REGISTER_RUNTIME_CBID_STR(cnMemsetD16Async);
REGISTER_RUNTIME_CBID_STR(cnMemsetD32Async);
REGISTER_RUNTIME_CBID_STR(cnInvokeKernel);
REGISTER_RUNTIME_CBID_STR(cnCreateQueue);
REGISTER_RUNTIME_CBID_STR(cnDestroyQueue);
REGISTER_RUNTIME_CBID_STR(cnQueueSync);
REGISTER_RUNTIME_CBID_STR(cnQueueWaitNotifier);
REGISTER_RUNTIME_CBID_STR(cnWaitNotifier);
REGISTER_RUNTIME_CBID_STR(cnCreateNotifier);
REGISTER_RUNTIME_CBID_STR(cnDestroyNotifier);
REGISTER_RUNTIME_CBID_STR(cnPlaceNotifier);
REGISTER_RUNTIME_CBID_STR(cnCtxCreate);
REGISTER_RUNTIME_CBID_STR(cnCtxDestroy);
REGISTER_RUNTIME_CBID_STR(cnCtxGetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
REGISTER_RUNTIME_CBID_STR(cnCtxSync);
#undef REGISTER_RUNTIME_CBID_STR
}
void AddApiRecord(const cnpapiActivityAPI* api, uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (api->start + time_gap < start_ns) {
return;
}
RuntimeTraceEvent event;
event.name = CnpapiRuntimeCbidStr::GetInstance().RuntimeKind(api->cbid);
event.start_ns = api->start + time_gap;
event.end_ns = api->end + time_gap;
event.process_id = api->process_id;
event.thread_id = api->thread_id;
event.correlation_id = api->correlation_id;
event.callback_id = api->cbid;
event.type = TracerEventType::MluRuntime;
collector->AddRuntimeEvent(std::move(event));
}
} // namespace
namespace details {
void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
uint64_t start_ns,
TraceEventCollector* collector) {
switch (record->type) {
case CNPAPI_ACTIVITY_TYPE_KERNEL:
AddKernelRecord(reinterpret_cast<const cnpapiActivityKernel*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMCPY:
AddMemcpyRecord(reinterpret_cast<const cnpapiActivityMemcpy*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP:
AddMemcpy2Record(
reinterpret_cast<const cnpapiActivityMemcpyPtoP*>(record), start_ns,
collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMSET:
AddMemsetRecord(reinterpret_cast<const cnpapiActivityMemset*>(record),
start_ns, collector);
break;
case CNPAPI_ACTIVITY_TYPE_CNDRV_API:
AddApiRecord(reinterpret_cast<const cnpapiActivityAPI*>(record), start_ns,
collector);
break;
default:
break;
}
}
} // namespace details
} // namespace platform
} // namespace paddle
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
namespace paddle {
namespace platform {
namespace details {
#ifdef PADDLE_WITH_MLU
void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
uint64_t start_ns,
TraceEventCollector* collector);
#endif
} // namespace details
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
#include <string>
#include <unordered_map>
#include "glog/logging.h"
#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#define CNPAPI_CALL(call) \
do { \
cnpapiResult _status = call; \
if (_status != CNPAPI_SUCCESS) { \
const char* errstr; \
cnpapiGetResultString(_status, &errstr); \
LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \
} \
} while (0)
namespace paddle {
namespace platform {
namespace {
void BufferRequestedCallback(uint64_t** buffer, size_t* size,
size_t* max_num_records) {
constexpr size_t kBufferSize = 1 << 23; // 8 MB
constexpr size_t kBufferAlignSize = 8;
*buffer = reinterpret_cast<uint64_t*>(
paddle::framework::AlignedMalloc(kBufferSize, kBufferAlignSize));
*size = kBufferSize;
*max_num_records = 0;
}
void BufferCompletedCallback(uint64_t* buffer, size_t size, size_t valid_size) {
if (buffer == nullptr || valid_size == 0) {
return;
}
auto mlu_tracer = &MluTracer::GetInstance();
mlu_tracer->ProcessCnpapiActivity(buffer, valid_size);
paddle::framework::AlignedFree(buffer);
}
} // namespace
MluTracer::MluTracer() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiInit());
CNPAPI_CALL(cnpapiActivityRegisterCallbacks(BufferRequestedCallback,
BufferCompletedCallback));
#endif
}
void MluTracer::PrepareTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::UNINITED || state_ == TracerState::STOPED, true,
platform::errors::PreconditionNotMet("MluTracer must be UNINITED"));
EnableCnpapiActivity();
state_ = TracerState::READY;
}
void MluTracer::StartTracing() {
PADDLE_ENFORCE_EQ(state_ == TracerState::READY, true,
platform::errors::PreconditionNotMet(
"MluTracer must be READY or STOPPED"));
tracing_start_ns_ = PosixInNsec();
state_ = TracerState::STARTED;
}
void MluTracer::StopTracing() {
PADDLE_ENFORCE_EQ(
state_, TracerState::STARTED,
platform::errors::PreconditionNotMet("MluTracer must be STARTED"));
DisableCnpapiActivity();
state_ = TracerState::STOPED;
}
void MluTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ(
state_, TracerState::STOPED,
platform::errors::PreconditionNotMet("MluTracer must be STOPED"));
for (auto he : collector_.HostEvents()) {
collector->AddHostEvent(std::move(he));
}
for (auto rte : collector_.RuntimeEvents()) {
collector->AddRuntimeEvent(std::move(rte));
}
for (auto de : collector_.DeviceEvents()) {
collector->AddDeviceEvent(std::move(de));
}
for (auto tn : collector_.ThreadNames()) {
collector->AddThreadName(tn.first, tn.second);
}
collector_.ClearAll();
}
void MluTracer::ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size) {
#ifdef PADDLE_WITH_MLU
cnpapiActivity* record = nullptr;
while (true) {
cnpapiResult status =
cnpapiActivityGetNextRecord(buffer, valid_size, &record);
if (status == CNPAPI_SUCCESS) {
details::ProcessCnpapiActivityRecord(record, tracing_start_ns_,
&collector_);
} else if (status == CNPAPI_ERROR_INSUFFICIENT_MEMORY ||
status == CNPAPI_ERROR_MAX_LIMIT_REACHED) {
break;
} else {
CNPAPI_CALL(status);
}
}
#endif
}
void MluTracer::EnableCnpapiActivity() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_KERNEL));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMSET));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
VLOG(3) << "enable cnpapi activity";
#endif
}
void MluTracer::DisableCnpapiActivity() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiActivityFlushAll());
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_KERNEL));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMSET));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
VLOG(3) << "disable cnpapi activity";
#endif
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <vector>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/profiler/tracer_base.h"
namespace paddle {
namespace platform {
class MluTracer : public TracerBase {
public:
static MluTracer& GetInstance() {
static MluTracer instance;
return instance;
}
void PrepareTracing() override;
void StartTracing() override;
void StopTracing() override;
void CollectTraceData(TraceEventCollector* collector) override;
void ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size);
private:
MluTracer();
DISABLE_COPY_AND_ASSIGN(MluTracer);
void EnableCnpapiActivity();
void DisableCnpapiActivity();
uint64_t tracing_start_ns_ = UINT64_MAX;
TraceEventCollector collector_;
};
} // namespace platform
} // namespace paddle
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include "paddle/fluid/platform/profiler/cuda_tracer.h" #include "paddle/fluid/platform/profiler/cuda_tracer.h"
#include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/host_tracer.h"
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
#include "paddle/fluid/platform/profiler/trace_event_collector.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/fluid/platform/profiler/utils.h" #include "paddle/fluid/platform/profiler/utils.h"
...@@ -52,6 +53,14 @@ bool Profiler::IsCuptiSupported() { ...@@ -52,6 +53,14 @@ bool Profiler::IsCuptiSupported() {
return supported; return supported;
} }
bool Profiler::IsCnpapiSupported() {
bool supported = false;
#ifdef PADDLE_WITH_MLU
supported = true;
#endif
return supported;
}
Profiler::Profiler(const ProfilerOptions& options) { Profiler::Profiler(const ProfilerOptions& options) {
options_ = options; options_ = options;
std::bitset<32> trace_switch(options_.trace_switch); std::bitset<32> trace_switch(options_.trace_switch);
...@@ -63,6 +72,9 @@ Profiler::Profiler(const ProfilerOptions& options) { ...@@ -63,6 +72,9 @@ Profiler::Profiler(const ProfilerOptions& options) {
if (trace_switch.test(kProfileGPUOptionBit)) { if (trace_switch.test(kProfileGPUOptionBit)) {
tracers_.emplace_back(&CudaTracer::GetInstance(), false); tracers_.emplace_back(&CudaTracer::GetInstance(), false);
} }
if (trace_switch.test(kProfileMLUOptionBit)) {
tracers_.emplace_back(&MluTracer::GetInstance(), false);
}
} }
Profiler::~Profiler() { alive_.store(false); } Profiler::~Profiler() { alive_.store(false); }
......
...@@ -33,9 +33,10 @@ namespace platform { ...@@ -33,9 +33,10 @@ namespace platform {
static constexpr uint32_t kProfileCPUOptionBit = 0; static constexpr uint32_t kProfileCPUOptionBit = 0;
static constexpr uint32_t kProfileGPUOptionBit = 1; static constexpr uint32_t kProfileGPUOptionBit = 1;
static constexpr uint32_t kProfileMLUOptionBit = 2;
struct ProfilerOptions { struct ProfilerOptions {
uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu
uint32_t trace_level = FLAGS_host_trace_level; uint32_t trace_level = FLAGS_host_trace_level;
}; };
...@@ -45,6 +46,8 @@ class Profiler { ...@@ -45,6 +46,8 @@ class Profiler {
static bool IsCuptiSupported(); static bool IsCuptiSupported();
static bool IsCnpapiSupported();
void Prepare(); void Prepare();
void Start(); void Start();
......
...@@ -50,6 +50,8 @@ enum class TracerEventType { ...@@ -50,6 +50,8 @@ enum class TracerEventType {
PythonOp = 13, PythonOp = 13,
// Used to mark python level userdefined // Used to mark python level userdefined
PythonUserDefined = 14, PythonUserDefined = 14,
// Used to mark mlu runtime record returned by cnpapi
MluRuntime = 15,
// A flag to denote the number of current types // A flag to denote the number of current types
NumTypes NumTypes
}; };
......
...@@ -52,6 +52,13 @@ class TraceEventCollector { ...@@ -52,6 +52,13 @@ class TraceEventCollector {
return thread_names_; return thread_names_;
} }
void ClearAll() {
thread_names_.clear();
host_events_.clear();
runtime_events_.clear();
device_events_.clear();
}
private: private:
std::unordered_map<uint64_t, std::string> thread_names_; std::unordered_map<uint64_t, std::string> thread_names_;
std::list<HostTraceEvent> host_events_; std::list<HostTraceEvent> host_events_;
......
...@@ -3369,6 +3369,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3369,6 +3369,8 @@ All parameter, weight, gradient are variables in Paddle.
.def("create", &paddle::platform::Profiler::Create, .def("create", &paddle::platform::Profiler::Create,
py::return_value_policy::take_ownership) py::return_value_policy::take_ownership)
.def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported) .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
.def("is_cnpapi_supported",
&paddle::platform::Profiler::IsCnpapiSupported)
.def("prepare", .def("prepare",
[](paddle::platform::Profiler *profiler) { [](paddle::platform::Profiler *profiler) {
platform::EnableHostEventRecorder(); platform::EnableHostEventRecorder();
......
...@@ -52,16 +52,19 @@ class ProfilerState(Enum): ...@@ -52,16 +52,19 @@ class ProfilerState(Enum):
class ProfilerTarget(Enum): class ProfilerTarget(Enum):
r""" r"""
ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU and GPU are supported currently. ProfilerTarget is used to specify target device for :ref:`profiling <api_paddle_profiler_Profiler>` . Only CPU, GPU and MLU are supported currently.
The meaning of each ProfilerState is as following The meaning of each ProfilerState is as following
- **ProfilerTarget.CPU** : Profile events on CPU. - **ProfilerTarget.CPU** : Profile events on CPU.
- **ProfilerTarget.GPU** : Profile events on GPU. - **ProfilerTarget.GPU** : Profile events on GPU.
- **ProfilerTarget.MLU** : Profile events on MLU.
""" """
CPU = 0 CPU = 0
GPU = 1 GPU = 1
MLU = 2
def make_scheduler(*, def make_scheduler(*,
...@@ -258,6 +261,8 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: ...@@ -258,6 +261,8 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
""" """
if _Profiler.is_cupti_supported(): if _Profiler.is_cupti_supported():
return [ProfilerTarget.CPU, ProfilerTarget.GPU] return [ProfilerTarget.CPU, ProfilerTarget.GPU]
if _Profiler.is_cnpapi_supported():
return [ProfilerTarget.CPU, ProfilerTarget.MLU]
return [ProfilerTarget.CPU] return [ProfilerTarget.CPU]
...@@ -266,7 +271,7 @@ class Profiler: ...@@ -266,7 +271,7 @@ class Profiler:
Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table.
Args: Args:
targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` . targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU <api_paddle_profiler_ProfilerTarget>` , :ref:`ProfilerTarget.GPU <api_paddle_profiler_ProfilerTarget>` and :ref:`ProfilerTarget.MLU <api_paddle_profiler_ProfilerTarget>` .
scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function. scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState <api_paddle_profiler_ProfilerState>`. This callable object can be generated by :ref:`make_scheduler <api_paddle_profiler_make_scheduler>` function.
If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
which means profiling range [start_batch, end_batch). which means profiling range [start_batch, end_batch).
...@@ -407,6 +412,8 @@ class Profiler: ...@@ -407,6 +412,8 @@ class Profiler:
profileoption.trace_switch |= 1 profileoption.trace_switch |= 1
if ProfilerTarget.GPU in self.targets: if ProfilerTarget.GPU in self.targets:
profileoption.trace_switch |= (1 << 1) profileoption.trace_switch |= (1 << 1)
if ProfilerTarget.MLU in self.targets:
profileoption.trace_switch |= (1 << 2)
wrap_optimizers() wrap_optimizers()
self.profiler = _Profiler.create(profileoption) self.profiler = _Profiler.create(profileoption)
if callable(scheduler): if callable(scheduler):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册