From fc208b7efe7307b0d286410aa9e7ca7c5ca410bd Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Fri, 15 Apr 2022 10:38:56 +0800 Subject: [PATCH] [MLU] add mlu new profiler (#41138) * [MLU] add mlu new profiler * fix format --- paddle/fluid/platform/device/mlu/mlu_info.h | 4 +- paddle/fluid/platform/profiler/CMakeLists.txt | 3 +- .../platform/profiler/chrometracing_logger.cc | 24 +- .../platform/profiler/mlu/CMakeLists.txt | 5 + .../profiler/mlu/cnpapi_data_process.cc | 263 ++++++++++++++++++ .../profiler/mlu/cnpapi_data_process.h | 35 +++ .../fluid/platform/profiler/mlu/mlu_tracer.cc | 154 ++++++++++ .../fluid/platform/profiler/mlu/mlu_tracer.h | 60 ++++ paddle/fluid/platform/profiler/profiler.cc | 12 + paddle/fluid/platform/profiler/profiler.h | 5 +- paddle/fluid/platform/profiler/trace_event.h | 2 + .../platform/profiler/trace_event_collector.h | 7 + paddle/fluid/pybind/pybind.cc | 2 + python/paddle/profiler/profiler.py | 11 +- 14 files changed, 574 insertions(+), 13 deletions(-) create mode 100644 paddle/fluid/platform/profiler/mlu/CMakeLists.txt create mode 100644 paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc create mode 100644 paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h create mode 100644 paddle/fluid/platform/profiler/mlu/mlu_tracer.cc create mode 100644 paddle/fluid/platform/profiler/mlu/mlu_tracer.h diff --git a/paddle/fluid/platform/device/mlu/mlu_info.h b/paddle/fluid/platform/device/mlu/mlu_info.h index fcf06cb4f1c..12c206ef2c4 100644 --- a/paddle/fluid/platform/device/mlu/mlu_info.h +++ b/paddle/fluid/platform/device/mlu/mlu_info.h @@ -16,7 +16,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_MLU #include +#include #include +#include #include #ifdef PADDLE_WITH_CNCL #include @@ -33,7 +35,7 @@ using cnclStatus = cnclResult_t; #endif using mluStream = cnrtQueue_t; using mluCnnlHandle = cnnlHandle_t; -using mluEventHandle = CNnotifier; +using mluEventHandle = cnrtNotifier_t; using mluDeviceHandle = CNdev; namespace platform { diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index c903a52530c..084bc44dbc7 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,12 +1,13 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) +add_subdirectory(mlu) cc_library(event_node SRCS event_node.cc DEPS enforce) cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) add_subdirectory(dump) cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils) cc_library(event_bind SRCS event_python.cc DEPS profiler_logger) cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind) +cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer) cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger) cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index d7879e7be51..4ee95a530fb 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -38,10 +38,12 @@ static std::string DefaultFileName() { } const char* ChromeTracingLogger::categary_name_[] = { - "Operator", "Dataloader", "ProfileStep", "CudaRuntime", - "Kernel", "Memcpy", "Memset", "UserDefined", - "OperatorInner", "Forward", "Backward", "Optimization", - "Communication", "PythonOp", "PythonUserDefined"}; + "Operator", "Dataloader", "ProfileStep", + "CudaRuntime", "Kernel", "Memcpy", + "Memset", "UserDefined", "OperatorInner", + "Forward", "Backward", "Optimization", + "Communication", "PythonOp", "PythonUserDefined", + "MluRuntime"}; void ChromeTracingLogger::OpenFile() { output_file_stream_.open(filename_, @@ -598,6 +600,12 @@ void ChromeTracingLogger::RefineDisplayName( (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1); } +#ifdef PADDLE_WITH_MLU + static std::string device_type("MLU"); +#else + static std::string device_type("GPU"); +#endif + for (auto it = deviceid_streamid_set_.begin(); it != deviceid_streamid_set_.end(); ++it) { output_file_stream_ << string_format( @@ -607,7 +615,7 @@ void ChromeTracingLogger::RefineDisplayName( "name": "process_name", "pid": %lld, "tid": %lld, "ph": "M", "args": { - "name": "Deivce %lld (GPU)" + "name": "Deivce %lld (%s)" } }, { @@ -632,9 +640,9 @@ void ChromeTracingLogger::RefineDisplayName( } }, )JSON"), - (*it).first, (*it).second, (*it).first, (*it).first, (*it).second, - (*it).second, (*it).first, (*it).second, (*it).first + 0x10000000, - (*it).first, (*it).second, (*it).second); + (*it).first, (*it).second, (*it).first, device_type.c_str(), + (*it).first, (*it).second, (*it).second, (*it).first, (*it).second, + (*it).first + 0x10000000, (*it).first, (*it).second, (*it).second); } } diff --git a/paddle/fluid/platform/profiler/mlu/CMakeLists.txt b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt new file mode 100644 index 00000000000..01b3757ea69 --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/CMakeLists.txt @@ -0,0 +1,5 @@ +if(WITH_MLU) + set(MLU_INFO mlu_info) +endif() + +cc_library(mlu_tracer SRCS mlu_tracer.cc cnpapi_data_process.cc DEPS workqueue_utils enforce glog ${MLU_INFO}) diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc new file mode 100644 index 00000000000..eceb5fabe8d --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc @@ -0,0 +1,263 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h" +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/os_info.h" + +#ifdef PADDLE_WITH_MLU +namespace paddle { +namespace platform { + +namespace { + +inline uint64_t GetTimeGap() { + static uint64_t time_gap = []() -> uint64_t { + uint64_t cpu_time = PosixInNsec(); + uint64_t mlu_time = cnpapiGetTimestamp(); + return (cpu_time - mlu_time); + }(); + return time_gap; +} + +void AddKernelRecord(const cnpapiActivityKernel* kernel, uint64_t start_ns, + TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (kernel->start + time_gap < start_ns) { + return; + } + DeviceTraceEvent event; + event.name = demangle(kernel->name); + event.type = TracerEventType::Kernel; + event.start_ns = kernel->start + time_gap; + event.end_ns = kernel->end + time_gap; + event.device_id = kernel->device_id; + event.context_id = kernel->context_id; + event.stream_id = kernel->queue_id; + event.correlation_id = kernel->correlation_id; + event.kernel_info.block_x = kernel->dimx; + event.kernel_info.block_y = kernel->dimy; + event.kernel_info.block_z = kernel->dimz; + event.kernel_info.grid_x = kernel->kernel_type; + event.kernel_info.grid_y = 0; + event.kernel_info.grid_z = 0; + event.kernel_info.queued = kernel->queued; + event.kernel_info.submitted = kernel->submitted; + event.kernel_info.completed = kernel->received; + collector->AddDeviceEvent(std::move(event)); +} + +const char* MemcpyKind(cnpapiActivityMemcpyType kind) { + switch (kind) { + case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD: + return "MEMCPY_HtoD"; + case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH: + return "MEMCPY_DtoH"; + case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD: + return "MEMCPY_DtoD"; + case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH: + return "MEMCPY_HtoH"; + case CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP: + return "MEMCPY_PtoP"; + default: + break; + } + return "MEMCPY"; +} + +void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy, uint64_t start_ns, + TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (memcpy->start + time_gap < start_ns) { + return; + } + DeviceTraceEvent event; + event.name = MemcpyKind(memcpy->copy_type); + event.type = TracerEventType::Memcpy; + event.start_ns = memcpy->start + time_gap; + event.end_ns = memcpy->end + time_gap; + event.device_id = memcpy->device_id; + event.context_id = memcpy->context_id; + event.stream_id = memcpy->queue_id; + event.correlation_id = memcpy->correlation_id; + event.memcpy_info.num_bytes = memcpy->bytes; + snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s", + MemcpyKind(memcpy->copy_type)); + collector->AddDeviceEvent(std::move(event)); +} + +void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2, + uint64_t start_ns, TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (memcpy2->start + time_gap < start_ns) { + return; + } + DeviceTraceEvent event; + event.name = MemcpyKind(memcpy2->copy_type); + event.type = TracerEventType::Memcpy; + event.start_ns = memcpy2->start + time_gap; + event.end_ns = memcpy2->end + time_gap; + event.device_id = memcpy2->device_id; + event.context_id = memcpy2->context_id; + event.stream_id = memcpy2->queue_id; + event.correlation_id = memcpy2->correlation_id; + event.memcpy_info.num_bytes = memcpy2->bytes; + snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s", + MemcpyKind(memcpy2->copy_type)); + collector->AddDeviceEvent(std::move(event)); +} + +void AddMemsetRecord(const cnpapiActivityMemset* memset, uint64_t start_ns, + TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (memset->start + time_gap < start_ns) { + return; + } + DeviceTraceEvent event; + event.name = "MEMSET"; + event.type = TracerEventType::Memset; + event.start_ns = memset->start + time_gap; + event.end_ns = memset->end + time_gap; + event.device_id = memset->device_id; + event.context_id = memset->context_id; + event.stream_id = memset->queue_id; + event.correlation_id = memset->correlation_id; + event.memset_info.num_bytes = memset->bytes; + event.memset_info.value = memset->value; + collector->AddDeviceEvent(std::move(event)); +} + +class CnpapiRuntimeCbidStr { + public: + static const CnpapiRuntimeCbidStr& GetInstance() { + static CnpapiRuntimeCbidStr inst; + return inst; + } + + std::string RuntimeKind(cnpapi_CallbackId cbid) const { + auto iter = cbid_str_.find(cbid); + if (iter == cbid_str_.end()) { + return "MLU Runtime API " + std::to_string(cbid); + } + return iter->second; + } + + private: + CnpapiRuntimeCbidStr(); + + std::unordered_map cbid_str_; +}; + +CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() { +#define REGISTER_RUNTIME_CBID_STR(cbid) \ + cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid + + REGISTER_RUNTIME_CBID_STR(cnMalloc); + REGISTER_RUNTIME_CBID_STR(cnMallocHost); + REGISTER_RUNTIME_CBID_STR(cnFree); + REGISTER_RUNTIME_CBID_STR(cnFreeHost); + REGISTER_RUNTIME_CBID_STR(cnMemcpy); + REGISTER_RUNTIME_CBID_STR(cnMemcpyPeer); + REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoD); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoH); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD); + REGISTER_RUNTIME_CBID_STR(cnMemcpyAsync); + REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoDAsync); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoHAsync); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoDAsync); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD2D); + REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD3D); + REGISTER_RUNTIME_CBID_STR(cnMemcpy2D); + REGISTER_RUNTIME_CBID_STR(cnMemcpy3D); + REGISTER_RUNTIME_CBID_STR(cnMemsetD8); + REGISTER_RUNTIME_CBID_STR(cnMemsetD16); + REGISTER_RUNTIME_CBID_STR(cnMemsetD32); + REGISTER_RUNTIME_CBID_STR(cnMemsetD8Async); + REGISTER_RUNTIME_CBID_STR(cnMemsetD16Async); + REGISTER_RUNTIME_CBID_STR(cnMemsetD32Async); + REGISTER_RUNTIME_CBID_STR(cnInvokeKernel); + REGISTER_RUNTIME_CBID_STR(cnCreateQueue); + REGISTER_RUNTIME_CBID_STR(cnDestroyQueue); + REGISTER_RUNTIME_CBID_STR(cnQueueSync); + REGISTER_RUNTIME_CBID_STR(cnQueueWaitNotifier); + REGISTER_RUNTIME_CBID_STR(cnWaitNotifier); + REGISTER_RUNTIME_CBID_STR(cnCreateNotifier); + REGISTER_RUNTIME_CBID_STR(cnDestroyNotifier); + REGISTER_RUNTIME_CBID_STR(cnPlaceNotifier); + REGISTER_RUNTIME_CBID_STR(cnCtxCreate); + REGISTER_RUNTIME_CBID_STR(cnCtxDestroy); + REGISTER_RUNTIME_CBID_STR(cnCtxGetCurrent); + REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent); + REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice); + REGISTER_RUNTIME_CBID_STR(cnCtxSync); +#undef REGISTER_RUNTIME_CBID_STR +} + +void AddApiRecord(const cnpapiActivityAPI* api, uint64_t start_ns, + TraceEventCollector* collector) { + static uint64_t time_gap = GetTimeGap(); + if (api->start + time_gap < start_ns) { + return; + } + RuntimeTraceEvent event; + event.name = CnpapiRuntimeCbidStr::GetInstance().RuntimeKind(api->cbid); + event.start_ns = api->start + time_gap; + event.end_ns = api->end + time_gap; + event.process_id = api->process_id; + event.thread_id = api->thread_id; + event.correlation_id = api->correlation_id; + event.callback_id = api->cbid; + event.type = TracerEventType::MluRuntime; + collector->AddRuntimeEvent(std::move(event)); +} + +} // namespace + +namespace details { + +void ProcessCnpapiActivityRecord(const cnpapiActivity* record, + uint64_t start_ns, + TraceEventCollector* collector) { + switch (record->type) { + case CNPAPI_ACTIVITY_TYPE_KERNEL: + AddKernelRecord(reinterpret_cast(record), + start_ns, collector); + break; + case CNPAPI_ACTIVITY_TYPE_MEMCPY: + AddMemcpyRecord(reinterpret_cast(record), + start_ns, collector); + break; + case CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP: + AddMemcpy2Record( + reinterpret_cast(record), start_ns, + collector); + break; + case CNPAPI_ACTIVITY_TYPE_MEMSET: + AddMemsetRecord(reinterpret_cast(record), + start_ns, collector); + break; + case CNPAPI_ACTIVITY_TYPE_CNDRV_API: + AddApiRecord(reinterpret_cast(record), start_ns, + collector); + break; + default: + break; + } +} + +} // namespace details +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h new file mode 100644 index 00000000000..1f00b46d2c2 --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif +#include "paddle/fluid/platform/profiler/trace_event_collector.h" + +namespace paddle { +namespace platform { +namespace details { + +#ifdef PADDLE_WITH_MLU +void ProcessCnpapiActivityRecord(const cnpapiActivity* record, + uint64_t start_ns, + TraceEventCollector* collector); +#endif + +} // namespace details +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc new file mode 100644 index 00000000000..2d719a8bbfd --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h" +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h" +#include "paddle/fluid/platform/os_info.h" +#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h" + +#define CNPAPI_CALL(call) \ + do { \ + cnpapiResult _status = call; \ + if (_status != CNPAPI_SUCCESS) { \ + const char* errstr; \ + cnpapiGetResultString(_status, &errstr); \ + LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \ + } \ + } while (0) + +namespace paddle { +namespace platform { + +namespace { + +void BufferRequestedCallback(uint64_t** buffer, size_t* size, + size_t* max_num_records) { + constexpr size_t kBufferSize = 1 << 23; // 8 MB + constexpr size_t kBufferAlignSize = 8; + *buffer = reinterpret_cast( + paddle::framework::AlignedMalloc(kBufferSize, kBufferAlignSize)); + *size = kBufferSize; + *max_num_records = 0; +} + +void BufferCompletedCallback(uint64_t* buffer, size_t size, size_t valid_size) { + if (buffer == nullptr || valid_size == 0) { + return; + } + auto mlu_tracer = &MluTracer::GetInstance(); + mlu_tracer->ProcessCnpapiActivity(buffer, valid_size); + + paddle::framework::AlignedFree(buffer); +} + +} // namespace + +MluTracer::MluTracer() { +#ifdef PADDLE_WITH_MLU + CNPAPI_CALL(cnpapiInit()); + CNPAPI_CALL(cnpapiActivityRegisterCallbacks(BufferRequestedCallback, + BufferCompletedCallback)); +#endif +} + +void MluTracer::PrepareTracing() { + PADDLE_ENFORCE_EQ( + state_ == TracerState::UNINITED || state_ == TracerState::STOPED, true, + platform::errors::PreconditionNotMet("MluTracer must be UNINITED")); + EnableCnpapiActivity(); + state_ = TracerState::READY; +} + +void MluTracer::StartTracing() { + PADDLE_ENFORCE_EQ(state_ == TracerState::READY, true, + platform::errors::PreconditionNotMet( + "MluTracer must be READY or STOPPED")); + tracing_start_ns_ = PosixInNsec(); + state_ = TracerState::STARTED; +} + +void MluTracer::StopTracing() { + PADDLE_ENFORCE_EQ( + state_, TracerState::STARTED, + platform::errors::PreconditionNotMet("MluTracer must be STARTED")); + DisableCnpapiActivity(); + state_ = TracerState::STOPED; +} + +void MluTracer::CollectTraceData(TraceEventCollector* collector) { + PADDLE_ENFORCE_EQ( + state_, TracerState::STOPED, + platform::errors::PreconditionNotMet("MluTracer must be STOPED")); + for (auto he : collector_.HostEvents()) { + collector->AddHostEvent(std::move(he)); + } + for (auto rte : collector_.RuntimeEvents()) { + collector->AddRuntimeEvent(std::move(rte)); + } + for (auto de : collector_.DeviceEvents()) { + collector->AddDeviceEvent(std::move(de)); + } + for (auto tn : collector_.ThreadNames()) { + collector->AddThreadName(tn.first, tn.second); + } + collector_.ClearAll(); +} + +void MluTracer::ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size) { +#ifdef PADDLE_WITH_MLU + cnpapiActivity* record = nullptr; + while (true) { + cnpapiResult status = + cnpapiActivityGetNextRecord(buffer, valid_size, &record); + if (status == CNPAPI_SUCCESS) { + details::ProcessCnpapiActivityRecord(record, tracing_start_ns_, + &collector_); + } else if (status == CNPAPI_ERROR_INSUFFICIENT_MEMORY || + status == CNPAPI_ERROR_MAX_LIMIT_REACHED) { + break; + } else { + CNPAPI_CALL(status); + } + } +#endif +} + +void MluTracer::EnableCnpapiActivity() { +#ifdef PADDLE_WITH_MLU + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_KERNEL)); + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY)); + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP)); + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMSET)); + CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_CNDRV_API)); + VLOG(3) << "enable cnpapi activity"; +#endif +} + +void MluTracer::DisableCnpapiActivity() { +#ifdef PADDLE_WITH_MLU + CNPAPI_CALL(cnpapiActivityFlushAll()); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_KERNEL)); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY)); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP)); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMSET)); + CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_CNDRV_API)); + VLOG(3) << "disable cnpapi activity"; +#endif +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/mlu/mlu_tracer.h b/paddle/fluid/platform/profiler/mlu/mlu_tracer.h new file mode 100644 index 00000000000..43c712b13ae --- /dev/null +++ b/paddle/fluid/platform/profiler/mlu/mlu_tracer.h @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/profiler/tracer_base.h" + +namespace paddle { +namespace platform { + +class MluTracer : public TracerBase { + public: + static MluTracer& GetInstance() { + static MluTracer instance; + return instance; + } + + void PrepareTracing() override; + + void StartTracing() override; + + void StopTracing() override; + + void CollectTraceData(TraceEventCollector* collector) override; + + void ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size); + + private: + MluTracer(); + + DISABLE_COPY_AND_ASSIGN(MluTracer); + + void EnableCnpapiActivity(); + + void DisableCnpapiActivity(); + + uint64_t tracing_start_ns_ = UINT64_MAX; + + TraceEventCollector collector_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index ac46fbed10a..a417eda1509 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -27,6 +27,7 @@ #include "paddle/fluid/platform/profiler/cuda_tracer.h" #include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/host_tracer.h" +#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h" #include "paddle/fluid/platform/profiler/utils.h" @@ -52,6 +53,14 @@ bool Profiler::IsCuptiSupported() { return supported; } +bool Profiler::IsCnpapiSupported() { + bool supported = false; +#ifdef PADDLE_WITH_MLU + supported = true; +#endif + return supported; +} + Profiler::Profiler(const ProfilerOptions& options) { options_ = options; std::bitset<32> trace_switch(options_.trace_switch); @@ -63,6 +72,9 @@ Profiler::Profiler(const ProfilerOptions& options) { if (trace_switch.test(kProfileGPUOptionBit)) { tracers_.emplace_back(&CudaTracer::GetInstance(), false); } + if (trace_switch.test(kProfileMLUOptionBit)) { + tracers_.emplace_back(&MluTracer::GetInstance(), false); + } } Profiler::~Profiler() { alive_.store(false); } diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index d24ee504bc6..ea346a4fb74 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -33,9 +33,10 @@ namespace platform { static constexpr uint32_t kProfileCPUOptionBit = 0; static constexpr uint32_t kProfileGPUOptionBit = 1; +static constexpr uint32_t kProfileMLUOptionBit = 2; struct ProfilerOptions { - uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu + uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu uint32_t trace_level = FLAGS_host_trace_level; }; @@ -45,6 +46,8 @@ class Profiler { static bool IsCuptiSupported(); + static bool IsCnpapiSupported(); + void Prepare(); void Start(); diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index 16ef62fb515..6d398a26eda 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -50,6 +50,8 @@ enum class TracerEventType { PythonOp = 13, // Used to mark python level userdefined PythonUserDefined = 14, + // Used to mark mlu runtime record returned by cnpapi + MluRuntime = 15, // A flag to denote the number of current types NumTypes }; diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h index cc85a178d14..5f2bc9dc90d 100644 --- a/paddle/fluid/platform/profiler/trace_event_collector.h +++ b/paddle/fluid/platform/profiler/trace_event_collector.h @@ -52,6 +52,13 @@ class TraceEventCollector { return thread_names_; } + void ClearAll() { + thread_names_.clear(); + host_events_.clear(); + runtime_events_.clear(); + device_events_.clear(); + } + private: std::unordered_map thread_names_; std::list host_events_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7b63fdd6dd4..982bf764612 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3369,6 +3369,8 @@ All parameter, weight, gradient are variables in Paddle. .def("create", &paddle::platform::Profiler::Create, py::return_value_policy::take_ownership) .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported) + .def("is_cnpapi_supported", + &paddle::platform::Profiler::IsCnpapiSupported) .def("prepare", [](paddle::platform::Profiler *profiler) { platform::EnableHostEventRecorder(); diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index 2fae583397a..3e60a82f121 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -52,16 +52,19 @@ class ProfilerState(Enum): class ProfilerTarget(Enum): r""" - ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU and GPU are supported currently. + ProfilerTarget is used to specify target device for :ref:`profiling ` . Only CPU, GPU and MLU are supported currently. The meaning of each ProfilerState is as following - **ProfilerTarget.CPU** : Profile events on CPU. - **ProfilerTarget.GPU** : Profile events on GPU. + + - **ProfilerTarget.MLU** : Profile events on MLU. """ CPU = 0 GPU = 1 + MLU = 2 def make_scheduler(*, @@ -258,6 +261,8 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: """ if _Profiler.is_cupti_supported(): return [ProfilerTarget.CPU, ProfilerTarget.GPU] + if _Profiler.is_cnpapi_supported(): + return [ProfilerTarget.CPU, ProfilerTarget.MLU] return [ProfilerTarget.CPU] @@ -266,7 +271,7 @@ class Profiler: Profiler context manager, user interface to manage profiling process to start, stop, export profiling data and print summary table. Args: - targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` and :ref:`ProfilerTarget.GPU ` . + targets (list, optional): specify target devices to profile, and all existing and supported devices will be chosen by default. Currently supported values, :ref:`ProfilerTarget.CPU ` , :ref:`ProfilerTarget.GPU ` and :ref:`ProfilerTarget.MLU ` . scheduler (Callable|tuple, optional): If it is a callable object, it takes a step number as parameter and return the corresponding :ref:`ProfilerState `. This callable object can be generated by :ref:`make_scheduler ` function. If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, which means profiling range [start_batch, end_batch). @@ -407,6 +412,8 @@ class Profiler: profileoption.trace_switch |= 1 if ProfilerTarget.GPU in self.targets: profileoption.trace_switch |= (1 << 1) + if ProfilerTarget.MLU in self.targets: + profileoption.trace_switch |= (1 << 2) wrap_optimizers() self.profiler = _Profiler.create(profileoption) if callable(scheduler): -- GitLab