From da51baf22262e52b1a8822ee646ffba5bad1b6a3 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Tue, 23 Aug 2022 10:48:35 +0800 Subject: [PATCH] [CustomDevice] add profiler apis (#45130) * [CustomDevice] add profiler apis * migrate CalculateEstOccupancy into cuda_tracer * update * add ut --- paddle/fluid/platform/profiler/CMakeLists.txt | 10 +- .../platform/profiler/chrometracing_logger.cc | 43 +------ .../platform/profiler/cupti_data_process.cc | 46 ++++++- .../profiler/custom_device/CMakeLists.txt | 4 + .../profiler/custom_device/custom_tracer.cc | 116 ++++++++++++++++++ .../profiler/custom_device/custom_tracer.h | 64 ++++++++++ paddle/fluid/platform/profiler/profiler.cc | 15 ++- paddle/fluid/platform/profiler/profiler.h | 8 +- paddle/fluid/platform/profiler/trace_event.h | 9 +- paddle/fluid/platform/profiler_helper.h | 14 +++ paddle/phi/backends/custom/custom_device.cc | 52 ++++++++ paddle/phi/backends/device_base.cc | 33 +++++ paddle/phi/backends/device_base.h | 27 ++++ paddle/phi/backends/device_ext.h | 26 ++++ paddle/phi/backends/device_manager.cc | 63 ++++++++-- paddle/phi/backends/device_manager.h | 26 ++++ .../fluid/tests/custom_runtime/CMakeLists.txt | 1 + .../test_custom_cpu_profiler_plugin.py | 65 ++++++++++ python/paddle/profiler/profiler.py | 20 ++- python/setup.py.in | 5 + 20 files changed, 581 insertions(+), 66 deletions(-) create mode 100644 paddle/fluid/platform/profiler/custom_device/CMakeLists.txt create mode 100644 paddle/fluid/platform/profiler/custom_device/custom_tracer.cc create mode 100644 paddle/fluid/platform/profiler/custom_device/custom_tracer.h create mode 100644 python/paddle/fluid/tests/custom_runtime/test_custom_cpu_profiler_plugin.py diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index 1daed7db1e7..68bb45e7b37 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -7,6 +7,7 @@ cc_library( SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) add_subdirectory(mlu) +add_subdirectory(custom_device) cc_library( event_node SRCS event_node.cc @@ -32,8 +33,13 @@ cc_library( cc_library( new_profiler SRCS profiler.cc - DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind - mlu_tracer) + DEPS host_tracer + cuda_tracer + profiler_utils + cpu_utilization + event_bind + mlu_tracer + custom_tracer) cc_test( test_event_node SRCS test_event_node.cc diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 15aa3201239..b825a68fad2 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -397,44 +397,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( void ChromeTracingLogger::HandleTypeKernel( const DeviceTraceEventNode& device_node) { KernelEventInfo kernel_info = device_node.KernelInfo(); - float blocks_per_sm = 0.0; - float warps_per_sm = 0.0; - float occupancy = 0.0; -#if defined(PADDLE_WITH_CUPTI) -#ifdef PADDLE_WITH_HIP - constexpr int threads_per_warp = 64; -#else - constexpr int threads_per_warp = 32; -#endif - const gpuDeviceProp& device_property = - GetDeviceProperties(device_node.DeviceId()); - blocks_per_sm = static_cast(kernel_info.grid_x * kernel_info.grid_y * - kernel_info.grid_z) / - device_property.multiProcessorCount; - warps_per_sm = - blocks_per_sm * - (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) / - threads_per_warp; -#ifdef PADDLE_WITH_HIP - occupancy = CalculateEstOccupancy(device_node.DeviceId(), - kernel_info.dynamic_shared_memory, - kernel_info.block_x, - kernel_info.block_y, - kernel_info.block_z, - kernel_info.kernelFunc, - kernel_info.launchType); -#else - occupancy = CalculateEstOccupancy(device_node.DeviceId(), - kernel_info.registers_per_thread, - kernel_info.static_shared_memory, - kernel_info.dynamic_shared_memory, - kernel_info.block_x, - kernel_info.block_y, - kernel_info.block_z, - blocks_per_sm); -#endif // PADDLE_WITH_HIP -#endif float dur = nsToMsFloat(device_node.Duration()); std::string dur_display; if (dur > 1.0) { @@ -480,15 +443,15 @@ void ChromeTracingLogger::HandleTypeKernel( device_node.CorrelationId(), kernel_info.registers_per_thread, kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory, - blocks_per_sm, - warps_per_sm, + kernel_info.blocks_per_sm, + kernel_info.warps_per_sm, kernel_info.grid_x, kernel_info.grid_y, kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y, kernel_info.block_z, - occupancy * 100); + kernel_info.occupancy * 100); } void ChromeTracingLogger::HandleTypeMemcpy( diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index cf296fe197a..840ba6f2606 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -16,8 +16,10 @@ #include +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/os_info.h" +#include "paddle/fluid/platform/profiler/utils.h" namespace paddle { namespace platform { @@ -52,10 +54,50 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel, event.kernel_info.queued = kernel->queued; event.kernel_info.submitted = kernel->submitted; event.kernel_info.completed = kernel->completed; + + float blocks_per_sm = 0.0; + float warps_per_sm = 0.0; + float occupancy = 0.0; + #ifdef PADDLE_WITH_HIP - event.kernel_info.kernelFunc = kernel->kernelFunc; - event.kernel_info.launchType = kernel->launchType; + constexpr int threads_per_warp = 64; +#else + constexpr int threads_per_warp = 32; #endif + const gpuDeviceProp& device_property = + paddle::platform::GetDeviceProperties(kernel->deviceId); + blocks_per_sm = + static_cast(event.kernel_info.grid_x * event.kernel_info.grid_y * + event.kernel_info.grid_z) / + device_property.multiProcessorCount; + warps_per_sm = blocks_per_sm * + (event.kernel_info.block_x * event.kernel_info.block_y * + event.kernel_info.block_z) / + threads_per_warp; +#ifdef PADDLE_WITH_HIP + occupancy = paddle::platform::CalculateEstOccupancy( + kernel->deviceId, + event.kernel_info.dynamic_shared_memory, + event.kernel_info.block_x, + event.kernel_info.block_y, + event.kernel_info.block_z, + kernel->kernelFunc, + kernel->launchType); +#else + occupancy = paddle::platform::CalculateEstOccupancy( + kernel->deviceId, + event.kernel_info.registers_per_thread, + event.kernel_info.static_shared_memory, + event.kernel_info.dynamic_shared_memory, + event.kernel_info.block_x, + event.kernel_info.block_y, + event.kernel_info.block_z, + blocks_per_sm); +#endif // PADDLE_WITH_HIP + event.kernel_info.blocks_per_sm = blocks_per_sm; + event.kernel_info.warps_per_sm = warps_per_sm; + event.kernel_info.occupancy = occupancy; + collector->AddDeviceEvent(std::move(event)); } diff --git a/paddle/fluid/platform/profiler/custom_device/CMakeLists.txt b/paddle/fluid/platform/profiler/custom_device/CMakeLists.txt new file mode 100644 index 00000000000..f4fe05d0e7d --- /dev/null +++ b/paddle/fluid/platform/profiler/custom_device/CMakeLists.txt @@ -0,0 +1,4 @@ +cc_library( + custom_tracer + SRCS custom_tracer.cc + DEPS workqueue_utils enforce glog) diff --git a/paddle/fluid/platform/profiler/custom_device/custom_tracer.cc b/paddle/fluid/platform/profiler/custom_device/custom_tracer.cc new file mode 100644 index 00000000000..70c0ed02a7c --- /dev/null +++ b/paddle/fluid/platform/profiler/custom_device/custom_tracer.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h" + +#include +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/os_info.h" +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/backends/device_manager.h" +#endif + +namespace paddle { +namespace platform { + +CustomTracer::CustomTracer(const std::string& dev_type) : dev_type_(dev_type) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::ProfilerInitialize(dev_type_, &collector_, &context_); +#endif +} + +CustomTracer::~CustomTracer() { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::ProfilerFinalize(dev_type_, &collector_, context_); +#endif +} + +void CustomTracer::PrepareTracing() { + PADDLE_ENFORCE_EQ( + state_ == TracerState::UNINITED || state_ == TracerState::STOPED, + true, + platform::errors::PreconditionNotMet("CustomTracer must be UNINITED")); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::ProfilerPrepareTracing(dev_type_, &collector_, context_); +#endif + state_ = TracerState::READY; +} + +void CustomTracer::StartTracing() { + PADDLE_ENFORCE_EQ( + state_ == TracerState::READY, + true, + platform::errors::PreconditionNotMet("Tracer must be READY or STOPPED")); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::ProfilerStartTracing(dev_type_, &collector_, context_); +#endif + tracing_start_ns_ = PosixInNsec(); + state_ = TracerState::STARTED; +} + +void CustomTracer::StopTracing() { + PADDLE_ENFORCE_EQ( + state_, + TracerState::STARTED, + platform::errors::PreconditionNotMet("Tracer must be STARTED")); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::ProfilerStopTracing(dev_type_, &collector_, context_); +#endif + state_ = TracerState::STOPED; +} + +void CustomTracer::CollectTraceData(TraceEventCollector* collector) { + PADDLE_ENFORCE_EQ( + state_, + TracerState::STOPED, + platform::errors::PreconditionNotMet("Tracer must be STOPED")); +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::ProfilerCollectTraceData( + dev_type_, &collector_, tracing_start_ns_, context_); +#endif + for (auto he : collector_.HostEvents()) { + collector->AddHostEvent(std::move(he)); + } + for (auto rte : collector_.RuntimeEvents()) { + collector->AddRuntimeEvent(std::move(rte)); + } + for (auto de : collector_.DeviceEvents()) { + collector->AddDeviceEvent(std::move(de)); + } + for (auto tn : collector_.ThreadNames()) { + collector->AddThreadName(tn.first, tn.second); + } + collector_.ClearAll(); +} + +} // namespace platform +} // namespace paddle + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +void profiler_add_runtime_trace_event(C_Profiler prof, void* event) { + paddle::platform::RuntimeTraceEvent re = + *reinterpret_cast(event); + reinterpret_cast(prof) + ->AddRuntimeEvent(std::move(re)); +} + +void profiler_add_device_trace_event(C_Profiler prof, void* event) { + paddle::platform::DeviceTraceEvent de = + *reinterpret_cast(event); + reinterpret_cast(prof) + ->AddDeviceEvent(std::move(de)); +} +#endif diff --git a/paddle/fluid/platform/profiler/custom_device/custom_tracer.h b/paddle/fluid/platform/profiler/custom_device/custom_tracer.h new file mode 100644 index 00000000000..d70f92588d5 --- /dev/null +++ b/paddle/fluid/platform/profiler/custom_device/custom_tracer.h @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/profiler/tracer_base.h" + +namespace paddle { +namespace platform { + +class CustomTracer : public TracerBase { + public: + static CustomTracer& GetInstance(const std::string& device_type) { + static std::unordered_map> + instance; + if (instance.find(device_type) == instance.cend()) { + instance.insert( + {device_type, std::make_shared(device_type)}); + } + return *instance[device_type]; + } + + void PrepareTracing() override; + + void StartTracing() override; + + void StopTracing() override; + + void CollectTraceData(TraceEventCollector* collector) override; + + ~CustomTracer() override; + + explicit CustomTracer(const std::string& dev_type); + + private: + DISABLE_COPY_AND_ASSIGN(CustomTracer); + + TraceEventCollector collector_; + + uint64_t tracing_start_ns_ = UINT64_MAX; + + std::string dev_type_; + + void* context_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 06c5e318240..6365586c684 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -26,6 +26,7 @@ #endif #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler/cuda_tracer.h" +#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h" #include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h" @@ -39,11 +40,13 @@ void SynchronizeAllDevice(); std::atomic Profiler::alive_{false}; -std::unique_ptr Profiler::Create(const ProfilerOptions& options) { +std::unique_ptr Profiler::Create( + const ProfilerOptions& options, + const std::vector& custom_device_types) { if (alive_.exchange(true)) { return nullptr; } - return std::unique_ptr(new Profiler(options)); + return std::unique_ptr(new Profiler(options, custom_device_types)); } bool Profiler::IsCuptiSupported() { @@ -62,7 +65,8 @@ bool Profiler::IsCnpapiSupported() { return supported; } -Profiler::Profiler(const ProfilerOptions& options) { +Profiler::Profiler(const ProfilerOptions& options, + const std::vector& custom_device_types) { options_ = options; std::bitset<32> trace_switch(options_.trace_switch); if (trace_switch.test(kProfileCPUOptionBit)) { @@ -76,6 +80,11 @@ Profiler::Profiler(const ProfilerOptions& options) { if (trace_switch.test(kProfileMLUOptionBit)) { tracers_.emplace_back(&MluTracer::GetInstance(), false); } + if (trace_switch.test(kProfileCustomDeviceOptionBit)) { + for (const auto& dev_type : custom_device_types) { + tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false); + } + } } Profiler::~Profiler() { alive_.store(false); } diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index 65a3bcc02d8..2480f3a6073 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -35,6 +35,7 @@ namespace platform { static constexpr uint32_t kProfileCPUOptionBit = 0; static constexpr uint32_t kProfileGPUOptionBit = 1; static constexpr uint32_t kProfileMLUOptionBit = 2; +static constexpr uint32_t kProfileCustomDeviceOptionBit = 3; struct ProfilerOptions { uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu @@ -43,7 +44,9 @@ struct ProfilerOptions { class Profiler { public: - static std::unique_ptr Create(const ProfilerOptions& options); + static std::unique_ptr Create( + const ProfilerOptions& options, + const std::vector& custom_device_types = {}); static bool IsCuptiSupported(); @@ -75,7 +78,8 @@ class Profiler { bool owned; }; - explicit Profiler(const ProfilerOptions& options); + explicit Profiler(const ProfilerOptions& options, + const std::vector& custom_device_types = {}); DISABLE_COPY_AND_ASSIGN(Profiler); diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index cdd302494c4..3315c7d705b 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -105,10 +105,11 @@ struct KernelEventInfo { uint64_t submitted; // The completed timestamp for the kernel execution, in ns. uint64_t completed; -#ifdef PADDLE_WITH_HIP - void* kernelFunc; - uint8_t launchType; -#endif + + float blocks_per_sm; + float warps_per_sm; + // theoretical achieved occupancy + float occupancy; }; static constexpr size_t kMemKindMaxLen = 50; diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 483220b45b9..5af420ee079 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -38,6 +38,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device/mlu/enforce.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h" #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/backends/device_manager.h" +#endif #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" @@ -146,6 +149,17 @@ void SynchronizeAllDevice() { PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice()); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes(); + for (const auto &dev_type : dev_types) { + auto dev_cnt = phi::DeviceManager::GetDeviceCount(dev_type); + for (size_t i = 0; i < dev_cnt; i++) { + auto place = paddle::platform::CustomPlace(dev_type, i); + phi::DeviceManager::SetDevice(place); + phi::DeviceManager::SynchronizeDevice(place); + } + } +#endif } static double ToMegaBytes(size_t bytes) { diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc index 75f5433a640..928101b09f2 100644 --- a/paddle/phi/backends/custom/custom_device.cc +++ b/paddle/phi/backends/custom/custom_device.cc @@ -817,6 +817,51 @@ class CustomDevice : public DeviceInterface { y)); } + // Profiler + void ProfilerInitialize(paddle::platform::TraceEventCollector* collector, + void** user_data) override { + CHECK_PTR(pimpl_->profiler_initialize); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_initialize( + reinterpret_cast(collector), user_data)); + } + + void ProfilerFinalize(paddle::platform::TraceEventCollector* collector, + void* user_data) override { + CHECK_PTR(pimpl_->profiler_finalize); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_finalize( + reinterpret_cast(collector), user_data)); + } + + void ProfilerPrepareTracing(paddle::platform::TraceEventCollector* collector, + void* user_data) override { + CHECK_PTR(pimpl_->profiler_prepare_tracing); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_prepare_tracing( + reinterpret_cast(collector), user_data)); + } + + void ProfilerStartTracing(paddle::platform::TraceEventCollector* collector, + void* user_data) override { + CHECK_PTR(pimpl_->profiler_start_tracing); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_start_tracing( + reinterpret_cast(collector), user_data)); + } + + void ProfilerStopTracing(paddle::platform::TraceEventCollector* collector, + void* user_data) override { + CHECK_PTR(pimpl_->profiler_stop_tracing); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_stop_tracing( + reinterpret_cast(collector), user_data)); + } + + void ProfilerCollectTraceData( + paddle::platform::TraceEventCollector* collector, + uint64_t start_ns, + void* user_data) override { + CHECK_PTR(pimpl_->profiler_collect_trace_data); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_collect_trace_data( + reinterpret_cast(collector), start_ns, user_data)); + } + private: inline int PlaceToIdNoCheck(const Place& place) { int dev_id = place.GetDeviceId(); @@ -925,6 +970,13 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) { CHECK_INTERFACE(xccl_recv, false); CHECK_INTERFACE(blas_axpby, false); + + CHECK_INTERFACE(profiler_initialize, false); + CHECK_INTERFACE(profiler_finalize, false); + CHECK_INTERFACE(profiler_prepare_tracing, false); + CHECK_INTERFACE(profiler_start_tracing, false); + CHECK_INTERFACE(profiler_stop_tracing, false); + CHECK_INTERFACE(profiler_collect_trace_data, false); return true; #undef CHECK_INTERFACE } diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc index fca6a32e4f8..bfa85cbf579 100644 --- a/paddle/phi/backends/device_base.cc +++ b/paddle/phi/backends/device_base.cc @@ -368,6 +368,39 @@ void DeviceInterface::BlasAXPBY(size_t dev_id, INTERFACE_UNIMPLEMENT; } +// profiler +void DeviceInterface::ProfilerInitialize( + paddle::platform::TraceEventCollector* collector, void** user_data) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::ProfilerFinalize( + paddle::platform::TraceEventCollector* collector, void* user_data) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::ProfilerPrepareTracing( + paddle::platform::TraceEventCollector* collector, void* user_data) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::ProfilerStartTracing( + paddle::platform::TraceEventCollector* collector, void* user_data) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::ProfilerStopTracing( + paddle::platform::TraceEventCollector* collector, void* user_data) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::ProfilerCollectTraceData( + paddle::platform::TraceEventCollector* collector, + uint64_t start_ns, + void* user_data) { + INTERFACE_UNIMPLEMENT; +} + #undef INTERFACE_UNIMPLEMENT } // namespace phi diff --git a/paddle/phi/backends/device_base.h b/paddle/phi/backends/device_base.h index e5bdc6c8126..a0d95349196 100644 --- a/paddle/phi/backends/device_base.h +++ b/paddle/phi/backends/device_base.h @@ -20,6 +20,12 @@ #include "paddle/phi/backends/event.h" #include "paddle/phi/backends/stream.h" +namespace paddle { +namespace platform { +class TraceEventCollector; +} // namespace platform +} // namespace paddle + namespace phi { class DeviceInterface { // Driver / Runtime @@ -236,6 +242,27 @@ class DeviceInterface { // Driver / Runtime float beta, void* y); + // profiler + virtual void ProfilerInitialize( + paddle::platform::TraceEventCollector* collector, void** user_data); + + virtual void ProfilerFinalize( + paddle::platform::TraceEventCollector* collector, void* user_data); + + virtual void ProfilerPrepareTracing( + paddle::platform::TraceEventCollector* collector, void* user_data); + + virtual void ProfilerStartTracing( + paddle::platform::TraceEventCollector* collector, void* user_data); + + virtual void ProfilerStopTracing( + paddle::platform::TraceEventCollector* collector, void* user_data); + + virtual void ProfilerCollectTraceData( + paddle::platform::TraceEventCollector* collector, + uint64_t start_ns, + void* user_data); + private: const std::string type_; const uint8_t priority_; diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h index ca254f8235a..6e4a6fcb663 100644 --- a/paddle/phi/backends/device_ext.h +++ b/paddle/phi/backends/device_ext.h @@ -83,6 +83,12 @@ typedef struct C_CCLComm_st* C_CCLComm; typedef enum { SUM = 0, AVG, MAX, MIN, PRODUCT } C_CCLReduceOp; +typedef struct C_Profiler_st* C_Profiler; + +void profiler_add_runtime_trace_event(C_Profiler prof, void* event); + +void profiler_add_device_trace_event(C_Profiler prof, void* event); + struct C_DeviceInterface { // Core fill it and plugin must to check it size_t size; @@ -632,6 +638,26 @@ struct C_DeviceInterface { void* reserved_ccl_api[8]; + ////////////////// + // profiler api // + ////////////////// + + C_Status (*profiler_initialize)(C_Profiler prof, void** user_data); + + C_Status (*profiler_finalize)(C_Profiler prof, void* user_data); + + C_Status (*profiler_prepare_tracing)(C_Profiler prof, void* user_data); + + C_Status (*profiler_start_tracing)(C_Profiler prof, void* user_data); + + C_Status (*profiler_stop_tracing)(C_Profiler prof, void* user_data); + + C_Status (*profiler_collect_trace_data)(C_Profiler prof, + uint64_t start_ns, + void* user_data); + + void* reserved_profiler_api[8]; + /////////////// // other api // /////////////// diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index 224bd0a1ff1..2bb57ab8fe6 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -254,14 +254,11 @@ DeviceInterface* DeviceManager::GetDeviceInterfaceWithType( phi::AutoRDLock lock(&_global_device_manager_rw_lock); auto& dev_impl_map = Instance().device_impl_map_; - if (dev_impl_map.find(device_type) != dev_impl_map.end()) { - return dev_impl_map.at(device_type).get(); - } else { - LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n"; - PADDLE_THROW( - phi::errors::Fatal("Unregistered device type %s.", device_type)); - return nullptr; - } + PADDLE_ENFORCE_NE( + dev_impl_map.find(device_type), + dev_impl_map.end(), + phi::errors::NotFound("%s interface not found.", device_type)); + return dev_impl_map.at(device_type).get(); } Device* DeviceManager::GetDeviceWithPlace(const Place& place) { @@ -600,6 +597,56 @@ void DeviceManager::CCLRecv(const std::string& device_type, dev_impl->CCLRecv(recvbuf, num, data_type, src_rank, ccl_comm, stream); } +// profiler +void DeviceManager::ProfilerInitialize( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void** context) { + auto dev_impl = GetDeviceInterfaceWithType(dev_type); + dev_impl->ProfilerInitialize(collector, context); +} + +void DeviceManager::ProfilerFinalize( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void* context) { + auto dev_impl = GetDeviceInterfaceWithType(dev_type); + dev_impl->ProfilerFinalize(collector, context); +} + +void DeviceManager::ProfilerPrepareTracing( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void* context) { + auto dev_impl = GetDeviceInterfaceWithType(dev_type); + dev_impl->ProfilerPrepareTracing(collector, context); +} + +void DeviceManager::ProfilerStartTracing( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void* context) { + auto dev_impl = GetDeviceInterfaceWithType(dev_type); + dev_impl->ProfilerStartTracing(collector, context); +} + +void DeviceManager::ProfilerStopTracing( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void* context) { + auto dev_impl = GetDeviceInterfaceWithType(dev_type); + dev_impl->ProfilerStopTracing(collector, context); +} + +void DeviceManager::ProfilerCollectTraceData( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + uint64_t start_ns, + void* context) { + auto dev_impl = GetDeviceInterfaceWithType(dev_type); + dev_impl->ProfilerCollectTraceData(collector, start_ns, context); +} + DeviceManager& DeviceManager::Instance() { static DeviceManager platform_manager; return platform_manager; diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h index fc8529e5813..54bafd796df 100644 --- a/paddle/phi/backends/device_manager.h +++ b/paddle/phi/backends/device_manager.h @@ -241,6 +241,32 @@ class DeviceManager { const ccl::CCLComm& ccl_comm, const stream::Stream& stream); + // profiler + static void ProfilerInitialize( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void** context); + static void ProfilerFinalize(const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void* context); + static void ProfilerPrepareTracing( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void* context); + static void ProfilerStartTracing( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void* context); + static void ProfilerStopTracing( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + void* context); + static void ProfilerCollectTraceData( + const std::string& dev_type, + paddle::platform::TraceEventCollector* collector, + uint64_t start_ns, + void* context); + static void Clear(); private: diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt index b825805bdf9..3161afd1192 100644 --- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt @@ -16,5 +16,6 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU) PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) set_tests_properties(test_custom_cpu_plugin PROPERTIES TIMEOUT 120) + set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120) set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120) endif() diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_profiler_plugin.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_profiler_plugin.py new file mode 100644 index 00000000000..7a8356ed932 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_profiler_plugin.py @@ -0,0 +1,65 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import site +import unittest +import numpy as np + + +class TestCustomCPUProfilerPlugin(unittest.TestCase): + + def setUp(self): + # compile so and set to current path + cur_dir = os.path.dirname(os.path.abspath(__file__)) + cmd = 'rm -rf PaddleCustomDevice && git clone https://github.com/PaddlePaddle/PaddleCustomDevice.git && cd PaddleCustomDevice/backends/custom_cpu && mkdir build && cd build && cmake .. && make -j8' + os.system(cmd) + + # set environment for loading and registering compiled custom kernels + # only valid in current process + os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join( + cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build') + + def test_custom_device(self): + import paddle + with paddle.fluid.framework._test_eager_guard(): + self._test_custom_profiler() + + def _test_custom_profiler(self): + import paddle + import paddle.profiler as profiler + + paddle.set_device('custom_cpu') + + x = paddle.to_tensor([1, 2, 3]) + p = profiler.Profiler(targets=[ + profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.CUSTOM_DEVICE + ]) + p.start() + for iter in range(10): + x = x + 1 + p.step() + p.stop() + p.summary() + + def tearDown(self): + del os.environ['CUSTOM_DEVICE_ROOT'] + + +if __name__ == '__main__': + if os.name == 'nt' or sys.platform.startswith('darwin'): + # only support Linux now + exit() + unittest.main() diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index f3f6d778184..c44d2f0f611 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -102,6 +102,7 @@ class ProfilerTarget(Enum): CPU = 0 GPU = 1 MLU = 2 + CUSTOM_DEVICE = 3 def make_scheduler(*, @@ -296,10 +297,14 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: Get the current supported profiler target in the system. """ if _Profiler.is_cupti_supported(): - return [ProfilerTarget.CPU, ProfilerTarget.GPU] + return [ + ProfilerTarget.CPU, ProfilerTarget.GPU, ProfilerTarget.CUSTOM_DEVICE + ] if _Profiler.is_cnpapi_supported(): - return [ProfilerTarget.CPU, ProfilerTarget.MLU] - return [ProfilerTarget.CPU] + return [ + ProfilerTarget.CPU, ProfilerTarget.MLU, ProfilerTarget.CUSTOM_DEVICE + ] + return [ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE] class Profiler: @@ -437,7 +442,8 @@ class Profiler: record_shapes: Optional[bool] = False, profile_memory=False, timer_only: Optional[bool] = False, - emit_nvtx: Optional[bool] = False): + emit_nvtx: Optional[bool] = False, + custom_device_types: Optional[list] = []): supported_targets = _get_supported_targets() if targets: self.targets = set(targets) @@ -455,8 +461,12 @@ class Profiler: profileoption.trace_switch |= (1 << 1) if ProfilerTarget.MLU in self.targets: profileoption.trace_switch |= (1 << 2) + if ProfilerTarget.CUSTOM_DEVICE in self.targets: + profileoption.trace_switch |= (1 << 3) + if not custom_device_types: + custom_device_types = paddle.device.get_all_custom_device_type() wrap_optimizers() - self.profiler = _Profiler.create(profileoption) + self.profiler = _Profiler.create(profileoption, custom_device_types) if callable(scheduler): self.scheduler = scheduler elif isinstance(scheduler, (tuple, list)): diff --git a/python/setup.py.in b/python/setup.py.in index 55129c47c22..66f0575284d 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -627,6 +627,8 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels/primitive')) + # phi kernel primitive api headers # capi headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/capi', recursive=True)) + # phi capi headers + # profiler headers + list(find_files('trace_event.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform/profiler')) + # phi profiler headers # utils api headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True))) # paddle utils headers @@ -681,6 +683,9 @@ class InstallHeaders(Command): if 'fluid/jit' in install_dir: install_dir = re.sub('fluid/jit', 'jit', install_dir) print('fluid/jit install_dir: ', install_dir) + if 'trace_event.h' in install_dir: + install_dir = re.sub('fluid/platform/profiler', 'phi/backends/custom', install_dir) + print('trace_event.h install_dir: ', install_dir) else: # third_party install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) -- GitLab