未验证 提交 da51baf2 编写于 作者: R ronnywang 提交者: GitHub

[CustomDevice] add profiler apis (#45130)

* [CustomDevice] add profiler apis

* migrate CalculateEstOccupancy into cuda_tracer

* update

* add ut
上级 9e5f3a38
...@@ -7,6 +7,7 @@ cc_library( ...@@ -7,6 +7,7 @@ cc_library(
SRCS cuda_tracer.cc cupti_data_process.cc SRCS cuda_tracer.cc cupti_data_process.cc
DEPS workqueue_utils enforce glog) DEPS workqueue_utils enforce glog)
add_subdirectory(mlu) add_subdirectory(mlu)
add_subdirectory(custom_device)
cc_library( cc_library(
event_node event_node
SRCS event_node.cc SRCS event_node.cc
...@@ -32,8 +33,13 @@ cc_library( ...@@ -32,8 +33,13 @@ cc_library(
cc_library( cc_library(
new_profiler new_profiler
SRCS profiler.cc SRCS profiler.cc
DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind DEPS host_tracer
mlu_tracer) cuda_tracer
profiler_utils
cpu_utilization
event_bind
mlu_tracer
custom_tracer)
cc_test( cc_test(
test_event_node test_event_node
SRCS test_event_node.cc SRCS test_event_node.cc
......
...@@ -397,44 +397,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( ...@@ -397,44 +397,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
void ChromeTracingLogger::HandleTypeKernel( void ChromeTracingLogger::HandleTypeKernel(
const DeviceTraceEventNode& device_node) { const DeviceTraceEventNode& device_node) {
KernelEventInfo kernel_info = device_node.KernelInfo(); KernelEventInfo kernel_info = device_node.KernelInfo();
float blocks_per_sm = 0.0;
float warps_per_sm = 0.0;
float occupancy = 0.0;
#if defined(PADDLE_WITH_CUPTI)
#ifdef PADDLE_WITH_HIP
constexpr int threads_per_warp = 64;
#else
constexpr int threads_per_warp = 32;
#endif
const gpuDeviceProp& device_property =
GetDeviceProperties(device_node.DeviceId());
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
kernel_info.grid_z) /
device_property.multiProcessorCount;
warps_per_sm =
blocks_per_sm *
(kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
threads_per_warp;
#ifdef PADDLE_WITH_HIP
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
kernel_info.kernelFunc,
kernel_info.launchType);
#else
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.registers_per_thread,
kernel_info.static_shared_memory,
kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
blocks_per_sm);
#endif // PADDLE_WITH_HIP
#endif
float dur = nsToMsFloat(device_node.Duration()); float dur = nsToMsFloat(device_node.Duration());
std::string dur_display; std::string dur_display;
if (dur > 1.0) { if (dur > 1.0) {
...@@ -480,15 +443,15 @@ void ChromeTracingLogger::HandleTypeKernel( ...@@ -480,15 +443,15 @@ void ChromeTracingLogger::HandleTypeKernel(
device_node.CorrelationId(), device_node.CorrelationId(),
kernel_info.registers_per_thread, kernel_info.registers_per_thread,
kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory, kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
blocks_per_sm, kernel_info.blocks_per_sm,
warps_per_sm, kernel_info.warps_per_sm,
kernel_info.grid_x, kernel_info.grid_x,
kernel_info.grid_y, kernel_info.grid_y,
kernel_info.grid_z, kernel_info.grid_z,
kernel_info.block_x, kernel_info.block_x,
kernel_info.block_y, kernel_info.block_y,
kernel_info.block_z, kernel_info.block_z,
occupancy * 100); kernel_info.occupancy * 100);
} }
void ChromeTracingLogger::HandleTypeMemcpy( void ChromeTracingLogger::HandleTypeMemcpy(
......
...@@ -16,8 +16,10 @@ ...@@ -16,8 +16,10 @@
#include <cstdio> #include <cstdio>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -52,10 +54,50 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel, ...@@ -52,10 +54,50 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel,
event.kernel_info.queued = kernel->queued; event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted; event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->completed; event.kernel_info.completed = kernel->completed;
float blocks_per_sm = 0.0;
float warps_per_sm = 0.0;
float occupancy = 0.0;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
event.kernel_info.kernelFunc = kernel->kernelFunc; constexpr int threads_per_warp = 64;
event.kernel_info.launchType = kernel->launchType; #else
constexpr int threads_per_warp = 32;
#endif #endif
const gpuDeviceProp& device_property =
paddle::platform::GetDeviceProperties(kernel->deviceId);
blocks_per_sm =
static_cast<float>(event.kernel_info.grid_x * event.kernel_info.grid_y *
event.kernel_info.grid_z) /
device_property.multiProcessorCount;
warps_per_sm = blocks_per_sm *
(event.kernel_info.block_x * event.kernel_info.block_y *
event.kernel_info.block_z) /
threads_per_warp;
#ifdef PADDLE_WITH_HIP
occupancy = paddle::platform::CalculateEstOccupancy(
kernel->deviceId,
event.kernel_info.dynamic_shared_memory,
event.kernel_info.block_x,
event.kernel_info.block_y,
event.kernel_info.block_z,
kernel->kernelFunc,
kernel->launchType);
#else
occupancy = paddle::platform::CalculateEstOccupancy(
kernel->deviceId,
event.kernel_info.registers_per_thread,
event.kernel_info.static_shared_memory,
event.kernel_info.dynamic_shared_memory,
event.kernel_info.block_x,
event.kernel_info.block_y,
event.kernel_info.block_z,
blocks_per_sm);
#endif // PADDLE_WITH_HIP
event.kernel_info.blocks_per_sm = blocks_per_sm;
event.kernel_info.warps_per_sm = warps_per_sm;
event.kernel_info.occupancy = occupancy;
collector->AddDeviceEvent(std::move(event)); collector->AddDeviceEvent(std::move(event));
} }
......
cc_library(
custom_tracer
SRCS custom_tracer.cc
DEPS workqueue_utils enforce glog)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
#include <mutex>
#include <unordered_map>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#endif
namespace paddle {
namespace platform {
CustomTracer::CustomTracer(const std::string& dev_type) : dev_type_(dev_type) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerInitialize(dev_type_, &collector_, &context_);
#endif
}
CustomTracer::~CustomTracer() {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerFinalize(dev_type_, &collector_, context_);
#endif
}
void CustomTracer::PrepareTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::UNINITED || state_ == TracerState::STOPED,
true,
platform::errors::PreconditionNotMet("CustomTracer must be UNINITED"));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerPrepareTracing(dev_type_, &collector_, context_);
#endif
state_ = TracerState::READY;
}
void CustomTracer::StartTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::READY,
true,
platform::errors::PreconditionNotMet("Tracer must be READY or STOPPED"));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerStartTracing(dev_type_, &collector_, context_);
#endif
tracing_start_ns_ = PosixInNsec();
state_ = TracerState::STARTED;
}
void CustomTracer::StopTracing() {
PADDLE_ENFORCE_EQ(
state_,
TracerState::STARTED,
platform::errors::PreconditionNotMet("Tracer must be STARTED"));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerStopTracing(dev_type_, &collector_, context_);
#endif
state_ = TracerState::STOPED;
}
void CustomTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ(
state_,
TracerState::STOPED,
platform::errors::PreconditionNotMet("Tracer must be STOPED"));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerCollectTraceData(
dev_type_, &collector_, tracing_start_ns_, context_);
#endif
for (auto he : collector_.HostEvents()) {
collector->AddHostEvent(std::move(he));
}
for (auto rte : collector_.RuntimeEvents()) {
collector->AddRuntimeEvent(std::move(rte));
}
for (auto de : collector_.DeviceEvents()) {
collector->AddDeviceEvent(std::move(de));
}
for (auto tn : collector_.ThreadNames()) {
collector->AddThreadName(tn.first, tn.second);
}
collector_.ClearAll();
}
} // namespace platform
} // namespace paddle
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void profiler_add_runtime_trace_event(C_Profiler prof, void* event) {
paddle::platform::RuntimeTraceEvent re =
*reinterpret_cast<paddle::platform::RuntimeTraceEvent*>(event);
reinterpret_cast<paddle::platform::TraceEventCollector*>(prof)
->AddRuntimeEvent(std::move(re));
}
void profiler_add_device_trace_event(C_Profiler prof, void* event) {
paddle::platform::DeviceTraceEvent de =
*reinterpret_cast<paddle::platform::DeviceTraceEvent*>(event);
reinterpret_cast<paddle::platform::TraceEventCollector*>(prof)
->AddDeviceEvent(std::move(de));
}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <vector>
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/profiler/tracer_base.h"
namespace paddle {
namespace platform {
class CustomTracer : public TracerBase {
public:
static CustomTracer& GetInstance(const std::string& device_type) {
static std::unordered_map<std::string, std::shared_ptr<CustomTracer>>
instance;
if (instance.find(device_type) == instance.cend()) {
instance.insert(
{device_type, std::make_shared<CustomTracer>(device_type)});
}
return *instance[device_type];
}
void PrepareTracing() override;
void StartTracing() override;
void StopTracing() override;
void CollectTraceData(TraceEventCollector* collector) override;
~CustomTracer() override;
explicit CustomTracer(const std::string& dev_type);
private:
DISABLE_COPY_AND_ASSIGN(CustomTracer);
TraceEventCollector collector_;
uint64_t tracing_start_ns_ = UINT64_MAX;
std::string dev_type_;
void* context_;
};
} // namespace platform
} // namespace paddle
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#endif #endif
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler/cuda_tracer.h" #include "paddle/fluid/platform/profiler/cuda_tracer.h"
#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
#include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/host_tracer.h"
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h" #include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
...@@ -39,11 +40,13 @@ void SynchronizeAllDevice(); ...@@ -39,11 +40,13 @@ void SynchronizeAllDevice();
std::atomic<bool> Profiler::alive_{false}; std::atomic<bool> Profiler::alive_{false};
std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) { std::unique_ptr<Profiler> Profiler::Create(
const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types) {
if (alive_.exchange(true)) { if (alive_.exchange(true)) {
return nullptr; return nullptr;
} }
return std::unique_ptr<Profiler>(new Profiler(options)); return std::unique_ptr<Profiler>(new Profiler(options, custom_device_types));
} }
bool Profiler::IsCuptiSupported() { bool Profiler::IsCuptiSupported() {
...@@ -62,7 +65,8 @@ bool Profiler::IsCnpapiSupported() { ...@@ -62,7 +65,8 @@ bool Profiler::IsCnpapiSupported() {
return supported; return supported;
} }
Profiler::Profiler(const ProfilerOptions& options) { Profiler::Profiler(const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types) {
options_ = options; options_ = options;
std::bitset<32> trace_switch(options_.trace_switch); std::bitset<32> trace_switch(options_.trace_switch);
if (trace_switch.test(kProfileCPUOptionBit)) { if (trace_switch.test(kProfileCPUOptionBit)) {
...@@ -76,6 +80,11 @@ Profiler::Profiler(const ProfilerOptions& options) { ...@@ -76,6 +80,11 @@ Profiler::Profiler(const ProfilerOptions& options) {
if (trace_switch.test(kProfileMLUOptionBit)) { if (trace_switch.test(kProfileMLUOptionBit)) {
tracers_.emplace_back(&MluTracer::GetInstance(), false); tracers_.emplace_back(&MluTracer::GetInstance(), false);
} }
if (trace_switch.test(kProfileCustomDeviceOptionBit)) {
for (const auto& dev_type : custom_device_types) {
tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false);
}
}
} }
Profiler::~Profiler() { alive_.store(false); } Profiler::~Profiler() { alive_.store(false); }
......
...@@ -35,6 +35,7 @@ namespace platform { ...@@ -35,6 +35,7 @@ namespace platform {
static constexpr uint32_t kProfileCPUOptionBit = 0; static constexpr uint32_t kProfileCPUOptionBit = 0;
static constexpr uint32_t kProfileGPUOptionBit = 1; static constexpr uint32_t kProfileGPUOptionBit = 1;
static constexpr uint32_t kProfileMLUOptionBit = 2; static constexpr uint32_t kProfileMLUOptionBit = 2;
static constexpr uint32_t kProfileCustomDeviceOptionBit = 3;
struct ProfilerOptions { struct ProfilerOptions {
uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu
...@@ -43,7 +44,9 @@ struct ProfilerOptions { ...@@ -43,7 +44,9 @@ struct ProfilerOptions {
class Profiler { class Profiler {
public: public:
static std::unique_ptr<Profiler> Create(const ProfilerOptions& options); static std::unique_ptr<Profiler> Create(
const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types = {});
static bool IsCuptiSupported(); static bool IsCuptiSupported();
...@@ -75,7 +78,8 @@ class Profiler { ...@@ -75,7 +78,8 @@ class Profiler {
bool owned; bool owned;
}; };
explicit Profiler(const ProfilerOptions& options); explicit Profiler(const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types = {});
DISABLE_COPY_AND_ASSIGN(Profiler); DISABLE_COPY_AND_ASSIGN(Profiler);
......
...@@ -105,10 +105,11 @@ struct KernelEventInfo { ...@@ -105,10 +105,11 @@ struct KernelEventInfo {
uint64_t submitted; uint64_t submitted;
// The completed timestamp for the kernel execution, in ns. // The completed timestamp for the kernel execution, in ns.
uint64_t completed; uint64_t completed;
#ifdef PADDLE_WITH_HIP
void* kernelFunc; float blocks_per_sm;
uint8_t launchType; float warps_per_sm;
#endif // theoretical achieved occupancy
float occupancy;
}; };
static constexpr size_t kMemKindMaxLen = 50; static constexpr size_t kMemKindMaxLen = 50;
......
...@@ -38,6 +38,9 @@ limitations under the License. */ ...@@ -38,6 +38,9 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/enforce.h" #include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#endif
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
...@@ -146,6 +149,17 @@ void SynchronizeAllDevice() { ...@@ -146,6 +149,17 @@ void SynchronizeAllDevice() {
PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice()); PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice());
} }
#endif #endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto &dev_type : dev_types) {
auto dev_cnt = phi::DeviceManager::GetDeviceCount(dev_type);
for (size_t i = 0; i < dev_cnt; i++) {
auto place = paddle::platform::CustomPlace(dev_type, i);
phi::DeviceManager::SetDevice(place);
phi::DeviceManager::SynchronizeDevice(place);
}
}
#endif
} }
static double ToMegaBytes(size_t bytes) { static double ToMegaBytes(size_t bytes) {
......
...@@ -817,6 +817,51 @@ class CustomDevice : public DeviceInterface { ...@@ -817,6 +817,51 @@ class CustomDevice : public DeviceInterface {
y)); y));
} }
// Profiler
void ProfilerInitialize(paddle::platform::TraceEventCollector* collector,
void** user_data) override {
CHECK_PTR(pimpl_->profiler_initialize);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_initialize(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerFinalize(paddle::platform::TraceEventCollector* collector,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_finalize);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_finalize(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerPrepareTracing(paddle::platform::TraceEventCollector* collector,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_prepare_tracing);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_prepare_tracing(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerStartTracing(paddle::platform::TraceEventCollector* collector,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_start_tracing);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_start_tracing(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerStopTracing(paddle::platform::TraceEventCollector* collector,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_stop_tracing);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_stop_tracing(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerCollectTraceData(
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_collect_trace_data);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_collect_trace_data(
reinterpret_cast<C_Profiler>(collector), start_ns, user_data));
}
private: private:
inline int PlaceToIdNoCheck(const Place& place) { inline int PlaceToIdNoCheck(const Place& place) {
int dev_id = place.GetDeviceId(); int dev_id = place.GetDeviceId();
...@@ -925,6 +970,13 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) { ...@@ -925,6 +970,13 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
CHECK_INTERFACE(xccl_recv, false); CHECK_INTERFACE(xccl_recv, false);
CHECK_INTERFACE(blas_axpby, false); CHECK_INTERFACE(blas_axpby, false);
CHECK_INTERFACE(profiler_initialize, false);
CHECK_INTERFACE(profiler_finalize, false);
CHECK_INTERFACE(profiler_prepare_tracing, false);
CHECK_INTERFACE(profiler_start_tracing, false);
CHECK_INTERFACE(profiler_stop_tracing, false);
CHECK_INTERFACE(profiler_collect_trace_data, false);
return true; return true;
#undef CHECK_INTERFACE #undef CHECK_INTERFACE
} }
......
...@@ -368,6 +368,39 @@ void DeviceInterface::BlasAXPBY(size_t dev_id, ...@@ -368,6 +368,39 @@ void DeviceInterface::BlasAXPBY(size_t dev_id,
INTERFACE_UNIMPLEMENT; INTERFACE_UNIMPLEMENT;
} }
// profiler
void DeviceInterface::ProfilerInitialize(
paddle::platform::TraceEventCollector* collector, void** user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerFinalize(
paddle::platform::TraceEventCollector* collector, void* user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerPrepareTracing(
paddle::platform::TraceEventCollector* collector, void* user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerStartTracing(
paddle::platform::TraceEventCollector* collector, void* user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerStopTracing(
paddle::platform::TraceEventCollector* collector, void* user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerCollectTraceData(
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* user_data) {
INTERFACE_UNIMPLEMENT;
}
#undef INTERFACE_UNIMPLEMENT #undef INTERFACE_UNIMPLEMENT
} // namespace phi } // namespace phi
...@@ -20,6 +20,12 @@ ...@@ -20,6 +20,12 @@
#include "paddle/phi/backends/event.h" #include "paddle/phi/backends/event.h"
#include "paddle/phi/backends/stream.h" #include "paddle/phi/backends/stream.h"
namespace paddle {
namespace platform {
class TraceEventCollector;
} // namespace platform
} // namespace paddle
namespace phi { namespace phi {
class DeviceInterface { // Driver / Runtime class DeviceInterface { // Driver / Runtime
...@@ -236,6 +242,27 @@ class DeviceInterface { // Driver / Runtime ...@@ -236,6 +242,27 @@ class DeviceInterface { // Driver / Runtime
float beta, float beta,
void* y); void* y);
// profiler
virtual void ProfilerInitialize(
paddle::platform::TraceEventCollector* collector, void** user_data);
virtual void ProfilerFinalize(
paddle::platform::TraceEventCollector* collector, void* user_data);
virtual void ProfilerPrepareTracing(
paddle::platform::TraceEventCollector* collector, void* user_data);
virtual void ProfilerStartTracing(
paddle::platform::TraceEventCollector* collector, void* user_data);
virtual void ProfilerStopTracing(
paddle::platform::TraceEventCollector* collector, void* user_data);
virtual void ProfilerCollectTraceData(
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* user_data);
private: private:
const std::string type_; const std::string type_;
const uint8_t priority_; const uint8_t priority_;
......
...@@ -83,6 +83,12 @@ typedef struct C_CCLComm_st* C_CCLComm; ...@@ -83,6 +83,12 @@ typedef struct C_CCLComm_st* C_CCLComm;
typedef enum { SUM = 0, AVG, MAX, MIN, PRODUCT } C_CCLReduceOp; typedef enum { SUM = 0, AVG, MAX, MIN, PRODUCT } C_CCLReduceOp;
typedef struct C_Profiler_st* C_Profiler;
void profiler_add_runtime_trace_event(C_Profiler prof, void* event);
void profiler_add_device_trace_event(C_Profiler prof, void* event);
struct C_DeviceInterface { struct C_DeviceInterface {
// Core fill it and plugin must to check it // Core fill it and plugin must to check it
size_t size; size_t size;
...@@ -632,6 +638,26 @@ struct C_DeviceInterface { ...@@ -632,6 +638,26 @@ struct C_DeviceInterface {
void* reserved_ccl_api[8]; void* reserved_ccl_api[8];
//////////////////
// profiler api //
//////////////////
C_Status (*profiler_initialize)(C_Profiler prof, void** user_data);
C_Status (*profiler_finalize)(C_Profiler prof, void* user_data);
C_Status (*profiler_prepare_tracing)(C_Profiler prof, void* user_data);
C_Status (*profiler_start_tracing)(C_Profiler prof, void* user_data);
C_Status (*profiler_stop_tracing)(C_Profiler prof, void* user_data);
C_Status (*profiler_collect_trace_data)(C_Profiler prof,
uint64_t start_ns,
void* user_data);
void* reserved_profiler_api[8];
/////////////// ///////////////
// other api // // other api //
/////////////// ///////////////
......
...@@ -254,14 +254,11 @@ DeviceInterface* DeviceManager::GetDeviceInterfaceWithType( ...@@ -254,14 +254,11 @@ DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
phi::AutoRDLock lock(&_global_device_manager_rw_lock); phi::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_; auto& dev_impl_map = Instance().device_impl_map_;
if (dev_impl_map.find(device_type) != dev_impl_map.end()) { PADDLE_ENFORCE_NE(
return dev_impl_map.at(device_type).get(); dev_impl_map.find(device_type),
} else { dev_impl_map.end(),
LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n"; phi::errors::NotFound("%s interface not found.", device_type));
PADDLE_THROW( return dev_impl_map.at(device_type).get();
phi::errors::Fatal("Unregistered device type %s.", device_type));
return nullptr;
}
} }
Device* DeviceManager::GetDeviceWithPlace(const Place& place) { Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
...@@ -600,6 +597,56 @@ void DeviceManager::CCLRecv(const std::string& device_type, ...@@ -600,6 +597,56 @@ void DeviceManager::CCLRecv(const std::string& device_type,
dev_impl->CCLRecv(recvbuf, num, data_type, src_rank, ccl_comm, stream); dev_impl->CCLRecv(recvbuf, num, data_type, src_rank, ccl_comm, stream);
} }
// profiler
void DeviceManager::ProfilerInitialize(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void** context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerInitialize(collector, context);
}
void DeviceManager::ProfilerFinalize(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerFinalize(collector, context);
}
void DeviceManager::ProfilerPrepareTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerPrepareTracing(collector, context);
}
void DeviceManager::ProfilerStartTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerStartTracing(collector, context);
}
void DeviceManager::ProfilerStopTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerStopTracing(collector, context);
}
void DeviceManager::ProfilerCollectTraceData(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerCollectTraceData(collector, start_ns, context);
}
DeviceManager& DeviceManager::Instance() { DeviceManager& DeviceManager::Instance() {
static DeviceManager platform_manager; static DeviceManager platform_manager;
return platform_manager; return platform_manager;
......
...@@ -241,6 +241,32 @@ class DeviceManager { ...@@ -241,6 +241,32 @@ class DeviceManager {
const ccl::CCLComm& ccl_comm, const ccl::CCLComm& ccl_comm,
const stream::Stream& stream); const stream::Stream& stream);
// profiler
static void ProfilerInitialize(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void** context);
static void ProfilerFinalize(const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context);
static void ProfilerPrepareTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context);
static void ProfilerStartTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context);
static void ProfilerStopTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context);
static void ProfilerCollectTraceData(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* context);
static void Clear(); static void Clear();
private: private:
......
...@@ -16,5 +16,6 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU) ...@@ -16,5 +16,6 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
set_tests_properties(test_custom_cpu_plugin PROPERTIES TIMEOUT 120) set_tests_properties(test_custom_cpu_plugin PROPERTIES TIMEOUT 120)
set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120)
set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120) set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120)
endif() endif()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import site
import unittest
import numpy as np
class TestCustomCPUProfilerPlugin(unittest.TestCase):
def setUp(self):
# compile so and set to current path
cur_dir = os.path.dirname(os.path.abspath(__file__))
cmd = 'rm -rf PaddleCustomDevice && git clone https://github.com/PaddlePaddle/PaddleCustomDevice.git && cd PaddleCustomDevice/backends/custom_cpu && mkdir build && cd build && cmake .. && make -j8'
os.system(cmd)
# set environment for loading and registering compiled custom kernels
# only valid in current process
os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build')
def test_custom_device(self):
import paddle
with paddle.fluid.framework._test_eager_guard():
self._test_custom_profiler()
def _test_custom_profiler(self):
import paddle
import paddle.profiler as profiler
paddle.set_device('custom_cpu')
x = paddle.to_tensor([1, 2, 3])
p = profiler.Profiler(targets=[
profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.CUSTOM_DEVICE
])
p.start()
for iter in range(10):
x = x + 1
p.step()
p.stop()
p.summary()
def tearDown(self):
del os.environ['CUSTOM_DEVICE_ROOT']
if __name__ == '__main__':
if os.name == 'nt' or sys.platform.startswith('darwin'):
# only support Linux now
exit()
unittest.main()
...@@ -102,6 +102,7 @@ class ProfilerTarget(Enum): ...@@ -102,6 +102,7 @@ class ProfilerTarget(Enum):
CPU = 0 CPU = 0
GPU = 1 GPU = 1
MLU = 2 MLU = 2
CUSTOM_DEVICE = 3
def make_scheduler(*, def make_scheduler(*,
...@@ -296,10 +297,14 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: ...@@ -296,10 +297,14 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
Get the current supported profiler target in the system. Get the current supported profiler target in the system.
""" """
if _Profiler.is_cupti_supported(): if _Profiler.is_cupti_supported():
return [ProfilerTarget.CPU, ProfilerTarget.GPU] return [
ProfilerTarget.CPU, ProfilerTarget.GPU, ProfilerTarget.CUSTOM_DEVICE
]
if _Profiler.is_cnpapi_supported(): if _Profiler.is_cnpapi_supported():
return [ProfilerTarget.CPU, ProfilerTarget.MLU] return [
return [ProfilerTarget.CPU] ProfilerTarget.CPU, ProfilerTarget.MLU, ProfilerTarget.CUSTOM_DEVICE
]
return [ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE]
class Profiler: class Profiler:
...@@ -437,7 +442,8 @@ class Profiler: ...@@ -437,7 +442,8 @@ class Profiler:
record_shapes: Optional[bool] = False, record_shapes: Optional[bool] = False,
profile_memory=False, profile_memory=False,
timer_only: Optional[bool] = False, timer_only: Optional[bool] = False,
emit_nvtx: Optional[bool] = False): emit_nvtx: Optional[bool] = False,
custom_device_types: Optional[list] = []):
supported_targets = _get_supported_targets() supported_targets = _get_supported_targets()
if targets: if targets:
self.targets = set(targets) self.targets = set(targets)
...@@ -455,8 +461,12 @@ class Profiler: ...@@ -455,8 +461,12 @@ class Profiler:
profileoption.trace_switch |= (1 << 1) profileoption.trace_switch |= (1 << 1)
if ProfilerTarget.MLU in self.targets: if ProfilerTarget.MLU in self.targets:
profileoption.trace_switch |= (1 << 2) profileoption.trace_switch |= (1 << 2)
if ProfilerTarget.CUSTOM_DEVICE in self.targets:
profileoption.trace_switch |= (1 << 3)
if not custom_device_types:
custom_device_types = paddle.device.get_all_custom_device_type()
wrap_optimizers() wrap_optimizers()
self.profiler = _Profiler.create(profileoption) self.profiler = _Profiler.create(profileoption, custom_device_types)
if callable(scheduler): if callable(scheduler):
self.scheduler = scheduler self.scheduler = scheduler
elif isinstance(scheduler, (tuple, list)): elif isinstance(scheduler, (tuple, list)):
......
...@@ -627,6 +627,8 @@ headers = ( ...@@ -627,6 +627,8 @@ headers = (
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels/primitive')) + # phi kernel primitive api headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels/primitive')) + # phi kernel primitive api headers
# capi headers # capi headers
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/capi', recursive=True)) + # phi capi headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/capi', recursive=True)) + # phi capi headers
# profiler headers
list(find_files('trace_event.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform/profiler')) + # phi profiler headers
# utils api headers # utils api headers
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True))) # paddle utils headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True))) # paddle utils headers
...@@ -681,6 +683,9 @@ class InstallHeaders(Command): ...@@ -681,6 +683,9 @@ class InstallHeaders(Command):
if 'fluid/jit' in install_dir: if 'fluid/jit' in install_dir:
install_dir = re.sub('fluid/jit', 'jit', install_dir) install_dir = re.sub('fluid/jit', 'jit', install_dir)
print('fluid/jit install_dir: ', install_dir) print('fluid/jit install_dir: ', install_dir)
if 'trace_event.h' in install_dir:
install_dir = re.sub('fluid/platform/profiler', 'phi/backends/custom', install_dir)
print('trace_event.h install_dir: ', install_dir)
else: else:
# third_party # third_party
install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册