未验证 提交 da51baf2 编写于 作者: R ronnywang 提交者: GitHub

[CustomDevice] add profiler apis (#45130)

* [CustomDevice] add profiler apis

* migrate CalculateEstOccupancy into cuda_tracer

* update

* add ut
上级 9e5f3a38
......@@ -7,6 +7,7 @@ cc_library(
SRCS cuda_tracer.cc cupti_data_process.cc
DEPS workqueue_utils enforce glog)
add_subdirectory(mlu)
add_subdirectory(custom_device)
cc_library(
event_node
SRCS event_node.cc
......@@ -32,8 +33,13 @@ cc_library(
cc_library(
new_profiler
SRCS profiler.cc
DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind
mlu_tracer)
DEPS host_tracer
cuda_tracer
profiler_utils
cpu_utilization
event_bind
mlu_tracer
custom_tracer)
cc_test(
test_event_node
SRCS test_event_node.cc
......
......@@ -397,44 +397,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
void ChromeTracingLogger::HandleTypeKernel(
const DeviceTraceEventNode& device_node) {
KernelEventInfo kernel_info = device_node.KernelInfo();
float blocks_per_sm = 0.0;
float warps_per_sm = 0.0;
float occupancy = 0.0;
#if defined(PADDLE_WITH_CUPTI)
#ifdef PADDLE_WITH_HIP
constexpr int threads_per_warp = 64;
#else
constexpr int threads_per_warp = 32;
#endif
const gpuDeviceProp& device_property =
GetDeviceProperties(device_node.DeviceId());
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
kernel_info.grid_z) /
device_property.multiProcessorCount;
warps_per_sm =
blocks_per_sm *
(kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
threads_per_warp;
#ifdef PADDLE_WITH_HIP
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
kernel_info.kernelFunc,
kernel_info.launchType);
#else
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.registers_per_thread,
kernel_info.static_shared_memory,
kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
blocks_per_sm);
#endif // PADDLE_WITH_HIP
#endif
float dur = nsToMsFloat(device_node.Duration());
std::string dur_display;
if (dur > 1.0) {
......@@ -480,15 +443,15 @@ void ChromeTracingLogger::HandleTypeKernel(
device_node.CorrelationId(),
kernel_info.registers_per_thread,
kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
blocks_per_sm,
warps_per_sm,
kernel_info.blocks_per_sm,
kernel_info.warps_per_sm,
kernel_info.grid_x,
kernel_info.grid_y,
kernel_info.grid_z,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
occupancy * 100);
kernel_info.occupancy * 100);
}
void ChromeTracingLogger::HandleTypeMemcpy(
......
......@@ -16,8 +16,10 @@
#include <cstdio>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle {
namespace platform {
......@@ -52,10 +54,50 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel,
event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->completed;
float blocks_per_sm = 0.0;
float warps_per_sm = 0.0;
float occupancy = 0.0;
#ifdef PADDLE_WITH_HIP
event.kernel_info.kernelFunc = kernel->kernelFunc;
event.kernel_info.launchType = kernel->launchType;
constexpr int threads_per_warp = 64;
#else
constexpr int threads_per_warp = 32;
#endif
const gpuDeviceProp& device_property =
paddle::platform::GetDeviceProperties(kernel->deviceId);
blocks_per_sm =
static_cast<float>(event.kernel_info.grid_x * event.kernel_info.grid_y *
event.kernel_info.grid_z) /
device_property.multiProcessorCount;
warps_per_sm = blocks_per_sm *
(event.kernel_info.block_x * event.kernel_info.block_y *
event.kernel_info.block_z) /
threads_per_warp;
#ifdef PADDLE_WITH_HIP
occupancy = paddle::platform::CalculateEstOccupancy(
kernel->deviceId,
event.kernel_info.dynamic_shared_memory,
event.kernel_info.block_x,
event.kernel_info.block_y,
event.kernel_info.block_z,
kernel->kernelFunc,
kernel->launchType);
#else
occupancy = paddle::platform::CalculateEstOccupancy(
kernel->deviceId,
event.kernel_info.registers_per_thread,
event.kernel_info.static_shared_memory,
event.kernel_info.dynamic_shared_memory,
event.kernel_info.block_x,
event.kernel_info.block_y,
event.kernel_info.block_z,
blocks_per_sm);
#endif // PADDLE_WITH_HIP
event.kernel_info.blocks_per_sm = blocks_per_sm;
event.kernel_info.warps_per_sm = warps_per_sm;
event.kernel_info.occupancy = occupancy;
collector->AddDeviceEvent(std::move(event));
}
......
cc_library(
custom_tracer
SRCS custom_tracer.cc
DEPS workqueue_utils enforce glog)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
#include <mutex>
#include <unordered_map>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#endif
namespace paddle {
namespace platform {
CustomTracer::CustomTracer(const std::string& dev_type) : dev_type_(dev_type) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerInitialize(dev_type_, &collector_, &context_);
#endif
}
CustomTracer::~CustomTracer() {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerFinalize(dev_type_, &collector_, context_);
#endif
}
void CustomTracer::PrepareTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::UNINITED || state_ == TracerState::STOPED,
true,
platform::errors::PreconditionNotMet("CustomTracer must be UNINITED"));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerPrepareTracing(dev_type_, &collector_, context_);
#endif
state_ = TracerState::READY;
}
void CustomTracer::StartTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::READY,
true,
platform::errors::PreconditionNotMet("Tracer must be READY or STOPPED"));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerStartTracing(dev_type_, &collector_, context_);
#endif
tracing_start_ns_ = PosixInNsec();
state_ = TracerState::STARTED;
}
void CustomTracer::StopTracing() {
PADDLE_ENFORCE_EQ(
state_,
TracerState::STARTED,
platform::errors::PreconditionNotMet("Tracer must be STARTED"));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerStopTracing(dev_type_, &collector_, context_);
#endif
state_ = TracerState::STOPED;
}
void CustomTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ(
state_,
TracerState::STOPED,
platform::errors::PreconditionNotMet("Tracer must be STOPED"));
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::ProfilerCollectTraceData(
dev_type_, &collector_, tracing_start_ns_, context_);
#endif
for (auto he : collector_.HostEvents()) {
collector->AddHostEvent(std::move(he));
}
for (auto rte : collector_.RuntimeEvents()) {
collector->AddRuntimeEvent(std::move(rte));
}
for (auto de : collector_.DeviceEvents()) {
collector->AddDeviceEvent(std::move(de));
}
for (auto tn : collector_.ThreadNames()) {
collector->AddThreadName(tn.first, tn.second);
}
collector_.ClearAll();
}
} // namespace platform
} // namespace paddle
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void profiler_add_runtime_trace_event(C_Profiler prof, void* event) {
paddle::platform::RuntimeTraceEvent re =
*reinterpret_cast<paddle::platform::RuntimeTraceEvent*>(event);
reinterpret_cast<paddle::platform::TraceEventCollector*>(prof)
->AddRuntimeEvent(std::move(re));
}
void profiler_add_device_trace_event(C_Profiler prof, void* event) {
paddle::platform::DeviceTraceEvent de =
*reinterpret_cast<paddle::platform::DeviceTraceEvent*>(event);
reinterpret_cast<paddle::platform::TraceEventCollector*>(prof)
->AddDeviceEvent(std::move(de));
}
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <vector>
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/profiler/tracer_base.h"
namespace paddle {
namespace platform {
class CustomTracer : public TracerBase {
public:
static CustomTracer& GetInstance(const std::string& device_type) {
static std::unordered_map<std::string, std::shared_ptr<CustomTracer>>
instance;
if (instance.find(device_type) == instance.cend()) {
instance.insert(
{device_type, std::make_shared<CustomTracer>(device_type)});
}
return *instance[device_type];
}
void PrepareTracing() override;
void StartTracing() override;
void StopTracing() override;
void CollectTraceData(TraceEventCollector* collector) override;
~CustomTracer() override;
explicit CustomTracer(const std::string& dev_type);
private:
DISABLE_COPY_AND_ASSIGN(CustomTracer);
TraceEventCollector collector_;
uint64_t tracing_start_ns_ = UINT64_MAX;
std::string dev_type_;
void* context_;
};
} // namespace platform
} // namespace paddle
......@@ -26,6 +26,7 @@
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler/cuda_tracer.h"
#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/host_tracer.h"
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
......@@ -39,11 +40,13 @@ void SynchronizeAllDevice();
std::atomic<bool> Profiler::alive_{false};
std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
std::unique_ptr<Profiler> Profiler::Create(
const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types) {
if (alive_.exchange(true)) {
return nullptr;
}
return std::unique_ptr<Profiler>(new Profiler(options));
return std::unique_ptr<Profiler>(new Profiler(options, custom_device_types));
}
bool Profiler::IsCuptiSupported() {
......@@ -62,7 +65,8 @@ bool Profiler::IsCnpapiSupported() {
return supported;
}
Profiler::Profiler(const ProfilerOptions& options) {
Profiler::Profiler(const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types) {
options_ = options;
std::bitset<32> trace_switch(options_.trace_switch);
if (trace_switch.test(kProfileCPUOptionBit)) {
......@@ -76,6 +80,11 @@ Profiler::Profiler(const ProfilerOptions& options) {
if (trace_switch.test(kProfileMLUOptionBit)) {
tracers_.emplace_back(&MluTracer::GetInstance(), false);
}
if (trace_switch.test(kProfileCustomDeviceOptionBit)) {
for (const auto& dev_type : custom_device_types) {
tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false);
}
}
}
Profiler::~Profiler() { alive_.store(false); }
......
......@@ -35,6 +35,7 @@ namespace platform {
static constexpr uint32_t kProfileCPUOptionBit = 0;
static constexpr uint32_t kProfileGPUOptionBit = 1;
static constexpr uint32_t kProfileMLUOptionBit = 2;
static constexpr uint32_t kProfileCustomDeviceOptionBit = 3;
struct ProfilerOptions {
uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu, bit 2: mlu
......@@ -43,7 +44,9 @@ struct ProfilerOptions {
class Profiler {
public:
static std::unique_ptr<Profiler> Create(const ProfilerOptions& options);
static std::unique_ptr<Profiler> Create(
const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types = {});
static bool IsCuptiSupported();
......@@ -75,7 +78,8 @@ class Profiler {
bool owned;
};
explicit Profiler(const ProfilerOptions& options);
explicit Profiler(const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types = {});
DISABLE_COPY_AND_ASSIGN(Profiler);
......
......@@ -105,10 +105,11 @@ struct KernelEventInfo {
uint64_t submitted;
// The completed timestamp for the kernel execution, in ns.
uint64_t completed;
#ifdef PADDLE_WITH_HIP
void* kernelFunc;
uint8_t launchType;
#endif
float blocks_per_sm;
float warps_per_sm;
// theoretical achieved occupancy
float occupancy;
};
static constexpr size_t kMemKindMaxLen = 50;
......
......@@ -38,6 +38,9 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#endif
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
......@@ -146,6 +149,17 @@ void SynchronizeAllDevice() {
PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto &dev_type : dev_types) {
auto dev_cnt = phi::DeviceManager::GetDeviceCount(dev_type);
for (size_t i = 0; i < dev_cnt; i++) {
auto place = paddle::platform::CustomPlace(dev_type, i);
phi::DeviceManager::SetDevice(place);
phi::DeviceManager::SynchronizeDevice(place);
}
}
#endif
}
static double ToMegaBytes(size_t bytes) {
......
......@@ -817,6 +817,51 @@ class CustomDevice : public DeviceInterface {
y));
}
// Profiler
void ProfilerInitialize(paddle::platform::TraceEventCollector* collector,
void** user_data) override {
CHECK_PTR(pimpl_->profiler_initialize);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_initialize(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerFinalize(paddle::platform::TraceEventCollector* collector,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_finalize);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_finalize(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerPrepareTracing(paddle::platform::TraceEventCollector* collector,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_prepare_tracing);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_prepare_tracing(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerStartTracing(paddle::platform::TraceEventCollector* collector,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_start_tracing);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_start_tracing(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerStopTracing(paddle::platform::TraceEventCollector* collector,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_stop_tracing);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_stop_tracing(
reinterpret_cast<C_Profiler>(collector), user_data));
}
void ProfilerCollectTraceData(
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* user_data) override {
CHECK_PTR(pimpl_->profiler_collect_trace_data);
PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->profiler_collect_trace_data(
reinterpret_cast<C_Profiler>(collector), start_ns, user_data));
}
private:
inline int PlaceToIdNoCheck(const Place& place) {
int dev_id = place.GetDeviceId();
......@@ -925,6 +970,13 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
CHECK_INTERFACE(xccl_recv, false);
CHECK_INTERFACE(blas_axpby, false);
CHECK_INTERFACE(profiler_initialize, false);
CHECK_INTERFACE(profiler_finalize, false);
CHECK_INTERFACE(profiler_prepare_tracing, false);
CHECK_INTERFACE(profiler_start_tracing, false);
CHECK_INTERFACE(profiler_stop_tracing, false);
CHECK_INTERFACE(profiler_collect_trace_data, false);
return true;
#undef CHECK_INTERFACE
}
......
......@@ -368,6 +368,39 @@ void DeviceInterface::BlasAXPBY(size_t dev_id,
INTERFACE_UNIMPLEMENT;
}
// profiler
void DeviceInterface::ProfilerInitialize(
paddle::platform::TraceEventCollector* collector, void** user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerFinalize(
paddle::platform::TraceEventCollector* collector, void* user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerPrepareTracing(
paddle::platform::TraceEventCollector* collector, void* user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerStartTracing(
paddle::platform::TraceEventCollector* collector, void* user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerStopTracing(
paddle::platform::TraceEventCollector* collector, void* user_data) {
INTERFACE_UNIMPLEMENT;
}
void DeviceInterface::ProfilerCollectTraceData(
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* user_data) {
INTERFACE_UNIMPLEMENT;
}
#undef INTERFACE_UNIMPLEMENT
} // namespace phi
......@@ -20,6 +20,12 @@
#include "paddle/phi/backends/event.h"
#include "paddle/phi/backends/stream.h"
namespace paddle {
namespace platform {
class TraceEventCollector;
} // namespace platform
} // namespace paddle
namespace phi {
class DeviceInterface { // Driver / Runtime
......@@ -236,6 +242,27 @@ class DeviceInterface { // Driver / Runtime
float beta,
void* y);
// profiler
virtual void ProfilerInitialize(
paddle::platform::TraceEventCollector* collector, void** user_data);
virtual void ProfilerFinalize(
paddle::platform::TraceEventCollector* collector, void* user_data);
virtual void ProfilerPrepareTracing(
paddle::platform::TraceEventCollector* collector, void* user_data);
virtual void ProfilerStartTracing(
paddle::platform::TraceEventCollector* collector, void* user_data);
virtual void ProfilerStopTracing(
paddle::platform::TraceEventCollector* collector, void* user_data);
virtual void ProfilerCollectTraceData(
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* user_data);
private:
const std::string type_;
const uint8_t priority_;
......
......@@ -83,6 +83,12 @@ typedef struct C_CCLComm_st* C_CCLComm;
typedef enum { SUM = 0, AVG, MAX, MIN, PRODUCT } C_CCLReduceOp;
typedef struct C_Profiler_st* C_Profiler;
void profiler_add_runtime_trace_event(C_Profiler prof, void* event);
void profiler_add_device_trace_event(C_Profiler prof, void* event);
struct C_DeviceInterface {
// Core fill it and plugin must to check it
size_t size;
......@@ -632,6 +638,26 @@ struct C_DeviceInterface {
void* reserved_ccl_api[8];
//////////////////
// profiler api //
//////////////////
C_Status (*profiler_initialize)(C_Profiler prof, void** user_data);
C_Status (*profiler_finalize)(C_Profiler prof, void* user_data);
C_Status (*profiler_prepare_tracing)(C_Profiler prof, void* user_data);
C_Status (*profiler_start_tracing)(C_Profiler prof, void* user_data);
C_Status (*profiler_stop_tracing)(C_Profiler prof, void* user_data);
C_Status (*profiler_collect_trace_data)(C_Profiler prof,
uint64_t start_ns,
void* user_data);
void* reserved_profiler_api[8];
///////////////
// other api //
///////////////
......
......@@ -254,14 +254,11 @@ DeviceInterface* DeviceManager::GetDeviceInterfaceWithType(
phi::AutoRDLock lock(&_global_device_manager_rw_lock);
auto& dev_impl_map = Instance().device_impl_map_;
if (dev_impl_map.find(device_type) != dev_impl_map.end()) {
PADDLE_ENFORCE_NE(
dev_impl_map.find(device_type),
dev_impl_map.end(),
phi::errors::NotFound("%s interface not found.", device_type));
return dev_impl_map.at(device_type).get();
} else {
LOG(ERROR) << "GetDeviceInterfaceWithType - " << device_type << " Failed\n";
PADDLE_THROW(
phi::errors::Fatal("Unregistered device type %s.", device_type));
return nullptr;
}
}
Device* DeviceManager::GetDeviceWithPlace(const Place& place) {
......@@ -600,6 +597,56 @@ void DeviceManager::CCLRecv(const std::string& device_type,
dev_impl->CCLRecv(recvbuf, num, data_type, src_rank, ccl_comm, stream);
}
// profiler
void DeviceManager::ProfilerInitialize(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void** context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerInitialize(collector, context);
}
void DeviceManager::ProfilerFinalize(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerFinalize(collector, context);
}
void DeviceManager::ProfilerPrepareTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerPrepareTracing(collector, context);
}
void DeviceManager::ProfilerStartTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerStartTracing(collector, context);
}
void DeviceManager::ProfilerStopTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerStopTracing(collector, context);
}
void DeviceManager::ProfilerCollectTraceData(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* context) {
auto dev_impl = GetDeviceInterfaceWithType(dev_type);
dev_impl->ProfilerCollectTraceData(collector, start_ns, context);
}
DeviceManager& DeviceManager::Instance() {
static DeviceManager platform_manager;
return platform_manager;
......
......@@ -241,6 +241,32 @@ class DeviceManager {
const ccl::CCLComm& ccl_comm,
const stream::Stream& stream);
// profiler
static void ProfilerInitialize(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void** context);
static void ProfilerFinalize(const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context);
static void ProfilerPrepareTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context);
static void ProfilerStartTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context);
static void ProfilerStopTracing(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
void* context);
static void ProfilerCollectTraceData(
const std::string& dev_type,
paddle::platform::TraceEventCollector* collector,
uint64_t start_ns,
void* context);
static void Clear();
private:
......
......@@ -16,5 +16,6 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
set_tests_properties(test_custom_cpu_plugin PROPERTIES TIMEOUT 120)
set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120)
set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120)
endif()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import site
import unittest
import numpy as np
class TestCustomCPUProfilerPlugin(unittest.TestCase):
def setUp(self):
# compile so and set to current path
cur_dir = os.path.dirname(os.path.abspath(__file__))
cmd = 'rm -rf PaddleCustomDevice && git clone https://github.com/PaddlePaddle/PaddleCustomDevice.git && cd PaddleCustomDevice/backends/custom_cpu && mkdir build && cd build && cmake .. && make -j8'
os.system(cmd)
# set environment for loading and registering compiled custom kernels
# only valid in current process
os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build')
def test_custom_device(self):
import paddle
with paddle.fluid.framework._test_eager_guard():
self._test_custom_profiler()
def _test_custom_profiler(self):
import paddle
import paddle.profiler as profiler
paddle.set_device('custom_cpu')
x = paddle.to_tensor([1, 2, 3])
p = profiler.Profiler(targets=[
profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.CUSTOM_DEVICE
])
p.start()
for iter in range(10):
x = x + 1
p.step()
p.stop()
p.summary()
def tearDown(self):
del os.environ['CUSTOM_DEVICE_ROOT']
if __name__ == '__main__':
if os.name == 'nt' or sys.platform.startswith('darwin'):
# only support Linux now
exit()
unittest.main()
......@@ -102,6 +102,7 @@ class ProfilerTarget(Enum):
CPU = 0
GPU = 1
MLU = 2
CUSTOM_DEVICE = 3
def make_scheduler(*,
......@@ -296,10 +297,14 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
Get the current supported profiler target in the system.
"""
if _Profiler.is_cupti_supported():
return [ProfilerTarget.CPU, ProfilerTarget.GPU]
return [
ProfilerTarget.CPU, ProfilerTarget.GPU, ProfilerTarget.CUSTOM_DEVICE
]
if _Profiler.is_cnpapi_supported():
return [ProfilerTarget.CPU, ProfilerTarget.MLU]
return [ProfilerTarget.CPU]
return [
ProfilerTarget.CPU, ProfilerTarget.MLU, ProfilerTarget.CUSTOM_DEVICE
]
return [ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE]
class Profiler:
......@@ -437,7 +442,8 @@ class Profiler:
record_shapes: Optional[bool] = False,
profile_memory=False,
timer_only: Optional[bool] = False,
emit_nvtx: Optional[bool] = False):
emit_nvtx: Optional[bool] = False,
custom_device_types: Optional[list] = []):
supported_targets = _get_supported_targets()
if targets:
self.targets = set(targets)
......@@ -455,8 +461,12 @@ class Profiler:
profileoption.trace_switch |= (1 << 1)
if ProfilerTarget.MLU in self.targets:
profileoption.trace_switch |= (1 << 2)
if ProfilerTarget.CUSTOM_DEVICE in self.targets:
profileoption.trace_switch |= (1 << 3)
if not custom_device_types:
custom_device_types = paddle.device.get_all_custom_device_type()
wrap_optimizers()
self.profiler = _Profiler.create(profileoption)
self.profiler = _Profiler.create(profileoption, custom_device_types)
if callable(scheduler):
self.scheduler = scheduler
elif isinstance(scheduler, (tuple, list)):
......
......@@ -627,6 +627,8 @@ headers = (
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels/primitive')) + # phi kernel primitive api headers
# capi headers
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/capi', recursive=True)) + # phi capi headers
# profiler headers
list(find_files('trace_event.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform/profiler')) + # phi profiler headers
# utils api headers
list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True))) # paddle utils headers
......@@ -681,6 +683,9 @@ class InstallHeaders(Command):
if 'fluid/jit' in install_dir:
install_dir = re.sub('fluid/jit', 'jit', install_dir)
print('fluid/jit install_dir: ', install_dir)
if 'trace_event.h' in install_dir:
install_dir = re.sub('fluid/platform/profiler', 'phi/backends/custom', install_dir)
print('trace_event.h install_dir: ', install_dir)
else:
# third_party
install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册