未验证 提交 8fabca11 编写于 作者: Y YuanRisheng 提交者: GitHub

[PHI Decoupling]Remove Profiler header (Part2) (#50183)

* move profiler

* add file

* fix mac compile bugs

* fix ci bugs

* fix mac bugs

* fix ci bugs

* fix compile bugs

* perfect code according comment
上级 96006f77
......@@ -439,7 +439,7 @@ copy(
DSTS ${dst_dir}/${module}/allocation)
set(module "platform")
set(platform_lib_deps profiler_proto errors)
set(platform_lib_deps phi_profiler_proto errors)
if(WITH_GPU)
set(platform_lib_deps ${platform_lib_deps} external_error_proto)
endif()
......@@ -449,7 +449,7 @@ copy(
fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h
${src_dir}/${module}/details/*.h
${PADDLE_BINARY_DIR}/paddle/fluid/platform/*.pb.h
${PADDLE_BINARY_DIR}/paddle/phi/api/profiler/*.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload
${dst_dir}/${module}/details ${dst_dir}/${module})
......
......@@ -32,7 +32,7 @@ cc_library(
cc_library(
cost_model
SRCS cost_model.cc
DEPS executor graph profiler proto_desc device_tracer)
DEPS executor graph profiler proto_desc phi_device_tracer)
set(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
if(WITH_TESTING)
......
......@@ -246,8 +246,8 @@ CostData CostModel::ProfileMeasure(
executor.Run(startup_program, &scope, /*block_id = */ 0);
// TODO(zhhsplendid): handle the case that Profiler is already enabled
SetTracerOption(platform::TracerOption::kAllOpDetail);
EnableProfiler(profiler_state);
platform::SetTracerOption(platform::TracerOption::kAllOpDetail);
platform::EnableProfiler(profiler_state);
executor.Run(main_program, &scope, /*block_id = */ 0);
std::unique_ptr<std::vector<std::vector<Event>>> time_events(
......@@ -255,7 +255,7 @@ CostData CostModel::ProfileMeasure(
std::unique_ptr<std::vector<std::vector<MemEvent>>> mem_events(
new std::vector<std::vector<MemEvent>>());
CompleteProfilerEvents(
platform::CompleteProfilerEvents(
/*tracer_profile= */ nullptr, time_events.get(), mem_events.get());
// TODO(zhhsplendid): remove debug vlog after this series of work
......
......@@ -16,7 +16,7 @@ cc_library(
cc_library(
staticgraph_executor_statistics
SRCS executor_statistics.cc
DEPS enforce glog os_info)
DEPS enforce glog phi_os_info)
# skip win32 since wget is not installed by default on windows machine.
if(WITH_GPU
......
......@@ -5,7 +5,7 @@ cc_library(
cc_library(
workqueue
SRCS workqueue.cc
DEPS workqueue_utils enforce glog os_info)
DEPS workqueue_utils enforce glog phi_os_info)
cc_test(
workqueue_test
SRCS workqueue_test.cc
......
......@@ -53,7 +53,7 @@ if(NOT WIN32)
benchmark.cc
DEPS
jit_kernel_helper
device_tracer
phi_device_tracer
tensor)
endif()
endif()
......
......@@ -19,9 +19,9 @@
#include "glog/logging.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/profiler/device_tracer.h"
DEFINE_int32(burning, 10, "Burning times.");
DEFINE_int32(repeat, 3000, "Repeat times.");
......@@ -97,11 +97,11 @@ struct BenchFunc {
for (int i = 0; i < FLAGS_burning; ++i) {
tgt(args...);
}
auto start = paddle::platform::PosixInNsec() * 1e-3;
auto start = phi::PosixInNsec() * 1e-3;
for (int i = 0; i < FLAGS_repeat; ++i) {
tgt(args...);
}
auto end = paddle::platform::PosixInNsec() * 1e-3;
auto end = phi::PosixInNsec() * 1e-3;
return static_cast<double>(end - start) / FLAGS_repeat;
}
};
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/fc_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
namespace paddle {
......
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto
simple_threadpool)
if(WITH_PYTHON)
py_proto_compile(profiler_py_proto SRCS profiler.proto)
add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E
touch __init__.py)
add_dependencies(profiler_py_proto profiler_py_proto_init)
if(NOT WIN32)
add_custom_command(
TARGET profiler_py_proto
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory
${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMENT
"Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else()
string(REPLACE "/" "\\" proto_dstpath
"${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
add_custom_command(
TARGET profiler_py_proto
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory
${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMAND copy /Y *.py ${proto_dstpath}
COMMENT
"Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endif()
cc_library(
denormal
SRCS denormal.cc
......@@ -60,14 +27,10 @@ cc_test(
cpu_info_test
SRCS cpu_info_test.cc
DEPS phi_backends)
cc_library(
os_info
SRCS os_info.cc
DEPS enforce)
cc_test(
os_info_test
SRCS os_info_test.cc
DEPS os_info)
DEPS phi_os_info)
if(WITH_GPU)
nv_library(
......@@ -121,7 +84,7 @@ if(WITH_IPU)
set(IPU_CTX_DEPS ipu_info)
else()
set(IPU_CTX_DEPS)
endif(WITH_IPU)
endif()
if(WITH_ASCEND_CL)
set(NPU_CTX_DEPS npu_stream npu_info)
......@@ -343,46 +306,45 @@ cc_test(
add_subdirectory(profiler)
cc_library(
device_tracer
SRCS device_tracer.cc
DEPS profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU)
nv_library(
profiler
SRCS profiler.cc profiler.cu
DEPS os_info
device_tracer
DEPS phi_os_info
phi_device_tracer
gpu_info
enforce
dynload_cuda
new_profiler
stats
op_proto_maker
shape_inference)
shape_inference
phi_profiler)
elseif(WITH_ROCM)
hip_library(
profiler
SRCS profiler.cc profiler.cu
DEPS os_info
device_tracer
DEPS phi_os_info
phi_device_tracer
gpu_info
enforce
new_profiler
stats
op_proto_maker
shape_inference)
shape_inference
phi_profiler)
else()
cc_library(
profiler
SRCS profiler.cc
DEPS os_info
device_tracer
DEPS phi_os_info
phi_device_tracer
enforce
new_profiler
stats
op_proto_maker
shape_inference)
shape_inference
phi_profiler)
endif()
cc_test(
......
......@@ -24,191 +24,20 @@ limitations under the License. */
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/profiler/event.h"
namespace paddle {
namespace platform {
enum class EventType { kMark, kPushRange, kPopRange };
enum class EventRole {
kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key
kSpecial, // record event such as PE which is outer of thread local
};
class Event {
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event(EventType type,
std::string name,
uint32_t thread_id,
EventRole role = EventRole::kOrdinary,
std::string attr = "none");
const EventType &type() const;
Event *parent() const { return parent_; }
void set_parent(Event *parent) { parent_ = parent; }
std::string name() const { return name_; }
EventRole role() const { return role_; }
uint64_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; }
void set_role(EventRole role) { role_ = role; }
std::string attr() const { return attr_; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifndef PADDLE_WITH_CUPTI
gpuEvent_t event() const { return event_; }
int device() const { return device_; }
#endif
#endif
double CpuElapsedMs(const Event &e) const;
double CudaElapsedMs(const Event &e) const;
private:
EventType type_;
std::string name_{};
Event *parent_{nullptr};
uint64_t thread_id_;
EventRole role_{};
int64_t cpu_ns_;
bool visited_status_{false};
std::string attr_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUPTI
int64_t gpu_ns_ = 0;
public:
void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
gpu_ns_ += end_ns - start_ns;
}
private:
#else
gpuEvent_t event_ = nullptr;
int device_ = -1;
#endif
#endif
};
using EventType = phi::EventType;
using EventRole = phi::EventRole;
using Event = phi::Event;
using EventWithStartNs = std::pair<Event *, uint64_t>;
using ThreadEvents = std::map<uint64_t, EventWithStartNs>;
class MemEvent {
public:
MemEvent(EventType type,
uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
Place place,
int64_t thread_id,
const std::string &annotation)
: type_(type),
start_ns_(start_ns),
end_ns_(end_ns),
bytes_(bytes),
place_(place),
thread_id_(thread_id),
annotation_(annotation) {}
const EventType &type() const { return type_; }
uint64_t start_ns() const { return start_ns_; }
uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; }
Place place() const { return place_; }
uint64_t thread_id() const { return thread_id_; }
const std::string &annotation() const { return annotation_; }
private:
EventType type_;
uint64_t start_ns_ = 0;
uint64_t end_ns_ = 0;
size_t bytes_;
Place place_;
uint64_t thread_id_;
std::string annotation_;
};
class CudaEvent {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
public:
CudaEvent() {
#ifdef PADDLE_WITH_HIP
hipEventCreateWithFlags(&event_, flags_);
#else
cudaEventCreateWithFlags(&event_, flags_);
#endif
VLOG(4) << "CudaEvent " << event_;
}
explicit CudaEvent(unsigned int flags) : flags_(flags) {
#ifdef PADDLE_WITH_HIP
hipEventCreateWithFlags(&event_, flags_);
#else
cudaEventCreateWithFlags(&event_, flags_);
#endif
VLOG(4) << "CudaEvent " << event_;
}
~CudaEvent() {
#ifdef PADDLE_WITH_HIP
hipEventDestroy(event_);
#else
cudaEventDestroy(event_);
#endif
}
void Record(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
#endif
}
bool Query() {
#ifdef PADDLE_WITH_HIP
gpuError_t err = hipEventQuery(event_);
if (err == hipSuccess) {
return true;
}
if (err == hipErrorNotReady) {
return false;
}
#else
gpuError_t err = cudaEventQuery(event_);
if (err == cudaSuccess) {
return true;
}
if (err == cudaErrorNotReady) {
return false;
}
#endif
PADDLE_ENFORCE_GPU_SUCCESS(err);
return false;
}
void Synchronize() {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
#endif
}
gpuEvent_t GetRawCudaEvent() { return event_; }
private:
#ifdef PADDLE_WITH_HIP
unsigned int flags_ = hipEventDefault;
#else
unsigned int flags_ = cudaEventDefault;
#endif
gpuEvent_t event_;
#endif
};
using MemEvent = phi::MemEvent;
using CudaEvent = phi::CudaEvent;
} // namespace platform
} // namespace paddle
......@@ -14,63 +14,29 @@ limitations under the License. */
#pragma once
#include <string>
#include <unordered_map>
#ifdef _POSIX_C_SOURCE
#include <time.h>
#endif
#include "paddle/phi/backends/dynload/port.h"
#include "paddle/phi/core/os_info.h"
namespace paddle {
namespace platform {
// Get system-wide realtime clock in nanoseconds
inline uint64_t PosixInNsec() {
#ifdef _POSIX_C_SOURCE
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return tp.tv_sec * 1000 * 1000 * 1000 + tp.tv_nsec;
#else
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
#endif
}
// All kinds of Ids for OS thread
struct ThreadId {
uint64_t std_tid = 0; // std::hash<std::thread::id>
uint64_t sys_tid = 0; // OS-specific, Linux: gettid
uint32_t cupti_tid = 0; // thread_id used by Nvidia CUPTI
};
// Better performance than GetCurrentThreadId
uint64_t GetCurrentThreadStdId();
// Better performance than GetCurrentThreadId
uint64_t GetCurrentThreadSysId();
ThreadId GetCurrentThreadId();
// Return the map from StdTid to ThreadId
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
std::unordered_map<uint64_t, ThreadId> GetAllThreadIds();
static constexpr const char* kDefaultThreadName = "unnamed";
// Returns kDefaultThreadName if SetCurrentThreadName is never called.
std::string GetCurrentThreadName();
// Return the map from StdTid to ThreadName
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
std::unordered_map<uint64_t, std::string> GetAllThreadNames();
// Thread name is immutable, only the first call will succeed.
// Returns false on failure.
bool SetCurrentThreadName(const std::string& name);
uint32_t GetProcessId();
using phi::PosixInNsec;
using ThreadId = phi::ThreadId;
using phi::GetCurrentThreadStdId;
using phi::GetCurrentThreadSysId;
using phi::GetCurrentThreadId;
using phi::GetAllThreadIds;
using phi::GetCurrentThreadName;
using phi::GetAllThreadNames;
using phi::SetCurrentThreadName;
using phi::GetProcessId;
} // namespace platform
} // namespace paddle
......@@ -20,13 +20,13 @@ limitations under the License. */
#include <string>
#include <type_traits>
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler/common_event.h"
#include "paddle/fluid/platform/profiler/host_event_recorder.h"
#include "paddle/fluid/platform/profiler/host_tracer.h"
#include "paddle/fluid/platform/profiler/profiler.h"
#include "paddle/fluid/platform/profiler_helper.h"
#include "paddle/phi/api/profiler/device_tracer.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h"
#endif
......@@ -38,10 +38,6 @@ PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
false,
"Enable rpc profiler or not.");
DEFINE_bool(enable_host_event_recorder_hook,
false,
"enable HostEventRecorder, hook Profiler");
DEFINE_bool(enable_record_op_info,
false,
"enable operator supplement info recorder");
......@@ -53,198 +49,6 @@ namespace platform {
MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type,
std::string name,
uint32_t thread_id,
EventRole role,
std::string attr)
: type_(type),
name_(name),
thread_id_(thread_id),
role_(role),
attr_(attr) {
cpu_ns_ = GetTimeInNsec();
}
const EventType &Event::type() const { return type_; }
double Event::CpuElapsedMs(const Event &e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
}
double Event::CudaElapsedMs(const Event &e) const {
#ifdef PADDLE_WITH_CUPTI
return gpu_ns_ / 1000000.0;
#else
LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
return 0;
#endif
}
RecordEvent::RecordEvent(const char *name,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
dynload::nvtxRangePushA(name);
is_pushed_ = true;
}
#endif
#endif
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return;
}
if (FLAGS_enable_host_event_recorder_hook == false) {
if (g_state != ProfilerState::kDisabled) { // avoid temp string
if (type == TracerEventType::Operator ||
type == TracerEventType::OperatorInner ||
type == TracerEventType::UserDefined) {
OriginalConstruct(name, role, "none");
}
}
return;
}
is_enabled_ = true;
shallow_copy_name_ = name;
role_ = role;
type_ = type;
start_ns_ = PosixInNsec();
}
RecordEvent::RecordEvent(const std::string &name,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
dynload::nvtxRangePushA(name.c_str());
is_pushed_ = true;
}
#endif
#endif
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return;
}
if (FLAGS_enable_host_event_recorder_hook == false) {
if (type == TracerEventType::Operator ||
type == TracerEventType::OperatorInner ||
type == TracerEventType::UserDefined) {
OriginalConstruct(name, role, "none");
}
return;
}
is_enabled_ = true;
name_ = new std::string(name);
role_ = role;
type_ = type;
start_ns_ = PosixInNsec();
}
RecordEvent::RecordEvent(const std::string &name,
const std::string &attr,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
dynload::nvtxRangePushA(name.c_str());
is_pushed_ = true;
}
#endif
#endif
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return;
}
if (FLAGS_enable_host_event_recorder_hook == false) {
if (type == TracerEventType::Operator ||
type == TracerEventType::OperatorInner ||
type == TracerEventType::UserDefined) {
OriginalConstruct(name, role, attr);
}
return;
}
is_enabled_ = true;
type_ = type;
name_ = new std::string(name);
start_ns_ = PosixInNsec();
attr_ = new std::string(attr);
}
void RecordEvent::OriginalConstruct(const std::string &name,
const EventRole role,
const std::string &attr) {
if (g_state == ProfilerState::kDisabled || name.empty()) return;
// do some initialization
name_ = new std::string(name);
start_ns_ = PosixInNsec();
role_ = role;
attr_ = new std::string(attr);
is_enabled_ = true;
// lock is not needed, the code below is thread-safe
// Maybe need the same push/pop behavior.
Event *e = PushEvent(name, role, attr);
SetCurAnnotation(e);
*name_ = e->name();
}
void RecordEvent::End() {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook && is_pushed_) {
dynload::nvtxRangePop();
is_pushed_ = false;
}
#endif
#endif
if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) {
uint64_t end_ns = PosixInNsec();
if (LIKELY(shallow_copy_name_ != nullptr)) {
HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
shallow_copy_name_, start_ns_, end_ns, role_, type_);
} else if (name_ != nullptr) {
if (attr_ == nullptr) {
HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
*name_, start_ns_, end_ns, role_, type_);
} else {
HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
*name_, start_ns_, end_ns, role_, type_, *attr_);
delete attr_;
}
delete name_;
}
// use this flag to avoid double End();
is_enabled_ = false;
return;
}
if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
// lock is not needed, the code below is thread-safe
DeviceTracer *tracer = GetDeviceTracer();
if (tracer) {
uint64_t end_ns = PosixInNsec();
tracer->AddCPURecords(
CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
}
ClearCurAnnotation();
PopEvent(*name_, role_);
delete name_;
delete attr_;
// use this flag to avoid double End();
is_enabled_ = false;
}
RecordInstantEvent::RecordInstantEvent(const char *name,
TracerEventType type,
uint32_t level) {
......@@ -323,11 +127,6 @@ RecordOpInfoSupplement::RecordOpInfoSupplement(
PosixInNsec(), type, input_shapes, dtypes, attrs, op_id);
}
bool RecordEvent::IsEnabled() {
return FLAGS_enable_host_event_recorder_hook || g_enable_nvprof_hook ||
g_state != ProfilerState::kDisabled;
}
bool RecordOpInfoSupplement::IsEnabled() { return FLAGS_enable_record_op_info; }
bool RecordMemEvent::IsEnabled() { return FLAGS_enable_record_memory; }
......@@ -342,7 +141,7 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
const phi::Place &place,
size_t size,
const TracerMemEventType type) {
if (g_state == ProfilerState::kDisabled &&
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled &&
FLAGS_enable_host_event_recorder_hook == false) {
return;
}
......@@ -690,7 +489,7 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size) {
if (g_state == ProfilerState::kDisabled) {
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_);
......@@ -730,7 +529,7 @@ void MemEvenRecorder::PushMemRecord(const void *ptr,
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return;
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr),
0,
......@@ -742,7 +541,7 @@ void MemEvenRecorder::PushMemRecord(const void *ptr,
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) {
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_);
......@@ -780,7 +579,7 @@ void MemEvenRecorder::PopMemRecord(const void *ptr,
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return;
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
......@@ -799,15 +598,15 @@ MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
: place_(place),
bytes_(bytes),
start_ns_(PosixInNsec()),
alloc_in_(CurAnnotationName()) {
alloc_in_(phi::CurAnnotationName()) {
PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
}
MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
DeviceTracer *tracer = GetDeviceTracer();
phi::DeviceTracer *tracer = phi::GetDeviceTracer();
end_ns_ = PosixInNsec();
auto annotation_free = CurAnnotationName();
auto annotation_free = phi::CurAnnotationName();
if (tracer) {
tracer->AddMemInfoRecord(start_ns_,
end_ns_,
......@@ -829,23 +628,27 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
RecordBlock::RecordBlock(int block_id)
: is_enabled_(false), start_ns_(PosixInNsec()) {
// lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled) return;
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
is_enabled_ = true;
SetCurBlock(block_id);
phi::SetCurBlock(block_id);
name_ = string::Sprintf("block_%d", block_id);
}
RecordBlock::~RecordBlock() {
// lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer *tracer = GetDeviceTracer();
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled || !is_enabled_)
return;
phi::DeviceTracer *tracer = phi::GetDeviceTracer();
if (tracer) {
// We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(
name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
tracer->AddCPURecords(name_,
start_ns_,
PosixInNsec(),
phi::BlockDepth(),
phi::ProfilerHelper::g_thread_id);
}
ClearCurBlock();
phi::ClearCurBlock();
}
void PushMemEvent(uint64_t start_ns,
......@@ -882,19 +685,10 @@ void Mark(const std::string &name) {
name, 0, 0, EventRole::kOrdinary, TracerEventType::UserDefined);
return;
}
GetEventList().Record(EventType::kMark, name, g_thread_id);
GetEventList().Record(
EventType::kMark, name, phi::ProfilerHelper::g_thread_id);
}
Event *PushEvent(const std::string &name,
const EventRole role,
std::string attr) {
return GetEventList().Record(
EventType::kPushRange, name, g_thread_id, role, attr);
}
void PopEvent(const std::string &name, const EventRole role, std::string attr) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
}
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE_NE(state,
ProfilerState::kDisabled,
......@@ -903,20 +697,21 @@ void EnableProfiler(ProfilerState state) {
"ProfilerState::kDisabled"));
SynchronizeAllDevice();
std::lock_guard<std::mutex> l(profiler_mu);
if (state == g_state) {
if (state == phi::ProfilerHelper::g_state) {
return;
}
g_state = state;
phi::ProfilerHelper::g_state = state;
ProfilerOptions option;
HostTraceLevel::GetInstance().SetLevel(option.trace_level);
should_send_profile_state = true;
GetDeviceTracer()->Enable();
phi::GetDeviceTracer()->Enable();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
g_state == ProfilerState::kCPU) {
if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA ||
phi::ProfilerHelper::g_state == ProfilerState::kAll ||
phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
// Generate some dummy events first to reduce the startup overhead.
DummyKernelAndEvent();
GetDeviceTracer()->Reset();
phi::GetDeviceTracer()->Reset();
}
#endif
// Mark the profiling start.
......@@ -925,15 +720,17 @@ void EnableProfiler(ProfilerState state) {
void ResetProfiler() {
SynchronizeAllDevice();
GetDeviceTracer()->Reset();
phi::GetDeviceTracer()->Reset();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
std::lock_guard<std::mutex> guard(
phi::ProfilerHelper::g_all_event_lists_mutex);
for (auto it = phi::ProfilerHelper::g_all_event_lists.begin();
it != phi::ProfilerHelper::g_all_event_lists.end();
++it) {
(*it)->Clear();
}
for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end();
for (auto it = phi::ProfilerHelper::g_all_mem_event_lists.begin();
it != phi::ProfilerHelper::g_all_mem_event_lists.end();
++it) {
(*it)->Clear();
}
......@@ -950,12 +747,12 @@ void DisableProfiler(EventSortingKey sorted_key,
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return;
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop.
Mark("_stop_profiler_");
DealWithShowName();
DeviceTracer *tracer = GetDeviceTracer();
phi::DeviceTracer *tracer = phi::GetDeviceTracer();
if (tracer->IsEnabled()) {
tracer->Disable();
DockHostEventRecorderDevicePart(thr_events);
......@@ -972,56 +769,56 @@ void DisableProfiler(EventSortingKey sorted_key,
ParseMemEvents(all_mem_events);
ResetProfiler();
g_state = ProfilerState::kDisabled;
phi::ProfilerHelper::g_state = ProfilerState::kDisabled;
g_tracer_option = TracerOption::kDefault;
should_send_profile_state = true;
}
void CompleteProfilerEvents(proto::Profile *tracer_profile,
void CompleteProfilerEvents(phi::proto::Profile *tracer_profile,
std::vector<std::vector<Event>> *time_events,
std::vector<std::vector<MemEvent>> *mem_events) {
SynchronizeAllDevice();
auto thr_events = DockHostEventRecorderHostPart();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return;
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop.
Mark("_stop_profiler_");
DeviceTracer *tracer = GetDeviceTracer();
phi::DeviceTracer *tracer = phi::GetDeviceTracer();
if (tracer->IsEnabled() && tracer_profile != nullptr) {
tracer->Disable();
DockHostEventRecorderDevicePart(thr_events);
tracer->GenEventKernelCudaElapsedTime();
*tracer_profile = tracer->GetProfile();
}
if (time_events != nullptr) {
*time_events = GetAllEvents();
}
if (mem_events != nullptr) {
*mem_events = GetMemEvents();
}
ResetProfiler();
g_state = ProfilerState::kDisabled;
phi::ProfilerHelper::g_state = ProfilerState::kDisabled;
g_tracer_option = TracerOption::kDefault;
should_send_profile_state = true;
}
std::vector<std::vector<Event>> GetAllEvents() {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
std::lock_guard<std::mutex> guard(
phi::ProfilerHelper::g_all_event_lists_mutex);
std::vector<std::vector<Event>> result;
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
for (auto it = phi::ProfilerHelper::g_all_event_lists.begin();
it != phi::ProfilerHelper::g_all_event_lists.end();
++it) {
result.emplace_back((*it)->Reduce());
}
return result;
}
bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
bool IsProfileEnabled() {
return phi::ProfilerHelper::g_state != ProfilerState::kDisabled;
}
bool ShouldSendProfileState() { return should_send_profile_state; }
......@@ -1063,10 +860,12 @@ int64_t ListenerId() { return profiler_lister_id; }
void NvprofEnableRecordEvent() {
SynchronizeAllDevice();
g_enable_nvprof_hook = true;
phi::ProfilerHelper::g_enable_nvprof_hook = true;
}
void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
void NvprofDisableRecordEvent() {
phi::ProfilerHelper::g_enable_nvprof_hook = false;
}
void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; }
......@@ -1103,7 +902,7 @@ static void EmulateEventPushAndPop(
for (const auto &thr_sec : host_sec.thr_sections) {
uint64_t tid = thr_sec.thread_id;
auto cur_thr_list = std::make_shared<EventList<Event>>();
g_all_event_lists.emplace_front(cur_thr_list);
phi::ProfilerHelper::g_all_event_lists.emplace_front(cur_thr_list);
// for nesting events
std::stack<size_t> evt_stk;
std::stack<std::string> prefix_stk;
......@@ -1148,7 +947,7 @@ static void EmulateEventPushAndPop(
static void EmulateCPURecordsAdd(
const HostEventSection<CommonEvent> &host_sec) {
DeviceTracer *tracer = GetDeviceTracer();
phi::DeviceTracer *tracer = phi::GetDeviceTracer();
if (tracer == nullptr) {
return;
}
......@@ -1156,14 +955,14 @@ static void EmulateCPURecordsAdd(
uint64_t tid = thr_sec.thread_id;
for (const auto &evt : thr_sec.events) {
tracer->AddCPURecords(
evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
evt.name, evt.start_ns, evt.end_ns, phi::BlockDepth(), tid);
}
}
}
static void EmulateCorrelation(
const std::map<uint64_t, ThreadEvents> &thr_events) {
DeviceTracer *tracer = GetDeviceTracer();
phi::DeviceTracer *tracer = phi::GetDeviceTracer();
if (tracer == nullptr) {
return;
}
......
......@@ -35,6 +35,14 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
#include "paddle/phi/api/profiler/profiler.h"
namespace phi {
namespace proto {
class Profile;
}
} // namespace phi
namespace paddle {
namespace platform {
......@@ -45,19 +53,8 @@ class Profile;
const int kEnableProfiler = 1;
const int kDisableProfiler = 2;
enum class ProfilerState {
kDisabled, // disabled state
kCPU, // CPU profiling state
kCUDA, // GPU profiling state
kAll, // Profile both CPU and GPU. (Currently experimental).
};
// it is the flag to control to print the profiling result
enum class TracerOption {
kDefault, // print the different op type profiling result
kOpDetail, // print the detail profiling result of different op type
kAllOpDetail, // print the detail profiling result of different op name
};
using ProfilerState = phi::ProfilerState;
using TracerOption = phi::TracerOption;
// Candidate keys to sort the profiling report
enum class EventSortingKey {
......@@ -159,40 +156,7 @@ struct RecordBlock {
};
template <typename T>
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(T);
constexpr static size_t kEventAlign = alignof(T);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
T* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<T> Reduce() {
std::vector<T> result;
for (auto& block : event_blocks) {
result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<T>> event_blocks;
};
using EventList = phi::EventList<T>;
void Mark(const std::string& name);
void PushMemEvent(uint64_t start_ns,
......@@ -205,24 +169,23 @@ void PopMemEvent(uint64_t start_ns,
size_t bytes,
const Place& place,
const std::string& annotation);
Event* PushEvent(const std::string& name,
const EventRole role,
const std::string attr = "none");
void PopEvent(const std::string& name,
const EventRole role,
const std::string attr = "none");
using phi::PopEvent;
using phi::PushEvent;
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents();
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
// Clear the g_all_event_lists, which is total event lists of all threads.
// Clear the phi::ProfilerHelper::g_all_event_lists, which is total event lists
// of all threads.
void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path);
// Disable profiler but return events instead of print it.
void CompleteProfilerEvents(proto::Profile* tracer_profile,
void CompleteProfilerEvents(phi::proto::Profile* tracer_profile,
std::vector<std::vector<Event>>* time_events,
std::vector<std::vector<MemEvent>>* mem_events);
......
......@@ -29,7 +29,7 @@ cc_library(
cc_library(
cpu_utilization
SRCS cpu_utilization.cc
DEPS phi_backends os_info enforce glog)
DEPS phi_backends phi_os_info enforce glog)
cc_library(
new_profiler
SRCS profiler.cc
......
......@@ -21,90 +21,14 @@
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later
#include "paddle/fluid/platform/profiler/trace_event.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/api/profiler/common_event.h"
namespace paddle {
namespace platform {
struct CommonEvent {
public:
CommonEvent(const char *name,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: name(name),
start_ns(start_ns),
end_ns(end_ns),
role(role),
type(type) {}
CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type,
const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
attr = buf;
}
using CommonEvent = phi::CommonEvent;
CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
}
const char *name = nullptr; // not owned, designed for performance
uint64_t start_ns = 0;
uint64_t end_ns = 0;
EventRole role = EventRole::kOrdinary;
TracerEventType type = TracerEventType::NumTypes;
const char *attr = nullptr; // not owned, designed for performance
};
struct CommonMemEvent {
public:
CommonMemEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
int64_t increase_bytes,
const Place &place,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved),
peak_allocated(peak_allocated),
peak_reserved(peak_reserved) {}
uint64_t timestamp_ns;
uint64_t addr;
TracerMemEventType type;
int64_t increase_bytes;
Place place;
uint64_t current_allocated;
uint64_t current_reserved;
uint64_t peak_allocated;
uint64_t peak_reserved;
};
using CommonMemEvent = phi::CommonMemEvent;
struct OperatorSupplementOriginEvent {
public:
......
......@@ -170,11 +170,11 @@ void AddMemcpyRecord(const CUpti_ActivityMemcpy* memcpy,
// snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
// MemcpyKind(memcpy->copyKind));
snprintf(event.memcpy_info.src_kind,
kMemKindMaxLen,
phi::kMemKindMaxLen,
"%s",
MemcpyKind(memcpy->srcKind));
snprintf(event.memcpy_info.dst_kind,
kMemKindMaxLen,
phi::kMemKindMaxLen,
"%s",
MemcpyKind(memcpy->dstKind));
collector->AddDeviceEvent(std::move(event));
......@@ -199,11 +199,11 @@ void AddMemcpy2Record(const CUpti_ActivityMemcpy2* memcpy2,
// snprintf(event.memcpy_info.copy_kind, kMemKindMaxLen, "%s",
// MemcpyKind(memcpy2->copyKind));
snprintf(event.memcpy_info.src_kind,
kMemKindMaxLen,
phi::kMemKindMaxLen,
"%s",
MemcpyKind(memcpy2->srcKind));
snprintf(event.memcpy_info.dst_kind,
kMemKindMaxLen,
phi::kMemKindMaxLen,
"%s",
MemcpyKind(memcpy2->dstKind));
collector->AddDeviceEvent(std::move(event));
......@@ -226,7 +226,7 @@ void AddMemsetRecord(const CUpti_ActivityMemset* memset,
event.correlation_id = memset->correlationId;
event.memset_info.num_bytes = memset->bytes;
snprintf(event.memset_info.memory_kind,
kMemKindMaxLen,
phi::kMemKindMaxLen,
"%s",
MemoryKind(memset->memoryKind));
event.memset_info.value = memset->value;
......
......@@ -341,13 +341,13 @@ MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto(
memcpy_info.num_bytes = memcpy_info_proto.num_bytes();
std::strncpy(memcpy_info.copy_kind,
memcpy_info_proto.copy_kind().c_str(),
kMemKindMaxLen - 1);
phi::kMemKindMaxLen - 1);
std::strncpy(memcpy_info.src_kind,
memcpy_info_proto.src_kind().c_str(),
kMemKindMaxLen - 1);
phi::kMemKindMaxLen - 1);
std::strncpy(memcpy_info.dst_kind,
memcpy_info_proto.dst_kind().c_str(),
kMemKindMaxLen - 1);
phi::kMemKindMaxLen - 1);
return memcpy_info;
}
......@@ -359,7 +359,7 @@ MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto(
memset_info.num_bytes = memset_info_proto.num_bytes();
std::strncpy(memset_info.memory_kind,
memset_info_proto.memory_kind().c_str(),
kMemKindMaxLen - 1);
phi::kMemKindMaxLen - 1);
memset_info.value = memset_info_proto.value();
return memset_info;
}
......
......@@ -18,14 +18,11 @@ limitations under the License. */
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
#include "paddle/phi/api/profiler/event_tracing.h"
namespace paddle {
namespace platform {
// Default tracing level.
// It is Recommended to set the level explicitly.
static constexpr uint32_t kDefaultTraceLevel = 4;
// Host event tracing. A trace marks something that happens but has no duration
// associated with it. For example, thread starts working.
// Chrome Trace Viewer Format: Instant Event
......@@ -40,76 +37,10 @@ struct RecordInstantEvent {
*/
explicit RecordInstantEvent(const char* name,
TracerEventType type,
uint32_t level = kDefaultTraceLevel);
uint32_t level = phi::kDefaultTraceLevel);
};
// Host event tracing. A trace starts when an object of this clas is created and
// stops when the object is destroyed.
// Chrome Trace Viewer Format: Duration Event/Complte Event
class RecordEvent {
public:
static bool IsEnabled();
/**
* @param name: If your string argument has a longer lifetime (e.g.: string
* literal, static variables, etc) than the event, use 'const char* name'.
* Do your best to avoid using 'std::string' as the argument type. It will
* cause deep-copy to harm performance.
* @param type: Classification which is used to instruct the profiling
* data statistics.
* @param level: Used to filter events, works like glog VLOG(level).
* RecordEvent will works if HostTraceLevel >= level.
*/
explicit RecordEvent(
const std::string& name,
const TracerEventType type = TracerEventType::UserDefined,
uint32_t level = kDefaultTraceLevel,
const EventRole role = EventRole::kOrdinary);
/**
* @param name: It is the caller's reponsibility to manage the underlying
* storage. RecordEvent stores the pointer.
* @param type: Classification which is used to instruct the profiling
* data statistics.
* @param level: Used to filter events, works like glog VLOG(level).
* RecordEvent will works if HostTraceLevel >= level.
*/
explicit RecordEvent(
const char* name,
const TracerEventType type = TracerEventType::UserDefined,
uint32_t level = kDefaultTraceLevel,
const EventRole role = EventRole::kOrdinary);
RecordEvent(const std::string& name,
const std::string& attr,
const TracerEventType type = TracerEventType::UserDefined,
uint32_t level = kDefaultTraceLevel,
const EventRole role = EventRole::kOrdinary);
// Stop event tracing explicitly before the object goes out of scope.
// Sometimes it's inconvenient to use RAII
void End();
~RecordEvent() { End(); }
private:
void OriginalConstruct(const std::string& name,
const EventRole role,
const std::string& attr);
bool is_enabled_{false};
bool is_pushed_{false};
// Event name
std::string* name_{nullptr};
const char* shallow_copy_name_{nullptr};
uint64_t start_ns_;
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
// std::string full_name_;
EventRole role_{EventRole::kOrdinary};
TracerEventType type_{TracerEventType::UserDefined};
std::string* attr_{nullptr};
bool finished_{false};
};
using RecordEvent = phi::RecordEvent;
} // namespace platform
} // namespace paddle
......@@ -21,274 +21,25 @@
#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/phi/api/profiler/host_event_recorder.h"
namespace paddle {
namespace platform {
template <typename HeadType, typename... RestTypes>
struct ContainsStdString
: std::conditional_t<
std::is_same<
std::string,
std::remove_cv_t<std::remove_reference_t<HeadType>>>::value,
std::true_type,
ContainsStdString<RestTypes...>> {};
template <typename TailType>
struct ContainsStdString<TailType>
: std::is_same<std::string,
std::remove_cv_t<std::remove_reference_t<TailType>>> {};
template <typename EventType>
class EventContainer {
public:
EventContainer() {
event_blocks_ = cur_event_block_ = new EventBlock;
str_blocks_ = cur_str_block_ = new StringBlock;
}
~EventContainer() {
Reduce();
delete event_blocks_;
for (auto cur = str_blocks_; cur != nullptr;) {
auto next = cur->next;
delete cur;
cur = next;
}
}
DISABLE_COPY_AND_ASSIGN(EventContainer);
public:
// Record an event
template <typename... Args>
void Record(Args &&...args) {
DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
}
// Get all events and clear the container
std::vector<EventType> Reduce();
// Return a buffer to store the string attribute of Event.
// HostEventRecorder locates in the static data section.
// So it's safe to use arena to avoid fragmented allocations.
char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); }
private:
struct EventBlock {
union InitDeferedEvent {
InitDeferedEvent() {}
~InitDeferedEvent() {}
EventType event;
};
static constexpr size_t kBlockSize = 1 << 24; // 16 MB
static constexpr size_t kAvailSize =
kBlockSize - sizeof(size_t) - sizeof(nullptr);
static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent);
static constexpr size_t kPadSize =
kAvailSize - kNumEvents * sizeof(InitDeferedEvent);
static constexpr size_t kMinimumEventsPerBlock = 1024;
static_assert(
kNumEvents >= kMinimumEventsPerBlock,
"EventType is too large for kBlockSize, make kBlockSize larger");
size_t offset = 0;
EventBlock *next = nullptr;
InitDeferedEvent events[kNumEvents];
char padding[kPadSize];
};
static_assert(sizeof(EventBlock) == EventBlock::kBlockSize,
"sizeof EventBlock must equal to kBlockSize");
struct StringBlock {
static constexpr size_t kBlockSize = 1 << 22; // 4 MB
static constexpr size_t kAvailSize =
kBlockSize - sizeof(size_t) - sizeof(nullptr);
size_t offset = 0;
StringBlock *next = nullptr;
char storage[kAvailSize];
};
static_assert(sizeof(StringBlock) == StringBlock::kBlockSize,
"sizeof StringBlock must equal to kBlockSize");
// Record an event with string arguments
template <typename... Args>
void DoRecord(std::true_type, Args &&...args) {
auto *storage = GetEventStorage();
std::function<void *(size_t)> allocator = [this](size_t size) {
return GetStrBufFromArena(size);
};
new (storage) EventType(allocator, std::forward<Args>(args)...);
}
// Record an event without any string argument
template <typename... Args>
void DoRecord(std::false_type, Args &&...args) {
auto *storage = GetEventStorage();
new (storage) EventType(std::forward<Args>(args)...);
}
EventType *GetEventStorage();
char *GetStringStorage(size_t sz);
EventBlock *event_blocks_ = nullptr;
EventBlock *cur_event_block_ = nullptr;
StringBlock *str_blocks_ = nullptr;
StringBlock *cur_str_block_ = nullptr;
};
using EventContainer = phi::EventContainer<EventType>;
template <typename EventType>
std::vector<EventType> EventContainer<EventType>::Reduce() {
std::vector<EventType> all_events;
size_t event_cnt = 0;
for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) {
event_cnt += cur->offset;
}
all_events.reserve(event_cnt);
for (auto cur = event_blocks_; cur != nullptr;) {
for (size_t i = 0; i < cur->offset; ++i) {
all_events.emplace_back(cur->events[i].event);
}
auto next = cur->next;
delete cur;
cur = next;
}
event_blocks_ = cur_event_block_ = new EventBlock;
return all_events;
}
using ThreadEventSection = phi::ThreadEventSection<EventType>;
template <typename EventType>
EventType *EventContainer<EventType>::GetEventStorage() {
if (UNLIKELY(cur_event_block_->offset >=
EventBlock::kNumEvents)) { // another block
cur_event_block_->next = new EventBlock;
cur_event_block_ = cur_event_block_->next;
}
auto &obj = cur_event_block_->events[cur_event_block_->offset].event;
++cur_event_block_->offset;
return &obj;
}
using ThreadEventRecorder = phi::ThreadEventRecorder<EventType>;
template <typename EventType>
char *EventContainer<EventType>::GetStringStorage(size_t sz) {
if (UNLIKELY(cur_str_block_->offset + sz >
StringBlock::kAvailSize)) { // another block
cur_str_block_->next = new StringBlock;
cur_str_block_ = cur_str_block_->next;
}
char *storage = cur_str_block_->storage + cur_str_block_->offset;
cur_str_block_->offset += sz;
return storage;
}
using HostEventSection = phi::HostEventSection<EventType>;
template <typename EventType>
struct ThreadEventSection {
std::string thread_name;
uint64_t thread_id;
std::vector<EventType> events;
};
template <typename EventType>
class ThreadEventRecorder {
public:
ThreadEventRecorder() {
thread_id_ = GetCurrentThreadSysId();
thread_name_ = GetCurrentThreadName();
}
DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
public:
// Forward call to EventContainer::Record
template <typename... Args>
void RecordEvent(Args &&...args) {
base_evt_cntr_.Record(std::forward<Args>(args)...);
}
ThreadEventSection<EventType> GatherEvents() {
ThreadEventSection<EventType> thr_sec;
thr_sec.thread_name = thread_name_;
thr_sec.thread_id = thread_id_;
thr_sec.events = std::move(base_evt_cntr_.Reduce());
return thr_sec;
}
private:
uint64_t thread_id_;
std::string thread_name_;
EventContainer<EventType> base_evt_cntr_;
};
template <typename EventType>
struct HostEventSection {
std::string process_name;
uint64_t process_id;
std::vector<ThreadEventSection<EventType>> thr_sections;
};
template <typename EventType>
class HostEventRecorder {
public:
// singleton
static HostEventRecorder &GetInstance() {
static HostEventRecorder instance;
return instance;
}
// thread-safe
// If your string argument has a longer lifetime than the Event,
// use 'const char*'. e.g.: string literal, op name, etc.
// Do your best to avoid using 'std::string' as the argument type.
// It will cause deep-copy to harm performance.
template <typename... Args>
void RecordEvent(Args &&...args) {
// Get thread local ThreadEventRecorder
// If not exists, we create a new one.
// Both HostEventRecorder and thread-local varibale in
// ThreadEventRecorderRegistry keep the shared pointer. We add this to
// prevent ThreadEventRecorder being destroyed by thread-local variable in
// ThreadEventRecorderRegistry and lose data.
if (GetThreadLocalRecorder()->get() == nullptr) {
std::shared_ptr<ThreadEventRecorder<EventType>>
thread_event_recorder_ptr =
std::make_shared<ThreadEventRecorder<EventType>>();
*(GetThreadLocalRecorder()) = thread_event_recorder_ptr;
thr_recorders_.push_back(thread_event_recorder_ptr);
}
(*GetThreadLocalRecorder())->RecordEvent(std::forward<Args>(args)...);
}
// thread-unsafe, make sure make sure there is no running tracing.
// Poor performance, call it at the ending
HostEventSection<EventType> GatherEvents() {
HostEventSection<EventType> host_sec;
host_sec.process_id = GetProcessId();
host_sec.thr_sections.reserve(thr_recorders_.size());
for (auto &v : thr_recorders_) {
host_sec.thr_sections.emplace_back(std::move(v->GatherEvents()));
}
return host_sec;
}
private:
using ThreadEventRecorderRegistry = framework::ThreadDataRegistry<
std::shared_ptr<ThreadEventRecorder<EventType>>>;
HostEventRecorder() = default;
DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
std::shared_ptr<ThreadEventRecorder<EventType>> *GetThreadLocalRecorder() {
return ThreadEventRecorderRegistry::GetInstance()
.GetMutableCurrentThreadData();
}
// Hold all thread-local ThreadEventRecorders
// ThreadEventRecorderRegistry and HostEventRecorder both take care of this
// shared pointer. We add this to prevent ThreadEventRecorder being destroyed
// by thread-local variable in ThreadEventRecorderRegistry and lose data.
std::vector<std::shared_ptr<ThreadEventRecorder<EventType>>> thr_recorders_;
};
using HostEventRecorder = phi::HostEventRecorder<EventType>;
} // namespace platform
} // namespace paddle
......@@ -37,7 +37,7 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : host_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
if (thr_sec.thread_name != phi::kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
......@@ -58,7 +58,7 @@ void ProcessHostMemEvents(
TraceEventCollector* collector) {
for (const auto& thr_sec : host_mem_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
if (thr_sec.thread_name != phi::kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
......@@ -84,7 +84,7 @@ void ProcessOperatorSupplementEvents(
TraceEventCollector* collector) {
for (const auto& thr_sec : op_supplement_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
if (thr_sec.thread_name != phi::kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
......
......@@ -15,33 +15,13 @@
#pragma once
#include "paddle/fluid/platform/profiler/tracer_base.h"
#include "paddle/phi/api/profiler/host_tracer.h"
namespace paddle {
namespace platform {
class HostTraceLevel {
public:
static constexpr int64_t kDisabled = -1;
static HostTraceLevel& GetInstance() {
static HostTraceLevel instance;
return instance;
}
bool NeedTrace(uint32_t level) {
return trace_level_ >= static_cast<int64_t>(level);
}
void SetLevel(int64_t trace_level) { trace_level_ = trace_level; }
private:
// Verbose trace level, works like VLOG(level)
int trace_level_ = kDisabled;
};
struct HostTracerOptions {
uint32_t trace_level = 0;
};
using HostTraceLevel = phi::HostTraceLevel;
using HostTracerOptions = phi::HostTracerOptions;
class HostTracer : public TracerBase {
public:
......
......@@ -98,7 +98,7 @@ void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy,
event.correlation_id = memcpy->correlation_id;
event.memcpy_info.num_bytes = memcpy->bytes;
snprintf(event.memcpy_info.copy_kind,
kMemKindMaxLen,
phi::kMemKindMaxLen,
"%s",
MemcpyKind(memcpy->copy_type));
collector->AddDeviceEvent(std::move(event));
......@@ -122,7 +122,7 @@ void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2,
event.correlation_id = memcpy2->correlation_id;
event.memcpy_info.num_bytes = memcpy2->bytes;
snprintf(event.memcpy_info.copy_kind,
kMemKindMaxLen,
phi::kMemKindMaxLen,
"%s",
MemcpyKind(memcpy2->copy_type));
collector->AddDeviceEvent(std::move(event));
......
......@@ -19,126 +19,20 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/phi/api/profiler/trace_event.h"
namespace paddle {
namespace platform {
enum class TracerEventType {
// Used to mark operator record
Operator = 0,
// Used to mark dataloader record
Dataloader = 1,
// Used to mark profile step record
ProfileStep = 2,
// Used to mark cuda runtime record returned by cupti
CudaRuntime = 3,
// Used to mark kernel computation record returned by cupti
Kernel = 4,
// Used to mark memcpy record returned by cupti
Memcpy = 5,
// Used to mark memset record returned by cupti
Memset = 6,
// Used to mark record defined by user
UserDefined = 7,
// Used to mark operator detail, (such as infer shape, compute)
OperatorInner = 8,
// Used to mark model training or testing perspective, forward process
Forward = 9,
// Used to mark model training perspective, backward process
Backward = 10,
// Used to mark model training perspective, optimization process
Optimization = 11,
// Used to mark distributed training perspective
Communication = 12,
// Used to mark python api
PythonOp = 13,
// Used to mark python level userdefined
PythonUserDefined = 14,
// Used to mark mlu runtime record returned by cnpapi
MluRuntime = 15,
// A flag to denote the number of current types
NumTypes
};
enum class TracerMemEventType {
// Used to mark memory allocation which is managed by paddle
Allocate = 0,
// Used to mark memory free which is managed by paddle
Free = 1,
// Used to mark reserved memory allocation which is applied from device.
ReservedAllocate = 2,
// Used to mark reserved memory free which is released to device.
ReservedFree = 3,
// A flag to denote the number of current types
NumTypes
};
struct KernelEventInfo {
// The X-dimension block size for the kernel.
uint32_t block_x;
// The Y-dimension block size for the kernel.
uint32_t block_y;
// The Z-dimension grid size for the kernel.
uint32_t block_z;
// X-dimension of a grid.
uint32_t grid_x;
// Y-dimension of a grid.
uint32_t grid_y;
// Z-dimension of a grid.
uint32_t grid_z;
// The dynamic shared memory reserved for the kernel, in bytes.
uint32_t dynamic_shared_memory;
// The static shared memory allocated for the kernel, in bytes.
uint32_t static_shared_memory;
// The number of registers required for each thread executing the kernel.
uint32_t registers_per_thread;
// The amount of local memory reserved for each thread, in bytes.
uint32_t local_memory_per_thread;
// The total amount of local memory reserved for the kernel, in bytes.
uint32_t local_memory_total;
// The timestamp when the kernel is queued up in the command buffer, in ns.
// This timestamp is not collected by default. Use API
// cuptiActivityEnableLatencyTimestamps() to enable collection.
uint64_t queued;
// The timestamp when the command buffer containing the kernel launch is
// submitted to the GPU, in ns.
// This timestamp is not collected by default. Use API
// cuptiActivityEnableLatencyTimestamps() to enable collection.
uint64_t submitted;
// The completed timestamp for the kernel execution, in ns.
uint64_t completed;
float blocks_per_sm;
float warps_per_sm;
// theoretical achieved occupancy
float occupancy;
};
static constexpr size_t kMemKindMaxLen = 50;
struct MemcpyEventInfo {
// The number of bytes transferred by the memory copy.
uint64_t num_bytes;
// The kind of the memory copy.
// Each kind represents the source and destination targets of a memory copy.
// Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind
char copy_kind[kMemKindMaxLen];
// The source memory kind read by the memory copy.
// Each kind represents the type of the memory accessed by a memory
// operation/copy. Refer to CUpti_ActivityMemoryKind
char src_kind[kMemKindMaxLen];
// The destination memory kind read by the memory copy.
char dst_kind[kMemKindMaxLen];
};
struct MemsetEventInfo {
// The number of bytes being set by the memory set.
uint64_t num_bytes;
// The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind
char memory_kind[kMemKindMaxLen];
// the value being assigned to memory by the memory set.
uint32_t value;
};
using TracerEventType = phi::TracerEventType;
using TracerMemEventType = phi::TracerMemEventType;
using KernelEventInfo = phi::KernelEventInfo;
using MemcpyEventInfo = phi::MemcpyEventInfo;
using MemsetEventInfo = phi::MemsetEventInfo;
using HostTraceEvent = phi::HostTraceEvent;
using RuntimeTraceEvent = phi::RuntimeTraceEvent;
using DeviceTraceEvent = phi::DeviceTraceEvent;
using MemTraceEvent = phi::MemTraceEvent;
struct OperatorSupplementEvent {
OperatorSupplementEvent() = default;
......@@ -181,201 +75,5 @@ struct OperatorSupplementEvent {
uint64_t thread_id;
};
struct HostTraceEvent {
HostTraceEvent() = default;
HostTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t process_id,
uint64_t thread_id)
: name(name),
type(type),
start_ns(start_ns),
end_ns(end_ns),
process_id(process_id),
thread_id(thread_id) {}
// record name
std::string name;
// record type, one of TracerEventType
TracerEventType type;
// start timestamp of the record
uint64_t start_ns;
// end timestamp of the record
uint64_t end_ns;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
};
struct RuntimeTraceEvent {
RuntimeTraceEvent() = default;
RuntimeTraceEvent(const std::string& name,
uint64_t start_ns,
uint64_t end_ns,
uint64_t process_id,
uint64_t thread_id,
uint32_t correlation_id,
uint32_t callback_id)
: name(name),
start_ns(start_ns),
end_ns(end_ns),
process_id(process_id),
thread_id(thread_id),
correlation_id(correlation_id),
callback_id(callback_id) {}
// record name
std::string name;
// record type, one of TracerEventType
TracerEventType type{TracerEventType::CudaRuntime};
// start timestamp of the record
uint64_t start_ns;
// end timestamp of the record
uint64_t end_ns;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// correlation id, used for correlating async activities happened on device
uint32_t correlation_id;
// callback id, used to identify which cuda runtime api is called
uint32_t callback_id;
};
struct DeviceTraceEvent {
DeviceTraceEvent() = default;
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const KernelEventInfo& kernel_info)
: name(name),
type(type),
start_ns(start_ns),
end_ns(end_ns),
device_id(device_id),
context_id(context_id),
stream_id(stream_id),
correlation_id(correlation_id),
kernel_info(kernel_info) {}
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const MemcpyEventInfo& memcpy_info)
: name(name),
type(type),
start_ns(start_ns),
end_ns(end_ns),
device_id(device_id),
context_id(context_id),
stream_id(stream_id),
correlation_id(correlation_id),
memcpy_info(memcpy_info) {}
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const MemsetEventInfo& memset_info)
: name(name),
type(type),
start_ns(start_ns),
end_ns(end_ns),
device_id(device_id),
context_id(context_id),
stream_id(stream_id),
correlation_id(correlation_id),
memset_info(memset_info) {}
// record name
std::string name;
// record type, one of TracerEventType
TracerEventType type;
// start timestamp of the record
uint64_t start_ns;
// end timestamp of the record
uint64_t end_ns;
// device id
uint64_t device_id;
// context id
uint64_t context_id;
// stream id
uint64_t stream_id;
// correlation id, used for correlating async activities happened on device
uint32_t correlation_id;
// union, specific device record type has different detail information
union {
// used for TracerEventType::Kernel
KernelEventInfo kernel_info;
// used for TracerEventType::Memcpy
MemcpyEventInfo memcpy_info;
// used for TracerEventType::Memset
MemsetEventInfo memset_info;
};
};
struct MemTraceEvent {
MemTraceEvent() = default;
MemTraceEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
uint64_t process_id,
uint64_t thread_id,
int64_t increase_bytes,
const std::string& place,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
process_id(process_id),
thread_id(thread_id),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved),
peak_allocated(peak_allocated),
peak_reserved(peak_reserved) {}
// timestamp of the record
uint64_t timestamp_ns;
// memory addr of allocation or free
uint64_t addr;
// memory manipulation type
TracerMemEventType type;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// increase bytes after this manipulation, allocation for sign +, free for
// sign -
int64_t increase_bytes;
// place
std::string place;
// current total allocated memory
uint64_t current_allocated;
// current total reserved memory
uint64_t current_reserved;
// current peak allocated memory
uint64_t peak_allocated;
// current peak reserved memory
uint64_t peak_reserved;
};
} // namespace platform
} // namespace paddle
......@@ -19,52 +19,21 @@ limitations under the License. */
#include <unordered_map>
#include "paddle/fluid/platform/profiler/trace_event.h"
#include "paddle/phi/api/profiler/trace_event_collector.h"
namespace paddle {
namespace platform {
class TraceEventCollector {
class TraceEventCollector : public phi::TraceEventCollector {
public:
void AddHostEvent(HostTraceEvent&& event) { host_events_.push_back(event); }
void AddRuntimeEvent(RuntimeTraceEvent&& event) {
runtime_events_.push_back(event);
}
void AddDeviceEvent(DeviceTraceEvent&& event) {
device_events_.push_back(event);
}
void AddThreadName(uint64_t tid, const std::string& name) {
thread_names_[tid] = name;
}
void AddMemEvent(MemTraceEvent&& event) { mem_events_.push_back(event); }
void AddOperatorSupplementEvent(OperatorSupplementEvent&& event) {
op_supplement_events_.push_back(event);
}
const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
return runtime_events_;
}
const std::list<DeviceTraceEvent>& DeviceEvents() const {
return device_events_;
}
const std::list<MemTraceEvent>& MemEvents() const { return mem_events_; }
const std::list<OperatorSupplementEvent>& OperatorSupplementEvents() const {
return op_supplement_events_;
}
const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
return thread_names_;
}
void ClearAll() {
thread_names_.clear();
host_events_.clear();
......@@ -75,11 +44,6 @@ class TraceEventCollector {
}
private:
std::unordered_map<uint64_t, std::string> thread_names_;
std::list<HostTraceEvent> host_events_;
std::list<RuntimeTraceEvent> runtime_events_;
std::list<DeviceTraceEvent> device_events_;
std::list<MemTraceEvent> mem_events_;
std::list<OperatorSupplementEvent> op_supplement_events_;
};
......
......@@ -44,6 +44,7 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/api/profiler/profiler_helper.h"
namespace paddle {
namespace platform {
......@@ -53,26 +54,7 @@ static bool should_send_profile_state = false;
std::mutex profiler_mu;
static TracerOption g_tracer_option = TracerOption::kDefault;
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// To hook RecordEvent's events, use it to nvtx timeline
static bool g_enable_nvprof_hook = false;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local uint64_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList<Event>> g_event_list;
static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
static std::mutex g_all_mem_event_lists_mutex;
static thread_local int32_t g_mem_thread_id;
static uint32_t g_mem_next_thread_id = 0;
......@@ -88,40 +70,28 @@ static int FindNthReversePos(const std::string &s, const char ch, const int N) {
return found_pos;
}
inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch())
.count();
}
using phi::GetTimeInNsec;
inline EventList<Event> &GetEventList() {
if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList<Event>>();
g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list);
RecoreCurThreadId(g_thread_id);
}
return *g_event_list;
}
using phi::GetEventList;
inline EventList<MemEvent> &GetMemEventList() {
if (!g_mem_event_list) {
g_mem_event_list = std::make_shared<EventList<MemEvent>>();
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
if (!phi::ProfilerHelper::g_mem_event_list) {
phi::ProfilerHelper::g_mem_event_list =
std::make_shared<EventList<MemEvent>>();
std::lock_guard<std::mutex> guard(
phi::ProfilerHelper::g_all_mem_event_lists_mutex);
g_mem_thread_id = g_mem_next_thread_id++;
g_all_mem_event_lists.emplace_front(g_mem_event_list);
phi::ProfilerHelper::g_all_mem_event_lists.emplace_front(
phi::ProfilerHelper::g_mem_event_list);
}
return *g_mem_event_list;
return *phi::ProfilerHelper::g_mem_event_list;
}
std::vector<std::vector<MemEvent>> GetMemEvents() {
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
std::lock_guard<std::mutex> guard(
phi::ProfilerHelper::g_all_mem_event_lists_mutex);
std::vector<std::vector<MemEvent>> result;
for (auto &it : g_all_mem_event_lists) {
for (auto &it : phi::ProfilerHelper::g_all_mem_event_lists) {
result.emplace_back((*it).Reduce());
}
return result;
......@@ -234,7 +204,7 @@ void PrintMemProfiler(
// parse memory events
void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
if (g_state == ProfilerState::kDisabled) return;
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
// place, annotation, alloc times, alloc size
std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
annotation_report;
......@@ -255,7 +225,8 @@ void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
void DealWithShowName() {
std::unordered_map<std::string, std::vector<std::string>> profiler_name_info;
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
for (auto it = phi::ProfilerHelper::g_all_event_lists.begin();
it != phi::ProfilerHelper::g_all_event_lists.end();
++it) {
for (auto &block : (*it)->event_blocks) {
for (auto &r : block) {
......@@ -382,9 +353,9 @@ void SetEvent(bool merge_thread,
gpu_time = rit->CudaElapsedMs(analyze_event);
#endif
double cpu_time = rit->CpuElapsedMs(analyze_event);
if (g_state == ProfilerState::kCUDA) {
if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA) {
event_time = gpu_time;
} else if (g_state == ProfilerState::kCPU) {
} else if (phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
event_time = cpu_time;
} else {
event_time = gpu_time + cpu_time;
......@@ -653,11 +624,11 @@ void PrintProfiler(
<< " Profiling Report "
<< "<-------------------------\n\n";
std::string place;
if (g_state == ProfilerState::kCPU) {
if (phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
place = "CPU";
} else if (g_state == ProfilerState::kCUDA) {
} else if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA) {
place = "CUDA";
} else if (g_state == ProfilerState::kAll) {
} else if (phi::ProfilerHelper::g_state == ProfilerState::kAll) {
place = "All";
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
......@@ -684,7 +655,7 @@ void PrintProfiler(
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total";
if (g_state == ProfilerState::kAll) {
if (phi::ProfilerHelper::g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
<< std::setw(data_width * 2) << "GPU Time (Ratio)";
}
......@@ -729,7 +700,7 @@ void PrintProfiler(
std::cout << std::setw(name_width) << print_name << std::setw(data_width)
<< event_item.calls << std::setw(data_width)
<< event_item.total_time;
if (g_state == ProfilerState::kAll) {
if (phi::ProfilerHelper::g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2)
<< string::Sprintf(
"%f (%f)",
......@@ -890,7 +861,7 @@ void AnalyzeEvent(
void ParseEvents(const std::vector<std::vector<Event>> &events,
bool merge_thread,
EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return;
if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
if (merge_thread && events.size() < 2) return;
std::string sorted_domain;
......
......@@ -45,7 +45,7 @@ TEST(RecordEvent, RecordEvent) {
using paddle::platform::RecordEvent;
ProfilerState state = ProfilerState::kCPU;
EnableProfiler(state);
paddle::platform::EnableProfiler(state);
/* Usage 1:
* PushEvent(evt_name);
......
add_subdirectory(profiler)
add_subdirectory(lib)
cc_library(
phi_api
......
......@@ -245,13 +245,23 @@ cc_library(
cc_library(
api_custom_impl
SRCS api_custom_impl.cc
DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta
phi_data_transform)
DEPS phi_tensor_raw
phi
kernel_dispatch
api_gen_utils
backward_infermeta
phi_data_transform
phi_profiler)
cc_library(
phi_function_api
SRCS ${api_source_file}
DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform
api_custom_impl)
DEPS phi_tensor_raw
phi
kernel_dispatch
api_gen_utils
phi_data_transform
api_custom_impl
phi_profiler)
cc_library(
phi_bw_function_api
SRCS ${bw_api_source_file}
......@@ -264,16 +274,22 @@ cc_library(
phi_data_transform
phi_function_api
api_custom_impl
global_utils)
global_utils
phi_profiler)
cc_library(
sparse_api
SRCS ${sparse_api_source_file}
DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_profiler)
cc_library(
sparse_bw_api
SRCS ${sparse_bw_api_source_file}
DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api
sparse_backward_infermeta)
DEPS phi_tensor_raw
phi
kernel_dispatch
api_gen_utils
sparse_api
sparse_backward_infermeta
phi_profiler)
cc_library(
phi_dygraph_api
SRCS ${dygraph_api_source_file}
......@@ -283,11 +299,12 @@ cc_library(
api_gen_utils
phi_data_transform
phi_function_api
sparse_api)
sparse_api
phi_profiler)
cc_library(
strings_api
SRCS ${strings_api_source_file}
DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_profiler)
cc_library(
phi_tensor
SRCS tensor_method.cc
......
proto_library(phi_profiler_proto SRCS profiler.proto)
if(WITH_PYTHON AND EXISTS ${PADDLE_BINARY_DIR})
set(FLUID_PATH ${PADDLE_BINARY_DIR}/python/paddle/fluid)
py_proto_compile(profiler_py_proto SRCS profiler.proto)
add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E
touch __init__.py)
add_dependencies(profiler_py_proto profiler_py_proto_init)
if(NOT WIN32)
add_custom_command(
TARGET profiler_py_proto
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${FLUID_PATH}/proto/profiler
COMMAND cp *.py ${FLUID_PATH}/proto/profiler
COMMENT
"Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else()
string(REPLACE "/" "\\" proto_dstpath "${FLUID_PATH}/proto/profiler/")
add_custom_command(
TARGET profiler_py_proto
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${FLUID_PATH}/proto/profiler
COMMAND copy /Y *.py ${proto_dstpath}
COMMENT
"Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endif()
if(WITH_GPU OR WITH_ROCM)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
endif()
cc_library(
phi_device_tracer
SRCS device_tracer.cc
DEPS phi_profiler_proto ${GPU_CTX_DEPS})
cc_library(
phi_profiler
SRCS profiler.cc
DEPS phi_os_info phi_device_tracer phi_enforce)
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstring>
#include <functional>
#include <string>
#include "paddle/phi/api/profiler/event.h" // import EventRole, TODO(TIEXING): remove later
#include "paddle/phi/api/profiler/trace_event.h"
#include "paddle/phi/core/ddim.h"
namespace phi {
struct CommonEvent {
public:
CommonEvent(const char *name,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: name(name),
start_ns(start_ns),
end_ns(end_ns),
role(role),
type(type) {}
CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type,
const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
attr = buf;
}
CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
}
const char *name = nullptr; // not owned, designed for performance
uint64_t start_ns = 0;
uint64_t end_ns = 0;
EventRole role = EventRole::kOrdinary;
TracerEventType type = TracerEventType::NumTypes;
const char *attr = nullptr; // not owned, designed for performance
};
struct CommonMemEvent {
public:
CommonMemEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
int64_t increase_bytes,
const Place &place,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved),
peak_allocated(peak_allocated),
peak_reserved(peak_reserved) {}
uint64_t timestamp_ns;
uint64_t addr;
TracerMemEventType type;
int64_t increase_bytes;
Place place;
uint64_t current_allocated;
uint64_t current_reserved;
uint64_t peak_allocated;
uint64_t peak_reserved;
};
} // namespace phi
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/phi/api/profiler/device_tracer.h"
#include <deque>
#include <forward_list>
......@@ -22,11 +22,11 @@ limitations under the License. */
#include <thread> // NOLINT
#include "glog/logging.h"
#include "paddle/phi/core/enforce.h"
DECLARE_bool(enable_host_event_recorder_hook);
namespace paddle {
namespace platform {
namespace phi {
// Used only by DeviceTracer
uint64_t GetThreadIdFromSystemThreadId(uint32_t id);
......@@ -192,7 +192,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx,
PADDLE_ENFORCE_EQ(
std::this_thread::get_id(),
cupti_thread_id,
platform::errors::PermissionDenied(
errors::PermissionDenied(
"Only one thread is allowed to call bufferCompleted()."));
CUptiResult status;
CUpti_Activity *record = NULL;
......@@ -688,18 +688,18 @@ class DeviceTracerImpl : public DeviceTracer {
for (const auto &r : tmp) {
auto *event = profile_pb.add_mem_events();
event->set_device_id(0);
if (platform::is_cpu_place(r.place)) {
if (r.place.GetType() == phi::AllocationType::CPU) {
event->set_place(proto::MemEvent::CPUPlace);
} else if (platform::is_gpu_place(r.place)) {
} else if (r.place.GetType() == phi::AllocationType::GPU) {
event->set_place(proto::MemEvent::CUDAPlace);
event->set_device_id(r.place.GetDeviceId());
} else if (platform::is_cuda_pinned_place(r.place)) {
} else if (r.place.GetType() == phi::AllocationType::GPUPINNED) {
event->set_place(proto::MemEvent::CUDAPinnedPlace);
} else if (platform::is_npu_place(r.place)) {
} else if (r.place.GetType() == phi::AllocationType::NPU) {
event->set_place(proto::MemEvent::NPUPlace);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"The current place is not supported."));
PADDLE_THROW(
errors::Unimplemented("The current place is not supported."));
}
event->set_alloc_in(r.alloc_in);
event->set_free_in(r.free_in);
......@@ -910,5 +910,4 @@ void initCuptiCbidStr() {
} // namespace
#endif // PADDLE_WITH_CUPTI
} // namespace platform
} // namespace paddle
} // namespace phi
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -16,14 +16,13 @@ limitations under the License. */
#include <chrono> // NOLINT
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.pb.h"
#include "paddle/phi/api/profiler/event.h"
#include "paddle/phi/api/profiler/profiler.pb.h"
#include "paddle/phi/backends/dynload/cupti.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/os_info.h"
namespace paddle {
namespace platform {
namespace phi {
///////////////////////
// WARN: Under Development. Don't depend on it yet.
......@@ -164,5 +163,4 @@ int BlockDepth();
// Set current thread id, so we can map the system thread id to thread id.
void RecoreCurThreadId(uint64_t id);
} // namespace platform
} // namespace paddle
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <map>
#include <string>
#include <utility>
#include "paddle/phi/common/place.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/phi/core/cuda_stream.h"
#endif
namespace phi {
enum class EventType { kMark, kPushRange, kPopRange };
enum class EventRole {
kOrdinary, // only record op time with op type key
kInnerOp, // record op detail time with op type key
kUniqueOp, // record op detail time with op unique name key
kSpecial, // record event such as PE which is outer of thread local
};
class Event {
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event(EventType type,
std::string name,
uint32_t thread_id,
EventRole role = EventRole::kOrdinary,
std::string attr = "none");
const EventType &type() const;
Event *parent() const { return parent_; }
void set_parent(Event *parent) { parent_ = parent; }
std::string name() const { return name_; }
EventRole role() const { return role_; }
uint64_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; }
void set_role(EventRole role) { role_ = role; }
std::string attr() const { return attr_; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifndef PADDLE_WITH_CUPTI
gpuEvent_t event() const { return event_; }
int device() const { return device_; }
#endif
#endif
double CpuElapsedMs(const Event &e) const;
double CudaElapsedMs(const Event &e) const;
private:
EventType type_;
std::string name_{};
Event *parent_{nullptr};
uint64_t thread_id_;
EventRole role_{};
int64_t cpu_ns_;
bool visited_status_{false};
std::string attr_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUPTI
int64_t gpu_ns_ = 0;
public:
void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
gpu_ns_ += end_ns - start_ns;
}
private:
#else
gpuEvent_t event_ = nullptr;
int device_ = -1;
#endif
#endif
};
using EventWithStartNs = std::pair<Event *, uint64_t>;
using ThreadEvents = std::map<uint64_t, EventWithStartNs>;
class MemEvent {
public:
MemEvent(EventType type,
uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
Place place,
int64_t thread_id,
const std::string &annotation)
: type_(type),
start_ns_(start_ns),
end_ns_(end_ns),
bytes_(bytes),
place_(place),
thread_id_(thread_id),
annotation_(annotation) {}
const EventType &type() const { return type_; }
uint64_t start_ns() const { return start_ns_; }
uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; }
Place place() const { return place_; }
uint64_t thread_id() const { return thread_id_; }
const std::string &annotation() const { return annotation_; }
private:
EventType type_;
uint64_t start_ns_ = 0;
uint64_t end_ns_ = 0;
size_t bytes_;
Place place_;
uint64_t thread_id_;
std::string annotation_;
};
class CudaEvent {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
public:
CudaEvent() {
#ifdef PADDLE_WITH_HIP
hipEventCreateWithFlags(&event_, flags_);
#else
cudaEventCreateWithFlags(&event_, flags_);
#endif
VLOG(4) << "CudaEvent " << event_;
}
explicit CudaEvent(unsigned int flags) : flags_(flags) {
#ifdef PADDLE_WITH_HIP
hipEventCreateWithFlags(&event_, flags_);
#else
cudaEventCreateWithFlags(&event_, flags_);
#endif
VLOG(4) << "CudaEvent " << event_;
}
~CudaEvent() {
#ifdef PADDLE_WITH_HIP
hipEventDestroy(event_);
#else
cudaEventDestroy(event_);
#endif
}
void Record(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
#endif
}
bool Query() {
#ifdef PADDLE_WITH_HIP
gpuError_t err = hipEventQuery(event_);
if (err == hipSuccess) {
return true;
}
if (err == hipErrorNotReady) {
return false;
}
#else
gpuError_t err = cudaEventQuery(event_);
if (err == cudaSuccess) {
return true;
}
if (err == cudaErrorNotReady) {
return false;
}
#endif
PADDLE_ENFORCE_GPU_SUCCESS(err);
return false;
}
void Synchronize() {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
#endif
}
gpuEvent_t GetRawCudaEvent() { return event_; }
private:
#ifdef PADDLE_WITH_HIP
unsigned int flags_ = hipEventDefault;
#else
unsigned int flags_ = cudaEventDefault;
#endif
gpuEvent_t event_;
#endif
};
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/phi/api/profiler/event.h"
#include "paddle/phi/api/profiler/trace_event.h"
namespace phi {
// Default tracing level.
// It is Recommended to set the level explicitly.
static constexpr uint32_t kDefaultTraceLevel = 4;
// Host event tracing. A trace starts when an object of this clas is created and
// stops when the object is destroyed.
// Chrome Trace Viewer Format: Duration Event/Complte Event
class RecordEvent {
public:
static bool IsEnabled();
/**
* @param name: If your string argument has a longer lifetime (e.g.: string
* literal, static variables, etc) than the event, use 'const char* name'.
* Do your best to avoid using 'std::string' as the argument type. It will
* cause deep-copy to harm performance.
* @param type: Classification which is used to instruct the profiling
* data statistics.
* @param level: Used to filter events, works like glog VLOG(level).
* RecordEvent will works if HostTraceLevel >= level.
*/
explicit RecordEvent(
const std::string& name,
const TracerEventType type = TracerEventType::UserDefined,
uint32_t level = kDefaultTraceLevel,
const EventRole role = EventRole::kOrdinary);
/**
* @param name: It is the caller's reponsibility to manage the underlying
* storage. RecordEvent stores the pointer.
* @param type: Classification which is used to instruct the profiling
* data statistics.
* @param level: Used to filter events, works like glog VLOG(level).
* RecordEvent will works if HostTraceLevel >= level.
*/
explicit RecordEvent(
const char* name,
const TracerEventType type = TracerEventType::UserDefined,
uint32_t level = kDefaultTraceLevel,
const EventRole role = EventRole::kOrdinary);
RecordEvent(const std::string& name,
const std::string& attr,
const TracerEventType type = TracerEventType::UserDefined,
uint32_t level = kDefaultTraceLevel,
const EventRole role = EventRole::kOrdinary);
// Stop event tracing explicitly before the object goes out of scope.
// Sometimes it's inconvenient to use RAII
void End();
~RecordEvent() { End(); }
private:
void OriginalConstruct(const std::string& name,
const EventRole role,
const std::string& attr);
bool is_enabled_{false};
bool is_pushed_{false};
// Event name
std::string* name_{nullptr};
const char* shallow_copy_name_{nullptr};
uint64_t start_ns_;
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
// std::string full_name_;
EventRole role_{EventRole::kOrdinary};
TracerEventType type_{TracerEventType::UserDefined};
std::string* attr_{nullptr};
bool finished_{false};
};
} // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <type_traits>
#include <vector>
#include "paddle/phi/common/thread_data_registry.h"
#include "paddle/phi/core/macros.h"
#include "paddle/phi/core/os_info.h"
namespace phi {
template <typename HeadType, typename... RestTypes>
struct ContainsStdString
: std::conditional_t<
std::is_same<
std::string,
std::remove_cv_t<std::remove_reference_t<HeadType>>>::value,
std::true_type,
ContainsStdString<RestTypes...>> {};
template <typename TailType>
struct ContainsStdString<TailType>
: std::is_same<std::string,
std::remove_cv_t<std::remove_reference_t<TailType>>> {};
template <typename EventType>
class EventContainer {
public:
EventContainer() {
event_blocks_ = cur_event_block_ = new EventBlock;
str_blocks_ = cur_str_block_ = new StringBlock;
}
~EventContainer() {
Reduce();
delete event_blocks_;
for (auto cur = str_blocks_; cur != nullptr;) {
auto next = cur->next;
delete cur;
cur = next;
}
}
DISABLE_COPY_AND_ASSIGN(EventContainer);
public:
// Record an event
template <typename... Args>
void Record(Args &&...args) {
DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
}
// Get all events and clear the container
std::vector<EventType> Reduce();
// Return a buffer to store the string attribute of Event.
// HostEventRecorder locates in the static data section.
// So it's safe to use arena to avoid fragmented allocations.
char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); }
private:
struct EventBlock {
union InitDeferedEvent {
InitDeferedEvent() {}
~InitDeferedEvent() {}
EventType event;
};
static constexpr size_t kBlockSize = 1 << 24; // 16 MB
static constexpr size_t kAvailSize =
kBlockSize - sizeof(size_t) - sizeof(nullptr);
static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent);
static constexpr size_t kPadSize =
kAvailSize - kNumEvents * sizeof(InitDeferedEvent);
static constexpr size_t kMinimumEventsPerBlock = 1024;
static_assert(
kNumEvents >= kMinimumEventsPerBlock,
"EventType is too large for kBlockSize, make kBlockSize larger");
size_t offset = 0;
EventBlock *next = nullptr;
InitDeferedEvent events[kNumEvents];
char padding[kPadSize];
};
static_assert(sizeof(EventBlock) == EventBlock::kBlockSize,
"sizeof EventBlock must equal to kBlockSize");
struct StringBlock {
static constexpr size_t kBlockSize = 1 << 22; // 4 MB
static constexpr size_t kAvailSize =
kBlockSize - sizeof(size_t) - sizeof(nullptr);
size_t offset = 0;
StringBlock *next = nullptr;
char storage[kAvailSize];
};
static_assert(sizeof(StringBlock) == StringBlock::kBlockSize,
"sizeof StringBlock must equal to kBlockSize");
// Record an event with string arguments
template <typename... Args>
void DoRecord(std::true_type, Args &&...args) {
auto *storage = GetEventStorage();
std::function<void *(size_t)> allocator = [this](size_t size) {
return GetStrBufFromArena(size);
};
new (storage) EventType(allocator, std::forward<Args>(args)...);
}
// Record an event without any string argument
template <typename... Args>
void DoRecord(std::false_type, Args &&...args) {
auto *storage = GetEventStorage();
new (storage) EventType(std::forward<Args>(args)...);
}
EventType *GetEventStorage();
char *GetStringStorage(size_t sz);
EventBlock *event_blocks_ = nullptr;
EventBlock *cur_event_block_ = nullptr;
StringBlock *str_blocks_ = nullptr;
StringBlock *cur_str_block_ = nullptr;
};
template <typename EventType>
std::vector<EventType> EventContainer<EventType>::Reduce() {
std::vector<EventType> all_events;
size_t event_cnt = 0;
for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) {
event_cnt += cur->offset;
}
all_events.reserve(event_cnt);
for (auto cur = event_blocks_; cur != nullptr;) {
for (size_t i = 0; i < cur->offset; ++i) {
all_events.emplace_back(cur->events[i].event);
}
auto next = cur->next;
delete cur;
cur = next;
}
event_blocks_ = cur_event_block_ = new EventBlock;
return all_events;
}
template <typename EventType>
EventType *EventContainer<EventType>::GetEventStorage() {
if (UNLIKELY(cur_event_block_->offset >=
EventBlock::kNumEvents)) { // another block
cur_event_block_->next = new EventBlock;
cur_event_block_ = cur_event_block_->next;
}
auto &obj = cur_event_block_->events[cur_event_block_->offset].event;
++cur_event_block_->offset;
return &obj;
}
template <typename EventType>
char *EventContainer<EventType>::GetStringStorage(size_t sz) {
if (UNLIKELY(cur_str_block_->offset + sz >
StringBlock::kAvailSize)) { // another block
cur_str_block_->next = new StringBlock;
cur_str_block_ = cur_str_block_->next;
}
char *storage = cur_str_block_->storage + cur_str_block_->offset;
cur_str_block_->offset += sz;
return storage;
}
template <typename EventType>
struct ThreadEventSection {
std::string thread_name;
uint64_t thread_id;
std::vector<EventType> events;
};
template <typename EventType>
class ThreadEventRecorder {
public:
ThreadEventRecorder() {
thread_id_ = GetCurrentThreadSysId();
thread_name_ = GetCurrentThreadName();
}
DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
public:
// Forward call to EventContainer::Record
template <typename... Args>
void RecordEvent(Args &&...args) {
base_evt_cntr_.Record(std::forward<Args>(args)...);
}
ThreadEventSection<EventType> GatherEvents() {
ThreadEventSection<EventType> thr_sec;
thr_sec.thread_name = thread_name_;
thr_sec.thread_id = thread_id_;
thr_sec.events = std::move(base_evt_cntr_.Reduce());
return thr_sec;
}
private:
uint64_t thread_id_;
std::string thread_name_;
EventContainer<EventType> base_evt_cntr_;
};
template <typename EventType>
struct HostEventSection {
std::string process_name;
uint64_t process_id;
std::vector<ThreadEventSection<EventType>> thr_sections;
};
template <typename EventType>
class HostEventRecorder {
public:
// singleton
static HostEventRecorder &GetInstance() {
static HostEventRecorder instance;
return instance;
}
// thread-safe
// If your string argument has a longer lifetime than the Event,
// use 'const char*'. e.g.: string literal, op name, etc.
// Do your best to avoid using 'std::string' as the argument type.
// It will cause deep-copy to harm performance.
template <typename... Args>
void RecordEvent(Args &&...args) {
// Get thread local ThreadEventRecorder
// If not exists, we create a new one.
// Both HostEventRecorder and thread-local varibale in
// ThreadEventRecorderRegistry keep the shared pointer. We add this to
// prevent ThreadEventRecorder being destroyed by thread-local variable in
// ThreadEventRecorderRegistry and lose data.
if (GetThreadLocalRecorder()->get() == nullptr) {
std::shared_ptr<ThreadEventRecorder<EventType>>
thread_event_recorder_ptr =
std::make_shared<ThreadEventRecorder<EventType>>();
*(GetThreadLocalRecorder()) = thread_event_recorder_ptr;
thr_recorders_.push_back(thread_event_recorder_ptr);
}
(*GetThreadLocalRecorder())->RecordEvent(std::forward<Args>(args)...);
}
// thread-unsafe, make sure make sure there is no running tracing.
// Poor performance, call it at the ending
HostEventSection<EventType> GatherEvents() {
HostEventSection<EventType> host_sec;
host_sec.process_id = GetProcessId();
host_sec.thr_sections.reserve(thr_recorders_.size());
for (auto &v : thr_recorders_) {
host_sec.thr_sections.emplace_back(std::move(v->GatherEvents()));
}
return host_sec;
}
private:
using ThreadEventRecorderRegistry =
phi::ThreadDataRegistry<std::shared_ptr<ThreadEventRecorder<EventType>>>;
HostEventRecorder() = default;
DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
std::shared_ptr<ThreadEventRecorder<EventType>> *GetThreadLocalRecorder() {
return ThreadEventRecorderRegistry::GetInstance()
.GetMutableCurrentThreadData();
}
// Hold all thread-local ThreadEventRecorders
// ThreadEventRecorderRegistry and HostEventRecorder both take care of this
// shared pointer. We add this to prevent ThreadEventRecorder being destroyed
// by thread-local variable in ThreadEventRecorderRegistry and lose data.
std::vector<std::shared_ptr<ThreadEventRecorder<EventType>>> thr_recorders_;
};
} // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace phi {
class HostTraceLevel {
public:
static constexpr int64_t kDisabled = -1;
static HostTraceLevel& GetInstance() {
static HostTraceLevel instance;
return instance;
}
bool NeedTrace(uint32_t level) {
return trace_level_ >= static_cast<int64_t>(level);
}
void SetLevel(int64_t trace_level) { trace_level_ = trace_level; }
private:
// Verbose trace level, works like VLOG(level)
int trace_level_ = kDisabled;
};
struct HostTracerOptions {
uint32_t trace_level = 0;
};
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/api/profiler/profiler.h"
#include <mutex> // NOLINT
#include <random>
#include <sstream>
#include <string>
#include <type_traits>
#include "paddle/phi/api/profiler/common_event.h"
#include "paddle/phi/api/profiler/device_tracer.h"
#include "paddle/phi/api/profiler/host_event_recorder.h"
#include "paddle/phi/api/profiler/host_tracer.h"
#include "paddle/phi/api/profiler/profiler_helper.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/os_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/phi/backends/dynload/nvtx.h"
#endif
DEFINE_bool(enable_host_event_recorder_hook,
false,
"enable HostEventRecorder, hook Profiler");
namespace phi {
ProfilerState ProfilerHelper::g_state = ProfilerState::kDisabled;
bool ProfilerHelper::g_enable_nvprof_hook = false;
thread_local uint64_t ProfilerHelper::g_thread_id;
uint32_t ProfilerHelper::g_next_thread_id = 0;
std::mutex ProfilerHelper::g_all_event_lists_mutex;
std::list<std::shared_ptr<EventList<Event>>> ProfilerHelper::g_all_event_lists;
thread_local std::shared_ptr<EventList<Event>> ProfilerHelper::g_event_list;
std::list<std::shared_ptr<EventList<MemEvent>>>
ProfilerHelper::g_all_mem_event_lists;
thread_local std::shared_ptr<EventList<MemEvent>>
ProfilerHelper::g_mem_event_list;
std::mutex ProfilerHelper::g_all_mem_event_lists_mutex;
Event::Event(EventType type,
std::string name,
uint32_t thread_id,
EventRole role,
std::string attr)
: type_(type),
name_(name),
thread_id_(thread_id),
role_(role),
attr_(attr) {
cpu_ns_ = GetTimeInNsec();
}
const EventType &Event::type() const { return type_; }
double Event::CpuElapsedMs(const Event &e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
}
double Event::CudaElapsedMs(const Event &e) const {
#ifdef PADDLE_WITH_CUPTI
return gpu_ns_ / 1000000.0;
#else
LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
return 0;
#endif
}
Event *PushEvent(const std::string &name,
const EventRole role,
std::string attr) {
return GetEventList().Record(
EventType::kPushRange, name, ProfilerHelper::g_thread_id, role, attr);
}
void PopEvent(const std::string &name, const EventRole role, std::string attr) {
GetEventList().Record(
EventType::kPopRange, name, ProfilerHelper::g_thread_id, role, attr);
}
RecordEvent::RecordEvent(const char *name,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (ProfilerHelper::g_enable_nvprof_hook) {
dynload::nvtxRangePushA(name);
is_pushed_ = true;
}
#endif
#endif
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return;
}
if (FLAGS_enable_host_event_recorder_hook == false) {
if (ProfilerHelper::g_state !=
ProfilerState::kDisabled) { // avoid temp string
if (type == TracerEventType::Operator ||
type == TracerEventType::OperatorInner ||
type == TracerEventType::UserDefined) {
OriginalConstruct(name, role, "none");
}
}
return;
}
is_enabled_ = true;
shallow_copy_name_ = name;
role_ = role;
type_ = type;
start_ns_ = PosixInNsec();
}
RecordEvent::RecordEvent(const std::string &name,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (ProfilerHelper::g_enable_nvprof_hook) {
dynload::nvtxRangePushA(name.c_str());
is_pushed_ = true;
}
#endif
#endif
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return;
}
if (FLAGS_enable_host_event_recorder_hook == false) {
if (type == TracerEventType::Operator ||
type == TracerEventType::OperatorInner ||
type == TracerEventType::UserDefined) {
OriginalConstruct(name, role, "none");
}
return;
}
is_enabled_ = true;
name_ = new std::string(name);
role_ = role;
type_ = type;
start_ns_ = PosixInNsec();
}
RecordEvent::RecordEvent(const std::string &name,
const std::string &attr,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (ProfilerHelper::g_enable_nvprof_hook) {
dynload::nvtxRangePushA(name.c_str());
is_pushed_ = true;
}
#endif
#endif
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return;
}
if (FLAGS_enable_host_event_recorder_hook == false) {
if (type == TracerEventType::Operator ||
type == TracerEventType::OperatorInner ||
type == TracerEventType::UserDefined) {
OriginalConstruct(name, role, attr);
}
return;
}
is_enabled_ = true;
type_ = type;
name_ = new std::string(name);
start_ns_ = PosixInNsec();
attr_ = new std::string(attr);
}
void RecordEvent::OriginalConstruct(const std::string &name,
const EventRole role,
const std::string &attr) {
if (ProfilerHelper::g_state == ProfilerState::kDisabled || name.empty())
return;
// do some initialization
name_ = new std::string(name);
start_ns_ = PosixInNsec();
role_ = role;
attr_ = new std::string(attr);
is_enabled_ = true;
// lock is not needed, the code below is thread-safe
// Maybe need the same push/pop behavior.
Event *e = PushEvent(name, role, attr);
SetCurAnnotation(e);
*name_ = e->name();
}
void RecordEvent::End() {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (ProfilerHelper::g_enable_nvprof_hook && is_pushed_) {
dynload::nvtxRangePop();
is_pushed_ = false;
}
#endif
#endif
if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) {
uint64_t end_ns = PosixInNsec();
if (LIKELY(shallow_copy_name_ != nullptr)) {
HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
shallow_copy_name_, start_ns_, end_ns, role_, type_);
} else if (name_ != nullptr) {
if (attr_ == nullptr) {
HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
*name_, start_ns_, end_ns, role_, type_);
} else {
HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
*name_, start_ns_, end_ns, role_, type_, *attr_);
delete attr_;
}
delete name_;
}
// use this flag to avoid double End();
is_enabled_ = false;
return;
}
if (ProfilerHelper::g_state == ProfilerState::kDisabled || !is_enabled_)
return;
// lock is not needed, the code below is thread-safe
DeviceTracer *tracer = GetDeviceTracer();
if (tracer) {
uint64_t end_ns = PosixInNsec();
tracer->AddCPURecords(CurAnnotationName(),
start_ns_,
end_ns,
BlockDepth(),
ProfilerHelper::g_thread_id);
}
ClearCurAnnotation();
PopEvent(*name_, role_);
delete name_;
delete attr_;
// use this flag to avoid double End();
is_enabled_ = false;
}
bool RecordEvent::IsEnabled() {
return FLAGS_enable_host_event_recorder_hook ||
ProfilerHelper::g_enable_nvprof_hook ||
ProfilerHelper::g_state != ProfilerState::kDisabled;
}
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <forward_list>
#include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/phi/api/profiler/event_tracing.h"
DECLARE_bool(enable_host_event_recorder_hook);
namespace phi {
enum class ProfilerState {
kDisabled, // disabled state
kCPU, // CPU profiling state
kCUDA, // GPU profiling state
kAll, // Profile both CPU and GPU. (Currently experimental).
};
// it is the flag to control to print the profiling result
enum class TracerOption {
kDefault, // print the different op type profiling result
kOpDetail, // print the detail profiling result of different op type
kAllOpDetail, // print the detail profiling result of different op name
};
template <typename T>
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(T);
constexpr static size_t kEventAlign = alignof(T);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
T* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<T> Reduce() {
std::vector<T> result;
for (auto& block : event_blocks) {
result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<T>> event_blocks;
};
Event* PushEvent(const std::string& name,
const EventRole role,
const std::string attr = "none");
void PopEvent(const std::string& name,
const EventRole role,
const std::string attr = "none");
} // namespace phi
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;
package phi.proto;
message MemCopy { optional uint64 bytes = 1; }
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <iomanip>
#include <limits>
#include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <random>
#include <set>
#include <stack>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/phi/api/profiler/device_tracer.h"
namespace phi {
class ProfilerHelper {
public:
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state;
// To hook RecordEvent's events, use it to nvtx timeline
static bool g_enable_nvprof_hook;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local uint64_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id
// and g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList<Event>> g_event_list;
static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
static std::mutex g_all_mem_event_lists_mutex;
};
inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch())
.count();
}
inline EventList<Event> &GetEventList() {
if (!ProfilerHelper::g_event_list) {
std::lock_guard<std::mutex> guard(ProfilerHelper::g_all_event_lists_mutex);
ProfilerHelper::g_event_list = std::make_shared<EventList<Event>>();
ProfilerHelper::g_thread_id = ProfilerHelper::g_next_thread_id++;
ProfilerHelper::g_all_event_lists.emplace_front(
ProfilerHelper::g_event_list);
RecoreCurThreadId(ProfilerHelper::g_thread_id);
}
return *ProfilerHelper::g_event_list;
}
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <string>
#include <vector>
namespace phi {
enum class TracerEventType {
// Used to mark operator record
Operator = 0,
// Used to mark dataloader record
Dataloader = 1,
// Used to mark profile step record
ProfileStep = 2,
// Used to mark cuda runtime record returned by cupti
CudaRuntime = 3,
// Used to mark kernel computation record returned by cupti
Kernel = 4,
// Used to mark memcpy record returned by cupti
Memcpy = 5,
// Used to mark memset record returned by cupti
Memset = 6,
// Used to mark record defined by user
UserDefined = 7,
// Used to mark operator detail, (such as infer shape, compute)
OperatorInner = 8,
// Used to mark model training or testing perspective, forward process
Forward = 9,
// Used to mark model training perspective, backward process
Backward = 10,
// Used to mark model training perspective, optimization process
Optimization = 11,
// Used to mark distributed training perspective
Communication = 12,
// Used to mark python api
PythonOp = 13,
// Used to mark python level userdefined
PythonUserDefined = 14,
// Used to mark mlu runtime record returned by cnpapi
MluRuntime = 15,
// A flag to denote the number of current types
NumTypes
};
enum class TracerMemEventType {
// Used to mark memory allocation which is managed by paddle
Allocate = 0,
// Used to mark memory free which is managed by paddle
Free = 1,
// Used to mark reserved memory allocation which is applied from device.
ReservedAllocate = 2,
// Used to mark reserved memory free which is released to device.
ReservedFree = 3,
// A flag to denote the number of current types
NumTypes
};
struct KernelEventInfo {
// The X-dimension block size for the kernel.
uint32_t block_x;
// The Y-dimension block size for the kernel.
uint32_t block_y;
// The Z-dimension grid size for the kernel.
uint32_t block_z;
// X-dimension of a grid.
uint32_t grid_x;
// Y-dimension of a grid.
uint32_t grid_y;
// Z-dimension of a grid.
uint32_t grid_z;
// The dynamic shared memory reserved for the kernel, in bytes.
uint32_t dynamic_shared_memory;
// The static shared memory allocated for the kernel, in bytes.
uint32_t static_shared_memory;
// The number of registers required for each thread executing the kernel.
uint32_t registers_per_thread;
// The amount of local memory reserved for each thread, in bytes.
uint32_t local_memory_per_thread;
// The total amount of local memory reserved for the kernel, in bytes.
uint32_t local_memory_total;
// The timestamp when the kernel is queued up in the command buffer, in ns.
// This timestamp is not collected by default. Use API
// cuptiActivityEnableLatencyTimestamps() to enable collection.
uint64_t queued;
// The timestamp when the command buffer containing the kernel launch is
// submitted to the GPU, in ns.
// This timestamp is not collected by default. Use API
// cuptiActivityEnableLatencyTimestamps() to enable collection.
uint64_t submitted;
// The completed timestamp for the kernel execution, in ns.
uint64_t completed;
float blocks_per_sm;
float warps_per_sm;
// theoretical achieved occupancy
float occupancy;
};
static constexpr size_t kMemKindMaxLen = 50;
struct MemcpyEventInfo {
// The number of bytes transferred by the memory copy.
uint64_t num_bytes;
// The kind of the memory copy.
// Each kind represents the source and destination targets of a memory copy.
// Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind
char copy_kind[kMemKindMaxLen];
// The source memory kind read by the memory copy.
// Each kind represents the type of the memory accessed by a memory
// operation/copy. Refer to CUpti_ActivityMemoryKind
char src_kind[kMemKindMaxLen];
// The destination memory kind read by the memory copy.
char dst_kind[kMemKindMaxLen];
};
struct MemsetEventInfo {
// The number of bytes being set by the memory set.
uint64_t num_bytes;
// The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind
char memory_kind[kMemKindMaxLen];
// the value being assigned to memory by the memory set.
uint32_t value;
};
struct HostTraceEvent {
HostTraceEvent() = default;
HostTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t process_id,
uint64_t thread_id)
: name(name),
type(type),
start_ns(start_ns),
end_ns(end_ns),
process_id(process_id),
thread_id(thread_id) {}
// record name
std::string name;
// record type, one of TracerEventType
TracerEventType type;
// start timestamp of the record
uint64_t start_ns;
// end timestamp of the record
uint64_t end_ns;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
};
struct RuntimeTraceEvent {
RuntimeTraceEvent() = default;
RuntimeTraceEvent(const std::string& name,
uint64_t start_ns,
uint64_t end_ns,
uint64_t process_id,
uint64_t thread_id,
uint32_t correlation_id,
uint32_t callback_id)
: name(name),
start_ns(start_ns),
end_ns(end_ns),
process_id(process_id),
thread_id(thread_id),
correlation_id(correlation_id),
callback_id(callback_id) {}
// record name
std::string name;
// record type, one of TracerEventType
TracerEventType type{TracerEventType::CudaRuntime};
// start timestamp of the record
uint64_t start_ns;
// end timestamp of the record
uint64_t end_ns;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// correlation id, used for correlating async activities happened on device
uint32_t correlation_id;
// callback id, used to identify which cuda runtime api is called
uint32_t callback_id;
};
struct DeviceTraceEvent {
DeviceTraceEvent() = default;
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const KernelEventInfo& kernel_info)
: name(name),
type(type),
start_ns(start_ns),
end_ns(end_ns),
device_id(device_id),
context_id(context_id),
stream_id(stream_id),
correlation_id(correlation_id),
kernel_info(kernel_info) {}
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const MemcpyEventInfo& memcpy_info)
: name(name),
type(type),
start_ns(start_ns),
end_ns(end_ns),
device_id(device_id),
context_id(context_id),
stream_id(stream_id),
correlation_id(correlation_id),
memcpy_info(memcpy_info) {}
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const MemsetEventInfo& memset_info)
: name(name),
type(type),
start_ns(start_ns),
end_ns(end_ns),
device_id(device_id),
context_id(context_id),
stream_id(stream_id),
correlation_id(correlation_id),
memset_info(memset_info) {}
// record name
std::string name;
// record type, one of TracerEventType
TracerEventType type;
// start timestamp of the record
uint64_t start_ns;
// end timestamp of the record
uint64_t end_ns;
// device id
uint64_t device_id;
// context id
uint64_t context_id;
// stream id
uint64_t stream_id;
// correlation id, used for correlating async activities happened on device
uint32_t correlation_id;
// union, specific device record type has different detail information
union {
// used for TracerEventType::Kernel
KernelEventInfo kernel_info;
// used for TracerEventType::Memcpy
MemcpyEventInfo memcpy_info;
// used for TracerEventType::Memset
MemsetEventInfo memset_info;
};
};
struct MemTraceEvent {
MemTraceEvent() = default;
MemTraceEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
uint64_t process_id,
uint64_t thread_id,
int64_t increase_bytes,
const std::string& place,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
process_id(process_id),
thread_id(thread_id),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved),
peak_allocated(peak_allocated),
peak_reserved(peak_reserved) {}
// timestamp of the record
uint64_t timestamp_ns;
// memory addr of allocation or free
uint64_t addr;
// memory manipulation type
TracerMemEventType type;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// increase bytes after this manipulation, allocation for sign +, free for
// sign -
int64_t increase_bytes;
// place
std::string place;
// current total allocated memory
uint64_t current_allocated;
// current total reserved memory
uint64_t current_reserved;
// current peak allocated memory
uint64_t peak_allocated;
// current peak reserved memory
uint64_t peak_reserved;
};
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <list>
#include <string>
#include <unordered_map>
#include "paddle/phi/api/profiler/trace_event.h"
namespace phi {
class TraceEventCollector {
public:
void AddHostEvent(HostTraceEvent&& event) { host_events_.push_back(event); }
void AddRuntimeEvent(RuntimeTraceEvent&& event) {
runtime_events_.push_back(event);
}
void AddDeviceEvent(DeviceTraceEvent&& event) {
device_events_.push_back(event);
}
void AddThreadName(uint64_t tid, const std::string& name) {
thread_names_[tid] = name;
}
void AddMemEvent(MemTraceEvent&& event) { mem_events_.push_back(event); }
const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
return runtime_events_;
}
const std::list<DeviceTraceEvent>& DeviceEvents() const {
return device_events_;
}
const std::list<MemTraceEvent>& MemEvents() const { return mem_events_; }
const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
return thread_names_;
}
void ClearAll() {
thread_names_.clear();
host_events_.clear();
runtime_events_.clear();
device_events_.clear();
mem_events_.clear();
}
protected:
std::unordered_map<uint64_t, std::string> thread_names_;
std::list<HostTraceEvent> host_events_;
std::list<RuntimeTraceEvent> runtime_events_;
std::list<DeviceTraceEvent> device_events_;
std::list<MemTraceEvent> mem_events_;
};
} // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/api/profiler/trace_event_collector.h"
namespace phi {
class TracerBase {
public:
// The state machine for a Tracer.
enum class TracerState { UNINITED, READY, STARTED, STOPED };
virtual void PrepareTracing() { state_ = TracerState::READY; }
virtual void StartTracing() = 0;
virtual void StopTracing() = 0;
virtual void CollectTraceData(TraceEventCollector* collector) = 0;
virtual ~TracerBase() {}
protected:
TracerState state_ = TracerState::UNINITED;
};
} // namespace phi
......@@ -1207,9 +1207,9 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
{code_indent} auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);
{input_tensors}
{output_create}
{code_indent} paddle::platform::RecordEvent *infer_shape_record_event = nullptr;
{code_indent} if(paddle::platform::RecordEvent::IsEnabled()){{
{code_indent} infer_shape_record_event = new paddle::platform::RecordEvent(\"{self.api} infer_meta\", paddle::platform::TracerEventType::OperatorInner, 1);
{code_indent} phi::RecordEvent *infer_shape_record_event = nullptr;
{code_indent} if(phi::RecordEvent::IsEnabled()){{
{code_indent} infer_shape_record_event = new phi::RecordEvent(\"{self.api} infer_meta\", phi::TracerEventType::OperatorInner, 1);
{code_indent} }}
{self.gene_infer_meta(kernel_output_names, code_indent)}
{code_indent} if(infer_shape_record_event != nullptr){{
......@@ -1217,9 +1217,9 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
{code_indent} }}
{code_indent} using kernel_signature = {kernel_signature};
{code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
{code_indent} paddle::platform::RecordEvent* kernel_record_event = nullptr;
{code_indent} if(paddle::platform::RecordEvent::IsEnabled()){{
{code_indent} kernel_record_event = new paddle::platform::RecordEvent(\"{self.api} compute\", paddle::platform::TracerEventType::OperatorInner, 1);
{code_indent} phi::RecordEvent* kernel_record_event = nullptr;
{code_indent} if(phi::RecordEvent::IsEnabled()){{
{code_indent} kernel_record_event = new phi::RecordEvent(\"{self.api} compute\", phi::TracerEventType::OperatorInner, 1);
{code_indent} }}
{code_indent} (*kernel_fn)({kernel_args}, {", ".join(outputs_args)});
{code_indent} if(kernel_record_event != nullptr){{
......
......@@ -343,7 +343,7 @@ def source_include(header_file_path):
#include "paddle/phi/infermeta/unary.h"
#include "paddle/phi/infermeta/ternary.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/api/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
DECLARE_bool(conv2d_disable_cudnn);
......
......@@ -286,7 +286,7 @@ def source_include(header_file_path):
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/unary.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/api/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
DECLARE_bool(conv2d_disable_cudnn);
......
......@@ -52,7 +52,7 @@ def source_include(header_file_path):
#include "paddle/phi/infermeta/sparse/binary.h"
#include "paddle/phi/infermeta/sparse/multiary.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/api/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
DECLARE_int32(low_precision_op_list);
......
......@@ -21,7 +21,6 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/onednn/onednn_context.h"
#include "paddle/phi/backends/onednn/onednn_helper.h"
#include "paddle/phi/common/data_type.h"
......@@ -393,11 +392,6 @@ class OneDNNHandlerT {
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
1,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......@@ -446,11 +440,6 @@ class OneDNNHandlerT {
dev_ctx_.SetBlob(key_reorder_p, reorder_p);
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
1,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......@@ -472,11 +461,6 @@ class OneDNNHandlerT {
auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
dev_ctx_.GetBlob(key_reorder_p));
if (reorder_p != nullptr) {
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
1,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......@@ -660,11 +644,6 @@ class OneDNNHandlerNoCachingT {
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
1,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......@@ -691,11 +670,6 @@ class OneDNNHandlerNoCachingT {
std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
1,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <thread>
#include <type_traits>
#include <unordered_map>
namespace phi {
template <typename T>
class ThreadDataRegistry {
public:
// Singleton
static ThreadDataRegistry& GetInstance() {
static ThreadDataRegistry instance;
return instance;
}
T* GetMutableCurrentThreadData() { return &CurrentThreadData(); }
const T& GetCurrentThreadData() { return CurrentThreadData(); }
template <typename Alias = T,
typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
void SetCurrentThreadData(const T& val) {
CurrentThreadData() = val;
}
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
template <
typename Alias = T,
typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
return impl_->GetAllThreadDataByValue();
}
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
std::unordered_map<uint64_t, std::reference_wrapper<T>>
GetAllThreadDataByRef() {
return impl_->GetAllThreadDataByRef();
}
private:
// types
// Lock types
#if defined(__clang__) || defined(__GNUC__) // CLANG or GCC
#ifndef __APPLE__
#if __cplusplus >= 201703L
using LockType = std::shared_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
#elif __cplusplus >= 201402L
using LockType = std::shared_timed_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
// Special case : mac. https://github.com/facebook/react-native/issues/31250
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
#elif defined(_MSC_VER) // MSVC
#if _MSVC_LANG >= 201703L
using LockType = std::shared_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
#elif _MSVC_LANG >= 201402L
using LockType = std::shared_timed_mutex;
using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
#else
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
#else // other compilers
using LockType = std::mutex;
using SharedLockGuardType = std::lock_guard<std::mutex>;
#endif
class ThreadDataHolder;
class ThreadDataRegistryImpl {
public:
void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) {
std::lock_guard<LockType> guard(lock_);
tid_map_[tid] = tls_obj;
}
void UnregisterData(uint64_t tid) {
std::lock_guard<LockType> guard(lock_);
tid_map_.erase(tid);
}
template <
typename Alias = T,
typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
std::unordered_map<uint64_t, T> data_copy;
SharedLockGuardType guard(lock_);
data_copy.reserve(tid_map_.size());
for (auto& kv : tid_map_) {
data_copy.emplace(kv.first, kv.second->GetData());
}
return data_copy;
}
std::unordered_map<uint64_t, std::reference_wrapper<T>>
GetAllThreadDataByRef() {
std::unordered_map<uint64_t, std::reference_wrapper<T>> data_ref;
SharedLockGuardType guard(lock_);
data_ref.reserve(tid_map_.size());
for (auto& kv : tid_map_) {
data_ref.emplace(kv.first, std::ref(kv.second->GetData()));
}
return data_ref;
}
private:
LockType lock_;
std::unordered_map<uint64_t, ThreadDataHolder*> tid_map_; // not owned
};
class ThreadDataHolder {
public:
explicit ThreadDataHolder(
std::shared_ptr<ThreadDataRegistryImpl> registry) {
registry_ = std::move(registry);
tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
registry_->RegisterData(tid_, this);
}
~ThreadDataHolder() { registry_->UnregisterData(tid_); }
T& GetData() { return data_; }
private:
std::shared_ptr<ThreadDataRegistryImpl> registry_;
uint64_t tid_;
T data_;
};
// methods
ThreadDataRegistry() { impl_ = std::make_shared<ThreadDataRegistryImpl>(); }
ThreadDataRegistry(const ThreadDataRegistry&) = delete;
ThreadDataRegistry& operator=(const ThreadDataRegistry&) = delete;
T& CurrentThreadData() {
static thread_local ThreadDataHolder thread_data(impl_);
return thread_data.GetData();
}
// data
std::shared_ptr<ThreadDataRegistryImpl> impl_;
};
} // namespace phi
......@@ -21,6 +21,11 @@ cc_library(
SRCS enforce.cc
DEPS ${phi_enforce_deps})
cc_library(
phi_os_info
SRCS os_info.cc
DEPS phi_enforce)
if(WITH_XPU)
cc_library(
kernel_factory
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/os_info.h"
#include "paddle/phi/core/os_info.h"
#include <functional>
#include <sstream>
......@@ -28,14 +28,12 @@ limitations under the License. */
#include <unistd.h>
#endif
#include "glog/logging.h"
#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
#include "paddle/fluid/platform/macros.h" // import DISABLE_COPY_AND_ASSIGN
#include "paddle/phi/common/thread_data_registry.h"
namespace paddle {
namespace platform {
namespace phi {
namespace internal {
using framework::ThreadDataRegistry;
using phi::ThreadDataRegistry;
class InternalThreadId {
public:
......@@ -128,5 +126,4 @@ uint32_t GetProcessId() {
#endif
}
} // namespace platform
} // namespace paddle
} // namespace phi
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <unordered_map>
#ifdef _POSIX_C_SOURCE
#include <time.h>
#endif
#include "paddle/phi/backends/dynload/port.h"
namespace phi {
// Get system-wide realtime clock in nanoseconds
inline uint64_t PosixInNsec() {
#ifdef _POSIX_C_SOURCE
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return tp.tv_sec * 1000 * 1000 * 1000 + tp.tv_nsec;
#else
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
#endif
}
// All kinds of Ids for OS thread
struct ThreadId {
uint64_t std_tid = 0; // std::hash<std::thread::id>
uint64_t sys_tid = 0; // OS-specific, Linux: gettid
uint32_t cupti_tid = 0; // thread_id used by Nvidia CUPTI
};
// Better performance than GetCurrentThreadId
uint64_t GetCurrentThreadStdId();
// Better performance than GetCurrentThreadId
uint64_t GetCurrentThreadSysId();
ThreadId GetCurrentThreadId();
// Return the map from StdTid to ThreadId
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
std::unordered_map<uint64_t, ThreadId> GetAllThreadIds();
static constexpr const char* kDefaultThreadName = "unnamed";
// Returns kDefaultThreadName if SetCurrentThreadName is never called.
std::string GetCurrentThreadName();
// Return the map from StdTid to ThreadName
// Returns current snapshot of all threads. Make sure there is no thread
// create/destory when using it.
std::unordered_map<uint64_t, std::string> GetAllThreadNames();
// Thread name is immutable, only the first call will succeed.
// Returns false on failure.
bool SetCurrentThreadName(const std::string& name);
uint32_t GetProcessId();
} // namespace phi
......@@ -16,7 +16,6 @@
#include "glog/logging.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/onednn/onednn_context.h"
#include "paddle/phi/common/bfloat16.h"
......@@ -98,11 +97,6 @@ void TransDataLayoutFromOneDNN(DataLayout in_layout,
handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
auto& astream = OneDNNContext::tls().get_stream();
::paddle::platform::RecordEvent record_reorder(
"ext_reorder",
::paddle::platform::TracerEventType::UserDefined,
1,
::paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
astream.wait();
} else {
......
......@@ -13,7 +13,6 @@
// limitations under the License.
#include "paddle/phi/kernels/conv_grad_kernel.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/data_layout_transform.h"
......@@ -140,11 +139,6 @@ void ConvGradKernel(const Context& dev_ctx,
diff_weights_memory_p);
{
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
1,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream, *diff_weights_memory_p, *reorder_dst_memory_p);
astream.wait();
......
......@@ -14,7 +14,6 @@
#include "paddle/phi/kernels/conv_transpose_kernel.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/onednn/onednn_helper.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/core/expect.h"
......@@ -264,11 +263,6 @@ class ConvTransposeOneDNNHandlerT
dev_ctx.SetBlob(key_reorder_p, reorder_p);
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
1,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......@@ -290,11 +284,6 @@ class ConvTransposeOneDNNHandlerT
auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
dev_ctx.GetBlob(key_reorder_p));
if (reorder_p != nullptr) {
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
1,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......
......@@ -55,12 +55,6 @@ inline void AddSubNonBroadcast(ReorderOneDNNHandler* reorder_handler,
auto reorder_p =
reorder_handler->AcquireReorder(dst_memory, src_memory, reorder_attr);
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
2,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
OneDNNContext::tls().get_stream(), *src_memory, *dst_memory);
}
......
......@@ -135,7 +135,7 @@ void MatmulKernel(const Context &dev_ctx,
funcs::ExecuteMatmul<T, float>(
dev_ctx, x, y, x_bd_dims, y_bd_dims, transpose_x, transpose_y, out);
} else if (is_bfloat16) {
funcs::ExecuteMatmul<T, paddle::platform::bfloat16>(
funcs::ExecuteMatmul<T, phi::dtype::bfloat16>(
dev_ctx, x, y, x_bd_dims, y_bd_dims, transpose_x, transpose_y, out);
} else {
funcs::ExecuteMatmul<T, int8_t>(
......@@ -219,11 +219,6 @@ class MulPrimitiveFactory {
auto &astream = OneDNNContext::tls().get_stream();
{
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
2,
paddle::platform::EventRole::kUniqueOp);
reorder.execute(astream, src_mem, dst_mem);
astream.wait();
}
......@@ -406,11 +401,6 @@ class MulPrimitiveFactory {
auto &astream = OneDNNContext::tls().get_stream();
{
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
2,
paddle::platform::EventRole::kUniqueOp);
reorder.execute(astream, src_mem, dst_mem);
astream.wait();
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册