未验证 提交 851637fd 编写于 作者: L liutiexing 提交者: GitHub

Make profiler better (#38280)

* add align for WorkQueue

* add spinlock

* merge develop

* merge

* Add EventsWaiter

* Revert "Add EventsWaiter"

This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2.

* update OS info

* split host_event_recorder

* split host_event_recorder

* update

* update

* update

* update

* update

* update

* update
Co-authored-by: Nliutiexing <liutiexing@google.com>
上级 14658d8f
...@@ -46,7 +46,7 @@ IF(WITH_XBYAK) ...@@ -46,7 +46,7 @@ IF(WITH_XBYAK)
ENDIF() ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
cc_library(os_info SRCS os_info.cc DEPS enforce device_tracer) cc_library(os_info SRCS os_info.cc DEPS enforce)
IF(WITH_GPU) IF(WITH_GPU)
nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
...@@ -169,15 +169,16 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer) ...@@ -169,15 +169,16 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto) cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto)
cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer) cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)
cc_library(host_event_recorder SRCS host_event_recorder.cc DEPS os_info)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU) if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda) nv_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce dynload_cuda)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
elseif(WITH_ROCM) elseif(WITH_ROCM)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce) hip_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce)
hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else() else()
cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce) cc_library(profiler SRCS profiler.cc DEPS host_event_recorder os_info device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif() endif()
......
...@@ -348,7 +348,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -348,7 +348,7 @@ class DeviceTracerImpl : public DeviceTracer {
} }
void AddCPURecords(const std::string &anno, uint64_t start_ns, void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) { uint64_t end_ns, int64_t device_id, uint64_t thread_id) {
if (anno.empty()) { if (anno.empty()) {
VLOG(1) << "Empty timeline annotation."; VLOG(1) << "Empty timeline annotation.";
return; return;
...@@ -383,7 +383,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -383,7 +383,7 @@ class DeviceTracerImpl : public DeviceTracer {
void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes, void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &alloc_in, const Place &place, const std::string &alloc_in,
const std::string &free_in, int64_t thread_id) { const std::string &free_in, uint64_t thread_id) {
if (0 == start_ns || 0 == end_ns) { if (0 == start_ns || 0 == end_ns) {
VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced."; VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
return; return;
......
...@@ -50,7 +50,7 @@ class DeviceTracer { ...@@ -50,7 +50,7 @@ class DeviceTracer {
uint64_t start_ns; uint64_t start_ns;
uint64_t end_ns; uint64_t end_ns;
int64_t device_id; int64_t device_id;
int64_t thread_id; uint64_t thread_id;
}; };
struct MemRecord { struct MemRecord {
...@@ -68,7 +68,7 @@ class DeviceTracer { ...@@ -68,7 +68,7 @@ class DeviceTracer {
uint64_t end_ns; uint64_t end_ns;
size_t bytes; size_t bytes;
Place place; Place place;
int64_t thread_id; uint64_t thread_id;
std::string alloc_in; std::string alloc_in;
std::string free_in; std::string free_in;
}; };
...@@ -105,7 +105,7 @@ class DeviceTracer { ...@@ -105,7 +105,7 @@ class DeviceTracer {
virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id) = 0; uint64_t thread_id) = 0;
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns, virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
uint64_t thread_id, uint64_t thread_id,
...@@ -115,7 +115,7 @@ class DeviceTracer { ...@@ -115,7 +115,7 @@ class DeviceTracer {
size_t bytes, const Place& place, size_t bytes, const Place& place,
const std::string& alloc_in, const std::string& alloc_in,
const std::string& free_in, const std::string& free_in,
int64_t thread_id) = 0; uint64_t thread_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <functional>
#include <map> #include <map>
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -45,9 +46,9 @@ class Event { ...@@ -45,9 +46,9 @@ class Event {
Event(EventType type, std::string name, uint32_t thread_id, Event(EventType type, std::string name, uint32_t thread_id,
EventRole role = EventRole::kOrdinary, std::string attr = "none"); EventRole role = EventRole::kOrdinary, std::string attr = "none");
const EventType& type() const; const EventType &type() const;
Event* parent() const { return parent_; } Event *parent() const { return parent_; }
void set_parent(Event* parent) { parent_ = parent; } void set_parent(Event *parent) { parent_ = parent; }
std::string name() const { return name_; } std::string name() const { return name_; }
EventRole role() const { return role_; } EventRole role() const { return role_; }
uint64_t thread_id() const { return thread_id_; } uint64_t thread_id() const { return thread_id_; }
...@@ -61,13 +62,13 @@ class Event { ...@@ -61,13 +62,13 @@ class Event {
#endif #endif
#endif #endif
double CpuElapsedMs(const Event& e) const; double CpuElapsedMs(const Event &e) const;
double CudaElapsedMs(const Event& e) const; double CudaElapsedMs(const Event &e) const;
private: private:
EventType type_; EventType type_;
std::string name_{}; std::string name_{};
Event* parent_{nullptr}; Event *parent_{nullptr};
uint64_t thread_id_; uint64_t thread_id_;
EventRole role_{}; EventRole role_{};
int64_t cpu_ns_; int64_t cpu_ns_;
...@@ -90,13 +91,13 @@ class Event { ...@@ -90,13 +91,13 @@ class Event {
#endif #endif
}; };
using EventWithStartNs = std::pair<Event*, uint64_t>; using EventWithStartNs = std::pair<Event *, uint64_t>;
using ThreadEvents = std::map<uint64_t, EventWithStartNs>; using ThreadEvents = std::map<uint64_t, EventWithStartNs>;
class MemEvent { class MemEvent {
public: public:
MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes, MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
Place place, int64_t thread_id, const std::string& annotation) Place place, int64_t thread_id, const std::string &annotation)
: type_(type), : type_(type),
start_ns_(start_ns), start_ns_(start_ns),
end_ns_(end_ns), end_ns_(end_ns),
...@@ -105,13 +106,13 @@ class MemEvent { ...@@ -105,13 +106,13 @@ class MemEvent {
thread_id_(thread_id), thread_id_(thread_id),
annotation_(annotation) {} annotation_(annotation) {}
const EventType& type() const { return type_; } const EventType &type() const { return type_; }
uint64_t start_ns() const { return start_ns_; } uint64_t start_ns() const { return start_ns_; }
uint64_t end_ns() const { return end_ns_; } uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; } size_t bytes() const { return bytes_; }
Place place() const { return place_; } Place place() const { return place_; }
uint64_t thread_id() const { return thread_id_; } uint64_t thread_id() const { return thread_id_; }
const std::string& annotation() const { return annotation_; } const std::string &annotation() const { return annotation_; }
private: private:
EventType type_; EventType type_;
...@@ -151,7 +152,7 @@ class CudaEvent { ...@@ -151,7 +152,7 @@ class CudaEvent {
#endif #endif
} }
void Record(const paddle::platform::stream::CUDAStream& stream) { void Record(const paddle::platform::stream::CUDAStream &stream) {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream())); PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
#else #else
...@@ -200,5 +201,39 @@ class CudaEvent { ...@@ -200,5 +201,39 @@ class CudaEvent {
#endif #endif
}; };
struct CommonEvent {
public:
CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
EventRole role)
: name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
CommonEvent(std::function<void *(size_t)> &arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role, const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
attr = buf;
}
CommonEvent(const std::function<void *(size_t)> &arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role)
: start_ns(start_ns), end_ns(end_ns), role(role) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
}
const char *name = nullptr; // not owned, designed for performance
uint64_t start_ns = 0;
uint64_t end_ns = 0;
EventRole role = EventRole::kOrdinary;
const char *attr = nullptr; // not owned, designed for performance
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/event.h"
namespace paddle {
namespace platform {
// CPU event tracing. A trace marks something that happens but has no duration
// associated with it. For example, thread starts working.
// Chrome Trace Viewer Format: Instant Event
struct RecordInstantEvent {
explicit RecordInstantEvent(const char* name,
const EventRole role = EventRole::kOrdinary);
};
// CPU event tracing. A trace starts when an object of this clas is created and
// stops when the object is destroyed.
// Chrome Trace Viewer Format: Duration Event/Complte Event
class RecordEvent {
public:
explicit RecordEvent(const std::string& name,
const EventRole role = EventRole::kOrdinary);
explicit RecordEvent(const char* name,
const EventRole role = EventRole::kOrdinary);
RecordEvent(const std::string& name, const EventRole role,
const std::string& attr);
// Stop event tracing explicitly before the object goes out of scope.
// Sometimes it's inconvenient to use RAII
void End();
~RecordEvent() { End(); }
private:
void OriginalConstruct(const std::string& name, const EventRole role,
const std::string& attr);
bool is_enabled_{false};
bool is_pushed_{false};
// Event name
std::string* name_{nullptr};
const char* shallow_copy_name_{nullptr};
uint64_t start_ns_;
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
// std::string full_name_;
EventRole role_{EventRole::kOrdinary};
std::string* attr_{nullptr};
bool finished_{false};
};
} // namespace platform
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/host_event_recorder.h"
#include "paddle/fluid/platform/os_info.h"
namespace paddle {
namespace platform {
ThreadEventRecorder::ThreadEventRecorder() {
thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid();
HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
}
HostEventSection HostEventRecorder::GatherEvents() {
HostEventSection host_sec;
host_sec.thr_sections.reserve(thread_recorders_.size());
for (auto &kv : thread_recorders_) {
host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
}
return std::move(host_sec);
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstring>
#include <mutex>
#include <string>
#include <type_traits>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/platform/event.h"
namespace paddle {
namespace platform {
template <typename HeadType, typename... RestTypes>
struct ContainsStdString
: std::conditional_t<
std::is_same<std::string, std::remove_cv_t<std::remove_reference_t<
HeadType>>>::value,
std::true_type, ContainsStdString<RestTypes...>> {};
template <typename TailType>
struct ContainsStdString<TailType>
: std::is_same<std::string,
std::remove_cv_t<std::remove_reference_t<TailType>>> {};
template <typename EventType>
class EventContainer {
public:
EventContainer() {
event_blocks_ = cur_event_block_ = new EventBlock;
str_blocks_ = cur_str_block_ = new StringBlock;
}
~EventContainer() {
Reduce();
delete event_blocks_;
for (auto cur = str_blocks_; cur != nullptr;) {
auto next = cur->next;
delete cur;
cur = next;
}
}
DISABLE_COPY_AND_ASSIGN(EventContainer);
public:
// Record an event
template <typename... Args>
void Record(Args &&... args) {
DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
}
// Get all events and clear the container
std::vector<EventType> Reduce();
// Return a buffer to store the string attribute of Event.
// HostEventRecorder locates in the static data section.
// So it's safe to use arena to avoid fragmented allocations.
char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); }
private:
struct EventBlock {
union InitDeferedEvent {
InitDeferedEvent() {}
~InitDeferedEvent() {}
EventType event;
};
static constexpr size_t kBlockSize = 1 << 24; // 16 MB
static constexpr size_t kAvailSize =
kBlockSize - sizeof(size_t) - sizeof(nullptr);
static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent);
static constexpr size_t kPadSize =
kAvailSize - kNumEvents * sizeof(InitDeferedEvent);
static constexpr size_t kMinimumEventsPerBlock = 1024;
static_assert(
kNumEvents >= kMinimumEventsPerBlock,
"EventType is too large for kBlockSize, make kBlockSize larger");
size_t offset = 0;
EventBlock *next = nullptr;
InitDeferedEvent events[kNumEvents];
char padding[kPadSize];
};
static_assert(sizeof(EventBlock) == EventBlock::kBlockSize,
"sizeof EventBlock must equal to kBlockSize");
struct StringBlock {
static constexpr size_t kBlockSize = 1 << 22; // 4 MB
static constexpr size_t kAvailSize =
kBlockSize - sizeof(size_t) - sizeof(nullptr);
size_t offset = 0;
StringBlock *next = nullptr;
char storage[kAvailSize];
};
static_assert(sizeof(StringBlock) == StringBlock::kBlockSize,
"sizeof StringBlock must equal to kBlockSize");
// Record an event with string arguments
template <typename... Args>
void DoRecord(std::true_type, Args &&... args) {
auto *storage = GetEventStorage();
std::function<void *(size_t)> allocator = [this](size_t size) {
return GetStrBufFromArena(size);
};
new (storage) EventType(allocator, std::forward<Args>(args)...);
}
// Record an event without any string argument
template <typename... Args>
void DoRecord(std::false_type, Args &&... args) {
auto *storage = GetEventStorage();
new (storage) EventType(std::forward<Args>(args)...);
}
EventType *GetEventStorage();
char *GetStringStorage(size_t sz);
EventBlock *event_blocks_ = nullptr;
EventBlock *cur_event_block_ = nullptr;
StringBlock *str_blocks_ = nullptr;
StringBlock *cur_str_block_ = nullptr;
};
template <typename EventType>
std::vector<EventType> EventContainer<EventType>::Reduce() {
std::vector<EventType> all_events;
size_t event_cnt = 0;
for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) {
event_cnt += cur->offset;
}
all_events.reserve(event_cnt);
for (auto cur = event_blocks_; cur != nullptr;) {
for (size_t i = 0; i < cur->offset; ++i) {
all_events.emplace_back(cur->events[i].event);
}
auto next = cur->next;
delete cur;
cur = next;
}
event_blocks_ = cur_event_block_ = new EventBlock;
return std::move(all_events);
}
template <typename EventType>
EventType *EventContainer<EventType>::GetEventStorage() {
if (UNLIKELY(cur_event_block_->offset >=
EventBlock::kNumEvents)) { // another block
cur_event_block_->next = new EventBlock;
cur_event_block_ = cur_event_block_->next;
}
auto &obj = cur_event_block_->events[cur_event_block_->offset].event;
++cur_event_block_->offset;
return &obj;
}
template <typename EventType>
char *EventContainer<EventType>::GetStringStorage(size_t sz) {
if (UNLIKELY(cur_str_block_->offset + sz >
StringBlock::kAvailSize)) { // another block
cur_str_block_->next = new StringBlock;
cur_str_block_ = cur_str_block_->next;
}
char *storage = cur_str_block_->storage + cur_str_block_->offset;
cur_str_block_->offset += sz;
return storage;
}
struct ThreadEventSection {
std::string thread_name;
uint64_t thread_id;
std::vector<CommonEvent> events;
};
class ThreadEventRecorder {
public:
ThreadEventRecorder();
DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
public:
// Forward call to EventContainer::Record
template <typename... Args>
void RecordEvent(Args &&... args) {
base_evt_cntr_.Record(std::forward<Args>(args)...);
}
ThreadEventSection GatherEvents() {
ThreadEventSection thr_sec;
thr_sec.thread_name = thread_name_;
thr_sec.thread_id = thread_id_;
thr_sec.events = std::move(base_evt_cntr_.Reduce());
return std::move(thr_sec);
}
private:
uint64_t thread_id_;
std::string thread_name_;
EventContainer<CommonEvent> base_evt_cntr_;
};
struct HostEventSection {
std::string process_name;
uint64_t process_id;
std::vector<ThreadEventSection> thr_sections;
};
class HostEventRecorder {
public:
// singleton
static HostEventRecorder &GetInstance() {
static HostEventRecorder instance;
return instance;
}
// If your string argument has a longer lifetime than the Event,
// use 'const char*'. e.g.: string literal, op name, etc.
// Do your best to avoid using 'std::string' as the argument type.
// It will cause deep-copy to harm performance.
template <typename... Args>
void RecordEvent(Args &&... args) {
GetThreadLocalRecorder().RecordEvent(std::forward<Args>(args)...);
}
// Poor performance, call it at the ending
HostEventSection GatherEvents();
void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) {
const std::lock_guard<std::mutex> guard(thread_recorders_lock_);
thread_recorders_[tid] = recorder;
}
private:
HostEventRecorder() = default;
DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
ThreadEventRecorder &GetThreadLocalRecorder() {
static thread_local ThreadEventRecorder tls_recorder;
return tls_recorder;
}
std::mutex thread_recorders_lock_;
std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_;
};
} // namespace platform
} // namespace paddle
...@@ -14,17 +14,32 @@ limitations under the License. */ ...@@ -14,17 +14,32 @@ limitations under the License. */
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
#include <sstream> #include <sstream>
#include "paddle/fluid/platform/device_tracer.h" #if defined(__linux__)
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#elif defined(_MSC_VER)
#include <processthreadsapi.h>
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
ThreadId::ThreadId() { ThreadId::ThreadId() {
// C++ std tid
std_tid_ = std::hash<std::thread::id>()(std::this_thread::get_id()); std_tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
// system tid
#if defined(__linux__)
sys_tid_ = syscall(SYS_gettid);
#elif defined(_MSC_VER)
sys_tid_ = GetCurrentThreadId();
#else // unsupported platforms
sys_tid_ = 0;
#endif
// cupti tid
std::stringstream ss; std::stringstream ss;
ss << std::this_thread::get_id(); ss << std::this_thread::get_id();
cupti_tid_ = static_cast<uint32_t>(std::stoull(ss.str())); cupti_tid_ = static_cast<uint32_t>(std::stoull(ss.str()));
RecoreCurThreadId(MainTid()); // For DeviceTracer
} }
ThreadIdRegistry::~ThreadIdRegistry() { ThreadIdRegistry::~ThreadIdRegistry() {
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include <mutex> #include <mutex>
#include <thread> #include <thread>
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h" // import LIKELY
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h" // import DISABLE_COPY_AND_ASSIGN
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#ifdef _POSIX_C_SOURCE #ifdef _POSIX_C_SOURCE
#include <time.h> #include <time.h>
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
// Get current time in nanoseconds // Get system-wide realtime clock in nanoseconds
inline uint64_t PosixInNsec() { inline uint64_t PosixInNsec() {
#ifdef _POSIX_C_SOURCE #ifdef _POSIX_C_SOURCE
struct timespec tp; struct timespec tp;
...@@ -45,13 +45,13 @@ class ThreadId { ...@@ -45,13 +45,13 @@ class ThreadId {
public: public:
ThreadId(); ThreadId();
uint64_t MainTid() const { return std_tid_; } uint64_t MainTid() const { return SysTid(); }
uint64_t StdTid() const { return std_tid_; } uint64_t StdTid() const { return std_tid_; }
uint32_t CuptiTid() const { return cupti_tid_; } uint32_t CuptiTid() const { return cupti_tid_; }
uint64_t SysTid() const { return sys_tid_; } uint64_t SysTid() const { return sys_tid_ != 0 ? sys_tid_ : std_tid_; }
private: private:
uint64_t std_tid_ = 0; // std::hash<std::thread::id> uint64_t std_tid_ = 0; // std::hash<std::thread::id>
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/host_event_recorder.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler_helper.h" #include "paddle/fluid/platform/profiler_helper.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -36,286 +37,6 @@ DEFINE_bool(enable_host_event_recorder_hook, false, ...@@ -36,286 +37,6 @@ DEFINE_bool(enable_host_event_recorder_hook, false,
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct DurationEvent {
public:
DurationEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
EventRole role)
: name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
DurationEvent(std::function<void *(size_t)> &arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role, const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
attr = buf;
}
DurationEvent(const std::function<void *(size_t)> &arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role)
: start_ns(start_ns), end_ns(end_ns), role(role) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
}
const char *name = nullptr; // not owned, designed for performance
uint64_t start_ns = 0;
uint64_t end_ns = 0;
EventRole role = EventRole::kOrdinary;
const char *attr = nullptr; // not owned, designed for performance
};
template <typename HeadType, typename... RestTypes>
struct ContainsStdString
: std::conditional_t<
std::is_same<std::string, std::remove_cv_t<std::remove_reference_t<
HeadType>>>::value,
std::true_type, ContainsStdString<RestTypes...>> {};
template <typename TailType>
struct ContainsStdString<TailType>
: std::is_same<std::string,
std::remove_cv_t<std::remove_reference_t<TailType>>> {};
template <typename EventType>
class EventContainer {
public:
EventContainer() {
event_blocks_ = cur_event_block_ = new EventBlock;
str_blocks_ = cur_str_block_ = new StringBlock;
}
~EventContainer() {
Reduce();
delete event_blocks_;
for (auto cur = str_blocks_; cur != nullptr;) {
auto next = cur->next;
delete cur;
cur = next;
}
}
DISABLE_COPY_AND_ASSIGN(EventContainer);
public:
// Record an event
template <typename... Args>
void Record(Args &&... args) {
DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
}
// Get all events and clear the container
std::vector<EventType> Reduce();
// Return a buffer to store the string attribute of Event.
// HostEventRecorder locates in the static data section.
// So it's safe to use arena to avoid fragmented allocations.
char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); }
private:
struct EventBlock {
union InitDeferedEvent {
InitDeferedEvent() {}
~InitDeferedEvent() {}
EventType event;
};
static constexpr size_t kBlockSize = 1 << 24; // 16 MB
static constexpr size_t kAvailSize =
kBlockSize - sizeof(size_t) - sizeof(nullptr);
static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent);
static constexpr size_t kPadSize =
kAvailSize - kNumEvents * sizeof(InitDeferedEvent);
static constexpr size_t kMinimumEventsPerBlock = 1024;
static_assert(
kNumEvents >= kMinimumEventsPerBlock,
"EventType is too large for kBlockSize, make kBlockSize larger");
size_t offset = 0;
EventBlock *next = nullptr;
InitDeferedEvent events[kNumEvents];
char padding[kPadSize];
};
static_assert(sizeof(EventBlock) == EventBlock::kBlockSize,
"sizeof EventBlock must equal to kBlockSize");
struct StringBlock {
static constexpr size_t kBlockSize = 1 << 22; // 4 MB
static constexpr size_t kAvailSize =
kBlockSize - sizeof(size_t) - sizeof(nullptr);
size_t offset = 0;
StringBlock *next = nullptr;
char storage[kAvailSize];
};
static_assert(sizeof(StringBlock) == StringBlock::kBlockSize,
"sizeof StringBlock must equal to kBlockSize");
// Record an event with string arguments
template <typename... Args>
void DoRecord(std::true_type, Args &&... args) {
auto *storage = GetEventStorage();
std::function<void *(size_t)> allocator = [this](size_t size) {
return GetStrBufFromArena(size);
};
new (storage) EventType(allocator, std::forward<Args>(args)...);
}
// Record an event without any string argument
template <typename... Args>
void DoRecord(std::false_type, Args &&... args) {
auto *storage = GetEventStorage();
new (storage) EventType(std::forward<Args>(args)...);
}
EventType *GetEventStorage();
char *GetStringStorage(size_t sz);
EventBlock *event_blocks_ = nullptr;
EventBlock *cur_event_block_ = nullptr;
StringBlock *str_blocks_ = nullptr;
StringBlock *cur_str_block_ = nullptr;
};
template <typename EventType>
std::vector<EventType> EventContainer<EventType>::Reduce() {
std::vector<EventType> all_events;
size_t event_cnt = 0;
for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) {
event_cnt += cur->offset;
}
all_events.reserve(event_cnt);
for (auto cur = event_blocks_; cur != nullptr;) {
for (size_t i = 0; i < cur->offset; ++i) {
all_events.emplace_back(cur->events[i].event);
}
auto next = cur->next;
delete cur;
cur = next;
}
event_blocks_ = cur_event_block_ = new EventBlock;
return std::move(all_events);
}
template <typename EventType>
EventType *EventContainer<EventType>::GetEventStorage() {
if (UNLIKELY(cur_event_block_->offset >=
EventBlock::kNumEvents)) { // another block
cur_event_block_->next = new EventBlock;
cur_event_block_ = cur_event_block_->next;
}
auto &obj = cur_event_block_->events[cur_event_block_->offset].event;
++cur_event_block_->offset;
return &obj;
}
template <typename EventType>
char *EventContainer<EventType>::GetStringStorage(size_t sz) {
if (UNLIKELY(cur_str_block_->offset + sz >
StringBlock::kAvailSize)) { // another block
cur_str_block_->next = new StringBlock;
cur_str_block_ = cur_str_block_->next;
}
char *storage = cur_str_block_->storage + cur_str_block_->offset;
cur_str_block_->offset += sz;
return storage;
}
struct ThreadEventSection {
std::string thread_name;
uint64_t thread_id;
std::vector<DurationEvent> events;
};
class ThreadEventRecorder {
public:
ThreadEventRecorder();
DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
public:
// Forward call to EventContainer::Record
template <typename... Args>
void RecordEvent(Args &&... args) {
base_evt_cntr_.Record(std::forward<Args>(args)...);
}
ThreadEventSection GatherEvents() {
ThreadEventSection thr_sec;
thr_sec.thread_name = thread_name_;
thr_sec.thread_id = thread_id_;
thr_sec.events = std::move(base_evt_cntr_.Reduce());
return std::move(thr_sec);
}
private:
uint64_t thread_id_;
std::string thread_name_;
EventContainer<DurationEvent> base_evt_cntr_;
};
struct HostEventSection {
std::string process_name;
uint64_t process_id;
std::vector<ThreadEventSection> thr_sections;
};
class HostEventRecorder {
public:
// singleton
static HostEventRecorder &GetInstance() {
static HostEventRecorder instance;
return instance;
}
// If your string argument has a longer lifetime than the Event,
// use 'const char*'. e.g.: string literal, op name, etc.
// Do your best to avoid using 'std::string' as the argument type.
// It will cause deep-copy to harm performance.
template <typename... Args>
void RecordEvent(Args &&... args) {
GetThreadLocalRecorder().RecordEvent(std::forward<Args>(args)...);
}
// Poor performance, call it at the ending
HostEventSection GatherEvents();
void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) {
const std::lock_guard<std::mutex> guard(thread_recorders_lock_);
thread_recorders_[tid] = recorder;
}
private:
HostEventRecorder() = default;
DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
ThreadEventRecorder &GetThreadLocalRecorder() {
static thread_local ThreadEventRecorder tls_recorder;
return tls_recorder;
}
std::mutex thread_recorders_lock_;
std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_;
};
ThreadEventRecorder::ThreadEventRecorder() {
thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid();
HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
}
HostEventSection HostEventRecorder::GatherEvents() {
HostEventSection host_sec;
host_sec.thr_sections.reserve(thread_recorders_.size());
for (auto &kv : thread_recorders_) {
host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
}
return std::move(host_sec);
}
MemEvenRecorder MemEvenRecorder::recorder; MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type, std::string name, uint32_t thread_id, Event::Event(EventType type, std::string name, uint32_t thread_id,
...@@ -416,7 +137,11 @@ void RecordEvent::OriginalConstruct(const std::string &name, ...@@ -416,7 +137,11 @@ void RecordEvent::OriginalConstruct(const std::string &name,
*name_ = e->name(); *name_ = e->name();
} }
RecordEvent::~RecordEvent() { void RecordEvent::End() {
if (UNLIKELY(finished_)) {
return;
}
finished_ = true;
#ifndef _WIN32 #ifndef _WIN32
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook && is_pushed_) { if (g_enable_nvprof_hook && is_pushed_) {
...@@ -456,6 +181,15 @@ RecordEvent::~RecordEvent() { ...@@ -456,6 +181,15 @@ RecordEvent::~RecordEvent() {
delete attr_; delete attr_;
} }
RecordInstantEvent::RecordInstantEvent(const char *name, const EventRole role) {
if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
return;
}
auto start_end_ns = PosixInNsec();
HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns,
role);
}
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
size_t size) { size_t size) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
...@@ -740,8 +474,9 @@ std::string PrintHostEvents() { ...@@ -740,8 +474,9 @@ std::string PrintHostEvents() {
for (const auto &thr_evt_sec : host_evt_sec.thr_sections) { for (const auto &thr_evt_sec : host_evt_sec.thr_sections) {
oss << thr_evt_sec.thread_id << std::endl; oss << thr_evt_sec.thread_id << std::endl;
for (const auto &evt : thr_evt_sec.events) { for (const auto &evt : thr_evt_sec.events) {
oss << "{ " << evt.name << " | " << evt.start_ns << " | " << evt.end_ns oss << "{ " << evt.name << " | " << evt.start_ns << "ns | " << evt.end_ns
<< " }" << std::endl; << "ns | " << (evt.end_ns - evt.start_ns) / 1000.000 << "us }"
<< std::endl;
} }
} }
return oss.str(); return oss.str();
......
...@@ -27,9 +27,9 @@ limitations under the License. */ ...@@ -27,9 +27,9 @@ limitations under the License. */
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/event_tracing.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
...@@ -127,43 +127,6 @@ struct MemEvenRecorder { ...@@ -127,43 +127,6 @@ struct MemEvenRecorder {
DISABLE_COPY_AND_ASSIGN(MemEvenRecorder); DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
}; };
struct RecordEvent {
explicit RecordEvent(const std::string& name,
const EventRole role = EventRole::kOrdinary);
explicit RecordEvent(const char* name,
const EventRole role = EventRole::kOrdinary);
RecordEvent(const std::string& name, const EventRole role,
const std::string& attr);
~RecordEvent();
void OriginalConstruct(const std::string& name, const EventRole role,
const std::string& attr);
bool is_enabled_{false};
bool is_pushed_{false};
// Event name
std::string* name_{nullptr};
const char* shallow_copy_name_{nullptr};
uint64_t start_ns_;
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
// std::string full_name_;
EventRole role_{EventRole::kOrdinary};
std::string* attr_{nullptr};
};
/*class RecordRPCEvent {
public:
explicit RecordRPCEvent(const std::string& name);
~RecordRPCEvent() {}
private:
std::unique_ptr<RecordEvent> event_;
};*/
struct RecordBlock { struct RecordBlock {
explicit RecordBlock(int block_id); explicit RecordBlock(int block_id);
~RecordBlock(); ~RecordBlock();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册