diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 26bf5d8b1be9d2807ee4aca28426d09d967ee438..1031d1ed6357df3962fc5827acfcd73daa5bd0e3 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -46,7 +46,7 @@ IF(WITH_XBYAK) ENDIF() cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) -cc_library(os_info SRCS os_info.cc DEPS enforce device_tracer) +cc_library(os_info SRCS os_info.cc DEPS enforce) IF(WITH_GPU) nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) @@ -169,15 +169,16 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer) cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto) cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer) +cc_library(host_event_recorder SRCS host_event_recorder.cc DEPS os_info) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce dynload_cuda) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce) + hip_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce) + cc_library(profiler SRCS profiler.cc DEPS host_event_recorder os_info device_tracer enforce) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) endif() diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 28c51251627c5c58b26de6a6cde61f48355f8d49..ff11bfd62c1385407f6eddf2478c447244c7ee0e 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -348,7 +348,7 @@ class DeviceTracerImpl : public DeviceTracer { } void AddCPURecords(const std::string &anno, uint64_t start_ns, - uint64_t end_ns, int64_t device_id, int64_t thread_id) { + uint64_t end_ns, int64_t device_id, uint64_t thread_id) { if (anno.empty()) { VLOG(1) << "Empty timeline annotation."; return; @@ -383,7 +383,7 @@ class DeviceTracerImpl : public DeviceTracer { void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes, const Place &place, const std::string &alloc_in, - const std::string &free_in, int64_t thread_id) { + const std::string &free_in, uint64_t thread_id) { if (0 == start_ns || 0 == end_ns) { VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced."; return; diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 9d6e435c8457fbf88bc3b33ac34684b0aedf8c93..4cb01529506b7677da99a8fafc7df6515e047463 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -50,7 +50,7 @@ class DeviceTracer { uint64_t start_ns; uint64_t end_ns; int64_t device_id; - int64_t thread_id; + uint64_t thread_id; }; struct MemRecord { @@ -68,7 +68,7 @@ class DeviceTracer { uint64_t end_ns; size_t bytes; Place place; - int64_t thread_id; + uint64_t thread_id; std::string alloc_in; std::string free_in; }; @@ -105,7 +105,7 @@ class DeviceTracer { virtual void AddCPURecords(const std::string& anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, - int64_t thread_id) = 0; + uint64_t thread_id) = 0; virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns, uint64_t end_ns, int64_t device_id, uint64_t thread_id, @@ -115,7 +115,7 @@ class DeviceTracer { size_t bytes, const Place& place, const std::string& alloc_in, const std::string& free_in, - int64_t thread_id) = 0; + uint64_t thread_id) = 0; // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 0d1eee316846c22081904c2e44678be294d25771..919266575e6ce7128958c144cd7d616a05590ff5 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -45,9 +46,9 @@ class Event { Event(EventType type, std::string name, uint32_t thread_id, EventRole role = EventRole::kOrdinary, std::string attr = "none"); - const EventType& type() const; - Event* parent() const { return parent_; } - void set_parent(Event* parent) { parent_ = parent; } + const EventType &type() const; + Event *parent() const { return parent_; } + void set_parent(Event *parent) { parent_ = parent; } std::string name() const { return name_; } EventRole role() const { return role_; } uint64_t thread_id() const { return thread_id_; } @@ -61,13 +62,13 @@ class Event { #endif #endif - double CpuElapsedMs(const Event& e) const; - double CudaElapsedMs(const Event& e) const; + double CpuElapsedMs(const Event &e) const; + double CudaElapsedMs(const Event &e) const; private: EventType type_; std::string name_{}; - Event* parent_{nullptr}; + Event *parent_{nullptr}; uint64_t thread_id_; EventRole role_{}; int64_t cpu_ns_; @@ -90,13 +91,13 @@ class Event { #endif }; -using EventWithStartNs = std::pair; +using EventWithStartNs = std::pair; using ThreadEvents = std::map; class MemEvent { public: MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes, - Place place, int64_t thread_id, const std::string& annotation) + Place place, int64_t thread_id, const std::string &annotation) : type_(type), start_ns_(start_ns), end_ns_(end_ns), @@ -105,13 +106,13 @@ class MemEvent { thread_id_(thread_id), annotation_(annotation) {} - const EventType& type() const { return type_; } + const EventType &type() const { return type_; } uint64_t start_ns() const { return start_ns_; } uint64_t end_ns() const { return end_ns_; } size_t bytes() const { return bytes_; } Place place() const { return place_; } uint64_t thread_id() const { return thread_id_; } - const std::string& annotation() const { return annotation_; } + const std::string &annotation() const { return annotation_; } private: EventType type_; @@ -151,7 +152,7 @@ class CudaEvent { #endif } - void Record(const paddle::platform::stream::CUDAStream& stream) { + void Record(const paddle::platform::stream::CUDAStream &stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream())); #else @@ -200,5 +201,39 @@ class CudaEvent { #endif }; +struct CommonEvent { + public: + CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns, + EventRole role) + : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {} + + CommonEvent(std::function &arena_allocator, + const std::string &name_str, uint64_t start_ns, uint64_t end_ns, + EventRole role, const std::string &attr_str) + : start_ns(start_ns), end_ns(end_ns), role(role) { + auto buf = static_cast(arena_allocator(name_str.length() + 1)); + strncpy(buf, name_str.c_str(), name_str.length() + 1); + name = buf; + buf = static_cast(arena_allocator(attr_str.length() + 1)); + strncpy(buf, attr_str.c_str(), attr_str.length() + 1); + attr = buf; + } + + CommonEvent(const std::function &arena_allocator, + const std::string &name_str, uint64_t start_ns, uint64_t end_ns, + EventRole role) + : start_ns(start_ns), end_ns(end_ns), role(role) { + auto buf = static_cast(arena_allocator(name_str.length() + 1)); + strncpy(buf, name_str.c_str(), name_str.length() + 1); + name = buf; + } + + const char *name = nullptr; // not owned, designed for performance + uint64_t start_ns = 0; + uint64_t end_ns = 0; + EventRole role = EventRole::kOrdinary; + const char *attr = nullptr; // not owned, designed for performance +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/event_tracing.h b/paddle/fluid/platform/event_tracing.h new file mode 100644 index 0000000000000000000000000000000000000000..f68b4b5162a9f25fde21a02bb4b7e90ca11a81ab --- /dev/null +++ b/paddle/fluid/platform/event_tracing.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/event.h" + +namespace paddle { +namespace platform { + +// CPU event tracing. A trace marks something that happens but has no duration +// associated with it. For example, thread starts working. +// Chrome Trace Viewer Format: Instant Event +struct RecordInstantEvent { + explicit RecordInstantEvent(const char* name, + const EventRole role = EventRole::kOrdinary); +}; + +// CPU event tracing. A trace starts when an object of this clas is created and +// stops when the object is destroyed. +// Chrome Trace Viewer Format: Duration Event/Complte Event +class RecordEvent { + public: + explicit RecordEvent(const std::string& name, + const EventRole role = EventRole::kOrdinary); + + explicit RecordEvent(const char* name, + const EventRole role = EventRole::kOrdinary); + + RecordEvent(const std::string& name, const EventRole role, + const std::string& attr); + + // Stop event tracing explicitly before the object goes out of scope. + // Sometimes it's inconvenient to use RAII + void End(); + + ~RecordEvent() { End(); } + + private: + void OriginalConstruct(const std::string& name, const EventRole role, + const std::string& attr); + + bool is_enabled_{false}; + bool is_pushed_{false}; + // Event name + std::string* name_{nullptr}; + const char* shallow_copy_name_{nullptr}; + uint64_t start_ns_; + // Need to distinguish name by op type, block_id, program_id and perhaps + // different kernel invocations within an op. + // std::string full_name_; + EventRole role_{EventRole::kOrdinary}; + std::string* attr_{nullptr}; + bool finished_{false}; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/host_event_recorder.cc b/paddle/fluid/platform/host_event_recorder.cc new file mode 100644 index 0000000000000000000000000000000000000000..750f39118d7d99d17dd1d16b7552469b861a4be9 --- /dev/null +++ b/paddle/fluid/platform/host_event_recorder.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/host_event_recorder.h" +#include "paddle/fluid/platform/os_info.h" + +namespace paddle { +namespace platform { + +ThreadEventRecorder::ThreadEventRecorder() { + thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid(); + HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this); +} + +HostEventSection HostEventRecorder::GatherEvents() { + HostEventSection host_sec; + host_sec.thr_sections.reserve(thread_recorders_.size()); + for (auto &kv : thread_recorders_) { + host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents())); + } + return std::move(host_sec); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/host_event_recorder.h b/paddle/fluid/platform/host_event_recorder.h new file mode 100644 index 0000000000000000000000000000000000000000..e8dd59ad4c6f1b31b84ab5d618ab8465c3bd2c1c --- /dev/null +++ b/paddle/fluid/platform/host_event_recorder.h @@ -0,0 +1,261 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/platform/event.h" + +namespace paddle { +namespace platform { + +template +struct ContainsStdString + : std::conditional_t< + std::is_same>>::value, + std::true_type, ContainsStdString> {}; + +template +struct ContainsStdString + : std::is_same>> {}; + +template +class EventContainer { + public: + EventContainer() { + event_blocks_ = cur_event_block_ = new EventBlock; + str_blocks_ = cur_str_block_ = new StringBlock; + } + ~EventContainer() { + Reduce(); + delete event_blocks_; + for (auto cur = str_blocks_; cur != nullptr;) { + auto next = cur->next; + delete cur; + cur = next; + } + } + DISABLE_COPY_AND_ASSIGN(EventContainer); + + public: + // Record an event + template + void Record(Args &&... args) { + DoRecord(ContainsStdString(), std::forward(args)...); + } + + // Get all events and clear the container + std::vector Reduce(); + + // Return a buffer to store the string attribute of Event. + // HostEventRecorder locates in the static data section. + // So it's safe to use arena to avoid fragmented allocations. + char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); } + + private: + struct EventBlock { + union InitDeferedEvent { + InitDeferedEvent() {} + ~InitDeferedEvent() {} + + EventType event; + }; + + static constexpr size_t kBlockSize = 1 << 24; // 16 MB + static constexpr size_t kAvailSize = + kBlockSize - sizeof(size_t) - sizeof(nullptr); + static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent); + static constexpr size_t kPadSize = + kAvailSize - kNumEvents * sizeof(InitDeferedEvent); + static constexpr size_t kMinimumEventsPerBlock = 1024; + static_assert( + kNumEvents >= kMinimumEventsPerBlock, + "EventType is too large for kBlockSize, make kBlockSize larger"); + + size_t offset = 0; + EventBlock *next = nullptr; + InitDeferedEvent events[kNumEvents]; + char padding[kPadSize]; + }; + static_assert(sizeof(EventBlock) == EventBlock::kBlockSize, + "sizeof EventBlock must equal to kBlockSize"); + + struct StringBlock { + static constexpr size_t kBlockSize = 1 << 22; // 4 MB + static constexpr size_t kAvailSize = + kBlockSize - sizeof(size_t) - sizeof(nullptr); + + size_t offset = 0; + StringBlock *next = nullptr; + char storage[kAvailSize]; + }; + static_assert(sizeof(StringBlock) == StringBlock::kBlockSize, + "sizeof StringBlock must equal to kBlockSize"); + + // Record an event with string arguments + template + void DoRecord(std::true_type, Args &&... args) { + auto *storage = GetEventStorage(); + std::function allocator = [this](size_t size) { + return GetStrBufFromArena(size); + }; + new (storage) EventType(allocator, std::forward(args)...); + } + + // Record an event without any string argument + template + void DoRecord(std::false_type, Args &&... args) { + auto *storage = GetEventStorage(); + new (storage) EventType(std::forward(args)...); + } + + EventType *GetEventStorage(); + + char *GetStringStorage(size_t sz); + + EventBlock *event_blocks_ = nullptr; + EventBlock *cur_event_block_ = nullptr; + StringBlock *str_blocks_ = nullptr; + StringBlock *cur_str_block_ = nullptr; +}; + +template +std::vector EventContainer::Reduce() { + std::vector all_events; + size_t event_cnt = 0; + for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) { + event_cnt += cur->offset; + } + all_events.reserve(event_cnt); + for (auto cur = event_blocks_; cur != nullptr;) { + for (size_t i = 0; i < cur->offset; ++i) { + all_events.emplace_back(cur->events[i].event); + } + auto next = cur->next; + delete cur; + cur = next; + } + event_blocks_ = cur_event_block_ = new EventBlock; + return std::move(all_events); +} + +template +EventType *EventContainer::GetEventStorage() { + if (UNLIKELY(cur_event_block_->offset >= + EventBlock::kNumEvents)) { // another block + cur_event_block_->next = new EventBlock; + cur_event_block_ = cur_event_block_->next; + } + auto &obj = cur_event_block_->events[cur_event_block_->offset].event; + ++cur_event_block_->offset; + return &obj; +} + +template +char *EventContainer::GetStringStorage(size_t sz) { + if (UNLIKELY(cur_str_block_->offset + sz > + StringBlock::kAvailSize)) { // another block + cur_str_block_->next = new StringBlock; + cur_str_block_ = cur_str_block_->next; + } + char *storage = cur_str_block_->storage + cur_str_block_->offset; + cur_str_block_->offset += sz; + return storage; +} + +struct ThreadEventSection { + std::string thread_name; + uint64_t thread_id; + std::vector events; +}; + +class ThreadEventRecorder { + public: + ThreadEventRecorder(); + DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder); + + public: + // Forward call to EventContainer::Record + template + void RecordEvent(Args &&... args) { + base_evt_cntr_.Record(std::forward(args)...); + } + + ThreadEventSection GatherEvents() { + ThreadEventSection thr_sec; + thr_sec.thread_name = thread_name_; + thr_sec.thread_id = thread_id_; + thr_sec.events = std::move(base_evt_cntr_.Reduce()); + return std::move(thr_sec); + } + + private: + uint64_t thread_id_; + std::string thread_name_; + EventContainer base_evt_cntr_; +}; + +struct HostEventSection { + std::string process_name; + uint64_t process_id; + std::vector thr_sections; +}; + +class HostEventRecorder { + public: + // singleton + static HostEventRecorder &GetInstance() { + static HostEventRecorder instance; + return instance; + } + + // If your string argument has a longer lifetime than the Event, + // use 'const char*'. e.g.: string literal, op name, etc. + // Do your best to avoid using 'std::string' as the argument type. + // It will cause deep-copy to harm performance. + template + void RecordEvent(Args &&... args) { + GetThreadLocalRecorder().RecordEvent(std::forward(args)...); + } + + // Poor performance, call it at the ending + HostEventSection GatherEvents(); + + void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) { + const std::lock_guard guard(thread_recorders_lock_); + thread_recorders_[tid] = recorder; + } + + private: + HostEventRecorder() = default; + DISABLE_COPY_AND_ASSIGN(HostEventRecorder); + + ThreadEventRecorder &GetThreadLocalRecorder() { + static thread_local ThreadEventRecorder tls_recorder; + return tls_recorder; + } + + std::mutex thread_recorders_lock_; + std::unordered_map thread_recorders_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc index 9af89645f54eabc0285bc422b27dec56ca05e305..5ba7f1d144e12eb465286bc706ad741af53d3109 100644 --- a/paddle/fluid/platform/os_info.cc +++ b/paddle/fluid/platform/os_info.cc @@ -14,17 +14,32 @@ limitations under the License. */ #include "paddle/fluid/platform/os_info.h" #include -#include "paddle/fluid/platform/device_tracer.h" +#if defined(__linux__) +#include +#include +#include +#elif defined(_MSC_VER) +#include +#endif namespace paddle { namespace platform { ThreadId::ThreadId() { + // C++ std tid std_tid_ = std::hash()(std::this_thread::get_id()); +// system tid +#if defined(__linux__) + sys_tid_ = syscall(SYS_gettid); +#elif defined(_MSC_VER) + sys_tid_ = GetCurrentThreadId(); +#else // unsupported platforms + sys_tid_ = 0; +#endif + // cupti tid std::stringstream ss; ss << std::this_thread::get_id(); cupti_tid_ = static_cast(std::stoull(ss.str())); - RecoreCurThreadId(MainTid()); // For DeviceTracer } ThreadIdRegistry::~ThreadIdRegistry() { diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h index b243429fd5a89ce25b306454a4cc63c16aa7b2f2..c38198f91b36bcf1e0393336059d4b38bbfa5216 100644 --- a/paddle/fluid/platform/os_info.h +++ b/paddle/fluid/platform/os_info.h @@ -17,8 +17,8 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/enforce.h" // import LIKELY +#include "paddle/fluid/platform/macros.h" // import DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/port.h" #ifdef _POSIX_C_SOURCE #include @@ -27,7 +27,7 @@ limitations under the License. */ namespace paddle { namespace platform { -// Get current time in nanoseconds +// Get system-wide realtime clock in nanoseconds inline uint64_t PosixInNsec() { #ifdef _POSIX_C_SOURCE struct timespec tp; @@ -45,13 +45,13 @@ class ThreadId { public: ThreadId(); - uint64_t MainTid() const { return std_tid_; } + uint64_t MainTid() const { return SysTid(); } uint64_t StdTid() const { return std_tid_; } uint32_t CuptiTid() const { return cupti_tid_; } - uint64_t SysTid() const { return sys_tid_; } + uint64_t SysTid() const { return sys_tid_ != 0 ? sys_tid_ : std_tid_; } private: uint64_t std_tid_ = 0; // std::hash diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 1bedd5b130844aeccc358c10d005c893e8d24fe0..eaa77273c8fd4cd1b6f7038349d2b7027d988597 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/host_event_recorder.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler_helper.h" #ifdef PADDLE_WITH_CUDA @@ -36,286 +37,6 @@ DEFINE_bool(enable_host_event_recorder_hook, false, namespace paddle { namespace platform { -struct DurationEvent { - public: - DurationEvent(const char *name, uint64_t start_ns, uint64_t end_ns, - EventRole role) - : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {} - - DurationEvent(std::function &arena_allocator, - const std::string &name_str, uint64_t start_ns, uint64_t end_ns, - EventRole role, const std::string &attr_str) - : start_ns(start_ns), end_ns(end_ns), role(role) { - auto buf = static_cast(arena_allocator(name_str.length() + 1)); - strncpy(buf, name_str.c_str(), name_str.length() + 1); - name = buf; - buf = static_cast(arena_allocator(attr_str.length() + 1)); - strncpy(buf, attr_str.c_str(), attr_str.length() + 1); - attr = buf; - } - - DurationEvent(const std::function &arena_allocator, - const std::string &name_str, uint64_t start_ns, uint64_t end_ns, - EventRole role) - : start_ns(start_ns), end_ns(end_ns), role(role) { - auto buf = static_cast(arena_allocator(name_str.length() + 1)); - strncpy(buf, name_str.c_str(), name_str.length() + 1); - name = buf; - } - - const char *name = nullptr; // not owned, designed for performance - uint64_t start_ns = 0; - uint64_t end_ns = 0; - EventRole role = EventRole::kOrdinary; - const char *attr = nullptr; // not owned, designed for performance -}; - -template -struct ContainsStdString - : std::conditional_t< - std::is_same>>::value, - std::true_type, ContainsStdString> {}; - -template -struct ContainsStdString - : std::is_same>> {}; - -template -class EventContainer { - public: - EventContainer() { - event_blocks_ = cur_event_block_ = new EventBlock; - str_blocks_ = cur_str_block_ = new StringBlock; - } - ~EventContainer() { - Reduce(); - delete event_blocks_; - for (auto cur = str_blocks_; cur != nullptr;) { - auto next = cur->next; - delete cur; - cur = next; - } - } - DISABLE_COPY_AND_ASSIGN(EventContainer); - - public: - // Record an event - template - void Record(Args &&... args) { - DoRecord(ContainsStdString(), std::forward(args)...); - } - - // Get all events and clear the container - std::vector Reduce(); - - // Return a buffer to store the string attribute of Event. - // HostEventRecorder locates in the static data section. - // So it's safe to use arena to avoid fragmented allocations. - char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); } - - private: - struct EventBlock { - union InitDeferedEvent { - InitDeferedEvent() {} - ~InitDeferedEvent() {} - - EventType event; - }; - - static constexpr size_t kBlockSize = 1 << 24; // 16 MB - static constexpr size_t kAvailSize = - kBlockSize - sizeof(size_t) - sizeof(nullptr); - static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent); - static constexpr size_t kPadSize = - kAvailSize - kNumEvents * sizeof(InitDeferedEvent); - static constexpr size_t kMinimumEventsPerBlock = 1024; - static_assert( - kNumEvents >= kMinimumEventsPerBlock, - "EventType is too large for kBlockSize, make kBlockSize larger"); - - size_t offset = 0; - EventBlock *next = nullptr; - InitDeferedEvent events[kNumEvents]; - char padding[kPadSize]; - }; - static_assert(sizeof(EventBlock) == EventBlock::kBlockSize, - "sizeof EventBlock must equal to kBlockSize"); - - struct StringBlock { - static constexpr size_t kBlockSize = 1 << 22; // 4 MB - static constexpr size_t kAvailSize = - kBlockSize - sizeof(size_t) - sizeof(nullptr); - - size_t offset = 0; - StringBlock *next = nullptr; - char storage[kAvailSize]; - }; - static_assert(sizeof(StringBlock) == StringBlock::kBlockSize, - "sizeof StringBlock must equal to kBlockSize"); - - // Record an event with string arguments - template - void DoRecord(std::true_type, Args &&... args) { - auto *storage = GetEventStorage(); - std::function allocator = [this](size_t size) { - return GetStrBufFromArena(size); - }; - new (storage) EventType(allocator, std::forward(args)...); - } - - // Record an event without any string argument - template - void DoRecord(std::false_type, Args &&... args) { - auto *storage = GetEventStorage(); - new (storage) EventType(std::forward(args)...); - } - - EventType *GetEventStorage(); - - char *GetStringStorage(size_t sz); - - EventBlock *event_blocks_ = nullptr; - EventBlock *cur_event_block_ = nullptr; - StringBlock *str_blocks_ = nullptr; - StringBlock *cur_str_block_ = nullptr; -}; - -template -std::vector EventContainer::Reduce() { - std::vector all_events; - size_t event_cnt = 0; - for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) { - event_cnt += cur->offset; - } - all_events.reserve(event_cnt); - for (auto cur = event_blocks_; cur != nullptr;) { - for (size_t i = 0; i < cur->offset; ++i) { - all_events.emplace_back(cur->events[i].event); - } - auto next = cur->next; - delete cur; - cur = next; - } - event_blocks_ = cur_event_block_ = new EventBlock; - return std::move(all_events); -} - -template -EventType *EventContainer::GetEventStorage() { - if (UNLIKELY(cur_event_block_->offset >= - EventBlock::kNumEvents)) { // another block - cur_event_block_->next = new EventBlock; - cur_event_block_ = cur_event_block_->next; - } - auto &obj = cur_event_block_->events[cur_event_block_->offset].event; - ++cur_event_block_->offset; - return &obj; -} - -template -char *EventContainer::GetStringStorage(size_t sz) { - if (UNLIKELY(cur_str_block_->offset + sz > - StringBlock::kAvailSize)) { // another block - cur_str_block_->next = new StringBlock; - cur_str_block_ = cur_str_block_->next; - } - char *storage = cur_str_block_->storage + cur_str_block_->offset; - cur_str_block_->offset += sz; - return storage; -} - -struct ThreadEventSection { - std::string thread_name; - uint64_t thread_id; - std::vector events; -}; - -class ThreadEventRecorder { - public: - ThreadEventRecorder(); - DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder); - - public: - // Forward call to EventContainer::Record - template - void RecordEvent(Args &&... args) { - base_evt_cntr_.Record(std::forward(args)...); - } - - ThreadEventSection GatherEvents() { - ThreadEventSection thr_sec; - thr_sec.thread_name = thread_name_; - thr_sec.thread_id = thread_id_; - thr_sec.events = std::move(base_evt_cntr_.Reduce()); - return std::move(thr_sec); - } - - private: - uint64_t thread_id_; - std::string thread_name_; - EventContainer base_evt_cntr_; -}; - -struct HostEventSection { - std::string process_name; - uint64_t process_id; - std::vector thr_sections; -}; - -class HostEventRecorder { - public: - // singleton - static HostEventRecorder &GetInstance() { - static HostEventRecorder instance; - return instance; - } - - // If your string argument has a longer lifetime than the Event, - // use 'const char*'. e.g.: string literal, op name, etc. - // Do your best to avoid using 'std::string' as the argument type. - // It will cause deep-copy to harm performance. - template - void RecordEvent(Args &&... args) { - GetThreadLocalRecorder().RecordEvent(std::forward(args)...); - } - - // Poor performance, call it at the ending - HostEventSection GatherEvents(); - - void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) { - const std::lock_guard guard(thread_recorders_lock_); - thread_recorders_[tid] = recorder; - } - - private: - HostEventRecorder() = default; - DISABLE_COPY_AND_ASSIGN(HostEventRecorder); - - ThreadEventRecorder &GetThreadLocalRecorder() { - static thread_local ThreadEventRecorder tls_recorder; - return tls_recorder; - } - - std::mutex thread_recorders_lock_; - std::unordered_map thread_recorders_; -}; - -ThreadEventRecorder::ThreadEventRecorder() { - thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid(); - HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this); -} - -HostEventSection HostEventRecorder::GatherEvents() { - HostEventSection host_sec; - host_sec.thr_sections.reserve(thread_recorders_.size()); - for (auto &kv : thread_recorders_) { - host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents())); - } - return std::move(host_sec); -} - MemEvenRecorder MemEvenRecorder::recorder; Event::Event(EventType type, std::string name, uint32_t thread_id, @@ -416,7 +137,11 @@ void RecordEvent::OriginalConstruct(const std::string &name, *name_ = e->name(); } -RecordEvent::~RecordEvent() { +void RecordEvent::End() { + if (UNLIKELY(finished_)) { + return; + } + finished_ = true; #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook && is_pushed_) { @@ -456,6 +181,15 @@ RecordEvent::~RecordEvent() { delete attr_; } +RecordInstantEvent::RecordInstantEvent(const char *name, const EventRole role) { + if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) { + return; + } + auto start_end_ns = PosixInNsec(); + HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns, + role); +} + void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, size_t size) { if (g_state == ProfilerState::kDisabled) return; @@ -740,8 +474,9 @@ std::string PrintHostEvents() { for (const auto &thr_evt_sec : host_evt_sec.thr_sections) { oss << thr_evt_sec.thread_id << std::endl; for (const auto &evt : thr_evt_sec.events) { - oss << "{ " << evt.name << " | " << evt.start_ns << " | " << evt.end_ns - << " }" << std::endl; + oss << "{ " << evt.name << " | " << evt.start_ns << "ns | " << evt.end_ns + << "ns | " << (evt.end_ns - evt.start_ns) / 1000.000 << "us }" + << std::endl; } } return oss.str(); diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 9d0bdf2358900e75f83631f5fdea5f09c1327b32..41cc3805f44daa78f900ab944be6a976fa1c8cc5 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -27,9 +27,9 @@ limitations under the License. */ #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/event.h" +#include "paddle/fluid/platform/event_tracing.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.pb.h" - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -127,43 +127,6 @@ struct MemEvenRecorder { DISABLE_COPY_AND_ASSIGN(MemEvenRecorder); }; -struct RecordEvent { - explicit RecordEvent(const std::string& name, - const EventRole role = EventRole::kOrdinary); - - explicit RecordEvent(const char* name, - const EventRole role = EventRole::kOrdinary); - - RecordEvent(const std::string& name, const EventRole role, - const std::string& attr); - - ~RecordEvent(); - - void OriginalConstruct(const std::string& name, const EventRole role, - const std::string& attr); - - bool is_enabled_{false}; - bool is_pushed_{false}; - // Event name - std::string* name_{nullptr}; - const char* shallow_copy_name_{nullptr}; - uint64_t start_ns_; - // Need to distinguish name by op type, block_id, program_id and perhaps - // different kernel invocations within an op. - // std::string full_name_; - EventRole role_{EventRole::kOrdinary}; - std::string* attr_{nullptr}; -}; - -/*class RecordRPCEvent { - public: - explicit RecordRPCEvent(const std::string& name); - ~RecordRPCEvent() {} - - private: - std::unique_ptr event_; -};*/ - struct RecordBlock { explicit RecordBlock(int block_id); ~RecordBlock();