diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d69dd74b062167ea294b525621253a03e9e71d76..88718d06e757498f7220abcfc8f7d1181f01e6f9 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/string/printf.h" @@ -140,6 +141,13 @@ class DeviceTracerImpl : public DeviceTracer { correlations_[id] = anno; } + void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) { + std::lock_guard l(trace_mu_); + cpu_records_.push_back( + CPURecord{anno, start_ns, end_ns, + std::hash{}(std::this_thread::get_id())}); + } + void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id) { std::lock_guard l(trace_mu_); @@ -185,7 +193,6 @@ class DeviceTracerImpl : public DeviceTracer { proto::Profile profile_pb; profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); - std::map> event_times; for (const KernelRecord &r : kernel_records_) { if (correlations_.find(r.correlation_id) == correlations_.end()) { fprintf(stderr, "cannot relate a kernel activity\n"); @@ -197,7 +204,15 @@ class DeviceTracerImpl : public DeviceTracer { event->set_end_ns(r.end_ns); event->set_stream_id(r.stream_id); event->set_device_id(r.device_id); - event_times[event->name()].push_back(r.end_ns - r.start_ns); + } + + for (const CPURecord &r : cpu_records_) { + auto *event = profile_pb.add_events(); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_stream_id(r.thread_id); + event->set_device_id(-1); } std::string profile_str; google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str); @@ -242,6 +257,7 @@ class DeviceTracerImpl : public DeviceTracer { uint64_t start_ns_; uint64_t end_ns_; std::vector kernel_records_; + std::vector cpu_records_; std::unordered_map correlations_; CUpti_SubscriberHandle subscriber_; }; @@ -254,6 +270,8 @@ class DeviceTracerDummy : public DeviceTracer { void AddAnnotation(uint64_t id, const std::string &anno) {} + void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {} + void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id) {} @@ -285,5 +303,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; } void ClearCurAnnotation() { cur_annotation = nullptr; } +const char *CurAnnotation() { return cur_annotation; } + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 4438b9c1c418035a0251330bb7a2e3078ea8268d..613af920756728a0f259b006e2076a6d948e0923 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -36,6 +36,12 @@ class DeviceTracer { uint32_t stream_id; uint32_t correlation_id; }; + struct CPURecord { + std::string name; + uint64_t start_ns; + uint64_t end_ns; + uint64_t thread_id; + }; virtual ~DeviceTracer() {} // Needs to be called once before use. @@ -48,6 +54,9 @@ class DeviceTracer { // human-readable annotations. virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; + virtual void AddCPURecords(const char* anno, uint64_t start_ns, + uint64_t end_ns) = 0; + // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. virtual void AddKernelRecords(uint64_t start, uint64_t end, @@ -67,6 +76,7 @@ DeviceTracer* GetDeviceTracer(); void SetCurAnnotation(const char* anno); // Clear the name after the operation is done. void ClearCurAnnotation(); - +// Current name of the operation being run in the thread. +const char* CurAnnotation(); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index a90a3ed590c0d43e45855d80859714c0ebe2c732..094f9224f719f6656fd0c44aa4a620730432ccac 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#include +#include #include #include #ifdef PADDLE_WITH_CUDA @@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() { .count(); } +inline uint64_t PosixInNsec() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); +} + Event::Event(EventKind kind, std::string name, uint32_t thread_id, const DeviceContext* dev_ctx) : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { @@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); } -RecordEvent::RecordEvent(const std::string& name, - const DeviceContext* dev_ctx) { +RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) + : start_ns_(PosixInNsec()) { if (g_state == ProfilerState::kDisabled) return; dev_ctx_ = dev_ctx; name_ = name; @@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name, RecordEvent::~RecordEvent() { if (g_state == ProfilerState::kDisabled) return; + DeviceTracer* tracer = GetDeviceTracer(); + if (tracer) { + tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec()); + } ClearCurAnnotation(); PopEvent(name_, dev_ctx_); } @@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key, Mark("_stop_profiler_", nullptr); g_state = ProfilerState::kDisabled; + std::vector> all_events = GetAllEvents(); + ParseEvents(all_events, sorted_key); + ResetProfiler(); DeviceTracer* tracer = GetDeviceTracer(); if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) { tracer->Disable(); tracer->GenProfile(profile_path); } - - std::vector> all_events = GetAllEvents(); - ParseEvents(all_events, sorted_key); - ResetProfiler(); } void ParseEvents(std::vector>& events, diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index cb9bb8a8a602df8e470c560ac960b19e5d038bdd..3542ce6cda87e3b013d60393e4ba93da61921940 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -108,6 +108,7 @@ struct RecordEvent { ~RecordEvent(); + uint64_t start_ns_; // The device context is used by Event to get the current cuda stream. const DeviceContext* dev_ctx_; // Event name diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index bdd86a0440d2b00eaee14195030456d0ad217f9a..d1cc8516dad0062fd85f1d3b9ce622b5bae82251 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -19,7 +19,8 @@ message Event { optional string name = 1; optional uint64 start_ns = 2; optional uint64 end_ns = 3; - optional uint32 device_id = 5; + // When positive, it represents gpu id. When -1, it represents CPU. + optional int32 device_id = 5; optional uint32 stream_id = 6; } diff --git a/tools/timeline.py b/tools/timeline.py index a6eb6bee5c147af6621e49be1af309f011cdd857..70d5ee3426a2f0b69c702b97b18f425e7ff923b1 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -124,15 +124,22 @@ class Timeline(object): if event.device_id not in self._devices: pid = self._allocate_pid() self._devices[event.device_id] = pid - self._chrome_trace.emit_pid("device:%s" % pid, pid) + if event.device_id >= 0: + self._chrome_trace.emit_pid("gpu:%s:stream:%d" % + (pid, event.stream_id), pid) + elif event.device_id == -1: + self._chrome_trace.emit_pid("cpu:thread_hash:%d" % + event.stream_id, pid) def _allocate_events(self): for event in self._profile_pb.events: pid = self._devices[event.device_id] args = {'name': event.name} - self._chrome_trace.emit_region( - event.start_ns, (event.end_ns - event.start_ns) / 1000000.0, - pid, 0, 'Op', event.name, args) + # TODO(panyx0718): Chrome tracing only handles ms. However, some + # ops takes micro-seconds. Hence, we keep the ns here. + self._chrome_trace.emit_region(event.start_ns, + (event.end_ns - event.start_ns) / + 1.0, pid, 0, 'Op', event.name, args) def generate_chrome_trace(self): self._allocate_pids()