From a6d30a860723190900e713d36d2517d0f3dec98d Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 23 Jul 2018 16:33:41 +0800 Subject: [PATCH] profiler support cpu --- paddle/fluid/platform/device_tracer.cc | 103 +++++++++++++++++++++++-- paddle/fluid/platform/device_tracer.h | 9 +++ paddle/fluid/platform/profiler.cc | 7 -- python/paddle/fluid/profiler.py | 4 +- 4 files changed, 107 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d9e2afada..c376480e9 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -361,27 +361,116 @@ class DeviceTracerDummy : public DeviceTracer { public: DeviceTracerDummy() {} - void AddAnnotation(uint64_t id, const std::string &anno) {} + void AddAnnotation(uint64_t id, const std::string &anno) { + std::lock_guard l(trace_mu_); + correlations_[id] = anno; + } void AddCPURecords(const std::string &anno, uint64_t start_ns, - uint64_t end_ns, int64_t device_id, int64_t thread_id) {} + uint64_t end_ns, int64_t device_id, int64_t thread_id) { + if (anno.empty()) { + VLOG(1) << "Empty timeline annotation."; + return; + } + std::lock_guard l(trace_mu_); + cpu_records_.push_back( + CPURecord{anno, start_ns, end_ns, device_id, thread_id}); + } void AddMemRecords(const std::string &name, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t stream_id, - uint32_t correlation_id, uint64_t bytes) {} + uint32_t correlation_id, uint64_t bytes) { + // 0 means timestamp information could not be collected for the kernel. + if (start_ns == 0 || end_ns == 0) { + VLOG(3) << name << " cannot be traced"; + return; + } + std::lock_guard l(trace_mu_); + mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, + stream_id, correlation_id, bytes}); + } void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, int64_t stream_id, uint32_t correlation_id) {} - bool IsEnabled() { return false; } + bool IsEnabled() { + std::lock_guard l(trace_mu_); + return enabled_; + } - void Enable() {} + void Enable() { + std::lock_guard l(trace_mu_); + if (enabled_) { + return; + } + int64_t start_ns_ = PosixInNsec(); + VLOG(3) << "start_ns_ = " << start_ns_; + enabled_ = true; + } + + void Disable() { + std::lock_guard l(trace_mu_); + uint64_t end_ns_ = PosixInNsec(); + VLOG(3) << "end_ns_ = " << end_ns_; + enabled_ = false; + } proto::Profile GenProfile(const std::string &profile_path) { - return proto::Profile(); + std::lock_guard l(trace_mu_); + proto::Profile profile_pb; + profile_pb.set_start_ns(start_ns_); + profile_pb.set_end_ns(end_ns_); + for (const KernelRecord &r : kernel_records_) { + if (correlations_.find(r.correlation_id) == correlations_.end()) { + fprintf(stderr, "cannot relate a kernel activity\n"); + continue; + } + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::GPUKernel); + event->set_name(correlations_.at(r.correlation_id)); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.stream_id); + event->set_device_id(r.device_id); + } + + for (const CPURecord &r : cpu_records_) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::CPU); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.thread_id); + event->set_device_id(r.device_id); + } + for (const MemRecord &r : mem_records_) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::GPUKernel); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.stream_id); + event->set_device_id(r.device_id); + event->mutable_memcopy()->set_bytes(r.bytes); + } + std::ofstream profile_f; + profile_f.open(profile_path, std::ios::out | std::ios::trunc); + std::string profile_str; + profile_pb.SerializeToString(&profile_str); + profile_f << profile_str; + profile_f.close(); + return profile_pb; } - void Disable() {} + private: + std::mutex trace_mu_; + bool enabled_; + uint64_t start_ns_; + uint64_t end_ns_; + std::vector kernel_records_; + std::vector mem_records_; + std::vector cpu_records_; + std::unordered_map correlations_; }; void CreateTracer(DeviceTracer **t) { diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 0375c7439..83bc90911 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include // NOLINT #include #include "paddle/fluid/platform/dynload/cupti.h" @@ -25,6 +28,12 @@ namespace platform { // WARN: Under Development. Don't depend on it yet. ////////////////////// +inline uint64_t PosixInNsec() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); +} + // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 2. Collect cuda statistics: start/end ts, memory, etc. diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 01de9d704..dc3f388f8 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include -#include #include #include #include @@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() { .count(); } -inline uint64_t PosixInNsec() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); -} - Event::Event(EventType type, std::string name, uint32_t thread_id, const DeviceContext* dev_ctx) : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) { diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 6a321ae02..a9f8a7f03 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -218,7 +218,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): def profiler(state, sorted_key=None, profile_path='/tmp/profile'): """The profiler interface. Different from cuda_profiler, this profiler can be used to profile both CPU - and GPU program. By defalut, it records the CPU and GPU operator kernels, + and GPU program. By default, it records the CPU and GPU operator kernels, if you want to profile other program, you can refer the profiling tutorial to add more records in C++ code. @@ -231,7 +231,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): state (string) : The profiling state, which should be 'CPU' or 'GPU', telling the profiler to use CPU timer or GPU timer for profiling. Although users may have already specified the execution place - (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler + (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler would not inherit this place. sorted_key (string) : If None, the profiling results will be printed in the order of first end time of events. Otherwise, the profiling -- GitLab