diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d9e2afadaf8ec439d158e57c94d3e6e684bce116..c376480e94b4fb5d663945229231e0674c486547 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -361,27 +361,116 @@ class DeviceTracerDummy : public DeviceTracer { public: DeviceTracerDummy() {} - void AddAnnotation(uint64_t id, const std::string &anno) {} + void AddAnnotation(uint64_t id, const std::string &anno) { + std::lock_guard l(trace_mu_); + correlations_[id] = anno; + } void AddCPURecords(const std::string &anno, uint64_t start_ns, - uint64_t end_ns, int64_t device_id, int64_t thread_id) {} + uint64_t end_ns, int64_t device_id, int64_t thread_id) { + if (anno.empty()) { + VLOG(1) << "Empty timeline annotation."; + return; + } + std::lock_guard l(trace_mu_); + cpu_records_.push_back( + CPURecord{anno, start_ns, end_ns, device_id, thread_id}); + } void AddMemRecords(const std::string &name, uint64_t start_ns, uint64_t end_ns, int64_t device_id, int64_t stream_id, - uint32_t correlation_id, uint64_t bytes) {} + uint32_t correlation_id, uint64_t bytes) { + // 0 means timestamp information could not be collected for the kernel. + if (start_ns == 0 || end_ns == 0) { + VLOG(3) << name << " cannot be traced"; + return; + } + std::lock_guard l(trace_mu_); + mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, + stream_id, correlation_id, bytes}); + } void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, int64_t stream_id, uint32_t correlation_id) {} - bool IsEnabled() { return false; } + bool IsEnabled() { + std::lock_guard l(trace_mu_); + return enabled_; + } - void Enable() {} + void Enable() { + std::lock_guard l(trace_mu_); + if (enabled_) { + return; + } + int64_t start_ns_ = PosixInNsec(); + VLOG(3) << "start_ns_ = " << start_ns_; + enabled_ = true; + } + + void Disable() { + std::lock_guard l(trace_mu_); + uint64_t end_ns_ = PosixInNsec(); + VLOG(3) << "end_ns_ = " << end_ns_; + enabled_ = false; + } proto::Profile GenProfile(const std::string &profile_path) { - return proto::Profile(); + std::lock_guard l(trace_mu_); + proto::Profile profile_pb; + profile_pb.set_start_ns(start_ns_); + profile_pb.set_end_ns(end_ns_); + for (const KernelRecord &r : kernel_records_) { + if (correlations_.find(r.correlation_id) == correlations_.end()) { + fprintf(stderr, "cannot relate a kernel activity\n"); + continue; + } + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::GPUKernel); + event->set_name(correlations_.at(r.correlation_id)); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.stream_id); + event->set_device_id(r.device_id); + } + + for (const CPURecord &r : cpu_records_) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::CPU); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.thread_id); + event->set_device_id(r.device_id); + } + for (const MemRecord &r : mem_records_) { + auto *event = profile_pb.add_events(); + event->set_type(proto::Event::GPUKernel); + event->set_name(r.name); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_sub_device_id(r.stream_id); + event->set_device_id(r.device_id); + event->mutable_memcopy()->set_bytes(r.bytes); + } + std::ofstream profile_f; + profile_f.open(profile_path, std::ios::out | std::ios::trunc); + std::string profile_str; + profile_pb.SerializeToString(&profile_str); + profile_f << profile_str; + profile_f.close(); + return profile_pb; } - void Disable() {} + private: + std::mutex trace_mu_; + bool enabled_; + uint64_t start_ns_; + uint64_t end_ns_; + std::vector kernel_records_; + std::vector mem_records_; + std::vector cpu_records_; + std::unordered_map correlations_; }; void CreateTracer(DeviceTracer **t) { diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..83bc90911776cb1708b385ff546ee8fe66f2f94b 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include // NOLINT #include #include "paddle/fluid/platform/dynload/cupti.h" @@ -25,6 +28,12 @@ namespace platform { // WARN: Under Development. Don't depend on it yet. ////////////////////// +inline uint64_t PosixInNsec() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); +} + // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 2. Collect cuda statistics: start/end ts, memory, etc. diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 01de9d7041bf3eb40884e2a6295027cccfaebd2a..dc3f388f8928667fdd66ead5d706a4c0106c6781 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include -#include #include #include #include @@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() { .count(); } -inline uint64_t PosixInNsec() { - struct timeval tv; - gettimeofday(&tv, nullptr); - return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); -} - Event::Event(EventType type, std::string name, uint32_t thread_id, const DeviceContext* dev_ctx) : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) { diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 6a321ae024dcb50452bc4d96d7e7e70f590a42c6..a9f8a7f03b517f4c3455454fc3578d796bf11e94 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -218,7 +218,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): def profiler(state, sorted_key=None, profile_path='/tmp/profile'): """The profiler interface. Different from cuda_profiler, this profiler can be used to profile both CPU - and GPU program. By defalut, it records the CPU and GPU operator kernels, + and GPU program. By default, it records the CPU and GPU operator kernels, if you want to profile other program, you can refer the profiling tutorial to add more records in C++ code. @@ -231,7 +231,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): state (string) : The profiling state, which should be 'CPU' or 'GPU', telling the profiler to use CPU timer or GPU timer for profiling. Although users may have already specified the execution place - (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler + (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler would not inherit this place. sorted_key (string) : If None, the profiling results will be printed in the order of first end time of events. Otherwise, the profiling