提交 a6d30a86 编写于 作者: Q qiaolongfei

profiler support cpu

上级 95a2b5f5
...@@ -361,27 +361,116 @@ class DeviceTracerDummy : public DeviceTracer { ...@@ -361,27 +361,116 @@ class DeviceTracerDummy : public DeviceTracer {
public: public:
DeviceTracerDummy() {} DeviceTracerDummy() {}
void AddAnnotation(uint64_t id, const std::string &anno) {} void AddAnnotation(uint64_t id, const std::string &anno) {
std::lock_guard<std::mutex> l(trace_mu_);
correlations_[id] = anno;
}
void AddCPURecords(const std::string &anno, uint64_t start_ns, void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) {} uint64_t end_ns, int64_t device_id, int64_t thread_id) {
if (anno.empty()) {
VLOG(1) << "Empty timeline annotation.";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
CPURecord{anno, start_ns, end_ns, device_id, thread_id});
}
void AddMemRecords(const std::string &name, uint64_t start_ns, void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t stream_id, uint64_t end_ns, int64_t device_id, int64_t stream_id,
uint32_t correlation_id, uint64_t bytes) {} uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) {
VLOG(3) << name << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes});
}
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
int64_t stream_id, uint32_t correlation_id) {} int64_t stream_id, uint32_t correlation_id) {}
bool IsEnabled() { return false; } bool IsEnabled() {
std::lock_guard<std::mutex> l(trace_mu_);
return enabled_;
}
void Enable() {} void Enable() {
std::lock_guard<std::mutex> l(trace_mu_);
if (enabled_) {
return;
}
int64_t start_ns_ = PosixInNsec();
VLOG(3) << "start_ns_ = " << start_ns_;
enabled_ = true;
}
void Disable() {
std::lock_guard<std::mutex> l(trace_mu_);
uint64_t end_ns_ = PosixInNsec();
VLOG(3) << "end_ns_ = " << end_ns_;
enabled_ = false;
}
proto::Profile GenProfile(const std::string &profile_path) { proto::Profile GenProfile(const std::string &profile_path) {
return proto::Profile(); std::lock_guard<std::mutex> l(trace_mu_);
proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_);
for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n");
continue;
}
auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel);
event->set_name(correlations_.at(r.correlation_id));
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id);
}
for (const CPURecord &r : cpu_records_) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel);
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
}
std::ofstream profile_f;
profile_f.open(profile_path, std::ios::out | std::ios::trunc);
std::string profile_str;
profile_pb.SerializeToString(&profile_str);
profile_f << profile_str;
profile_f.close();
return profile_pb;
} }
void Disable() {} private:
std::mutex trace_mu_;
bool enabled_;
uint64_t start_ns_;
uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
}; };
void CreateTracer(DeviceTracer **t) { void CreateTracer(DeviceTracer **t) {
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <sys/time.h>
#include <time.h>
#include <chrono> // NOLINT
#include <string> #include <string>
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
...@@ -25,6 +28,12 @@ namespace platform { ...@@ -25,6 +28,12 @@ namespace platform {
// WARN: Under Development. Don't depend on it yet. // WARN: Under Development. Don't depend on it yet.
////////////////////// //////////////////////
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
// DeviceTracer performs the following tasks: // DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc. // 2. Collect cuda statistics: start/end ts, memory, etc.
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <sys/time.h> #include <sys/time.h>
#include <time.h>
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() { ...@@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() {
.count(); .count();
} }
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
Event::Event(EventType type, std::string name, uint32_t thread_id, Event::Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx) const DeviceContext* dev_ctx)
: type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) { : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
......
...@@ -218,7 +218,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): ...@@ -218,7 +218,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
def profiler(state, sorted_key=None, profile_path='/tmp/profile'): def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
"""The profiler interface. """The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels, and GPU program. By default, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial if you want to profile other program, you can refer the profiling tutorial
to add more records in C++ code. to add more records in C++ code.
...@@ -231,7 +231,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): ...@@ -231,7 +231,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
state (string) : The profiling state, which should be 'CPU' or 'GPU', state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling. telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
would not inherit this place. would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling in the order of first end time of events. Otherwise, the profiling
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册