提交 55b2d3d0 编写于 作者: X Xin Pan

Add CPU time to the timeline.

上级 9fbe2e36
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <map> #include <map>
#include <mutex> #include <mutex>
#include <numeric> #include <numeric>
#include <thread>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -140,6 +141,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -140,6 +141,13 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_[id] = anno; correlations_[id] = anno;
} }
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
CPURecord{anno, start_ns, end_ns,
std::hash<std::thread::id>{}(std::this_thread::get_id())});
}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) { uint32_t stream_id, uint32_t correlation_id) {
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
...@@ -185,7 +193,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -185,7 +193,6 @@ class DeviceTracerImpl : public DeviceTracer {
proto::Profile profile_pb; proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
std::map<std::string, std::vector<uint64_t>> event_times;
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) { if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n"); fprintf(stderr, "cannot relate a kernel activity\n");
...@@ -197,7 +204,15 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -197,7 +204,15 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id); event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
event_times[event->name()].push_back(r.end_ns - r.start_ns); }
for (const CPURecord &r : cpu_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.thread_id);
event->set_device_id(-1);
} }
std::string profile_str; std::string profile_str;
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str); google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
...@@ -242,6 +257,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -242,6 +257,7 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t start_ns_; uint64_t start_ns_;
uint64_t end_ns_; uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_; std::vector<KernelRecord> kernel_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_; std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_; CUpti_SubscriberHandle subscriber_;
}; };
...@@ -254,6 +270,8 @@ class DeviceTracerDummy : public DeviceTracer { ...@@ -254,6 +270,8 @@ class DeviceTracerDummy : public DeviceTracer {
void AddAnnotation(uint64_t id, const std::string &anno) {} void AddAnnotation(uint64_t id, const std::string &anno) {}
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {} uint32_t stream_id, uint32_t correlation_id) {}
...@@ -285,5 +303,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; } ...@@ -285,5 +303,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
void ClearCurAnnotation() { cur_annotation = nullptr; } void ClearCurAnnotation() { cur_annotation = nullptr; }
const char *CurAnnotation() { return cur_annotation; }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -36,6 +36,12 @@ class DeviceTracer { ...@@ -36,6 +36,12 @@ class DeviceTracer {
uint32_t stream_id; uint32_t stream_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
struct CPURecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint64_t thread_id;
};
virtual ~DeviceTracer() {} virtual ~DeviceTracer() {}
// Needs to be called once before use. // Needs to be called once before use.
...@@ -48,6 +54,9 @@ class DeviceTracer { ...@@ -48,6 +54,9 @@ class DeviceTracer {
// human-readable annotations. // human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
virtual void AddCPURecords(const char* anno, uint64_t start_ns,
uint64_t end_ns) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end, virtual void AddKernelRecords(uint64_t start, uint64_t end,
...@@ -67,6 +76,7 @@ DeviceTracer* GetDeviceTracer(); ...@@ -67,6 +76,7 @@ DeviceTracer* GetDeviceTracer();
void SetCurAnnotation(const char* anno); void SetCurAnnotation(const char* anno);
// Clear the name after the operation is done. // Clear the name after the operation is done.
void ClearCurAnnotation(); void ClearCurAnnotation();
// Current name of the operation being run in the thread.
const char* CurAnnotation();
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <iomanip> #include <iomanip>
#include <map> #include <map>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() { ...@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
.count(); .count();
} }
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
Event::Event(EventKind kind, std::string name, uint32_t thread_id, Event::Event(EventKind kind, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx) const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
...@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { ...@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
const DeviceContext* dev_ctx) { : start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
...@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name, ...@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
}
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(name_, dev_ctx_); PopEvent(name_, dev_ctx_);
} }
...@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) { if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
} }
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
} }
void ParseEvents(std::vector<std::vector<Event>>& events, void ParseEvents(std::vector<std::vector<Event>>& events,
......
...@@ -108,6 +108,7 @@ struct RecordEvent { ...@@ -108,6 +108,7 @@ struct RecordEvent {
~RecordEvent(); ~RecordEvent();
uint64_t start_ns_;
// The device context is used by Event to get the current cuda stream. // The device context is used by Event to get the current cuda stream.
const DeviceContext* dev_ctx_; const DeviceContext* dev_ctx_;
// Event name // Event name
......
...@@ -19,7 +19,8 @@ message Event { ...@@ -19,7 +19,8 @@ message Event {
optional string name = 1; optional string name = 1;
optional uint64 start_ns = 2; optional uint64 start_ns = 2;
optional uint64 end_ns = 3; optional uint64 end_ns = 3;
optional uint32 device_id = 5; // When positive, it represents gpu id. When -1, it represents CPU.
optional int32 device_id = 5;
optional uint32 stream_id = 6; optional uint32 stream_id = 6;
} }
......
...@@ -124,15 +124,22 @@ class Timeline(object): ...@@ -124,15 +124,22 @@ class Timeline(object):
if event.device_id not in self._devices: if event.device_id not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
self._devices[event.device_id] = pid self._devices[event.device_id] = pid
self._chrome_trace.emit_pid("device:%s" % pid, pid) if event.device_id >= 0:
self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
(pid, event.stream_id), pid)
elif event.device_id == -1:
self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
event.stream_id, pid)
def _allocate_events(self): def _allocate_events(self):
for event in self._profile_pb.events: for event in self._profile_pb.events:
pid = self._devices[event.device_id] pid = self._devices[event.device_id]
args = {'name': event.name} args = {'name': event.name}
self._chrome_trace.emit_region( # TODO(panyx0718): Chrome tracing only handles ms. However, some
event.start_ns, (event.end_ns - event.start_ns) / 1000000.0, # ops takes micro-seconds. Hence, we keep the ns here.
pid, 0, 'Op', event.name, args) self._chrome_trace.emit_region(event.start_ns,
(event.end_ns - event.start_ns) /
1.0, pid, 0, 'Op', event.name, args)
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册