提交 55b2d3d0 编写于 作者: X Xin Pan

Add CPU time to the timeline.

上级 9fbe2e36
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <map>
#include <mutex>
#include <numeric>
#include <thread>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h"
......@@ -140,6 +141,13 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_[id] = anno;
}
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
CPURecord{anno, start_ns, end_ns,
std::hash<std::thread::id>{}(std::this_thread::get_id())});
}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {
std::lock_guard<std::mutex> l(trace_mu_);
......@@ -185,7 +193,6 @@ class DeviceTracerImpl : public DeviceTracer {
proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_);
std::map<std::string, std::vector<uint64_t>> event_times;
for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n");
......@@ -197,7 +204,15 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id);
event_times[event->name()].push_back(r.end_ns - r.start_ns);
}
for (const CPURecord &r : cpu_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.thread_id);
event->set_device_id(-1);
}
std::string profile_str;
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
......@@ -242,6 +257,7 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t start_ns_;
uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_;
};
......@@ -254,6 +270,8 @@ class DeviceTracerDummy : public DeviceTracer {
void AddAnnotation(uint64_t id, const std::string &anno) {}
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {}
......@@ -285,5 +303,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
void ClearCurAnnotation() { cur_annotation = nullptr; }
const char *CurAnnotation() { return cur_annotation; }
} // namespace platform
} // namespace paddle
......@@ -36,6 +36,12 @@ class DeviceTracer {
uint32_t stream_id;
uint32_t correlation_id;
};
struct CPURecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint64_t thread_id;
};
virtual ~DeviceTracer() {}
// Needs to be called once before use.
......@@ -48,6 +54,9 @@ class DeviceTracer {
// human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
virtual void AddCPURecords(const char* anno, uint64_t start_ns,
uint64_t end_ns) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end,
......@@ -67,6 +76,7 @@ DeviceTracer* GetDeviceTracer();
void SetCurAnnotation(const char* anno);
// Clear the name after the operation is done.
void ClearCurAnnotation();
// Current name of the operation being run in the thread.
const char* CurAnnotation();
} // namespace platform
} // namespace paddle
......@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <iomanip>
#include <map>
#ifdef PADDLE_WITH_CUDA
......@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
.count();
}
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
......@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
}
RecordEvent::RecordEvent(const std::string& name,
const DeviceContext* dev_ctx) {
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
: start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
name_ = name;
......@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return;
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
}
ClearCurAnnotation();
PopEvent(name_, dev_ctx_);
}
......@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer();
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
tracer->Disable();
tracer->GenProfile(profile_path);
}
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
}
void ParseEvents(std::vector<std::vector<Event>>& events,
......
......@@ -108,6 +108,7 @@ struct RecordEvent {
~RecordEvent();
uint64_t start_ns_;
// The device context is used by Event to get the current cuda stream.
const DeviceContext* dev_ctx_;
// Event name
......
......@@ -19,7 +19,8 @@ message Event {
optional string name = 1;
optional uint64 start_ns = 2;
optional uint64 end_ns = 3;
optional uint32 device_id = 5;
// When positive, it represents gpu id. When -1, it represents CPU.
optional int32 device_id = 5;
optional uint32 stream_id = 6;
}
......
......@@ -124,15 +124,22 @@ class Timeline(object):
if event.device_id not in self._devices:
pid = self._allocate_pid()
self._devices[event.device_id] = pid
self._chrome_trace.emit_pid("device:%s" % pid, pid)
if event.device_id >= 0:
self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
(pid, event.stream_id), pid)
elif event.device_id == -1:
self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
event.stream_id, pid)
def _allocate_events(self):
for event in self._profile_pb.events:
pid = self._devices[event.device_id]
args = {'name': event.name}
self._chrome_trace.emit_region(
event.start_ns, (event.end_ns - event.start_ns) / 1000000.0,
pid, 0, 'Op', event.name, args)
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
self._chrome_trace.emit_region(event.start_ns,
(event.end_ns - event.start_ns) /
1.0, pid, 0, 'Op', event.name, args)
def generate_chrome_trace(self):
self._allocate_pids()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册