未验证 提交 92974d41 编写于 作者: X Xin Pan 提交者: GitHub

Merge pull request #8679 from panyx0718/test2

Add CPU time and MemCopy to the timeline.
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <map> #include <map>
#include <mutex> #include <mutex>
#include <numeric> #include <numeric>
#include <thread>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -54,6 +55,36 @@ uint64_t kAlignSize = 8; ...@@ -54,6 +55,36 @@ uint64_t kAlignSize = 8;
} \ } \
} while (0) } while (0)
std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
switch (kind) {
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
return "MEMCPY_HtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
return "MEMCPY_DtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
return "MEMCPY_HtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
return "MEMCPY_AtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
return "MEMCPY_AtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
return "MEMCPY_AtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
return "MEMCPY_DtoA";
case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
return "MEMCPY_DtoD";
case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
return "MEMCPY_HtoH";
case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
return "MEMCPY_PtoP";
case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
return "MEMCPY_FORCE_INT";
default:
break;
}
return "MEMCPY";
}
void EnableActivity() { void EnableActivity() {
// Device activity record is created when CUDA initializes, so we // Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call. // want to enable it before cuInit() or any CUDA runtime call.
...@@ -110,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -110,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
kernel->correlationId); kernel->correlationId);
break; break;
} }
case CUPTI_ACTIVITY_KIND_MEMCPY: {
auto *memcpy =
reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
tracer->AddMemRecords(
MemcpyKind(
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
memcpy->correlationId, memcpy->bytes);
break;
}
case CUPTI_ACTIVITY_KIND_MEMCPY2: {
auto *memcpy =
reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
tracer->AddMemRecords(
MemcpyKind(
static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
memcpy->correlationId, memcpy->bytes);
break;
}
default: { break; } default: { break; }
} }
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
...@@ -140,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -140,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_[id] = anno; correlations_[id] = anno;
} }
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
CPURecord{anno, start_ns, end_ns,
std::hash<std::thread::id>{}(std::this_thread::get_id())});
}
void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes});
}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) { uint32_t stream_id, uint32_t correlation_id) {
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
...@@ -175,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -175,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
CUPTI_CALL( CUPTI_CALL(
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_)); CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
enabled_ = true; enabled_ = true;
} }
...@@ -185,7 +249,6 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -185,7 +249,6 @@ class DeviceTracerImpl : public DeviceTracer {
proto::Profile profile_pb; proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
std::map<std::string, std::vector<uint64_t>> event_times;
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) { if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n"); fprintf(stderr, "cannot relate a kernel activity\n");
...@@ -197,7 +260,24 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -197,7 +260,24 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id); event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
event_times[event->name()].push_back(r.end_ns - r.start_ns); }
for (const CPURecord &r : cpu_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.thread_id);
event->set_device_id(-1);
}
for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events();
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
} }
std::string profile_str; std::string profile_str;
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str); google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
...@@ -242,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -242,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t start_ns_; uint64_t start_ns_;
uint64_t end_ns_; uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_; std::vector<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_; std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_; CUpti_SubscriberHandle subscriber_;
}; };
...@@ -254,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer { ...@@ -254,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {
void AddAnnotation(uint64_t id, const std::string &anno) {} void AddAnnotation(uint64_t id, const std::string &anno) {}
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {} uint32_t stream_id, uint32_t correlation_id) {}
...@@ -285,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; } ...@@ -285,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
void ClearCurAnnotation() { cur_annotation = nullptr; } void ClearCurAnnotation() { cur_annotation = nullptr; }
const char *CurAnnotation() { return cur_annotation; }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -36,6 +36,21 @@ class DeviceTracer { ...@@ -36,6 +36,21 @@ class DeviceTracer {
uint32_t stream_id; uint32_t stream_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
struct CPURecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint64_t thread_id;
};
struct MemRecord {
std::string name;
uint64_t start_ns;
uint64_t end_ns;
uint32_t device_id;
uint32_t stream_id;
uint32_t correlation_id;
uint64_t bytes;
};
virtual ~DeviceTracer() {} virtual ~DeviceTracer() {}
// Needs to be called once before use. // Needs to be called once before use.
...@@ -48,6 +63,14 @@ class DeviceTracer { ...@@ -48,6 +63,14 @@ class DeviceTracer {
// human-readable annotations. // human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id,
uint64_t bytes) = 0;
virtual void AddCPURecords(const char* anno, uint64_t start_ns,
uint64_t end_ns) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end, virtual void AddKernelRecords(uint64_t start, uint64_t end,
...@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer(); ...@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
void SetCurAnnotation(const char* anno); void SetCurAnnotation(const char* anno);
// Clear the name after the operation is done. // Clear the name after the operation is done.
void ClearCurAnnotation(); void ClearCurAnnotation();
// Current name of the operation being run in the thread.
const char* CurAnnotation();
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -74,7 +74,8 @@ extern void *cupti_dso_handle; ...@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
__macro(cuptiFinalize); \ __macro(cuptiFinalize); \
__macro(cuptiSubscribe); \ __macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \ __macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback); __macro(cuptiEnableCallback); \
__macro(cuptiEnableDomain);
CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <iomanip> #include <iomanip>
#include <map> #include <map>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() { ...@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
.count(); .count();
} }
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
Event::Event(EventKind kind, std::string name, uint32_t thread_id, Event::Event(EventKind kind, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx) const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
...@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { ...@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
const DeviceContext* dev_ctx) { : start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
...@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name, ...@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
}
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(name_, dev_ctx_); PopEvent(name_, dev_ctx_);
} }
...@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) { if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
} }
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
} }
void ParseEvents(std::vector<std::vector<Event>>& events, void ParseEvents(std::vector<std::vector<Event>>& events,
......
...@@ -108,6 +108,7 @@ struct RecordEvent { ...@@ -108,6 +108,7 @@ struct RecordEvent {
~RecordEvent(); ~RecordEvent();
uint64_t start_ns_;
// The device context is used by Event to get the current cuda stream. // The device context is used by Event to get the current cuda stream.
const DeviceContext* dev_ctx_; const DeviceContext* dev_ctx_;
// Event name // Event name
......
...@@ -15,12 +15,17 @@ limitations under the License. */ ...@@ -15,12 +15,17 @@ limitations under the License. */
syntax = "proto2"; syntax = "proto2";
package paddle.platform.proto; package paddle.platform.proto;
message MemCopy { optional uint64 bytes = 3; }
message Event { message Event {
optional string name = 1; optional string name = 1;
optional uint64 start_ns = 2; optional uint64 start_ns = 2;
optional uint64 end_ns = 3; optional uint64 end_ns = 3;
optional uint32 device_id = 5; // When positive, it represents gpu id. When -1, it represents CPU.
optional int64 device_id = 5;
optional uint32 stream_id = 6; optional uint32 stream_id = 6;
optional MemCopy memcopy = 7;
} }
message Profile { message Profile {
......
...@@ -124,15 +124,24 @@ class Timeline(object): ...@@ -124,15 +124,24 @@ class Timeline(object):
if event.device_id not in self._devices: if event.device_id not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
self._devices[event.device_id] = pid self._devices[event.device_id] = pid
self._chrome_trace.emit_pid("device:%s" % pid, pid) if event.device_id >= 0:
self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
(pid, event.stream_id), pid)
elif event.device_id == -1:
self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
event.stream_id, pid)
def _allocate_events(self): def _allocate_events(self):
for event in self._profile_pb.events: for event in self._profile_pb.events:
pid = self._devices[event.device_id] pid = self._devices[event.device_id]
args = {'name': event.name} args = {'name': event.name}
self._chrome_trace.emit_region( if event.memcopy.bytes > 0:
event.start_ns, (event.end_ns - event.start_ns) / 1000000.0, args = {'mem_bytes': event.memcopy.bytes}
pid, 0, 'Op', event.name, args) # TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
self._chrome_trace.emit_region(event.start_ns,
(event.end_ns - event.start_ns) /
1.0, pid, 0, 'Op', event.name, args)
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册