Merge pull request #8679 from panyx0718/test2

Add CPU time and MemCopy to the timeline.

Merge pull request #8679 from panyx0718/test2
Add CPU time and MemCopy to the timeline.
92974d41 · Xin Pan · GitHub · 0240bb77 · f3cbfc02 · 92974d41
7 changed file
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <map>
 #include <mutex>
 #include <numeric>
+#include <thread>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/string/printf.h"
@@ -54,6 +55,36 @@ uint64_t kAlignSize = 8;
    }                                                                      \
  } while (0)

+std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
+      return "MEMCPY_HtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
+      return "MEMCPY_DtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
+      return "MEMCPY_HtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
+      return "MEMCPY_AtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
+      return "MEMCPY_AtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
+      return "MEMCPY_AtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
+      return "MEMCPY_DtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
+      return "MEMCPY_DtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
+      return "MEMCPY_HtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
+      return "MEMCPY_PtoP";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
+      return "MEMCPY_FORCE_INT";
+    default:
+      break;
+  }
+  return "MEMCPY";
+}
+
 void EnableActivity() {
  // Device activity record is created when CUDA initializes, so we
  // want to enable it before cuInit() or any CUDA runtime call.
@@ -110,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                                     kernel->correlationId);
            break;
          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY2: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
          default: { break; }
        }
      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -140,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
    correlations_[id] = anno;
  }

+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    cpu_records_.push_back(
+        CPURecord{anno, start_ns, end_ns,
+                  std::hash<std::thread::id>{}(std::this_thread::get_id())});
+  }
+
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {
+    mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
+                                     stream_id, correlation_id, bytes});
+  }
+
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {
    std::lock_guard<std::mutex> l(trace_mu_);
@@ -175,7 +240,6 @@ class DeviceTracerImpl : public DeviceTracer {
    CUPTI_CALL(
        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
-
    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
    enabled_ = true;
  }
@@ -185,7 +249,6 @@ class DeviceTracerImpl : public DeviceTracer {
    proto::Profile profile_pb;
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
-    std::map<std::string, std::vector<uint64_t>> event_times;
    for (const KernelRecord &r : kernel_records_) {
      if (correlations_.find(r.correlation_id) == correlations_.end()) {
        fprintf(stderr, "cannot relate a kernel activity\n");
@@ -197,7 +260,24 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_end_ns(r.end_ns);
      event->set_stream_id(r.stream_id);
      event->set_device_id(r.device_id);
-      event_times[event->name()].push_back(r.end_ns - r.start_ns);
+    }
+
+    for (const CPURecord &r : cpu_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.thread_id);
+      event->set_device_id(-1);
+    }
+    for (const MemRecord &r : mem_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.stream_id);
+      event->set_device_id(r.device_id);
+      event->mutable_memcopy()->set_bytes(r.bytes);
    }
    std::string profile_str;
    google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
@@ -242,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
  uint64_t start_ns_;
  uint64_t end_ns_;
  std::vector<KernelRecord> kernel_records_;
+  std::vector<MemRecord> mem_records_;
+  std::vector<CPURecord> cpu_records_;
  std::unordered_map<uint32_t, std::string> correlations_;
  CUpti_SubscriberHandle subscriber_;
 };
@@ -254,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {

  void AddAnnotation(uint64_t id, const std::string &anno) {}

+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
+
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {}
+
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {}

@@ -285,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }

 void ClearCurAnnotation() { cur_annotation = nullptr; }

+const char *CurAnnotation() { return cur_annotation; }
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -36,6 +36,21 @@ class DeviceTracer {
    uint32_t stream_id;
    uint32_t correlation_id;
  };
+  struct CPURecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint64_t thread_id;
+  };
+  struct MemRecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+    uint64_t bytes;
+  };

  virtual ~DeviceTracer() {}
  // Needs to be called once before use.
@@ -48,6 +63,14 @@ class DeviceTracer {
  // human-readable annotations.
  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;

+  virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
+                             uint64_t end_ns, uint32_t device_id,
+                             uint32_t stream_id, uint32_t correlation_id,
+                             uint64_t bytes) = 0;
+
+  virtual void AddCPURecords(const char* anno, uint64_t start_ns,
+                             uint64_t end_ns) = 0;
+
  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.
  virtual void AddKernelRecords(uint64_t start, uint64_t end,
@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
 void SetCurAnnotation(const char* anno);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
-
+// Current name of the operation being run in the thread.
+const char* CurAnnotation();
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
  __macro(cuptiFinalize);                     \
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
-  __macro(cuptiEnableCallback);
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);

 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);


--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/profiler.h"
+#include <sys/time.h>
+#include <time.h>
 #include <iomanip>
 #include <map>
 #ifdef PADDLE_WITH_CUDA
@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
      .count();
 }

+inline uint64_t PosixInNsec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
+}
+
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
             const DeviceContext* dev_ctx)
    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }

-RecordEvent::RecordEvent(const std::string& name,
-                         const DeviceContext* dev_ctx) {
+RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
+    : start_ns_(PosixInNsec()) {
  if (g_state == ProfilerState::kDisabled) return;
  dev_ctx_ = dev_ctx;
  name_ = name;
@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,

 RecordEvent::~RecordEvent() {
  if (g_state == ProfilerState::kDisabled) return;
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (tracer) {
+    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
+  }
  ClearCurAnnotation();
  PopEvent(name_, dev_ctx_);
 }
@@ -207,15 +219,14 @@ void DisableProfiler(EventSortingKey sorted_key,
  Mark("_stop_profiler_", nullptr);
  g_state = ProfilerState::kDisabled;

+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
  DeviceTracer* tracer = GetDeviceTracer();
  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
    tracer->Disable();
    tracer->GenProfile(profile_path);
  }
-
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
-  ResetProfiler();
 }

 void ParseEvents(std::vector<std::vector<Event>>& events,

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -108,6 +108,7 @@ struct RecordEvent {

  ~RecordEvent();

+  uint64_t start_ns_;
  // The device context is used by Event to get the current cuda stream.
  const DeviceContext* dev_ctx_;
  // Event name

--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -15,12 +15,17 @@ limitations under the License. */
 syntax = "proto2";
 package paddle.platform.proto;

+message MemCopy { optional uint64 bytes = 3; }
+
 message Event {
  optional string name = 1;
  optional uint64 start_ns = 2;
  optional uint64 end_ns = 3;
-  optional uint32 device_id = 5;
+  // When positive, it represents gpu id. When -1, it represents CPU.
+  optional int64 device_id = 5;
  optional uint32 stream_id = 6;
+
+  optional MemCopy memcopy = 7;
 }

 message Profile {

--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -124,15 +124,24 @@ class Timeline(object):
            if event.device_id not in self._devices:
                pid = self._allocate_pid()
                self._devices[event.device_id] = pid
-                self._chrome_trace.emit_pid("device:%s" % pid, pid)
+                if event.device_id >= 0:
+                    self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
+                                                (pid, event.stream_id), pid)
+                elif event.device_id == -1:
+                    self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
+                                                event.stream_id, pid)

    def _allocate_events(self):
        for event in self._profile_pb.events:
            pid = self._devices[event.device_id]
            args = {'name': event.name}
-            self._chrome_trace.emit_region(
-                event.start_ns, (event.end_ns - event.start_ns) / 1000000.0,
-                pid, 0, 'Op', event.name, args)
+            if event.memcopy.bytes > 0:
+                args = {'mem_bytes': event.memcopy.bytes}
+            # TODO(panyx0718): Chrome tracing only handles ms. However, some
+            # ops takes micro-seconds. Hence, we keep the ns here.
+            self._chrome_trace.emit_region(event.start_ns,
+                                           (event.end_ns - event.start_ns) /
+                                           1.0, pid, 0, 'Op', event.name, args)

    def generate_chrome_trace(self):
        self._allocate_pids()