device_tracer.cc 33.1 KB
Newer Older
1
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
14

15
#include "paddle/phi/api/profiler/device_tracer.h"
16

17
#include <deque>
18
#include <forward_list>
X
Xin Pan 已提交
19
#include <fstream>
20 21 22 23
#include <mutex>  // NOLINT
#include <string>
#include <thread>  // NOLINT

24
#include "glog/logging.h"
25
#include "paddle/phi/core/enforce.h"
26

27 28
DECLARE_bool(enable_host_event_recorder_hook);

29
namespace phi {
30 31 32 33

// Used only by DeviceTracer
uint64_t GetThreadIdFromSystemThreadId(uint32_t id);

34
namespace {
X
Xin Pan 已提交
35
// Tracking the nested block stacks of each thread.
W
Wilber 已提交
36 37 38 39 40 41
#ifdef PADDLE_WITH_SW
// sw not supported thread_local
std::deque<int> block_id_stack;
std::deque<Event *> annotation_stack;
#else
// Tracking the nested event stacks.
X
Xin Pan 已提交
42 43
thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks.
44
thread_local std::deque<Event *> annotation_stack;
W
Wilber 已提交
45
#endif
W
wangchaochaohu 已提交
46 47 48
// stack to strore event sunch as pe and so on
static std::deque<Event *> main_thread_annotation_stack{};
static std::deque<std::string> main_thread_annotation_stack_name{};
49

50 51
std::map<uint32_t, uint64_t> system_thread_id_map;
std::mutex system_thread_id_map_mutex;
52 53 54

std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr;
55 56 57 58 59

void PrintCuptiHint() {
  static bool showed = false;
  if (showed) return;
  showed = true;
T
tianshuo78520a 已提交
60
  LOG(WARNING) << "Invalid timestamp occurred. Please try increasing the "
61 62 63
                  "FLAGS_multiple_of_cupti_buffer_size.";
}

64 65 66 67
}  // namespace
#ifdef PADDLE_WITH_CUPTI

namespace {
68 69 70
// The experimental best performance is
// the same size with CUPTI device buffer size(8M)
uint64_t kBufSize = 1024 * 1024 * 8;
71
uint64_t kAlignSize = 8;
72 73
std::unordered_map<CUpti_CallbackId, std::string> runtime_cbid_str,
    driver_cbid_str;
74 75 76 77 78 79

#define ALIGN_BUFFER(buffer, align)                                 \
  (((uintptr_t)(buffer) & ((align)-1))                              \
       ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \
       : (buffer))

80 81 82 83 84 85 86 87 88 89 90 91 92 93
#define CUPTI_CALL(call)                                           \
  do {                                                             \
    CUptiResult _status = call;                                    \
    if (_status != CUPTI_SUCCESS) {                                \
      const char *errstr;                                          \
      dynload::cuptiGetResultString(_status, &errstr);             \
      fprintf(stderr,                                              \
              "%s:%d: error: function %s failed with error %s.\n", \
              __FILE__,                                            \
              __LINE__,                                            \
              #call,                                               \
              errstr);                                             \
      exit(-1);                                                    \
    }                                                              \
94 95
  } while (0)

X
Xin Pan 已提交
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
  switch (kind) {
    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
      return "MEMCPY_HtoD";
    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
      return "MEMCPY_DtoH";
    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
      return "MEMCPY_HtoA";
    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
      return "MEMCPY_AtoH";
    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
      return "MEMCPY_AtoA";
    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
      return "MEMCPY_AtoD";
    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
      return "MEMCPY_DtoA";
    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
      return "MEMCPY_DtoD";
    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
      return "MEMCPY_HtoH";
    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
      return "MEMCPY_PtoP";
    case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
      return "MEMCPY_FORCE_INT";
    default:
      break;
  }
  return "MEMCPY";
}

126 127 128 129 130 131 132 133 134 135 136 137 138 139
std::string DriverKind(CUpti_CallbackId cbid) {
  auto iter = driver_cbid_str.find(cbid);
  if (iter == driver_cbid_str.end())
    return "Driver API " + std::to_string(cbid);
  return iter->second;
}

std::string RuntimeKind(CUpti_CallbackId cbid) {
  auto iter = runtime_cbid_str.find(cbid);
  if (iter == runtime_cbid_str.end())
    return "Runtime API " + std::to_string(cbid);
  return iter->second;
}

140 141 142 143
void EnableActivity() {
  // Device activity record is created when CUDA initializes, so we
  // want to enable it before cuInit() or any CUDA runtime call.
  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
144 145 146 147 148
  CUPTI_CALL(
      dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
149
  // We don't track these activities for now.
D
Dun 已提交
150
  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
151 152
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
153 154 155 156 157 158 159 160 161
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
}

void DisableActivity() {
  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
162 163 164
  CUPTI_CALL(
      dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
165
  // Disable all other activity record kinds.
166
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
167 168
  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
D
Dun 已提交
169
  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
170 171 172
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
  // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
173 174
}

175 176
void CUPTIAPI bufferRequested(uint8_t **buffer,
                              size_t *size,
177
                              size_t *maxNumRecords) {
178
  uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
179 180 181 182 183
  *size = kBufSize;
  *buffer = ALIGN_BUFFER(buf, kAlignSize);
  *maxNumRecords = 0;
}

184 185 186 187 188
void CUPTIAPI bufferCompleted(CUcontext ctx,
                              uint32_t streamId,
                              uint8_t *buffer,
                              size_t size,
                              size_t validSize) {
189 190 191
  static std::thread::id cupti_thread_id(0);
  if (cupti_thread_id == std::thread::id(0))
    cupti_thread_id = std::this_thread::get_id();
G
GaoWei8 已提交
192
  PADDLE_ENFORCE_EQ(
193 194
      std::this_thread::get_id(),
      cupti_thread_id,
195
      errors::PermissionDenied(
G
GaoWei8 已提交
196
          "Only one thread is allowed to call bufferCompleted()."));
197 198 199 200 201 202 203 204 205
  CUptiResult status;
  CUpti_Activity *record = NULL;
  if (validSize > 0) {
    do {
      status = dynload::cuptiActivityGetNextRecord(buffer, validSize, &record);
      if (status == CUPTI_SUCCESS) {
        switch (record->kind) {
          case CUPTI_ACTIVITY_KIND_KERNEL:
          case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
W
wangchaochaohu 已提交
206 207 208 209
#if CUDA_VERSION >= 9000
            auto *kernel =
                reinterpret_cast<const CUpti_ActivityKernel4 *>(record);
#else
210 211
            auto *kernel =
                reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
W
wangchaochaohu 已提交
212
#endif
213 214 215 216 217
            tracer->AddKernelRecords(kernel->name,
                                     kernel->start,
                                     kernel->end,
                                     kernel->deviceId,
                                     kernel->streamId,
218 219 220
                                     kernel->correlationId);
            break;
          }
X
Xin Pan 已提交
221 222 223 224 225 226
          case CUPTI_ACTIVITY_KIND_MEMCPY: {
            auto *memcpy =
                reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
            tracer->AddMemRecords(
                MemcpyKind(
                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
227 228 229 230 231 232
                memcpy->start,
                memcpy->end,
                memcpy->deviceId,
                memcpy->streamId,
                memcpy->correlationId,
                memcpy->bytes);
X
Xin Pan 已提交
233 234 235 236 237 238 239 240
            break;
          }
          case CUPTI_ACTIVITY_KIND_MEMCPY2: {
            auto *memcpy =
                reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
            tracer->AddMemRecords(
                MemcpyKind(
                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
241 242 243 244 245 246
                memcpy->start,
                memcpy->end,
                memcpy->deviceId,
                memcpy->streamId,
                memcpy->correlationId,
                memcpy->bytes);
X
Xin Pan 已提交
247 248
            break;
          }
D
Dun 已提交
249 250 251
          case CUPTI_ACTIVITY_KIND_MEMSET: {
            auto *memset =
                reinterpret_cast<const CUpti_ActivityMemset *>(record);
252 253 254 255 256
            tracer->AddKernelRecords("MEMSET",
                                     memset->start,
                                     memset->end,
                                     memset->deviceId,
                                     memset->streamId,
D
Dun 已提交
257 258 259
                                     memset->correlationId);
            break;
          }
260 261
          case CUPTI_ACTIVITY_KIND_DRIVER: {
            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
262 263 264
            if (api->start != 0 && api->end != 0) {
              // -1 device id represents ActiveKind api call
              tracer->AddActiveKindRecords(
265 266 267 268
                  DriverKind(api->cbid),
                  api->start,
                  api->end,
                  -1,
269 270 271
                  GetThreadIdFromSystemThreadId(api->threadId),
                  api->correlationId);
            }
272 273 274 275
            break;
          }
          case CUPTI_ACTIVITY_KIND_RUNTIME: {
            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
276 277 278
            if (api->start != 0 && api->end != 0) {
              // -1 device id represents ActiveKind api call
              tracer->AddActiveKindRecords(
279 280 281 282
                  RuntimeKind(api->cbid),
                  api->start,
                  api->end,
                  -1,
283 284 285
                  GetThreadIdFromSystemThreadId(api->threadId),
                  api->correlationId);
            }
286 287
            break;
          }
288 289 290
          default: {
            break;
          }
291 292 293 294 295 296 297 298 299 300 301 302 303 304
        }
      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
        // Seems not an error in this case.
        break;
      } else {
        CUPTI_CALL(status);
      }
    } while (1);

    size_t dropped;
    CUPTI_CALL(
        dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
    if (dropped != 0) {
      fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
305
      PrintCuptiHint();
306 307 308 309
    }
  }
  free(buffer);
}
310 311 312

void initCuptiCbidStr();

313 314
}  // namespace

Q
qiaolongfei 已提交
315 316
#endif  // PADDLE_WITH_CUPTI

317 318
class DeviceTracerImpl : public DeviceTracer {
 public:
319 320 321 322 323
  DeviceTracerImpl() : enabled_(false) {
#ifdef PADDLE_WITH_CUPTI
    initCuptiCbidStr();
#endif
  }
324

325
  void AddAnnotation(uint32_t id, Event *event) {
W
Wilber 已提交
326 327 328 329
#ifdef PADDLE_WITH_SW
    std::forward_list<std::pair<uint32_t, Event *>> *local_correlations_pairs =
        nullptr;
#else
330 331
    thread_local std::forward_list<std::pair<uint32_t, Event *>>
        *local_correlations_pairs = nullptr;
W
Wilber 已提交
332
#endif
333 334 335 336 337 338
    if (local_correlations_pairs == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      correlations_pairs.emplace_front();
      local_correlations_pairs = &correlations_pairs.front();
    }
    local_correlations_pairs->push_front(std::make_pair(id, event));
339 340
  }

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
  void AddAnnotations(const std::map<uint64_t, ThreadEvents> &thr_events) {
    for (auto &tmp : active_kind_records_) {
      for (const ActiveKindRecord &r : tmp) {
        auto iter = thr_events.find(r.thread_id);
        if (iter == thr_events.end()) {
          VLOG(10) << __func__ << " " << r.name
                   << " Missing tid: " << r.thread_id;
          continue;
        }
        const ThreadEvents &evts = iter->second;
        auto evt_iter = evts.upper_bound(r.end_ns);
        if (evt_iter == evts.end()) {
          VLOG(10) << __func__ << " Missing Record " << r.name
                   << " tid: " << r.thread_id << " end_ns: " << r.end_ns;
          continue;
        }
        if (evt_iter != evts.begin()) {
          auto prev_iter = std::prev(evt_iter);
          if (prev_iter->first >= r.end_ns) {
            evt_iter = prev_iter;
          } else {
            VLOG(10) << __func__ << " prev end_ns " << prev_iter->first
                     << " end_ns: " << r.end_ns;
          }
        }
        Event *evt = evt_iter->second.first;
        uint64_t start_ns = evt_iter->second.second;
        if (start_ns > r.start_ns) {
          VLOG(10) << __func__ << " Mismatch Record " << r.name
                   << " tid: " << r.thread_id << " start_ns: " << r.start_ns
                   << " end_ns: " << r.end_ns << ", event " << evt->name()
                   << " start_ns: " << start_ns;
          continue;
        }
        VLOG(10) << __func__ << " tid: " << r.thread_id << " Add correlation "
                 << r.correlation_id << "<->" << evt->name();
        AddAnnotation(r.correlation_id, evt);
      }
    }
  }

382 383 384 385 386
  void AddCPURecords(const std::string &anno,
                     uint64_t start_ns,
                     uint64_t end_ns,
                     int64_t device_id,
                     uint64_t thread_id) {
X
Xin Pan 已提交
387
    if (anno.empty()) {
M
minqiyang 已提交
388
      VLOG(1) << "Empty timeline annotation.";
389 390
      return;
    }
W
Wilber 已提交
391 392 393
#ifdef PADDLE_WITH_SW
    std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
#else
394
    thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
W
Wilber 已提交
395
#endif
396 397 398 399 400 401
    if (local_cpu_records_ == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      cpu_records_.emplace_front();
      local_cpu_records_ = &cpu_records_.front();
    }
    local_cpu_records_->push_front(
X
Xin Pan 已提交
402
        CPURecord{anno, start_ns, end_ns, device_id, thread_id});
X
Xin Pan 已提交
403 404
  }

405 406 407 408 409 410 411
  void AddMemRecords(const std::string &name,
                     uint64_t start_ns,
                     uint64_t end_ns,
                     int64_t device_id,
                     int64_t stream_id,
                     uint32_t correlation_id,
                     uint64_t bytes) {
X
Xin Pan 已提交
412
    // 0 means timestamp information could not be collected for the kernel.
413
    if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) {
M
minqiyang 已提交
414
      VLOG(3) << name << " cannot be traced";
415
      PrintCuptiHint();
X
Xin Pan 已提交
416 417
      return;
    }
418
    // NOTE(liangdun): lock is not needed, only one thread call this function.
419 420
    mem_records_.push_front(MemRecord{
        name, start_ns, end_ns, device_id, stream_id, correlation_id, bytes});
X
Xin Pan 已提交
421 422
  }

423 424 425 426 427 428 429
  void AddMemInfoRecord(uint64_t start_ns,
                        uint64_t end_ns,
                        size_t bytes,
                        const Place &place,
                        const std::string &alloc_in,
                        const std::string &free_in,
                        uint64_t thread_id) {
C
chengduo 已提交
430 431 432 433
    if (0 == start_ns || 0 == end_ns) {
      VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
      return;
    }
W
Wilber 已提交
434 435 436
#ifdef PADDLE_WITH_SW
    std::forward_list<MemInfoRecord> *local_mem_info_record = nullptr;
#else
C
chengduo 已提交
437 438
    thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
        nullptr;
W
Wilber 已提交
439
#endif
C
chengduo 已提交
440 441 442 443 444 445 446 447 448
    if (local_mem_info_record == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      mem_info_record_.emplace_front();
      local_mem_info_record = &mem_info_record_.front();
    }
    local_mem_info_record->emplace_front(MemInfoRecord{
        start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
  }

449 450 451 452 453 454
  void AddActiveKindRecords(const std::string &anno,
                            uint64_t start_ns,
                            uint64_t end_ns,
                            int64_t device_id,
                            uint64_t thread_id,
                            uint32_t correlation_id) {
455 456 457 458
    if (anno.empty()) {
      VLOG(1) << "Empty timeline annotation.";
      return;
    }
W
Wilber 已提交
459 460 461
#ifdef PADDLE_WITH_SW
    std::forward_list<ActiveKindRecord> *local_active_kind_records = nullptr;
#else
462 463
    thread_local std::forward_list<ActiveKindRecord>
        *local_active_kind_records = nullptr;
W
Wilber 已提交
464
#endif
465 466 467 468 469 470 471 472 473 474
    if (local_active_kind_records == nullptr) {
      std::lock_guard<std::mutex> l(trace_mu_);
      active_kind_records_.emplace_front();
      local_active_kind_records = &active_kind_records_.front();
    }
    //  lock is not needed, only one thread call this function.
    local_active_kind_records->push_front(ActiveKindRecord{
        anno, start_ns, end_ns, device_id, thread_id, correlation_id});
  }

475 476 477 478 479
  void AddKernelRecords(std::string name,
                        uint64_t start,
                        uint64_t end,
                        int64_t device_id,
                        int64_t stream_id,
Z
ZongwuYang 已提交
480
                        uint32_t correlation_id) {
X
Xin Pan 已提交
481
    // 0 means timestamp information could not be collected for the kernel.
482
    if (start == 0 || end == 0 || start == end) {
M
minqiyang 已提交
483
      VLOG(3) << correlation_id << " cannot be traced";
484
      PrintCuptiHint();
X
Xin Pan 已提交
485 486
      return;
    }
487 488
    // NOTE(liangdun): lock is not needed, only one thread call this function.
    kernel_records_.push_front(
Z
ZongwuYang 已提交
489
        KernelRecord{name, start, end, device_id, stream_id, correlation_id});
490 491 492 493 494 495 496 497 498 499 500 501
  }

  bool IsEnabled() {
    std::lock_guard<std::mutex> l(trace_mu_);
    return enabled_;
  }

  void Enable() {
    std::lock_guard<std::mutex> l(trace_mu_);
    if (enabled_) {
      return;
    }
Q
qiaolongfei 已提交
502 503

#ifdef PADDLE_WITH_CUPTI
504 505 506 507 508 509 510 511 512 513 514 515 516 517
    EnableActivity();

    // Register callbacks for buffer requests and completed by CUPTI.
    CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(bufferRequested,
                                                       bufferCompleted));

    CUptiResult ret;
    ret = dynload::cuptiSubscribe(
        &subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
    if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
      fprintf(stderr, "CUPTI subcriber limit reached.\n");
    } else if (ret != CUPTI_SUCCESS) {
      fprintf(stderr, "Failed to create CUPTI subscriber.\n");
    }
518
    const std::vector<int> runtime_cbids {
519
      CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
520
          CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020,
521
          CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
D
Dun 已提交
522 523
          CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020,
          CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020,
524 525 526 527 528 529 530 531
          CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
#if CUDA_VERSION >= 9000
          ,
          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000
#endif
    };
532 533 534 535
    const std::vector<int> driver_cbids{CUPTI_DRIVER_TRACE_CBID_cuLaunch,
                                        CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid,
                                        CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel};
    for (auto cbid : runtime_cbids)
536 537
      CUPTI_CALL(dynload::cuptiEnableCallback(
          1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid));
538 539 540
    for (auto cbid : driver_cbids)
      CUPTI_CALL(dynload::cuptiEnableCallback(
          1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
541
    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
Q
qiaolongfei 已提交
542
#endif  // PADDLE_WITH_CUPTI
543 544 545
    enabled_ = true;
  }

546 547 548 549 550 551 552 553 554 555 556
  void Reset() {
#ifdef PADDLE_WITH_CUPTI
    CUPTI_CALL(
        dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
#endif
    std::lock_guard<std::mutex> l(trace_mu_);
    kernel_records_.clear();
    mem_records_.clear();
    correlations_.clear();
    for (auto &tmp : correlations_pairs) tmp.clear();
    for (auto &tmp : cpu_records_) tmp.clear();
C
chengduo 已提交
557
    for (auto &tmp : mem_info_record_) tmp.clear();
558
    for (auto &tmp : active_kind_records_) tmp.clear();
559 560 561 562 563 564 565 566 567 568 569
  }

  void GenEventKernelCudaElapsedTime() {
#ifdef PADDLE_WITH_CUPTI
    if (correlations_.empty())
      for (auto &tmp : correlations_pairs)
        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
    for (const KernelRecord &r : kernel_records_) {
      auto c = correlations_.find(r.correlation_id);
      if (c != correlations_.end() && c->second != nullptr) {
        Event *e = c->second;
570 571 572 573 574
        Event *parent = e->parent();
        while (parent) {
          parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
          parent = parent->parent();
        }
575 576 577 578 579 580 581
        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
      }
    }
    for (const auto &r : mem_records_) {
      auto c = correlations_.find(r.correlation_id);
      if (c != correlations_.end() && c->second != nullptr) {
        Event *e = c->second;
582 583 584 585 586
        Event *parent = e->parent();
        while (parent) {
          parent->AddCudaElapsedTime(r.start_ns, r.end_ns);
          parent = parent->parent();
        }
587 588 589 590 591 592
        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
      }
    }
#endif
  }

X
Xin Pan 已提交
593
  proto::Profile GenProfile(const std::string &profile_path) {
H
Huihuang Zheng 已提交
594 595 596 597 598 599 600 601 602 603
    proto::Profile profile_pb = this->GetProfile();
    std::ofstream profile_f;
    profile_f.open(profile_path,
                   std::ios::out | std::ios::trunc | std::ios::binary);
    profile_pb.SerializeToOstream(&profile_f);
    profile_f.close();
    return profile_pb;
  }

  proto::Profile GetProfile() {
604
    int miss = 0, find = 0;
605 606 607 608
    std::lock_guard<std::mutex> l(trace_mu_);
    proto::Profile profile_pb;
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
C
chengduo 已提交
609 610
    if (correlations_.empty()) {
      for (auto &tmp : correlations_pairs) {
611
        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
C
chengduo 已提交
612 613 614
      }
    }

615 616
    for (const KernelRecord &r : kernel_records_) {
      auto *event = profile_pb.add_events();
X
Xin Pan 已提交
617
      event->set_type(proto::Event::GPUKernel);
618 619 620
      auto c = correlations_.find(r.correlation_id);
      if (c != correlations_.end() && c->second != nullptr) {
        event->set_name(c->second->name());
Y
Yuang Liu 已提交
621
        event->set_detail_info(c->second->attr());
622
        find++;
Z
ZongwuYang 已提交
623
      } else {
624
        VLOG(10) << __func__ << " Missing Kernel Event: " + r.name;
625
        miss++;
Z
ZongwuYang 已提交
626 627
        event->set_name(r.name);
      }
628 629
      event->set_start_ns(r.start_ns);
      event->set_end_ns(r.end_ns);
X
Xin Pan 已提交
630
      event->set_sub_device_id(r.stream_id);
631
      event->set_device_id(r.device_id);
X
Xin Pan 已提交
632
    }
633 634
    VLOG(1) << __func__ << " KernelRecord event miss: " << miss
            << " find: " << find;
C
chengduo 已提交
635

636
    for (auto &tmp : cpu_records_) {
637 638 639 640 641 642 643 644 645
      for (const CPURecord &r : tmp) {
        auto *event = profile_pb.add_events();
        event->set_type(proto::Event::CPU);
        event->set_name(r.name);
        event->set_start_ns(r.start_ns);
        event->set_end_ns(r.end_ns);
        event->set_sub_device_id(r.thread_id);
        event->set_device_id(r.device_id);
      }
646
    }
C
chengduo 已提交
647

648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
    for (auto &tmp : active_kind_records_) {
      for (const ActiveKindRecord &r : tmp) {
        auto *event = profile_pb.add_events();
        event->set_type(proto::Event::CPU);
        auto c = correlations_.find(r.correlation_id);
        if (c != correlations_.end() && c->second != nullptr) {
          event->set_name(c->second->name());
          event->set_detail_info(r.name);
        } else {
          event->set_name(r.name);
        }
        event->set_start_ns(r.start_ns);
        event->set_end_ns(r.end_ns);
        event->set_sub_device_id(r.thread_id);
        event->set_device_id(r.device_id);
      }
    }
665
    miss = find = 0;
X
Xin Pan 已提交
666 667
    for (const MemRecord &r : mem_records_) {
      auto *event = profile_pb.add_events();
X
Xin Pan 已提交
668
      event->set_type(proto::Event::GPUKernel);
669 670 671 672 673 674 675 676 677
      auto c = correlations_.find(r.correlation_id);
      if (c != correlations_.end() && c->second != nullptr) {
        event->set_name(c->second->name());
        event->set_detail_info(r.name);
        find++;
      } else {
        miss++;
        event->set_name(r.name);
      }
X
Xin Pan 已提交
678 679
      event->set_start_ns(r.start_ns);
      event->set_end_ns(r.end_ns);
X
Xin Pan 已提交
680
      event->set_sub_device_id(r.stream_id);
X
Xin Pan 已提交
681 682 683
      event->set_device_id(r.device_id);
      event->mutable_memcopy()->set_bytes(r.bytes);
    }
684 685
    VLOG(1) << __func__ << " MemRecord event miss: " << miss
            << " find: " << find;
C
chengduo 已提交
686 687 688 689 690

    for (auto &tmp : mem_info_record_) {
      for (const auto &r : tmp) {
        auto *event = profile_pb.add_mem_events();
        event->set_device_id(0);
691
        if (r.place.GetType() == phi::AllocationType::CPU) {
C
chengduo 已提交
692
          event->set_place(proto::MemEvent::CPUPlace);
693
        } else if (r.place.GetType() == phi::AllocationType::GPU) {
C
chengduo 已提交
694
          event->set_place(proto::MemEvent::CUDAPlace);
695
          event->set_device_id(r.place.GetDeviceId());
696
        } else if (r.place.GetType() == phi::AllocationType::GPUPINNED) {
C
chengduo 已提交
697
          event->set_place(proto::MemEvent::CUDAPinnedPlace);
698
        } else if (r.place.GetType() == phi::AllocationType::NPU) {
699
          event->set_place(proto::MemEvent::NPUPlace);
C
chengduo 已提交
700
        } else {
701 702
          PADDLE_THROW(
              errors::Unimplemented("The current place is not supported."));
C
chengduo 已提交
703 704 705 706 707 708 709 710 711
        }
        event->set_alloc_in(r.alloc_in);
        event->set_free_in(r.free_in);
        event->set_start_ns(r.start_ns);
        event->set_end_ns(r.end_ns);
        event->set_bytes(r.bytes);
        event->set_thread_id(r.thread_id);
      }
    }
712 713 714 715
    return profile_pb;
  }

  void Disable() {
Q
qiaolongfei 已提交
716
#ifdef PADDLE_WITH_CUPTI
717
    // flush might cause additional calls to DeviceTracker.
718 719
    CUPTI_CALL(
        dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
Q
qiaolongfei 已提交
720 721 722
#endif  // PADDLE_WITH_CUPTI
    std::lock_guard<std::mutex> l(trace_mu_);
#ifdef PADDLE_WITH_CUPTI
723
    DisableActivity();
724
    CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_));
725
    CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
Q
qiaolongfei 已提交
726
#endif  // PADDLE_WITH_CUPTI
727 728 729 730
    enabled_ = false;
  }

 private:
Q
qiaolongfei 已提交
731
#ifdef PADDLE_WITH_CUPTI
732 733 734 735
  static void CUPTIAPI ApiCallback(void *userdata,
                                   CUpti_CallbackDomain domain,
                                   CUpti_CallbackId cbid,
                                   const void *cbdata) {
736 737 738
    if (LIKELY(FLAGS_enable_host_event_recorder_hook)) {
      return;
    }
739
    auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
740 741 742 743
    DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
    if (cbInfo->callbackSite == CUPTI_API_ENTER) {
      Event *event = CurAnnotation();
      tracer->AddAnnotation(cbInfo->correlationId, event);
744 745 746 747
    }
  }
  CUpti_SubscriberHandle subscriber_;
#endif  // PADDLE_WITH_CUPTI
Q
qiaolongfei 已提交
748 749 750 751
  std::mutex trace_mu_;
  bool enabled_;
  uint64_t start_ns_;
  uint64_t end_ns_;
752 753 754
  std::forward_list<KernelRecord> kernel_records_;
  std::forward_list<MemRecord> mem_records_;
  std::forward_list<std::forward_list<CPURecord>> cpu_records_;
C
chengduo 已提交
755
  std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
756
  std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
757 758 759
  std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
      correlations_pairs;
  std::unordered_map<uint32_t, Event *> correlations_;
760 761
};

Q
qiaolongfei 已提交
762
void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
763 764 765 766 767 768

DeviceTracer *GetDeviceTracer() {
  std::call_once(tracer_once_flag, CreateTracer, &tracer);
  return tracer;
}

769 770 771 772 773 774
// In order to record PE time, we add main_thread_annotation_stack
// for all event between PE run, we treat it as PE's child Event,
// so when event is not in same thread of PE event, we need add
// father event(PE::run event) for this event
void SetCurAnnotation(Event *event) {
  if (!annotation_stack.empty()) {
775 776 777
    event->set_parent(annotation_stack.back());
    event->set_name(annotation_stack.back()->name() + "/" + event->name());
  }
778 779 780 781 782 783
  if (annotation_stack.empty() && !main_thread_annotation_stack.empty() &&
      main_thread_annotation_stack.back()->thread_id() != event->thread_id()) {
    event->set_parent(main_thread_annotation_stack.back());
    event->set_name(main_thread_annotation_stack.back()->name() + "/" +
                    event->name());
  }
784
  annotation_stack.push_back(event);
W
wangchaochaohu 已提交
785 786 787 788 789 790 791 792 793

  if (event->role() == EventRole::kSpecial) {
    std::string name = event->name();
    if (!main_thread_annotation_stack_name.empty()) {
      name = main_thread_annotation_stack_name.back() + "/" + event->name();
    }
    main_thread_annotation_stack_name.push_back(name);
    main_thread_annotation_stack.push_back(event);
  }
794
}
X
Xin Pan 已提交
795

W
wangchaochaohu 已提交
796
void ClearCurAnnotation() {
797
  if (!main_thread_annotation_stack.empty()) {
798 799 800 801 802 803
    std::string name = annotation_stack.back()->name();
    std::string main_name = main_thread_annotation_stack.back()->name();
    int main_name_len = main_name.length();
    int name_len = name.length();
    int prefix_len = main_name_len - name_len;

804 805 806
    if ((prefix_len > 0 && main_name.at(prefix_len - 1) == '/' &&
         name == main_name.substr(prefix_len, name_len)) ||
        (name == main_name)) {
807 808 809
      main_thread_annotation_stack_name.pop_back();
      main_thread_annotation_stack.pop_back();
    }
W
wangchaochaohu 已提交
810 811 812
  }
  annotation_stack.pop_back();
}
X
Xin Pan 已提交
813

814 815
Event *CurAnnotation() {
  if (annotation_stack.empty()) return nullptr;
X
Xin Pan 已提交
816 817
  return annotation_stack.back();
}
818

819
std::string CurAnnotationName() {
C
chengduo 已提交
820
  if (annotation_stack.empty()) return "Unknown";
821 822
  return annotation_stack.back()->name();
}
X
Xin Pan 已提交
823 824 825 826 827 828

void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }

void ClearCurBlock() { block_id_stack.pop_back(); }

int BlockDepth() { return block_id_stack.size(); }
829 830 831 832 833 834 835 836

uint32_t GetCurSystemThreadId() {
  std::stringstream ss;
  ss << std::this_thread::get_id();
  uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
  return id;
}

837 838
void RecoreCurThreadId(uint64_t id) {
  std::lock_guard<std::mutex> lock(system_thread_id_map_mutex);
839 840 841 842
  auto gid = GetCurSystemThreadId();
  system_thread_id_map[gid] = id;
}

843
uint64_t GetThreadIdFromSystemThreadId(uint32_t id) {
844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899
  auto it = system_thread_id_map.find(id);
  if (it != system_thread_id_map.end()) return it->second;
  // return origin id if no event is recorded in this thread.
  return static_cast<int32_t>(id);
}

#ifdef PADDLE_WITH_CUPTI
namespace {

void initCuptiCbidStr() {
  static bool called = false;
  if (called) return;
  called = true;
#define REGISTER_RUNTIME_CBID_STR(cbid) \
  runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid

  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
  REGISTER_RUNTIME_CBID_STR(
      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
  REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
C
chengduo 已提交
900 901
  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
902
  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
903 904 905 906 907 908 909 910 911 912
#if CUDA_VERSION >= 9000
  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
  REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
#endif

#undef REGISTER_RUNTIME_CBID_STR
}
}  // namespace
#endif  // PADDLE_WITH_CUPTI

913
}  // namespace phi