profiler.cc 22.4 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
dangqingqing 已提交
2 3 4 5

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

D
dangqingqing 已提交
7 8 9 10 11 12 13 14
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "paddle/fluid/platform/profiler.h"
16
#include <algorithm>
17
#include <iomanip>
18
#include <limits>
19
#include <map>
20
#include <mutex>  // NOLINT
21
#include <random>
22
#include <string>
C
chengduo 已提交
23 24
#include <vector>

25 26 27
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif  // PADDLE_WITH_CUDA
Y
Yancey1989 已提交
28

29
#include "glog/logging.h"
30 31
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
Y
Yancey1989 已提交
32
#include "paddle/fluid/platform/port.h"
33
#include "paddle/fluid/string/printf.h"
D
dangqingqing 已提交
34

G
gongweibao 已提交
35 36
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");

D
dangqingqing 已提交
37 38 39
namespace paddle {
namespace platform {

40 41
static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
X
Xin Pan 已提交
42
std::mutex profiler_mu;
43

D
dangqingqing 已提交
44 45 46 47 48 49 50 51 52 53 54
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
C
chengduo 已提交
55
static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
D
dangqingqing 已提交
56
// The thread local event list only can be accessed by the specific thread
C
chengduo 已提交
57
static thread_local std::shared_ptr<EventList<Event>> g_event_list;
58

C
chengduo 已提交
59 60 61 62 63
static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
static std::mutex g_all_mem_event_lists_mutex;
static thread_local int32_t g_mem_thread_id;
static uint32_t g_mem_next_thread_id = 0;
64

D
dangqingqing 已提交
65 66 67 68 69 70 71 72 73
inline uint64_t GetTimeInNsec() {
  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                                 std::chrono::high_resolution_clock,
                                 std::chrono::steady_clock>::type;
  return std::chrono::duration_cast<std::chrono::nanoseconds>(
             clock::now().time_since_epoch())
      .count();
}

74 75
Event::Event(EventType type, std::string name, uint32_t thread_id)
    : type_(type), name_(name), thread_id_(thread_id) {
D
dangqingqing 已提交
76 77 78
  cpu_ns_ = GetTimeInNsec();
}

C
chengduo 已提交
79
const EventType &Event::type() const { return type_; }
D
dangqingqing 已提交
80

C
chengduo 已提交
81
double Event::CpuElapsedMs(const Event &e) const {
82
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
D
dangqingqing 已提交
83 84
}

C
chengduo 已提交
85
double Event::CudaElapsedMs(const Event &e) const {
86 87
#ifdef PADDLE_WITH_CUPTI
  return gpu_ns_ / 1000000.0;
D
Dun Liang 已提交
88
#else
89
  LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
D
Dun Liang 已提交
90
  return 0;
D
dangqingqing 已提交
91 92 93
#endif
}

C
chengduo 已提交
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
inline EventList<MemEvent> &GetMemEventList() {
  if (!g_mem_event_list) {
    g_mem_event_list = std::make_shared<EventList<MemEvent>>();
    std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
    g_mem_thread_id = g_mem_next_thread_id++;
    g_all_mem_event_lists.emplace_front(g_mem_event_list);
  }
  return *g_mem_event_list;
}

void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                  const Place &place, const std::string &annotation) {
  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
                           place, g_mem_thread_id, annotation);
}

void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                 const Place &place, const std::string &annotation) {
  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
                           g_mem_thread_id, annotation);
}

inline EventList<Event> &GetEventList() {
D
dangqingqing 已提交
117 118
  if (!g_event_list) {
    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
C
chengduo 已提交
119
    g_event_list = std::make_shared<EventList<Event>>();
D
dangqingqing 已提交
120 121
    g_thread_id = g_next_thread_id++;
    g_all_event_lists.emplace_front(g_event_list);
122
    RecoreCurThreadId(g_thread_id);
D
dangqingqing 已提交
123 124 125 126
  }
  return *g_event_list;
}

C
chengduo 已提交
127
void Mark(const std::string &name) {
128
  GetEventList().Record(EventType::kMark, name, g_thread_id);
129 130
}

C
chengduo 已提交
131
Event *PushEvent(const std::string &name) {
132
  return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
133 134
}

C
chengduo 已提交
135
void PopEvent(const std::string &name) {
136
  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
D
dangqingqing 已提交
137 138
}

C
chengduo 已提交
139
RecordEvent::RecordEvent(const std::string &name)
X
Xin Pan 已提交
140
    : is_enabled_(false), start_ns_(PosixInNsec()) {
D
dangqingqing 已提交
141
  if (g_state == ProfilerState::kDisabled) return;
142
  // lock is not needed, the code below is thread-safe
Y
Yancey1989 已提交
143

X
Xin Pan 已提交
144
  is_enabled_ = true;
Y
Yibing Liu 已提交
145
  name_ = name;
C
chengduo 已提交
146
  Event *e = PushEvent(name_);
147
  // Maybe need the same push/pop behavior.
148
  SetCurAnnotation(e);
D
dangqingqing 已提交
149 150 151
}

RecordEvent::~RecordEvent() {
X
Xin Pan 已提交
152
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
153
  // lock is not needed, the code below is thread-safe
C
chengduo 已提交
154
  DeviceTracer *tracer = GetDeviceTracer();
X
Xin Pan 已提交
155
  if (tracer) {
156
    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
157
                          BlockDepth(), g_thread_id);
X
Xin Pan 已提交
158
  }
Y
Yibing Liu 已提交
159
  ClearCurAnnotation();
160
  PopEvent(name_);
D
dangqingqing 已提交
161
}
D
dangqingqing 已提交
162

C
chengduo 已提交
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
MemEvenRecorder MemEvenRecorder::recorder;

void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
                                    size_t size) {
  if (g_state == ProfilerState::kDisabled) return;
  std::lock_guard<std::mutex> guard(mtx_);
  auto &events = address_memevent_[place];
  PADDLE_ENFORCE(events.count(ptr) == 0, "");
  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
                          new MemEvenRecorder::RecordMemEvent(place, size)));
}

void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
  if (g_state == ProfilerState::kDisabled) return;
  std::lock_guard<std::mutex> guard(mtx_);
  auto &events = address_memevent_[place];
  auto iter = events.find(ptr);
  // The ptr maybe not in address_memevent
  if (iter != events.end()) {
    events.erase(iter);
  }
}

void MemEvenRecorder::Flush() {
  std::lock_guard<std::mutex> guard(mtx_);
  address_memevent_.clear();
}

MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
                                                size_t bytes)
    : place_(place),
      bytes_(bytes),
      start_ns_(PosixInNsec()),
      alloc_in_(CurAnnotationName()) {
  PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
}

MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
  DeviceTracer *tracer = GetDeviceTracer();
  end_ns_ = PosixInNsec();

  auto annotation_free = CurAnnotationName();
  if (tracer) {
    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
                             annotation_free, g_mem_thread_id);
  }
  PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}

RecordRPCEvent::RecordRPCEvent(const std::string &name) {
G
gongweibao 已提交
213
  if (FLAGS_enable_rpc_profiler) {
214
    event_.reset(new platform::RecordEvent(name));
G
gongweibao 已提交
215 216 217
  }
}

X
Xin Pan 已提交
218 219
RecordBlock::RecordBlock(int block_id)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
220
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
221
  if (g_state == ProfilerState::kDisabled) return;
X
Xin Pan 已提交
222
  is_enabled_ = true;
X
Xin Pan 已提交
223 224 225 226 227
  SetCurBlock(block_id);
  name_ = string::Sprintf("block_%d", block_id);
}

RecordBlock::~RecordBlock() {
228
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
229
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
C
chengduo 已提交
230
  DeviceTracer *tracer = GetDeviceTracer();
X
Xin Pan 已提交
231 232 233 234
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
235
                          g_thread_id);
X
Xin Pan 已提交
236 237 238 239
  }
  ClearCurBlock();
}

240 241 242 243 244 245 246 247 248 249
void SynchronizeAllDevice() {
#ifdef PADDLE_WITH_CUDA
  int count = GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
    SetDeviceId(i);
    PADDLE_ENFORCE(cudaDeviceSynchronize());
  }
#endif
}

D
dangqingqing 已提交
250 251
void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
Q
Qiao Longfei 已提交
252
                 "Can't enable profiling, since the input state is ",
D
dangqingqing 已提交
253
                 "ProfilerState::kDisabled");
254
  SynchronizeAllDevice();
X
Xin Pan 已提交
255
  std::lock_guard<std::mutex> l(profiler_mu);
256 257
  if (state == g_state) {
    return;
258
  }
259
  g_state = state;
X
Xin Pan 已提交
260
  should_send_profile_state = true;
261
  GetDeviceTracer()->Enable();
D
dangqingqing 已提交
262
#ifdef PADDLE_WITH_CUDA
263 264
  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
      g_state == ProfilerState::kCPU) {
265
    // Generate some dummy events first to reduce the startup overhead.
266 267
    DummyKernelAndEvent();
    GetDeviceTracer()->Reset();
D
dangqingqing 已提交
268 269 270
  }
#endif
  // Mark the profiling start.
271
  Mark("_start_profiler_");
D
dangqingqing 已提交
272 273
}

274
void ResetProfiler() {
275 276
  SynchronizeAllDevice();
  GetDeviceTracer()->Reset();
C
chengduo 已提交
277
  MemEvenRecorder::Instance().Flush();
D
dangqingqing 已提交
278
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
279 280 281 282
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    (*it)->Clear();
  }
C
chengduo 已提交
283 284 285 286
  for (auto it = g_all_mem_event_lists.begin();
       it != g_all_mem_event_lists.end(); ++it) {
    (*it)->Clear();
  }
287 288 289 290 291
}

std::vector<std::vector<Event>> GetAllEvents() {
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
  std::vector<std::vector<Event>> result;
D
dangqingqing 已提交
292 293 294
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    result.emplace_back((*it)->Reduce());
D
dangqingqing 已提交
295 296 297 298
  }
  return result;
}

C
chengduo 已提交
299 300 301 302 303 304 305 306 307
std::vector<std::vector<MemEvent>> GetMemEvents() {
  std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
  std::vector<std::vector<MemEvent>> result;
  for (auto &it : g_all_mem_event_lists) {
    result.emplace_back((*it).Reduce());
  }
  return result;
}

308 309 310 311 312 313 314
// The information of each event given in the profiling report
struct EventItem {
  std::string name;
  int calls;
  double total_time;
  double max_time;
  double ave_time;
C
chengduo 已提交
315 316 317
  double min_time;
  double cpu_time;
  double gpu_time;
Y
Yan Chunwei 已提交
318
  float ratio;
319 320 321
};

// Print results
C
chengduo 已提交
322 323
void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
                   const std::string &sorted_domain, const size_t name_width,
324
                   const size_t data_width, bool merge_thread) {
325 326 327 328 329 330 331 332 333 334 335 336
  // Output header information
  std::cout << "\n------------------------->"
            << "     Profiling Report     "
            << "<-------------------------\n\n";
  std::string place;
  if (g_state == ProfilerState::kCPU) {
    place = "CPU";
  } else if (g_state == ProfilerState::kCUDA) {
    place = "CUDA";
  } else if (g_state == ProfilerState::kAll) {
    place = "All";
  } else {
X
Xin Pan 已提交
337
    PADDLE_THROW("Invalid profiler state", g_state);
338
  }
339

340 341 342 343
  if (merge_thread) {
    std::cout << "Note! This Report merge all thread info into one."
              << std::endl;
  }
344 345 346 347 348 349 350
  std::cout << "Place: " << place << std::endl;
  std::cout << "Time unit: ms" << std::endl;
  std::cout << "Sorted by " << sorted_domain
            << " in descending order in the same thread\n\n";
  // Output events table
  std::cout.setf(std::ios::left);
  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
C
chengduo 已提交
351 352 353 354 355 356
            << "Calls" << std::setw(data_width) << "Total";
  if (g_state == ProfilerState::kAll) {
    std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
              << std::setw(data_width * 2) << "GPU Time (Ratio)";
  }
  std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
Y
Yan Chunwei 已提交
357 358
            << "Max." << std::setw(data_width) << "Ave."
            << std::setw(data_width) << "Ratio." << std::endl;
359 360
  for (size_t i = 0; i < events_table.size(); ++i) {
    for (size_t j = 0; j < events_table[i].size(); ++j) {
C
chengduo 已提交
361
      const EventItem &event_item = events_table[i][j];
362 363
      std::cout << std::setw(name_width) << event_item.name
                << std::setw(data_width) << event_item.calls
C
chengduo 已提交
364 365 366 367 368 369 370 371 372 373 374 375
                << std::setw(data_width) << event_item.total_time;
      if (g_state == ProfilerState::kAll) {
        std::cout << std::setw(data_width * 2)
                  << string::Sprintf(
                         "%f (%f)", event_item.cpu_time,
                         (event_item.cpu_time / event_item.total_time))
                  << std::setw(data_width * 2)
                  << string::Sprintf(
                         "%f (%f)", event_item.gpu_time,
                         (event_item.gpu_time / event_item.total_time));
      }
      std::cout << std::setw(data_width) << event_item.min_time
376
                << std::setw(data_width) << event_item.max_time
Y
Yan Chunwei 已提交
377
                << std::setw(data_width) << event_item.ave_time
378
                << std::setw(data_width) << event_item.ratio << std::endl;
379
    }
380
  }
381
  std::cout << std::endl;
382 383
}

384
// Parse the event list and output the profiling report
C
chengduo 已提交
385
void ParseEvents(const std::vector<std::vector<Event>> &events,
386
                 bool merge_thread,
387 388
                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
  if (g_state == ProfilerState::kDisabled) return;
389
  if (merge_thread && events.size() < 2) return;
390 391

  std::string sorted_domain;
C
chengduo 已提交
392
  std::function<bool(const EventItem &, const EventItem &)> sorted_func;
393 394 395
  switch (sorted_by) {
    case EventSortingKey::kCalls:
      sorted_domain = "number of calls";
C
chengduo 已提交
396
      sorted_func = [](const EventItem &a, const EventItem &b) {
397 398 399 400 401
        return a.calls > b.calls;
      };
      break;
    case EventSortingKey::kTotal:
      sorted_domain = "total time";
C
chengduo 已提交
402
      sorted_func = [](const EventItem &a, const EventItem &b) {
403 404 405 406 407
        return a.total_time > b.total_time;
      };
      break;
    case EventSortingKey::kMin:
      sorted_domain = "minimum time";
C
chengduo 已提交
408
      sorted_func = [](const EventItem &a, const EventItem &b) {
409 410 411 412 413
        return a.min_time > b.min_time;
      };
      break;
    case EventSortingKey::kMax:
      sorted_domain = "maximum time";
C
chengduo 已提交
414
      sorted_func = [](const EventItem &a, const EventItem &b) {
415 416 417 418 419
        return a.max_time > b.max_time;
      };
      break;
    case EventSortingKey::kAve:
      sorted_domain = "average time";
C
chengduo 已提交
420
      sorted_func = [](const EventItem &a, const EventItem &b) {
421 422 423
        return a.ave_time > b.ave_time;
      };
      break;
C
chengduo 已提交
424 425
    case EventSortingKey::kGPUTime:
      sorted_domain = "average time";
C
chengduo 已提交
426
      sorted_func = [](const EventItem &a, const EventItem &b) {
C
chengduo 已提交
427 428 429 430 431
        return a.gpu_time > b.gpu_time;
      };
      break;
    case EventSortingKey::kCPUTime:
      sorted_domain = "average time";
C
chengduo 已提交
432
      sorted_func = [](const EventItem &a, const EventItem &b) {
C
chengduo 已提交
433 434 435
        return a.cpu_time > b.cpu_time;
      };
      break;
436
    default:
437
      sorted_domain = "event first end time";
438 439
  }

C
chengduo 已提交
440
  const std::vector<std::vector<Event>> *analyze_events;
441 442 443
  std::vector<std::vector<Event>> merged_events_list;
  if (merge_thread) {
    std::vector<Event> merged_events;
Y
Yibing Liu 已提交
444 445
    for (size_t i = 0; i < events.size(); ++i) {
      for (size_t j = 0; j < events[i].size(); ++j) {
446 447 448 449 450 451 452 453 454
        merged_events.push_back(events[i][j]);
      }
    }
    merged_events_list.push_back(merged_events);
    analyze_events = &merged_events_list;
  } else {
    analyze_events = &events;
  }

455
  std::vector<std::vector<EventItem>> events_table;
Y
Yibing Liu 已提交
456
  size_t max_name_width = 0;
457 458
  for (size_t i = 0; i < (*analyze_events).size(); i++) {
    double total = 0.;  // the total time in one thread
459
    std::list<Event> pushed_events;
460 461 462
    std::vector<EventItem> event_items;
    std::unordered_map<std::string, int> event_idx;

463 464 465 466
    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
      if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
        pushed_events.push_back((*analyze_events)[i][j]);
      } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
467
        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
468
        while (rit != pushed_events.rend() &&
469
               rit->name() != (*analyze_events)[i][j].name()) {
470 471
          ++rit;
        }
472

473
        if (rit != pushed_events.rend()) {
C
chengduo 已提交
474
          double event_time = 0;
475 476 477 478
          double gpu_time = 0.0f;
#ifdef PADDLE_WITH_CUDA
          gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
#endif
C
chengduo 已提交
479 480 481 482 483 484 485 486 487
          double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
          if (g_state == ProfilerState::kCUDA) {
            event_time = gpu_time;
          } else if (g_state == ProfilerState::kCPU) {
            event_time = cpu_time;
          } else {
            event_time = gpu_time + cpu_time;
          }

Y
Yan Chunwei 已提交
488
          total += event_time;
489

490 491 492 493 494 495 496 497 498
          std::string event_name;
          if (merge_thread) {
            event_name = rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          } else {
            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
                         rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          }
499

500 501 502
          if (event_idx.find(event_name) == event_idx.end()) {
            event_idx[event_name] = event_items.size();
            EventItem event_item = {event_name, 1,          event_time,
Y
Yan Chunwei 已提交
503
                                    event_time, event_time, event_time,
504
                                    cpu_time,   gpu_time,   0.};
505
            event_items.push_back(event_item);
506
          } else {
507 508
            int index = event_idx[event_name];
            event_items[index].calls += 1;
509
            // total time
510
            event_items[index].total_time += event_time;
511
            // min time
512 513
            event_items[index].min_time =
                std::min(event_time, event_items[index].min_time);
514
            // max time
515 516
            event_items[index].max_time =
                std::max(event_time, event_items[index].max_time);
C
chengduo 已提交
517 518
            event_items[index].gpu_time += gpu_time;
            event_items[index].cpu_time += cpu_time;
519
          }
520

Y
Yibing Liu 已提交
521
          // remove the push marker from the list
522 523
          pushed_events.erase((++rit).base());
        } else {
524
          LOG(WARNING) << "Cannot find the push marker of event \'"
525
                       << (*analyze_events)[i][j].name()
526
                       << "\', which will be ignored in profiling report.";
527 528 529
        }
      }
    }
530
    // average time
C
chengduo 已提交
531
    for (auto &item : event_items) {
532
      item.ave_time = item.total_time / item.calls;
533
      item.ratio = item.total_time / total;
534 535 536
    }
    // sort
    if (sorted_by != EventSortingKey::kDefault) {
537
      std::sort(event_items.begin(), event_items.end(), sorted_func);
538
    }
539

540
    events_table.push_back(event_items);
Y
Yibing Liu 已提交
541
    // log warning if there are events with `push` but without `pop`
542 543
    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
    while (rit != pushed_events.rend()) {
Y
Yibing Liu 已提交
544 545
      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
                   << "\', which will be ignored in profiling report.";
546 547
      ++rit;
    }
548
  }
549 550

  // Print report
551 552
  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
                merge_thread);
553 554
}

C
chengduo 已提交
555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
struct MemoryProfierReport {
  size_t alloc_times{0};
  size_t alloc_size{0};
  size_t free_times{0};
  size_t free_size{0};
};

// Print results
void PrintMemProfiler(
    const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
        &annotation_report,
    const size_t name_width, const size_t data_width) {
  // Output header information
  std::cout << "\n------------------------->"
            << "    Memory Profiling Report     "
            << "<-------------------------\n\n";

  // Output events table
  std::cout.setf(std::ios::left);
  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
            << "Alloc Calls" << std::setw(data_width) << "Size(MB)"
            << std::setw(data_width) << "Free Calls" << std::setw(data_width)
            << "Size(MB)" << std::endl;

  for (auto &tmp : annotation_report) {
    for (auto &e : tmp.second) {
      auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
      std::cout << std::setw(name_width) << event_name;
      std::cout << std::setw(data_width) << e.second.alloc_times;
      std::cout << std::setw(data_width)
                << e.second.alloc_size / (1024.0 * 1024.0);
      std::cout << std::setw(data_width) << e.second.free_times;
      std::cout << std::setw(data_width)
                << e.second.free_size / (1024.0 * 1024.0) << std::endl;
    }
  }
  std::cout << std::endl;
}

// parse memory events
void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
  if (g_state == ProfilerState::kDisabled) return;
  // place, annotation, alloc times,  alloc size
  std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
      annotation_report;

  for (auto &tmp : events) {
    for (auto &e : tmp) {
      if (e.type() == EventType::kPushRange) {
        annotation_report[e.place()][e.annotation()].alloc_times += 1;
        annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
      } else if (e.type() == EventType::kPopRange) {
        annotation_report[e.place()][e.annotation()].free_times += 1;
        annotation_report[e.place()][e.annotation()].free_size += e.bytes();
      }
    }
  }
  PrintMemProfiler(annotation_report, 55, 18);
}

615
void DisableProfiler(EventSortingKey sorted_key,
C
chengduo 已提交
616
                     const std::string &profile_path) {
617
  SynchronizeAllDevice();
C
chengduo 已提交
618 619
  MemEvenRecorder::Instance().Flush();

X
Xin Pan 已提交
620
  std::lock_guard<std::mutex> l(profiler_mu);
621
  if (g_state == ProfilerState::kDisabled) return;
622
  // Mark the profiling stop.
623
  Mark("_stop_profiler_");
624

C
chengduo 已提交
625
  DeviceTracer *tracer = GetDeviceTracer();
626
  if (tracer->IsEnabled()) {
627 628
    tracer->Disable();
    tracer->GenProfile(profile_path);
629
    tracer->GenEventKernelCudaElapsedTime();
630
  }
631 632 633 634

  std::vector<std::vector<Event>> all_events = GetAllEvents();
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
C
chengduo 已提交
635 636 637 638 639
  if (VLOG_IS_ON(5)) {
    std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
    ParseMemEvents(all_mem_events);
  }

640
  ResetProfiler();
641
  g_state = ProfilerState::kDisabled;
X
Xin Pan 已提交
642
  should_send_profile_state = true;
643 644 645 646 647
}

bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
bool ShouldSendProfileState() { return should_send_profile_state; }

X
Xin Pan 已提交
648
void SetProfileListener() {
649 650 651
  std::mt19937 rng;
  rng.seed(std::random_device()());
  std::uniform_int_distribution<std::mt19937::result_type> dist6(
X
Xin Pan 已提交
652
      1, std::numeric_limits<int>::max());
653
  profiler_lister_id = dist6(rng);
654
}
655
int64_t ListenerId() { return profiler_lister_id; }
656

D
dangqingqing 已提交
657 658
}  // namespace platform
}  // namespace paddle