profiler.cc 16.5 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
dangqingqing 已提交
2 3 4 5

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

D
dangqingqing 已提交
7 8 9 10 11 12 13 14
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15 16
#include "paddle/fluid/platform/profiler.h"

17
#include <algorithm>
18
#include <iomanip>
19
#include <limits>
20
#include <map>
21
#include <mutex>  // NOLINT
22
#include <random>
23
#include <string>
24 25 26
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif  // PADDLE_WITH_CUDA
Y
Yancey1989 已提交
27

28
#include "glog/logging.h"
29 30
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
Y
Yancey1989 已提交
31
#include "paddle/fluid/platform/port.h"
32
#include "paddle/fluid/string/printf.h"
D
dangqingqing 已提交
33

G
gongweibao 已提交
34 35
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");

D
dangqingqing 已提交
36 37 38
namespace paddle {
namespace platform {

39 40
struct EventList;

41 42
static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
X
Xin Pan 已提交
43
std::mutex profiler_mu;
44

D
dangqingqing 已提交
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list;

60 61 62 63 64 65 66 67 68 69
struct EventList {
  constexpr static size_t kMB = 1024 * 1024;
  constexpr static size_t kEventBlockSize = 16 * kMB;
  constexpr static size_t kEventSize = sizeof(Event);
  constexpr static size_t kEventAlign = alignof(Event);
  constexpr static size_t kNumBlock =
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);

  template <typename... Args>
70
  Event* Record(Args&&... args) {
71 72 73 74 75
    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
      event_blocks.emplace_front();
      event_blocks.front().reserve(kNumBlock);
    }
    event_blocks.front().emplace_back(std::forward<Args>(args)...);
76
    return &event_blocks.front().back();
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
  }

  std::vector<Event> Reduce() {
    std::vector<Event> result;
    for (auto& block : event_blocks) {
      result.insert(result.begin(), std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
    return result;
  }

  void Clear() { event_blocks.clear(); }

  std::forward_list<std::vector<Event>> event_blocks;
};

D
dangqingqing 已提交
94 95 96 97 98 99 100 101 102
inline uint64_t GetTimeInNsec() {
  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                                 std::chrono::high_resolution_clock,
                                 std::chrono::steady_clock>::type;
  return std::chrono::duration_cast<std::chrono::nanoseconds>(
             clock::now().time_since_epoch())
      .count();
}

103 104
Event::Event(EventType type, std::string name, uint32_t thread_id)
    : type_(type), name_(name), thread_id_(thread_id) {
D
dangqingqing 已提交
105 106 107
  cpu_ns_ = GetTimeInNsec();
}

108
const EventType& Event::type() const { return type_; }
D
dangqingqing 已提交
109

110 111
double Event::CpuElapsedMs(const Event& e) const {
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
D
dangqingqing 已提交
112 113
}

114
double Event::CudaElapsedMs(const Event& e) const {
115 116
#ifdef PADDLE_WITH_CUPTI
  return gpu_ns_ / 1000000.0;
D
Dun Liang 已提交
117
#else
D
Dun Liang 已提交
118 119
  LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
  return 0;
D
dangqingqing 已提交
120 121 122 123 124 125 126 127 128
#endif
}

inline EventList& GetEventList() {
  if (!g_event_list) {
    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
    g_event_list = std::make_shared<EventList>();
    g_thread_id = g_next_thread_id++;
    g_all_event_lists.emplace_front(g_event_list);
129
    RecoreCurThreadId(g_thread_id);
D
dangqingqing 已提交
130 131 132 133
  }
  return *g_event_list;
}

134 135
void Mark(const std::string& name) {
  GetEventList().Record(EventType::kMark, name, g_thread_id);
136 137
}

138 139
Event* PushEvent(const std::string& name) {
  return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
140 141
}

142 143
void PopEvent(const std::string& name) {
  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
D
dangqingqing 已提交
144 145
}

146
RecordEvent::RecordEvent(const std::string& name)
X
Xin Pan 已提交
147
    : is_enabled_(false), start_ns_(PosixInNsec()) {
D
dangqingqing 已提交
148
  if (g_state == ProfilerState::kDisabled) return;
149
  // lock is not needed, the code below is thread-safe
Y
Yancey1989 已提交
150

X
Xin Pan 已提交
151
  is_enabled_ = true;
Y
Yibing Liu 已提交
152
  name_ = name;
153
  Event* e = PushEvent(name_);
154
  // Maybe need the same push/pop behavior.
155
  SetCurAnnotation(e);
D
dangqingqing 已提交
156 157 158
}

RecordEvent::~RecordEvent() {
X
Xin Pan 已提交
159
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
160
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
161 162
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
163
    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
164
                          BlockDepth(), g_thread_id);
X
Xin Pan 已提交
165
  }
Y
Yibing Liu 已提交
166
  ClearCurAnnotation();
167
  PopEvent(name_);
D
dangqingqing 已提交
168
}
D
dangqingqing 已提交
169

170
RecordRPCEvent::RecordRPCEvent(const std::string& name) {
G
gongweibao 已提交
171
  if (FLAGS_enable_rpc_profiler) {
172
    event_.reset(new platform::RecordEvent(name));
G
gongweibao 已提交
173 174 175
  }
}

X
Xin Pan 已提交
176 177
RecordBlock::RecordBlock(int block_id)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
178
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
179
  if (g_state == ProfilerState::kDisabled) return;
X
Xin Pan 已提交
180
  is_enabled_ = true;
X
Xin Pan 已提交
181 182 183 184 185
  SetCurBlock(block_id);
  name_ = string::Sprintf("block_%d", block_id);
}

RecordBlock::~RecordBlock() {
186
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
187
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
X
Xin Pan 已提交
188 189 190 191 192
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
193
                          g_thread_id);
X
Xin Pan 已提交
194 195 196 197
  }
  ClearCurBlock();
}

198 199 200 201 202 203 204 205 206 207
void SynchronizeAllDevice() {
#ifdef PADDLE_WITH_CUDA
  int count = GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
    SetDeviceId(i);
    PADDLE_ENFORCE(cudaDeviceSynchronize());
  }
#endif
}

D
dangqingqing 已提交
208 209
void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
Q
Qiao Longfei 已提交
210
                 "Can't enable profiling, since the input state is ",
D
dangqingqing 已提交
211
                 "ProfilerState::kDisabled");
212
  SynchronizeAllDevice();
X
Xin Pan 已提交
213
  std::lock_guard<std::mutex> l(profiler_mu);
214 215
  if (state == g_state) {
    return;
216
  }
217
  g_state = state;
X
Xin Pan 已提交
218
  should_send_profile_state = true;
219
  GetDeviceTracer()->Enable();
D
dangqingqing 已提交
220
#ifdef PADDLE_WITH_CUDA
221 222
  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
      g_state == ProfilerState::kCPU) {
223
    // Generate some dummy events first to reduce the startup overhead.
224 225
    DummyKernelAndEvent();
    GetDeviceTracer()->Reset();
D
dangqingqing 已提交
226 227 228
  }
#endif
  // Mark the profiling start.
229
  Mark("_start_profiler_");
D
dangqingqing 已提交
230 231
}

232
void ResetProfiler() {
233 234
  SynchronizeAllDevice();
  GetDeviceTracer()->Reset();
D
dangqingqing 已提交
235
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
236 237 238 239 240 241 242 243 244
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    (*it)->Clear();
  }
}

std::vector<std::vector<Event>> GetAllEvents() {
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
  std::vector<std::vector<Event>> result;
D
dangqingqing 已提交
245 246 247
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    result.emplace_back((*it)->Reduce());
D
dangqingqing 已提交
248 249 250 251
  }
  return result;
}

252 253 254 255 256 257 258 259
// The information of each event given in the profiling report
struct EventItem {
  std::string name;
  int calls;
  double total_time;
  double min_time;
  double max_time;
  double ave_time;
Y
Yan Chunwei 已提交
260
  float ratio;
261 262 263 264 265
};

// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                   const std::string& sorted_domain, const size_t name_width,
266
                   const size_t data_width, bool merge_thread) {
267 268 269 270 271 272 273 274 275 276 277 278
  // Output header information
  std::cout << "\n------------------------->"
            << "     Profiling Report     "
            << "<-------------------------\n\n";
  std::string place;
  if (g_state == ProfilerState::kCPU) {
    place = "CPU";
  } else if (g_state == ProfilerState::kCUDA) {
    place = "CUDA";
  } else if (g_state == ProfilerState::kAll) {
    place = "All";
  } else {
X
Xin Pan 已提交
279
    PADDLE_THROW("Invalid profiler state", g_state);
280
  }
281

282 283 284 285
  if (merge_thread) {
    std::cout << "Note! This Report merge all thread info into one."
              << std::endl;
  }
286 287 288 289 290 291 292 293 294
  std::cout << "Place: " << place << std::endl;
  std::cout << "Time unit: ms" << std::endl;
  std::cout << "Sorted by " << sorted_domain
            << " in descending order in the same thread\n\n";
  // Output events table
  std::cout.setf(std::ios::left);
  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
            << "Calls" << std::setw(data_width) << "Total"
            << std::setw(data_width) << "Min." << std::setw(data_width)
Y
Yan Chunwei 已提交
295 296
            << "Max." << std::setw(data_width) << "Ave."
            << std::setw(data_width) << "Ratio." << std::endl;
297 298 299 300 301 302 303 304
  for (size_t i = 0; i < events_table.size(); ++i) {
    for (size_t j = 0; j < events_table[i].size(); ++j) {
      const EventItem& event_item = events_table[i][j];
      std::cout << std::setw(name_width) << event_item.name
                << std::setw(data_width) << event_item.calls
                << std::setw(data_width) << event_item.total_time
                << std::setw(data_width) << event_item.min_time
                << std::setw(data_width) << event_item.max_time
Y
Yan Chunwei 已提交
305
                << std::setw(data_width) << event_item.ave_time
306
                << std::setw(data_width) << event_item.ratio << std::endl;
307
    }
308
  }
309
  std::cout << std::endl;
310 311
}

312 313
// Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events,
314
                 bool merge_thread,
315 316
                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
  if (g_state == ProfilerState::kDisabled) return;
317
  if (merge_thread && events.size() < 2) return;
318 319

  std::string sorted_domain;
L
Luo Tao 已提交
320
  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
321 322 323
  switch (sorted_by) {
    case EventSortingKey::kCalls:
      sorted_domain = "number of calls";
L
Luo Tao 已提交
324
      sorted_func = [](const EventItem& a, const EventItem& b) {
325 326 327 328 329
        return a.calls > b.calls;
      };
      break;
    case EventSortingKey::kTotal:
      sorted_domain = "total time";
L
Luo Tao 已提交
330
      sorted_func = [](const EventItem& a, const EventItem& b) {
331 332 333 334 335
        return a.total_time > b.total_time;
      };
      break;
    case EventSortingKey::kMin:
      sorted_domain = "minimum time";
L
Luo Tao 已提交
336
      sorted_func = [](const EventItem& a, const EventItem& b) {
337 338 339 340 341
        return a.min_time > b.min_time;
      };
      break;
    case EventSortingKey::kMax:
      sorted_domain = "maximum time";
L
Luo Tao 已提交
342
      sorted_func = [](const EventItem& a, const EventItem& b) {
343 344 345 346 347
        return a.max_time > b.max_time;
      };
      break;
    case EventSortingKey::kAve:
      sorted_domain = "average time";
L
Luo Tao 已提交
348
      sorted_func = [](const EventItem& a, const EventItem& b) {
349 350 351 352
        return a.ave_time > b.ave_time;
      };
      break;
    default:
353
      sorted_domain = "event first end time";
354 355
  }

356 357 358 359
  const std::vector<std::vector<Event>>* analyze_events;
  std::vector<std::vector<Event>> merged_events_list;
  if (merge_thread) {
    std::vector<Event> merged_events;
Y
Yibing Liu 已提交
360 361
    for (size_t i = 0; i < events.size(); ++i) {
      for (size_t j = 0; j < events[i].size(); ++j) {
362 363 364 365 366 367 368 369 370
        merged_events.push_back(events[i][j]);
      }
    }
    merged_events_list.push_back(merged_events);
    analyze_events = &merged_events_list;
  } else {
    analyze_events = &events;
  }

371
  std::vector<std::vector<EventItem>> events_table;
Y
Yibing Liu 已提交
372
  size_t max_name_width = 0;
373 374
  for (size_t i = 0; i < (*analyze_events).size(); i++) {
    double total = 0.;  // the total time in one thread
375
    std::list<Event> pushed_events;
376 377 378
    std::vector<EventItem> event_items;
    std::unordered_map<std::string, int> event_idx;

379 380 381 382
    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
      if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
        pushed_events.push_back((*analyze_events)[i][j]);
      } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
383
        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
384
        while (rit != pushed_events.rend() &&
385
               rit->name() != (*analyze_events)[i][j].name()) {
386 387
          ++rit;
        }
388

389
        if (rit != pushed_events.rend()) {
390 391
          double event_time = (g_state == ProfilerState::kCUDA ||
                               g_state == ProfilerState::kAll)
392 393
                                  ? rit->CudaElapsedMs((*analyze_events)[i][j])
                                  : rit->CpuElapsedMs((*analyze_events)[i][j]);
Y
Yan Chunwei 已提交
394
          total += event_time;
395

396 397 398 399 400 401 402 403 404
          std::string event_name;
          if (merge_thread) {
            event_name = rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          } else {
            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
                         rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          }
405

406 407 408
          if (event_idx.find(event_name) == event_idx.end()) {
            event_idx[event_name] = event_items.size();
            EventItem event_item = {event_name, 1,          event_time,
Y
Yan Chunwei 已提交
409 410
                                    event_time, event_time, event_time,
                                    0.};
411
            event_items.push_back(event_item);
412
          } else {
413 414
            int index = event_idx[event_name];
            event_items[index].calls += 1;
415
            // total time
416
            event_items[index].total_time += event_time;
417
            // min time
418 419
            event_items[index].min_time =
                std::min(event_time, event_items[index].min_time);
420
            // max time
421 422
            event_items[index].max_time =
                std::max(event_time, event_items[index].max_time);
423
          }
424

Y
Yibing Liu 已提交
425
          // remove the push marker from the list
426 427
          pushed_events.erase((++rit).base());
        } else {
428
          LOG(WARNING) << "Cannot find the push marker of event \'"
429
                       << (*analyze_events)[i][j].name()
430
                       << "\', which will be ignored in profiling report.";
431 432 433
        }
      }
    }
434 435 436
    // average time
    for (auto& item : event_items) {
      item.ave_time = item.total_time / item.calls;
437
      item.ratio = item.total_time / total;
438 439 440
    }
    // sort
    if (sorted_by != EventSortingKey::kDefault) {
441
      std::sort(event_items.begin(), event_items.end(), sorted_func);
442
    }
443

444
    events_table.push_back(event_items);
Y
Yibing Liu 已提交
445
    // log warning if there are events with `push` but without `pop`
446 447
    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
    while (rit != pushed_events.rend()) {
Y
Yibing Liu 已提交
448 449
      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
                   << "\', which will be ignored in profiling report.";
450 451
      ++rit;
    }
452
  }
453 454

  // Print report
455 456
  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
                merge_thread);
457 458
}

459 460
void DisableProfiler(EventSortingKey sorted_key,
                     const std::string& profile_path) {
461
  SynchronizeAllDevice();
X
Xin Pan 已提交
462
  std::lock_guard<std::mutex> l(profiler_mu);
463
  if (g_state == ProfilerState::kDisabled) return;
464
  // Mark the profiling stop.
465
  Mark("_stop_profiler_");
466 467

  DeviceTracer* tracer = GetDeviceTracer();
468
  if (tracer->IsEnabled()) {
469 470
    tracer->Disable();
    tracer->GenProfile(profile_path);
471
    tracer->GenEventKernelCudaElapsedTime();
472
  }
473 474 475 476 477

  std::vector<std::vector<Event>> all_events = GetAllEvents();
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
  ResetProfiler();
478
  g_state = ProfilerState::kDisabled;
X
Xin Pan 已提交
479
  should_send_profile_state = true;
480 481 482 483 484
}

bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
bool ShouldSendProfileState() { return should_send_profile_state; }

X
Xin Pan 已提交
485
void SetProfileListener() {
486 487 488
  std::mt19937 rng;
  rng.seed(std::random_device()());
  std::uniform_int_distribution<std::mt19937::result_type> dist6(
X
Xin Pan 已提交
489
      1, std::numeric_limits<int>::max());
490
  profiler_lister_id = dist6(rng);
491
}
492
int64_t ListenerId() { return profiler_lister_id; }
493

D
dangqingqing 已提交
494 495
}  // namespace platform
}  // namespace paddle