profiler.cc 17.4 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
dangqingqing 已提交
2 3 4 5

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

D
dangqingqing 已提交
7 8 9 10 11 12 13 14
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include <algorithm>
16
#include <iomanip>
17
#include <limits>
18
#include <map>
19
#include <mutex>  // NOLINT
20
#include <random>
21
#include <string>
22 23 24
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif  // PADDLE_WITH_CUDA
Y
Yancey1989 已提交
25

26
#include "glog/logging.h"
27 28
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
Y
Yancey1989 已提交
29 30
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
31
#include "paddle/fluid/string/printf.h"
D
dangqingqing 已提交
32

G
gongweibao 已提交
33 34
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");

D
dangqingqing 已提交
35 36 37
namespace paddle {
namespace platform {

38 39
struct EventList;

40 41
static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
X
Xin Pan 已提交
42
std::mutex profiler_mu;
43

D
dangqingqing 已提交
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list;

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
struct EventList {
  constexpr static size_t kMB = 1024 * 1024;
  constexpr static size_t kEventBlockSize = 16 * kMB;
  constexpr static size_t kEventSize = sizeof(Event);
  constexpr static size_t kEventAlign = alignof(Event);
  constexpr static size_t kNumBlock =
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);

  template <typename... Args>
  void Record(Args&&... args) {
    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
      event_blocks.emplace_front();
      event_blocks.front().reserve(kNumBlock);
    }
    event_blocks.front().emplace_back(std::forward<Args>(args)...);
  }

  std::vector<Event> Reduce() {
    std::vector<Event> result;
    for (auto& block : event_blocks) {
      result.insert(result.begin(), std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
    return result;
  }

  void Clear() { event_blocks.clear(); }

  std::forward_list<std::vector<Event>> event_blocks;
};

D
dangqingqing 已提交
92 93 94 95 96 97 98 99 100
inline uint64_t GetTimeInNsec() {
  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                                 std::chrono::high_resolution_clock,
                                 std::chrono::steady_clock>::type;
  return std::chrono::duration_cast<std::chrono::nanoseconds>(
             clock::now().time_since_epoch())
      .count();
}

101
Event::Event(EventType type, std::string name, uint32_t thread_id,
D
dangqingqing 已提交
102
             const DeviceContext* dev_ctx)
103
    : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
D
dangqingqing 已提交
104
#ifdef PADDLE_WITH_CUDA
D
dangqingqing 已提交
105 106 107
  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
  if (has_cuda_) {
    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
Q
qiaolongfei 已提交
108 109
    PADDLE_ENFORCE(cudaSetDevice(
        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
D
dangqingqing 已提交
110 111 112 113 114 115 116 117 118
    PADDLE_ENFORCE(cudaGetDevice(&device_));
    PADDLE_ENFORCE(cudaEventCreate(&event_));
    auto stream = cuda_dev_ctx->stream();
    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
  }
#endif
  cpu_ns_ = GetTimeInNsec();
}

119
const EventType& Event::type() const { return type_; }
D
dangqingqing 已提交
120

121 122
double Event::CpuElapsedMs(const Event& e) const {
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
D
dangqingqing 已提交
123 124
}

125
double Event::CudaElapsedMs(const Event& e) const {
D
dangqingqing 已提交
126
#ifdef PADDLE_WITH_CUDA
127
  if (!has_cuda_) return 0.0;
D
dangqingqing 已提交
128 129 130 131 132 133
  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
  PADDLE_ENFORCE(e.device() == device());
  PADDLE_ENFORCE(cudaEventSynchronize(event_));
  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
  float ms;
  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
134
  return ms;
D
dangqingqing 已提交
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
#else
  PADDLE_THROW("CUDA is not enabled");
#endif
}

#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
  auto original_device = GetCurrentDeviceId();
  int count = GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
    SetDeviceId(i);
    func(i);
  }
  SetDeviceId(original_device);
}
#endif

inline EventList& GetEventList() {
  if (!g_event_list) {
    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
    g_event_list = std::make_shared<EventList>();
    g_thread_id = g_next_thread_id++;
    g_all_event_lists.emplace_front(g_event_list);
  }
  return *g_event_list;
}

D
dangqingqing 已提交
162
void Mark(const std::string& name, const DeviceContext* dev_ctx) {
163
  GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
164 165
}

D
dangqingqing 已提交
166
void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
167
  GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
168 169
}

D
dangqingqing 已提交
170
void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
171
  GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
D
dangqingqing 已提交
172 173
}

X
Xin Pan 已提交
174
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
X
Xin Pan 已提交
175
    : is_enabled_(false), start_ns_(PosixInNsec()) {
D
dangqingqing 已提交
176
  if (g_state == ProfilerState::kDisabled) return;
Y
Yancey1989 已提交
177 178
  std::lock_guard<std::mutex> l(profiler_mu);

X
Xin Pan 已提交
179
  is_enabled_ = true;
D
dangqingqing 已提交
180
  dev_ctx_ = dev_ctx;
Y
Yibing Liu 已提交
181
  name_ = name;
182
  PushEvent(name_, dev_ctx_);
183
  // Maybe need the same push/pop behavior.
X
Xin Pan 已提交
184
  SetCurAnnotation(name_);
D
dangqingqing 已提交
185 186 187
}

RecordEvent::~RecordEvent() {
X
Xin Pan 已提交
188
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
Y
Yancey1989 已提交
189
  std::lock_guard<std::mutex> l(profiler_mu);
X
Xin Pan 已提交
190 191
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
X
Xin Pan 已提交
192
    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
193
                          BlockDepth(), g_thread_id);
X
Xin Pan 已提交
194
  }
Y
Yibing Liu 已提交
195
  ClearCurAnnotation();
196
  PopEvent(name_, dev_ctx_);
D
dangqingqing 已提交
197
}
D
dangqingqing 已提交
198

G
gongweibao 已提交
199 200 201 202 203 204 205
RecordRPCEvent::RecordRPCEvent(const std::string& name,
                               const DeviceContext* dev_ctx) {
  if (FLAGS_enable_rpc_profiler) {
    event_.reset(new platform::RecordEvent(name, dev_ctx));
  }
}

X
Xin Pan 已提交
206 207
RecordBlock::RecordBlock(int block_id)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
Q
qiaolongfei 已提交
208
  std::lock_guard<std::mutex> l(profiler_mu);
X
Xin Pan 已提交
209
  if (g_state == ProfilerState::kDisabled) return;
X
Xin Pan 已提交
210
  is_enabled_ = true;
X
Xin Pan 已提交
211 212 213 214 215
  SetCurBlock(block_id);
  name_ = string::Sprintf("block_%d", block_id);
}

RecordBlock::~RecordBlock() {
Q
qiaolongfei 已提交
216
  std::lock_guard<std::mutex> l(profiler_mu);
X
Xin Pan 已提交
217
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
X
Xin Pan 已提交
218 219 220 221 222
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
223
                          g_thread_id);
X
Xin Pan 已提交
224 225 226 227
  }
  ClearCurBlock();
}

D
dangqingqing 已提交
228 229
void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
Q
Qiao Longfei 已提交
230
                 "Can't enable profiling, since the input state is ",
D
dangqingqing 已提交
231
                 "ProfilerState::kDisabled");
X
Xin Pan 已提交
232 233

  std::lock_guard<std::mutex> l(profiler_mu);
234 235
  if (state == g_state) {
    return;
236
  }
237
  g_state = state;
X
Xin Pan 已提交
238
  should_send_profile_state = true;
239
  GetDeviceTracer()->Enable();
D
dangqingqing 已提交
240
#ifdef PADDLE_WITH_CUDA
D
dangqingqing 已提交
241
  if (g_state == ProfilerState::kCUDA) {
242
    // Generate some dummy events first to reduce the startup overhead.
D
dangqingqing 已提交
243 244
    for (int i = 0; i < 5; i++) {
      ForEachDevice([](int d) {
D
dangqingqing 已提交
245
        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
D
dangqingqing 已提交
246 247
        Mark("_cuda_startup_", dev_ctx);
        dev_ctx->Wait();
D
dangqingqing 已提交
248
        delete dev_ctx;
D
dangqingqing 已提交
249 250 251 252 253
      });
    }
  }
#endif
  // Mark the profiling start.
D
dangqingqing 已提交
254
  Mark("_start_profiler_", nullptr);
D
dangqingqing 已提交
255 256
}

257
void ResetProfiler() {
D
dangqingqing 已提交
258
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
259 260 261 262 263 264 265 266 267
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    (*it)->Clear();
  }
}

std::vector<std::vector<Event>> GetAllEvents() {
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
  std::vector<std::vector<Event>> result;
D
dangqingqing 已提交
268 269 270
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    result.emplace_back((*it)->Reduce());
D
dangqingqing 已提交
271 272 273 274
  }
  return result;
}

275 276 277 278 279 280 281 282
// The information of each event given in the profiling report
struct EventItem {
  std::string name;
  int calls;
  double total_time;
  double min_time;
  double max_time;
  double ave_time;
Y
Yan Chunwei 已提交
283
  float ratio;
284 285 286 287 288
};

// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                   const std::string& sorted_domain, const size_t name_width,
289
                   const size_t data_width, bool merge_thread) {
290 291 292 293 294 295 296 297 298 299 300 301
  // Output header information
  std::cout << "\n------------------------->"
            << "     Profiling Report     "
            << "<-------------------------\n\n";
  std::string place;
  if (g_state == ProfilerState::kCPU) {
    place = "CPU";
  } else if (g_state == ProfilerState::kCUDA) {
    place = "CUDA";
  } else if (g_state == ProfilerState::kAll) {
    place = "All";
  } else {
X
Xin Pan 已提交
302
    PADDLE_THROW("Invalid profiler state", g_state);
303
  }
304

305 306 307 308
  if (merge_thread) {
    std::cout << "Note! This Report merge all thread info into one."
              << std::endl;
  }
309 310 311 312 313 314 315 316 317
  std::cout << "Place: " << place << std::endl;
  std::cout << "Time unit: ms" << std::endl;
  std::cout << "Sorted by " << sorted_domain
            << " in descending order in the same thread\n\n";
  // Output events table
  std::cout.setf(std::ios::left);
  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
            << "Calls" << std::setw(data_width) << "Total"
            << std::setw(data_width) << "Min." << std::setw(data_width)
Y
Yan Chunwei 已提交
318 319
            << "Max." << std::setw(data_width) << "Ave."
            << std::setw(data_width) << "Ratio." << std::endl;
320 321 322 323 324 325 326 327
  for (size_t i = 0; i < events_table.size(); ++i) {
    for (size_t j = 0; j < events_table[i].size(); ++j) {
      const EventItem& event_item = events_table[i][j];
      std::cout << std::setw(name_width) << event_item.name
                << std::setw(data_width) << event_item.calls
                << std::setw(data_width) << event_item.total_time
                << std::setw(data_width) << event_item.min_time
                << std::setw(data_width) << event_item.max_time
Y
Yan Chunwei 已提交
328
                << std::setw(data_width) << event_item.ave_time
329
                << std::setw(data_width) << event_item.ratio << std::endl;
330
    }
331
  }
332
  std::cout << std::endl;
333 334
}

335 336
// Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events,
337
                 bool merge_thread,
338 339
                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
  if (g_state == ProfilerState::kDisabled) return;
340
  if (merge_thread && events.size() < 2) return;
341 342

  std::string sorted_domain;
L
Luo Tao 已提交
343
  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
344 345 346
  switch (sorted_by) {
    case EventSortingKey::kCalls:
      sorted_domain = "number of calls";
L
Luo Tao 已提交
347
      sorted_func = [](const EventItem& a, const EventItem& b) {
348 349 350 351 352
        return a.calls > b.calls;
      };
      break;
    case EventSortingKey::kTotal:
      sorted_domain = "total time";
L
Luo Tao 已提交
353
      sorted_func = [](const EventItem& a, const EventItem& b) {
354 355 356 357 358
        return a.total_time > b.total_time;
      };
      break;
    case EventSortingKey::kMin:
      sorted_domain = "minimum time";
L
Luo Tao 已提交
359
      sorted_func = [](const EventItem& a, const EventItem& b) {
360 361 362 363 364
        return a.min_time > b.min_time;
      };
      break;
    case EventSortingKey::kMax:
      sorted_domain = "maximum time";
L
Luo Tao 已提交
365
      sorted_func = [](const EventItem& a, const EventItem& b) {
366 367 368 369 370
        return a.max_time > b.max_time;
      };
      break;
    case EventSortingKey::kAve:
      sorted_domain = "average time";
L
Luo Tao 已提交
371
      sorted_func = [](const EventItem& a, const EventItem& b) {
372 373 374 375
        return a.ave_time > b.ave_time;
      };
      break;
    default:
376
      sorted_domain = "event first end time";
377 378
  }

379 380 381 382
  const std::vector<std::vector<Event>>* analyze_events;
  std::vector<std::vector<Event>> merged_events_list;
  if (merge_thread) {
    std::vector<Event> merged_events;
Y
Yibing Liu 已提交
383 384
    for (size_t i = 0; i < events.size(); ++i) {
      for (size_t j = 0; j < events[i].size(); ++j) {
385 386 387 388 389 390 391 392 393
        merged_events.push_back(events[i][j]);
      }
    }
    merged_events_list.push_back(merged_events);
    analyze_events = &merged_events_list;
  } else {
    analyze_events = &events;
  }

394
  std::vector<std::vector<EventItem>> events_table;
Y
Yibing Liu 已提交
395
  size_t max_name_width = 0;
396 397
  for (size_t i = 0; i < (*analyze_events).size(); i++) {
    double total = 0.;  // the total time in one thread
398
    std::list<Event> pushed_events;
399 400 401
    std::vector<EventItem> event_items;
    std::unordered_map<std::string, int> event_idx;

402 403 404 405
    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
      if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
        pushed_events.push_back((*analyze_events)[i][j]);
      } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
406
        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
407
        while (rit != pushed_events.rend() &&
408
               rit->name() != (*analyze_events)[i][j].name()) {
409 410
          ++rit;
        }
411

412
        if (rit != pushed_events.rend()) {
413 414
          double event_time = (g_state == ProfilerState::kCUDA ||
                               g_state == ProfilerState::kAll)
415 416
                                  ? rit->CudaElapsedMs((*analyze_events)[i][j])
                                  : rit->CpuElapsedMs((*analyze_events)[i][j]);
Y
Yan Chunwei 已提交
417
          total += event_time;
418

419 420 421 422 423 424 425 426 427
          std::string event_name;
          if (merge_thread) {
            event_name = rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          } else {
            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
                         rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          }
428

429 430 431
          if (event_idx.find(event_name) == event_idx.end()) {
            event_idx[event_name] = event_items.size();
            EventItem event_item = {event_name, 1,          event_time,
Y
Yan Chunwei 已提交
432 433
                                    event_time, event_time, event_time,
                                    0.};
434
            event_items.push_back(event_item);
435
          } else {
436 437
            int index = event_idx[event_name];
            event_items[index].calls += 1;
438
            // total time
439
            event_items[index].total_time += event_time;
440
            // min time
441 442
            event_items[index].min_time =
                std::min(event_time, event_items[index].min_time);
443
            // max time
444 445
            event_items[index].max_time =
                std::max(event_time, event_items[index].max_time);
446
          }
447

Y
Yibing Liu 已提交
448
          // remove the push marker from the list
449 450
          pushed_events.erase((++rit).base());
        } else {
451
          LOG(WARNING) << "Cannot find the push marker of event \'"
452
                       << (*analyze_events)[i][j].name()
453
                       << "\', which will be ignored in profiling report.";
454 455 456
        }
      }
    }
457 458 459
    // average time
    for (auto& item : event_items) {
      item.ave_time = item.total_time / item.calls;
460
      item.ratio = item.total_time / total;
461 462 463
    }
    // sort
    if (sorted_by != EventSortingKey::kDefault) {
464
      std::sort(event_items.begin(), event_items.end(), sorted_func);
465
    }
466

467
    events_table.push_back(event_items);
Y
Yibing Liu 已提交
468
    // log warning if there are events with `push` but without `pop`
469 470
    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
    while (rit != pushed_events.rend()) {
Y
Yibing Liu 已提交
471 472
      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
                   << "\', which will be ignored in profiling report.";
473 474
      ++rit;
    }
475
  }
476 477

  // Print report
478 479
  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
                merge_thread);
480 481
}

482 483
void DisableProfiler(EventSortingKey sorted_key,
                     const std::string& profile_path) {
X
Xin Pan 已提交
484
  std::lock_guard<std::mutex> l(profiler_mu);
485
  if (g_state == ProfilerState::kDisabled) return;
486 487 488 489
  // Mark the profiling stop.
  Mark("_stop_profiler_", nullptr);

  std::vector<std::vector<Event>> all_events = GetAllEvents();
490 491
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
492 493
  ResetProfiler();
  DeviceTracer* tracer = GetDeviceTracer();
494
  if (tracer->IsEnabled()) {
495 496
    tracer->Disable();
    tracer->GenProfile(profile_path);
497
  }
498
  g_state = ProfilerState::kDisabled;
X
Xin Pan 已提交
499
  should_send_profile_state = true;
500 501 502 503 504
}

bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
bool ShouldSendProfileState() { return should_send_profile_state; }

X
Xin Pan 已提交
505
void SetProfileListener() {
506 507 508
  std::mt19937 rng;
  rng.seed(std::random_device()());
  std::uniform_int_distribution<std::mt19937::result_type> dist6(
X
Xin Pan 已提交
509
      1, std::numeric_limits<int>::max());
510
  profiler_lister_id = dist6(rng);
511
}
512
int64_t ListenerId() { return profiler_lister_id; }
513

D
dangqingqing 已提交
514 515
}  // namespace platform
}  // namespace paddle