profiler.cc 17.4 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
dangqingqing 已提交
2 3 4 5

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

D
dangqingqing 已提交
7 8 9 10 11 12 13 14
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/platform/profiler.h"
16

X
Xin Pan 已提交
17
#include <sys/time.h>
18
#include <algorithm>
19
#include <iomanip>
20
#include <limits>
21
#include <map>
22
#include <mutex>  // NOLINT
23
#include <random>
24
#include <string>
25 26 27
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif  // PADDLE_WITH_CUDA
28
#include "glog/logging.h"
29 30 31
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/string/printf.h"
D
dangqingqing 已提交
32

G
gongweibao 已提交
33 34
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");

D
dangqingqing 已提交
35 36 37
namespace paddle {
namespace platform {

38 39
struct EventList;

40 41
static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
X
Xin Pan 已提交
42
std::mutex profiler_mu;
43

D
dangqingqing 已提交
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list;

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
struct EventList {
  constexpr static size_t kMB = 1024 * 1024;
  constexpr static size_t kEventBlockSize = 16 * kMB;
  constexpr static size_t kEventSize = sizeof(Event);
  constexpr static size_t kEventAlign = alignof(Event);
  constexpr static size_t kNumBlock =
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);

  template <typename... Args>
  void Record(Args&&... args) {
    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
      event_blocks.emplace_front();
      event_blocks.front().reserve(kNumBlock);
    }
    event_blocks.front().emplace_back(std::forward<Args>(args)...);
  }

  std::vector<Event> Reduce() {
    std::vector<Event> result;
    for (auto& block : event_blocks) {
      result.insert(result.begin(), std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
    return result;
  }

  void Clear() { event_blocks.clear(); }

  std::forward_list<std::vector<Event>> event_blocks;
};

D
dangqingqing 已提交
92 93 94 95 96 97 98 99 100
inline uint64_t GetTimeInNsec() {
  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                                 std::chrono::high_resolution_clock,
                                 std::chrono::steady_clock>::type;
  return std::chrono::duration_cast<std::chrono::nanoseconds>(
             clock::now().time_since_epoch())
      .count();
}

101
Event::Event(EventType type, std::string name, uint32_t thread_id,
D
dangqingqing 已提交
102
             const DeviceContext* dev_ctx)
103
    : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
D
dangqingqing 已提交
104
#ifdef PADDLE_WITH_CUDA
D
dangqingqing 已提交
105 106 107
  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
  if (has_cuda_) {
    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
Q
qiaolongfei 已提交
108 109
    PADDLE_ENFORCE(cudaSetDevice(
        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
D
dangqingqing 已提交
110 111 112 113 114 115 116 117 118
    PADDLE_ENFORCE(cudaGetDevice(&device_));
    PADDLE_ENFORCE(cudaEventCreate(&event_));
    auto stream = cuda_dev_ctx->stream();
    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
  }
#endif
  cpu_ns_ = GetTimeInNsec();
}

119
const EventType& Event::type() const { return type_; }
D
dangqingqing 已提交
120

121 122
double Event::CpuElapsedMs(const Event& e) const {
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
D
dangqingqing 已提交
123 124
}

125
double Event::CudaElapsedMs(const Event& e) const {
D
dangqingqing 已提交
126
#ifdef PADDLE_WITH_CUDA
127
  if (!has_cuda_) return 0.0;
D
dangqingqing 已提交
128 129 130 131 132 133
  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
  PADDLE_ENFORCE(e.device() == device());
  PADDLE_ENFORCE(cudaEventSynchronize(event_));
  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
  float ms;
  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
134
  return ms;
D
dangqingqing 已提交
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
#else
  PADDLE_THROW("CUDA is not enabled");
#endif
}

#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
  auto original_device = GetCurrentDeviceId();
  int count = GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
    SetDeviceId(i);
    func(i);
  }
  SetDeviceId(original_device);
}
#endif

inline EventList& GetEventList() {
  if (!g_event_list) {
    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
    g_event_list = std::make_shared<EventList>();
    g_thread_id = g_next_thread_id++;
    g_all_event_lists.emplace_front(g_event_list);
  }
  return *g_event_list;
}

D
dangqingqing 已提交
162
void Mark(const std::string& name, const DeviceContext* dev_ctx) {
163
  GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
164 165
}

D
dangqingqing 已提交
166
void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
167
  GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
168 169
}

D
dangqingqing 已提交
170
void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
171
  GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
D
dangqingqing 已提交
172 173
}

X
Xin Pan 已提交
174
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
X
Xin Pan 已提交
175
    : is_enabled_(false), start_ns_(PosixInNsec()) {
Q
qiaolongfei 已提交
176
  std::lock_guard<std::mutex> l(profiler_mu);
D
dangqingqing 已提交
177
  if (g_state == ProfilerState::kDisabled) return;
X
Xin Pan 已提交
178
  is_enabled_ = true;
D
dangqingqing 已提交
179
  dev_ctx_ = dev_ctx;
Y
Yibing Liu 已提交
180
  name_ = name;
181
  PushEvent(name_, dev_ctx_);
182
  // Maybe need the same push/pop behavior.
X
Xin Pan 已提交
183
  SetCurAnnotation(name_);
D
dangqingqing 已提交
184 185 186
}

RecordEvent::~RecordEvent() {
Q
qiaolongfei 已提交
187
  std::lock_guard<std::mutex> l(profiler_mu);
X
Xin Pan 已提交
188
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
X
Xin Pan 已提交
189 190
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
X
Xin Pan 已提交
191
    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
192
                          BlockDepth(), g_thread_id);
X
Xin Pan 已提交
193
  }
Y
Yibing Liu 已提交
194
  ClearCurAnnotation();
195
  PopEvent(name_, dev_ctx_);
D
dangqingqing 已提交
196
}
D
dangqingqing 已提交
197

G
gongweibao 已提交
198 199 200 201 202 203 204
RecordRPCEvent::RecordRPCEvent(const std::string& name,
                               const DeviceContext* dev_ctx) {
  if (FLAGS_enable_rpc_profiler) {
    event_.reset(new platform::RecordEvent(name, dev_ctx));
  }
}

X
Xin Pan 已提交
205 206
RecordBlock::RecordBlock(int block_id)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
Q
qiaolongfei 已提交
207
  std::lock_guard<std::mutex> l(profiler_mu);
X
Xin Pan 已提交
208
  if (g_state == ProfilerState::kDisabled) return;
X
Xin Pan 已提交
209
  is_enabled_ = true;
X
Xin Pan 已提交
210 211 212 213 214
  SetCurBlock(block_id);
  name_ = string::Sprintf("block_%d", block_id);
}

RecordBlock::~RecordBlock() {
Q
qiaolongfei 已提交
215
  std::lock_guard<std::mutex> l(profiler_mu);
X
Xin Pan 已提交
216
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
X
Xin Pan 已提交
217 218 219 220 221
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
222
                          g_thread_id);
X
Xin Pan 已提交
223 224 225 226
  }
  ClearCurBlock();
}

D
dangqingqing 已提交
227 228 229 230
void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                 "Can't enbale profling, since the input state is ",
                 "ProfilerState::kDisabled");
X
Xin Pan 已提交
231 232

  std::lock_guard<std::mutex> l(profiler_mu);
233 234
  if (state == g_state) {
    return;
235
  }
236
  g_state = state;
X
Xin Pan 已提交
237
  should_send_profile_state = true;
238
  GetDeviceTracer()->Enable();
D
dangqingqing 已提交
239
#ifdef PADDLE_WITH_CUDA
D
dangqingqing 已提交
240
  if (g_state == ProfilerState::kCUDA) {
241
    // Generate some dummy events first to reduce the startup overhead.
D
dangqingqing 已提交
242 243
    for (int i = 0; i < 5; i++) {
      ForEachDevice([](int d) {
D
dangqingqing 已提交
244
        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
D
dangqingqing 已提交
245 246
        Mark("_cuda_startup_", dev_ctx);
        dev_ctx->Wait();
D
dangqingqing 已提交
247
        delete dev_ctx;
D
dangqingqing 已提交
248 249 250 251 252
      });
    }
  }
#endif
  // Mark the profiling start.
D
dangqingqing 已提交
253
  Mark("_start_profiler_", nullptr);
D
dangqingqing 已提交
254 255
}

256
void ResetProfiler() {
D
dangqingqing 已提交
257
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
258 259 260 261 262 263 264 265 266
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    (*it)->Clear();
  }
}

std::vector<std::vector<Event>> GetAllEvents() {
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
  std::vector<std::vector<Event>> result;
D
dangqingqing 已提交
267 268 269
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    result.emplace_back((*it)->Reduce());
D
dangqingqing 已提交
270 271 272 273
  }
  return result;
}

274 275 276 277 278 279 280 281
// The information of each event given in the profiling report
struct EventItem {
  std::string name;
  int calls;
  double total_time;
  double min_time;
  double max_time;
  double ave_time;
Y
Yan Chunwei 已提交
282
  float ratio;
283 284 285 286 287
};

// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                   const std::string& sorted_domain, const size_t name_width,
288
                   const size_t data_width, bool merge_thread) {
289 290 291 292 293 294 295 296 297 298 299 300
  // Output header information
  std::cout << "\n------------------------->"
            << "     Profiling Report     "
            << "<-------------------------\n\n";
  std::string place;
  if (g_state == ProfilerState::kCPU) {
    place = "CPU";
  } else if (g_state == ProfilerState::kCUDA) {
    place = "CUDA";
  } else if (g_state == ProfilerState::kAll) {
    place = "All";
  } else {
X
Xin Pan 已提交
301
    PADDLE_THROW("Invalid profiler state", g_state);
302
  }
303

304 305 306 307
  if (merge_thread) {
    std::cout << "Note! This Report merge all thread info into one."
              << std::endl;
  }
308 309 310 311 312 313 314 315 316
  std::cout << "Place: " << place << std::endl;
  std::cout << "Time unit: ms" << std::endl;
  std::cout << "Sorted by " << sorted_domain
            << " in descending order in the same thread\n\n";
  // Output events table
  std::cout.setf(std::ios::left);
  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
            << "Calls" << std::setw(data_width) << "Total"
            << std::setw(data_width) << "Min." << std::setw(data_width)
Y
Yan Chunwei 已提交
317 318
            << "Max." << std::setw(data_width) << "Ave."
            << std::setw(data_width) << "Ratio." << std::endl;
319 320 321 322 323 324 325 326
  for (size_t i = 0; i < events_table.size(); ++i) {
    for (size_t j = 0; j < events_table[i].size(); ++j) {
      const EventItem& event_item = events_table[i][j];
      std::cout << std::setw(name_width) << event_item.name
                << std::setw(data_width) << event_item.calls
                << std::setw(data_width) << event_item.total_time
                << std::setw(data_width) << event_item.min_time
                << std::setw(data_width) << event_item.max_time
Y
Yan Chunwei 已提交
327
                << std::setw(data_width) << event_item.ave_time
328
                << std::setw(data_width) << event_item.ratio << std::endl;
329
    }
330
  }
331
  std::cout << std::endl;
332 333
}

334 335
// Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events,
336
                 bool merge_thread,
337 338
                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
  if (g_state == ProfilerState::kDisabled) return;
339
  if (merge_thread && events.size() < 2) return;
340 341

  std::string sorted_domain;
L
Luo Tao 已提交
342
  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
343 344 345
  switch (sorted_by) {
    case EventSortingKey::kCalls:
      sorted_domain = "number of calls";
L
Luo Tao 已提交
346
      sorted_func = [](const EventItem& a, const EventItem& b) {
347 348 349 350 351
        return a.calls > b.calls;
      };
      break;
    case EventSortingKey::kTotal:
      sorted_domain = "total time";
L
Luo Tao 已提交
352
      sorted_func = [](const EventItem& a, const EventItem& b) {
353 354 355 356 357
        return a.total_time > b.total_time;
      };
      break;
    case EventSortingKey::kMin:
      sorted_domain = "minimum time";
L
Luo Tao 已提交
358
      sorted_func = [](const EventItem& a, const EventItem& b) {
359 360 361 362 363
        return a.min_time > b.min_time;
      };
      break;
    case EventSortingKey::kMax:
      sorted_domain = "maximum time";
L
Luo Tao 已提交
364
      sorted_func = [](const EventItem& a, const EventItem& b) {
365 366 367 368 369
        return a.max_time > b.max_time;
      };
      break;
    case EventSortingKey::kAve:
      sorted_domain = "average time";
L
Luo Tao 已提交
370
      sorted_func = [](const EventItem& a, const EventItem& b) {
371 372 373 374
        return a.ave_time > b.ave_time;
      };
      break;
    default:
375
      sorted_domain = "event first end time";
376 377
  }

378 379 380 381
  const std::vector<std::vector<Event>>* analyze_events;
  std::vector<std::vector<Event>> merged_events_list;
  if (merge_thread) {
    std::vector<Event> merged_events;
Y
Yibing Liu 已提交
382 383
    for (size_t i = 0; i < events.size(); ++i) {
      for (size_t j = 0; j < events[i].size(); ++j) {
384 385 386 387 388 389 390 391 392
        merged_events.push_back(events[i][j]);
      }
    }
    merged_events_list.push_back(merged_events);
    analyze_events = &merged_events_list;
  } else {
    analyze_events = &events;
  }

393
  std::vector<std::vector<EventItem>> events_table;
Y
Yibing Liu 已提交
394
  size_t max_name_width = 0;
395 396
  for (size_t i = 0; i < (*analyze_events).size(); i++) {
    double total = 0.;  // the total time in one thread
397
    std::list<Event> pushed_events;
398 399 400
    std::vector<EventItem> event_items;
    std::unordered_map<std::string, int> event_idx;

401 402 403 404
    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
      if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
        pushed_events.push_back((*analyze_events)[i][j]);
      } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
405
        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
406
        while (rit != pushed_events.rend() &&
407
               rit->name() != (*analyze_events)[i][j].name()) {
408 409
          ++rit;
        }
410

411
        if (rit != pushed_events.rend()) {
412 413
          double event_time = (g_state == ProfilerState::kCUDA ||
                               g_state == ProfilerState::kAll)
414 415
                                  ? rit->CudaElapsedMs((*analyze_events)[i][j])
                                  : rit->CpuElapsedMs((*analyze_events)[i][j]);
Y
Yan Chunwei 已提交
416
          total += event_time;
417

418 419 420 421 422 423 424 425 426
          std::string event_name;
          if (merge_thread) {
            event_name = rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          } else {
            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
                         rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          }
427

428 429 430
          if (event_idx.find(event_name) == event_idx.end()) {
            event_idx[event_name] = event_items.size();
            EventItem event_item = {event_name, 1,          event_time,
Y
Yan Chunwei 已提交
431 432
                                    event_time, event_time, event_time,
                                    0.};
433
            event_items.push_back(event_item);
434
          } else {
435 436
            int index = event_idx[event_name];
            event_items[index].calls += 1;
437
            // total time
438
            event_items[index].total_time += event_time;
439
            // min time
440 441
            event_items[index].min_time =
                std::min(event_time, event_items[index].min_time);
442
            // max time
443 444
            event_items[index].max_time =
                std::max(event_time, event_items[index].max_time);
445
          }
446

Y
Yibing Liu 已提交
447
          // remove the push marker from the list
448 449
          pushed_events.erase((++rit).base());
        } else {
450
          LOG(WARNING) << "Cannot find the push marker of event \'"
451
                       << (*analyze_events)[i][j].name()
452
                       << "\', which will be ignored in profiling report.";
453 454 455
        }
      }
    }
456 457 458
    // average time
    for (auto& item : event_items) {
      item.ave_time = item.total_time / item.calls;
459
      item.ratio = item.total_time / total;
460 461 462
    }
    // sort
    if (sorted_by != EventSortingKey::kDefault) {
463
      std::sort(event_items.begin(), event_items.end(), sorted_func);
464
    }
465

466
    events_table.push_back(event_items);
Y
Yibing Liu 已提交
467
    // log warning if there are events with `push` but without `pop`
468 469
    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
    while (rit != pushed_events.rend()) {
Y
Yibing Liu 已提交
470 471
      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
                   << "\', which will be ignored in profiling report.";
472 473
      ++rit;
    }
474
  }
475 476

  // Print report
477 478
  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
                merge_thread);
479 480
}

481 482
void DisableProfiler(EventSortingKey sorted_key,
                     const std::string& profile_path) {
X
Xin Pan 已提交
483
  std::lock_guard<std::mutex> l(profiler_mu);
484
  if (g_state == ProfilerState::kDisabled) return;
485 486 487 488
  // Mark the profiling stop.
  Mark("_stop_profiler_", nullptr);

  std::vector<std::vector<Event>> all_events = GetAllEvents();
489 490
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
491 492
  ResetProfiler();
  DeviceTracer* tracer = GetDeviceTracer();
493
  if (tracer->IsEnabled()) {
494 495
    tracer->Disable();
    tracer->GenProfile(profile_path);
496
  }
497
  g_state = ProfilerState::kDisabled;
X
Xin Pan 已提交
498
  should_send_profile_state = true;
499 500 501 502 503
}

bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
bool ShouldSendProfileState() { return should_send_profile_state; }

X
Xin Pan 已提交
504
void SetProfileListener() {
505 506 507
  std::mt19937 rng;
  rng.seed(std::random_device()());
  std::uniform_int_distribution<std::mt19937::result_type> dist6(
X
Xin Pan 已提交
508
      1, std::numeric_limits<int>::max());
509
  profiler_lister_id = dist6(rng);
510
}
511
int64_t ListenerId() { return profiler_lister_id; }
512

D
dangqingqing 已提交
513 514
}  // namespace platform
}  // namespace paddle