profiler.cc 17.8 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
dangqingqing 已提交
2 3 4 5

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

D
dangqingqing 已提交
7 8 9 10 11 12 13 14
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15 16
#include "paddle/fluid/platform/profiler.h"

17
#include <algorithm>
18
#include <iomanip>
19
#include <limits>
20
#include <map>
21
#include <mutex>  // NOLINT
22
#include <random>
23
#include <string>
24 25 26
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif  // PADDLE_WITH_CUDA
Y
Yancey1989 已提交
27

28
#include "glog/logging.h"
29 30
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
Y
Yancey1989 已提交
31
#include "paddle/fluid/platform/port.h"
32
#include "paddle/fluid/string/printf.h"
D
dangqingqing 已提交
33

G
gongweibao 已提交
34 35
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");

D
dangqingqing 已提交
36 37 38
namespace paddle {
namespace platform {

39 40
struct EventList;

41 42
static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
X
Xin Pan 已提交
43
std::mutex profiler_mu;
44

D
dangqingqing 已提交
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
// The thread local event list only can be accessed by the specific thread
// The thread index of each thread
static thread_local int32_t g_thread_id;
// The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list;

60 61 62 63 64 65 66 67 68 69
struct EventList {
  constexpr static size_t kMB = 1024 * 1024;
  constexpr static size_t kEventBlockSize = 16 * kMB;
  constexpr static size_t kEventSize = sizeof(Event);
  constexpr static size_t kEventAlign = alignof(Event);
  constexpr static size_t kNumBlock =
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);

  template <typename... Args>
70
  Event* Record(Args&&... args) {
71 72 73 74 75
    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
      event_blocks.emplace_front();
      event_blocks.front().reserve(kNumBlock);
    }
    event_blocks.front().emplace_back(std::forward<Args>(args)...);
76
    return &event_blocks.front().back();
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
  }

  std::vector<Event> Reduce() {
    std::vector<Event> result;
    for (auto& block : event_blocks) {
      result.insert(result.begin(), std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
    return result;
  }

  void Clear() { event_blocks.clear(); }

  std::forward_list<std::vector<Event>> event_blocks;
};

D
dangqingqing 已提交
94 95 96 97 98 99 100 101 102
inline uint64_t GetTimeInNsec() {
  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                                 std::chrono::high_resolution_clock,
                                 std::chrono::steady_clock>::type;
  return std::chrono::duration_cast<std::chrono::nanoseconds>(
             clock::now().time_since_epoch())
      .count();
}

103 104
Event::Event(EventType type, std::string name, uint32_t thread_id)
    : type_(type), name_(name), thread_id_(thread_id) {
D
dangqingqing 已提交
105 106 107
  cpu_ns_ = GetTimeInNsec();
}

108
const EventType& Event::type() const { return type_; }
D
dangqingqing 已提交
109

110 111
double Event::CpuElapsedMs(const Event& e) const {
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
D
dangqingqing 已提交
112 113
}

114
double Event::CudaElapsedMs(const Event& e) const {
115 116
#ifdef PADDLE_WITH_CUPTI
  return gpu_ns_ / 1000000.0;
D
Dun Liang 已提交
117
#else
D
Dun Liang 已提交
118 119
  LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
  return 0;
D
dangqingqing 已提交
120 121 122 123 124 125 126 127 128
#endif
}

inline EventList& GetEventList() {
  if (!g_event_list) {
    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
    g_event_list = std::make_shared<EventList>();
    g_thread_id = g_next_thread_id++;
    g_all_event_lists.emplace_front(g_event_list);
129
    RecoreCurThreadId(g_thread_id);
D
dangqingqing 已提交
130 131 132 133
  }
  return *g_event_list;
}

134 135
void Mark(const std::string& name) {
  GetEventList().Record(EventType::kMark, name, g_thread_id);
136 137
}

138 139
Event* PushEvent(const std::string& name) {
  return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
140 141
}

142 143
void PopEvent(const std::string& name) {
  GetEventList().Record(EventType::kPopRange, name, g_thread_id);
D
dangqingqing 已提交
144 145
}

146
RecordEvent::RecordEvent(const std::string& name)
X
Xin Pan 已提交
147
    : is_enabled_(false), start_ns_(PosixInNsec()) {
D
dangqingqing 已提交
148
  if (g_state == ProfilerState::kDisabled) return;
149
  // lock is not needed, the code below is thread-safe
Y
Yancey1989 已提交
150

X
Xin Pan 已提交
151
  is_enabled_ = true;
Y
Yibing Liu 已提交
152
  name_ = name;
153
  Event* e = PushEvent(name_);
154
  // Maybe need the same push/pop behavior.
155
  SetCurAnnotation(e);
D
dangqingqing 已提交
156 157 158
}

RecordEvent::~RecordEvent() {
X
Xin Pan 已提交
159
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
160
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
161 162
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
163
    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
164
                          BlockDepth(), g_thread_id);
X
Xin Pan 已提交
165
  }
Y
Yibing Liu 已提交
166
  ClearCurAnnotation();
167
  PopEvent(name_);
D
dangqingqing 已提交
168
}
D
dangqingqing 已提交
169

170
RecordRPCEvent::RecordRPCEvent(const std::string& name) {
G
gongweibao 已提交
171
  if (FLAGS_enable_rpc_profiler) {
172
    event_.reset(new platform::RecordEvent(name));
G
gongweibao 已提交
173 174 175
  }
}

X
Xin Pan 已提交
176 177
RecordBlock::RecordBlock(int block_id)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
178
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
179
  if (g_state == ProfilerState::kDisabled) return;
X
Xin Pan 已提交
180
  is_enabled_ = true;
X
Xin Pan 已提交
181 182 183 184 185
  SetCurBlock(block_id);
  name_ = string::Sprintf("block_%d", block_id);
}

RecordBlock::~RecordBlock() {
186
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
187
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
X
Xin Pan 已提交
188 189 190 191 192
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
193
                          g_thread_id);
X
Xin Pan 已提交
194 195 196 197
  }
  ClearCurBlock();
}

198 199 200 201 202 203 204 205 206 207
void SynchronizeAllDevice() {
#ifdef PADDLE_WITH_CUDA
  int count = GetCUDADeviceCount();
  for (int i = 0; i < count; i++) {
    SetDeviceId(i);
    PADDLE_ENFORCE(cudaDeviceSynchronize());
  }
#endif
}

D
dangqingqing 已提交
208 209
void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
Q
Qiao Longfei 已提交
210
                 "Can't enable profiling, since the input state is ",
D
dangqingqing 已提交
211
                 "ProfilerState::kDisabled");
212
  SynchronizeAllDevice();
X
Xin Pan 已提交
213
  std::lock_guard<std::mutex> l(profiler_mu);
214 215
  if (state == g_state) {
    return;
216
  }
217
  g_state = state;
X
Xin Pan 已提交
218
  should_send_profile_state = true;
219
  GetDeviceTracer()->Enable();
D
dangqingqing 已提交
220
#ifdef PADDLE_WITH_CUDA
221 222
  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
      g_state == ProfilerState::kCPU) {
223
    // Generate some dummy events first to reduce the startup overhead.
224 225
    DummyKernelAndEvent();
    GetDeviceTracer()->Reset();
D
dangqingqing 已提交
226 227 228
  }
#endif
  // Mark the profiling start.
229
  Mark("_start_profiler_");
D
dangqingqing 已提交
230 231
}

232
void ResetProfiler() {
233 234
  SynchronizeAllDevice();
  GetDeviceTracer()->Reset();
D
dangqingqing 已提交
235
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
236 237 238 239 240 241 242 243 244
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    (*it)->Clear();
  }
}

std::vector<std::vector<Event>> GetAllEvents() {
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
  std::vector<std::vector<Event>> result;
D
dangqingqing 已提交
245 246 247
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    result.emplace_back((*it)->Reduce());
D
dangqingqing 已提交
248 249 250 251
  }
  return result;
}

252 253 254 255 256 257 258
// The information of each event given in the profiling report
struct EventItem {
  std::string name;
  int calls;
  double total_time;
  double max_time;
  double ave_time;
C
chengduo 已提交
259 260 261
  double min_time;
  double cpu_time;
  double gpu_time;
Y
Yan Chunwei 已提交
262
  float ratio;
263 264 265 266 267
};

// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                   const std::string& sorted_domain, const size_t name_width,
268
                   const size_t data_width, bool merge_thread) {
269 270 271 272 273 274 275 276 277 278 279 280
  // Output header information
  std::cout << "\n------------------------->"
            << "     Profiling Report     "
            << "<-------------------------\n\n";
  std::string place;
  if (g_state == ProfilerState::kCPU) {
    place = "CPU";
  } else if (g_state == ProfilerState::kCUDA) {
    place = "CUDA";
  } else if (g_state == ProfilerState::kAll) {
    place = "All";
  } else {
X
Xin Pan 已提交
281
    PADDLE_THROW("Invalid profiler state", g_state);
282
  }
283

284 285 286 287
  if (merge_thread) {
    std::cout << "Note! This Report merge all thread info into one."
              << std::endl;
  }
288 289 290 291 292 293 294
  std::cout << "Place: " << place << std::endl;
  std::cout << "Time unit: ms" << std::endl;
  std::cout << "Sorted by " << sorted_domain
            << " in descending order in the same thread\n\n";
  // Output events table
  std::cout.setf(std::ios::left);
  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
C
chengduo 已提交
295 296 297 298 299 300
            << "Calls" << std::setw(data_width) << "Total";
  if (g_state == ProfilerState::kAll) {
    std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
              << std::setw(data_width * 2) << "GPU Time (Ratio)";
  }
  std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
Y
Yan Chunwei 已提交
301 302
            << "Max." << std::setw(data_width) << "Ave."
            << std::setw(data_width) << "Ratio." << std::endl;
303 304 305 306 307
  for (size_t i = 0; i < events_table.size(); ++i) {
    for (size_t j = 0; j < events_table[i].size(); ++j) {
      const EventItem& event_item = events_table[i][j];
      std::cout << std::setw(name_width) << event_item.name
                << std::setw(data_width) << event_item.calls
C
chengduo 已提交
308 309 310 311 312 313 314 315 316 317 318 319
                << std::setw(data_width) << event_item.total_time;
      if (g_state == ProfilerState::kAll) {
        std::cout << std::setw(data_width * 2)
                  << string::Sprintf(
                         "%f (%f)", event_item.cpu_time,
                         (event_item.cpu_time / event_item.total_time))
                  << std::setw(data_width * 2)
                  << string::Sprintf(
                         "%f (%f)", event_item.gpu_time,
                         (event_item.gpu_time / event_item.total_time));
      }
      std::cout << std::setw(data_width) << event_item.min_time
320
                << std::setw(data_width) << event_item.max_time
Y
Yan Chunwei 已提交
321
                << std::setw(data_width) << event_item.ave_time
322
                << std::setw(data_width) << event_item.ratio << std::endl;
323
    }
324
  }
325
  std::cout << std::endl;
326 327
}

328 329
// Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events,
330
                 bool merge_thread,
331 332
                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
  if (g_state == ProfilerState::kDisabled) return;
333
  if (merge_thread && events.size() < 2) return;
334 335

  std::string sorted_domain;
L
Luo Tao 已提交
336
  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
337 338 339
  switch (sorted_by) {
    case EventSortingKey::kCalls:
      sorted_domain = "number of calls";
L
Luo Tao 已提交
340
      sorted_func = [](const EventItem& a, const EventItem& b) {
341 342 343 344 345
        return a.calls > b.calls;
      };
      break;
    case EventSortingKey::kTotal:
      sorted_domain = "total time";
L
Luo Tao 已提交
346
      sorted_func = [](const EventItem& a, const EventItem& b) {
347 348 349 350 351
        return a.total_time > b.total_time;
      };
      break;
    case EventSortingKey::kMin:
      sorted_domain = "minimum time";
L
Luo Tao 已提交
352
      sorted_func = [](const EventItem& a, const EventItem& b) {
353 354 355 356 357
        return a.min_time > b.min_time;
      };
      break;
    case EventSortingKey::kMax:
      sorted_domain = "maximum time";
L
Luo Tao 已提交
358
      sorted_func = [](const EventItem& a, const EventItem& b) {
359 360 361 362 363
        return a.max_time > b.max_time;
      };
      break;
    case EventSortingKey::kAve:
      sorted_domain = "average time";
L
Luo Tao 已提交
364
      sorted_func = [](const EventItem& a, const EventItem& b) {
365 366 367
        return a.ave_time > b.ave_time;
      };
      break;
C
chengduo 已提交
368 369 370 371 372 373 374 375 376 377 378 379
    case EventSortingKey::kGPUTime:
      sorted_domain = "average time";
      sorted_func = [](const EventItem& a, const EventItem& b) {
        return a.gpu_time > b.gpu_time;
      };
      break;
    case EventSortingKey::kCPUTime:
      sorted_domain = "average time";
      sorted_func = [](const EventItem& a, const EventItem& b) {
        return a.cpu_time > b.cpu_time;
      };
      break;
380
    default:
381
      sorted_domain = "event first end time";
382 383
  }

384 385 386 387
  const std::vector<std::vector<Event>>* analyze_events;
  std::vector<std::vector<Event>> merged_events_list;
  if (merge_thread) {
    std::vector<Event> merged_events;
Y
Yibing Liu 已提交
388 389
    for (size_t i = 0; i < events.size(); ++i) {
      for (size_t j = 0; j < events[i].size(); ++j) {
390 391 392 393 394 395 396 397 398
        merged_events.push_back(events[i][j]);
      }
    }
    merged_events_list.push_back(merged_events);
    analyze_events = &merged_events_list;
  } else {
    analyze_events = &events;
  }

399
  std::vector<std::vector<EventItem>> events_table;
Y
Yibing Liu 已提交
400
  size_t max_name_width = 0;
401 402
  for (size_t i = 0; i < (*analyze_events).size(); i++) {
    double total = 0.;  // the total time in one thread
403
    std::list<Event> pushed_events;
404 405 406
    std::vector<EventItem> event_items;
    std::unordered_map<std::string, int> event_idx;

407 408 409 410
    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
      if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
        pushed_events.push_back((*analyze_events)[i][j]);
      } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
411
        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
412
        while (rit != pushed_events.rend() &&
413
               rit->name() != (*analyze_events)[i][j].name()) {
414 415
          ++rit;
        }
416

417
        if (rit != pushed_events.rend()) {
C
chengduo 已提交
418 419 420 421 422 423 424 425 426 427 428
          double event_time = 0;
          double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
          double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
          if (g_state == ProfilerState::kCUDA) {
            event_time = gpu_time;
          } else if (g_state == ProfilerState::kCPU) {
            event_time = cpu_time;
          } else {
            event_time = gpu_time + cpu_time;
          }

Y
Yan Chunwei 已提交
429
          total += event_time;
430

431 432 433 434 435 436 437 438 439
          std::string event_name;
          if (merge_thread) {
            event_name = rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          } else {
            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
                         rit->name();
            max_name_width = std::max(max_name_width, event_name.size());
          }
440

441 442 443
          if (event_idx.find(event_name) == event_idx.end()) {
            event_idx[event_name] = event_items.size();
            EventItem event_item = {event_name, 1,          event_time,
Y
Yan Chunwei 已提交
444
                                    event_time, event_time, event_time,
C
chengduo 已提交
445
                                    gpu_time,   cpu_time,   0.};
446
            event_items.push_back(event_item);
447
          } else {
448 449
            int index = event_idx[event_name];
            event_items[index].calls += 1;
450
            // total time
451
            event_items[index].total_time += event_time;
452
            // min time
453 454
            event_items[index].min_time =
                std::min(event_time, event_items[index].min_time);
455
            // max time
456 457
            event_items[index].max_time =
                std::max(event_time, event_items[index].max_time);
C
chengduo 已提交
458 459
            event_items[index].gpu_time += gpu_time;
            event_items[index].cpu_time += cpu_time;
460
          }
461

Y
Yibing Liu 已提交
462
          // remove the push marker from the list
463 464
          pushed_events.erase((++rit).base());
        } else {
465
          LOG(WARNING) << "Cannot find the push marker of event \'"
466
                       << (*analyze_events)[i][j].name()
467
                       << "\', which will be ignored in profiling report.";
468 469 470
        }
      }
    }
471 472 473
    // average time
    for (auto& item : event_items) {
      item.ave_time = item.total_time / item.calls;
474
      item.ratio = item.total_time / total;
475 476 477
    }
    // sort
    if (sorted_by != EventSortingKey::kDefault) {
478
      std::sort(event_items.begin(), event_items.end(), sorted_func);
479
    }
480

481
    events_table.push_back(event_items);
Y
Yibing Liu 已提交
482
    // log warning if there are events with `push` but without `pop`
483 484
    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
    while (rit != pushed_events.rend()) {
Y
Yibing Liu 已提交
485 486
      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
                   << "\', which will be ignored in profiling report.";
487 488
      ++rit;
    }
489
  }
490 491

  // Print report
492 493
  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
                merge_thread);
494 495
}

496 497
void DisableProfiler(EventSortingKey sorted_key,
                     const std::string& profile_path) {
498
  SynchronizeAllDevice();
X
Xin Pan 已提交
499
  std::lock_guard<std::mutex> l(profiler_mu);
500
  if (g_state == ProfilerState::kDisabled) return;
501
  // Mark the profiling stop.
502
  Mark("_stop_profiler_");
503 504

  DeviceTracer* tracer = GetDeviceTracer();
505
  if (tracer->IsEnabled()) {
506 507
    tracer->Disable();
    tracer->GenProfile(profile_path);
508
    tracer->GenEventKernelCudaElapsedTime();
509
  }
510 511 512 513 514

  std::vector<std::vector<Event>> all_events = GetAllEvents();
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
  ResetProfiler();
515
  g_state = ProfilerState::kDisabled;
X
Xin Pan 已提交
516
  should_send_profile_state = true;
517 518 519 520 521
}

bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
bool ShouldSendProfileState() { return should_send_profile_state; }

X
Xin Pan 已提交
522
void SetProfileListener() {
523 524 525
  std::mt19937 rng;
  rng.seed(std::random_device()());
  std::uniform_int_distribution<std::mt19937::result_type> dist6(
X
Xin Pan 已提交
526
      1, std::numeric_limits<int>::max());
527
  profiler_lister_id = dist6(rng);
528
}
529
int64_t ListenerId() { return profiler_lister_id; }
530

D
dangqingqing 已提交
531 532
}  // namespace platform
}  // namespace paddle