profiler.h 4.3 KB
Newer Older
D
dangqingqing 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include <forward_list>
#include <list>
#include <mutex>
#include <vector>
#include "paddle/platform/device_context.h"

namespace paddle {
namespace platform {

enum EventKind { kMark, kPushRange, kPopRange };

class Event {
 public:
D
dangqingqing 已提交
29 30
  // The DeviceContext is used to get the cuda stream.
  // If CPU profiling mode, can pass nullptr.
D
dangqingqing 已提交
31
  Event(EventKind kind, std::string name, uint32_t thread_id,
D
dangqingqing 已提交
32
        DeviceContext* dev_ctx);
D
dangqingqing 已提交
33

D
dangqingqing 已提交
34
  std::string kind() const;
D
dangqingqing 已提交
35
  std::string name() const { return name_; }
36
  uint32_t thread_id() const { return thread_id_; }
D
dangqingqing 已提交
37 38 39 40 41 42 43
  bool has_cuda() const { return has_cuda_; }

#ifdef PADDLE_WITH_CUDA
  cudaEvent_t event() const { return event_; }
  int device() const { return device_; }
#endif

44 45
  double CpuElapsedMs(const Event& e) const;
  double CudaElapsedMs(const Event& e) const;
D
dangqingqing 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59

 private:
  EventKind kind_;
  std::string name_;
  uint32_t thread_id_;
  int64_t cpu_ns_;
  bool has_cuda_;
#ifdef PADDLE_WITH_CUDA
  cudaEvent_t event_ = nullptr;
  int device_ = -1;
#endif
};

struct EventList {
D
dangqingqing 已提交
60 61 62 63 64
  constexpr static size_t kMB = 1024 * 1024;
  constexpr static size_t kEventBlockSize = 16 * kMB;
  constexpr static size_t kEventSize = sizeof(Event);
  constexpr static size_t kEventAlign = alignof(Event);
  constexpr static size_t kNumBlock =
D
dangqingqing 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);

  template <typename... Args>
  void Record(Args&&... args) {
    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
      event_blocks.emplace_front();
      event_blocks.front().reserve(kNumBlock);
    }
    event_blocks.front().emplace_back(std::forward<Args>(args)...);
  }

  std::vector<Event> Reduce() {
    std::vector<Event> result;
    for (auto& block : event_blocks) {
      result.insert(result.begin(), std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
    return result;
  }

87 88
  void Clear() { event_blocks.clear(); }

D
dangqingqing 已提交
89 90 91 92
  std::forward_list<std::vector<Event>> event_blocks;
};

enum ProfilerState {
D
dangqingqing 已提交
93 94 95
  kDisabled,  // disabled state
  kCPU,       // CPU profiling state
  kCUDA,      // GPU profiling state
D
dangqingqing 已提交
96 97
};

D
dangqingqing 已提交
98
void Mark(const std::string& name, DeviceContext* dev_ctx);
D
dangqingqing 已提交
99

Y
Yibing Liu 已提交
100
void PushEvent(const std::string& name, DeviceContext* dev_ctx);
101

Y
Yibing Liu 已提交
102
void PopEvent(const std::string& name, DeviceContext* dev_ctx);
103

D
dangqingqing 已提交
104
struct RecordEvent {
D
dangqingqing 已提交
105
  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
D
dangqingqing 已提交
106

D
dangqingqing 已提交
107 108 109 110
  ~RecordEvent();

  // The device context is used by Event to get the current cuda stream.
  DeviceContext* dev_ctx_;
Y
Yibing Liu 已提交
111
  // Event name
112
  std::string name_;
D
dangqingqing 已提交
113 114
};

D
dangqingqing 已提交
115 116
// Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
117
std::vector<std::vector<Event>> GetAllEvents();
D
dangqingqing 已提交
118

119 120 121 122 123 124 125 126 127 128 129 130 131
// The information of each event given in the profiling report
struct EventItem {
  std::string name;
  int calls;
  double total_time;
  double min_time;
  double max_time;
  double ave_time;
};

// Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };

132 133 134 135 136 137 138 139
// Enable the profiling function.
void EnableProfiler(ProfilerState state);

// Clear the g_all_event_lists, which is total event lists of all threads.
void ResetProfiler();

void DisableProfiler(EventSortingKey sorted_key);

140 141 142
// Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&,
                 EventSortingKey sorted_by = EventSortingKey::kDefault);
143

144
// Print results
145 146 147 148
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
                   std::string& sorted_domain, const size_t name_width,
                   const size_t data_width);

D
dangqingqing 已提交
149 150
}  // namespace platform
}  // namespace paddle