profiler.h 4.2 KB
Newer Older
D
dangqingqing 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include <forward_list>
#include <list>
#include <mutex>
#include <vector>
#include "paddle/platform/device_context.h"

namespace paddle {
namespace platform {

enum EventKind { kMark, kPushRange, kPopRange };

class Event {
 public:
D
dangqingqing 已提交
29 30
  // The DeviceContext is used to get the cuda stream.
  // If CPU profiling mode, can pass nullptr.
D
dangqingqing 已提交
31
  Event(EventKind kind, std::string name, uint32_t thread_id,
D
dangqingqing 已提交
32
        DeviceContext* dev_ctx);
D
dangqingqing 已提交
33

D
dangqingqing 已提交
34
  std::string kind() const;
D
dangqingqing 已提交
35
  std::string name() const { return name_; }
36
  uint32_t thread_id() const { return thread_id_; }
D
dangqingqing 已提交
37 38 39 40 41 42 43
  bool has_cuda() const { return has_cuda_; }

#ifdef PADDLE_WITH_CUDA
  cudaEvent_t event() const { return event_; }
  int device() const { return device_; }
#endif

44 45
  double CpuElapsedMs(const Event& e) const;
  double CudaElapsedMs(const Event& e) const;
D
dangqingqing 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59

 private:
  EventKind kind_;
  std::string name_;
  uint32_t thread_id_;
  int64_t cpu_ns_;
  bool has_cuda_;
#ifdef PADDLE_WITH_CUDA
  cudaEvent_t event_ = nullptr;
  int device_ = -1;
#endif
};

struct EventList {
D
dangqingqing 已提交
60 61 62 63 64
  constexpr static size_t kMB = 1024 * 1024;
  constexpr static size_t kEventBlockSize = 16 * kMB;
  constexpr static size_t kEventSize = sizeof(Event);
  constexpr static size_t kEventAlign = alignof(Event);
  constexpr static size_t kNumBlock =
D
dangqingqing 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
      kEventBlockSize /
      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);

  template <typename... Args>
  void Record(Args&&... args) {
    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
      event_blocks.emplace_front();
      event_blocks.front().reserve(kNumBlock);
    }
    event_blocks.front().emplace_back(std::forward<Args>(args)...);
  }

  std::vector<Event> Reduce() {
    std::vector<Event> result;
    for (auto& block : event_blocks) {
      result.insert(result.begin(), std::make_move_iterator(block.begin()),
                    std::make_move_iterator(block.end()));
    }
    event_blocks.clear();
    return result;
  }

  std::forward_list<std::vector<Event>> event_blocks;
};

enum ProfilerState {
D
dangqingqing 已提交
91 92 93
  kDisabled,  // disabled state
  kCPU,       // CPU profiling state
  kCUDA,      // GPU profiling state
D
dangqingqing 已提交
94 95
};

D
dangqingqing 已提交
96
void Mark(const std::string& name, DeviceContext* dev_ctx);
D
dangqingqing 已提交
97

Y
Yibing Liu 已提交
98
void PushEvent(const std::string& name, DeviceContext* dev_ctx);
99

Y
Yibing Liu 已提交
100
void PopEvent(const std::string& name, DeviceContext* dev_ctx);
101

D
dangqingqing 已提交
102
struct RecordEvent {
D
dangqingqing 已提交
103
  explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
D
dangqingqing 已提交
104

D
dangqingqing 已提交
105 106 107 108
  ~RecordEvent();

  // The device context is used by Event to get the current cuda stream.
  DeviceContext* dev_ctx_;
Y
Yibing Liu 已提交
109
  // Event name
110
  std::string name_;
D
dangqingqing 已提交
111 112
};

D
dangqingqing 已提交
113
// Enable the profiling function.
D
dangqingqing 已提交
114
void EnableProfiler(ProfilerState state);
D
dangqingqing 已提交
115 116 117

// Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
D
dangqingqing 已提交
118 119
std::vector<std::vector<Event>> DisableProfiler();

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
// The information of each event given in the profiling report
struct EventItem {
  std::string name;
  int calls;
  double total_time;
  double min_time;
  double max_time;
  double ave_time;
};

// Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };

// Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&,
                 EventSortingKey sorted_by = EventSortingKey::kDefault);
136

137 138 139 140
// Print results
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
                          EventSortingKey sorted_by, const size_t name_width,
                          const size_t data_width);
D
dangqingqing 已提交
141 142
}  // namespace platform
}  // namespace paddle