profiler.cc 10.0 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
dangqingqing 已提交
2 3 4 5

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
6

D
dangqingqing 已提交
7 8 9 10 11 12 13 14
    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include <mutex>  // NOLINT
16
#include <random>
17
#include <string>
Y
Yancey1989 已提交
18

19
#include "paddle/fluid/platform/device_tracer.h"
W
wangchaochaohu 已提交
20 21 22
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler_helper.h"
23 24 25
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h"
#endif
D
dangqingqing 已提交
26

G
gongweibao 已提交
27 28
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");

D
dangqingqing 已提交
29 30 31
namespace paddle {
namespace platform {

W
wangchaochaohu 已提交
32
MemEvenRecorder MemEvenRecorder::recorder;
D
dangqingqing 已提交
33

34 35 36
Event::Event(EventType type, std::string name, uint32_t thread_id,
             EventRole role)
    : type_(type), name_(name), thread_id_(thread_id), role_(role) {
D
dangqingqing 已提交
37 38 39
  cpu_ns_ = GetTimeInNsec();
}

C
chengduo 已提交
40
const EventType &Event::type() const { return type_; }
D
dangqingqing 已提交
41

C
chengduo 已提交
42
double Event::CpuElapsedMs(const Event &e) const {
43
  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
D
dangqingqing 已提交
44 45
}

C
chengduo 已提交
46
double Event::CudaElapsedMs(const Event &e) const {
47 48
#ifdef PADDLE_WITH_CUPTI
  return gpu_ns_ / 1000000.0;
D
Dun Liang 已提交
49
#else
D
Dun Liang 已提交
50 51
  LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
  return 0;
D
dangqingqing 已提交
52 53 54
#endif
}

55
RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
56 57 58 59 60 61 62 63
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
  if (g_enable_nvprof_hook) {
    dynload::nvtxRangePushA(name.c_str());
    is_pushed_ = true;
  }
#endif
#endif
64
  if (g_state == ProfilerState::kDisabled || name.empty()) return;
65 66 67 68

  // do some initialization
  start_ns_ = PosixInNsec();
  role_ = role;
X
Xin Pan 已提交
69
  is_enabled_ = true;
70
  // lock is not needed, the code below is thread-safe
71
  // Maybe need the same push/pop behavior.
72
  Event *e = PushEvent(name, role);
73 74
  SetCurAnnotation(e);
  name_ = e->name();
D
dangqingqing 已提交
75 76 77
}

RecordEvent::~RecordEvent() {
78 79 80 81 82 83 84
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
  if (g_enable_nvprof_hook && is_pushed_) {
    dynload::nvtxRangePop();
  }
#endif
#endif
X
Xin Pan 已提交
85
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
86
  // lock is not needed, the code below is thread-safe
C
chengduo 已提交
87
  DeviceTracer *tracer = GetDeviceTracer();
X
Xin Pan 已提交
88
  if (tracer) {
89
    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
90
                          BlockDepth(), g_thread_id);
X
Xin Pan 已提交
91
  }
Y
Yibing Liu 已提交
92
  ClearCurAnnotation();
W
wangchaochaohu 已提交
93
  PopEvent(name_, role_);
D
dangqingqing 已提交
94
}
D
dangqingqing 已提交
95

C
chengduo 已提交
96 97 98 99 100
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
                                    size_t size) {
  if (g_state == ProfilerState::kDisabled) return;
  std::lock_guard<std::mutex> guard(mtx_);
  auto &events = address_memevent_[place];
G
GaoWei8 已提交
101 102 103
  PADDLE_ENFORCE_EQ(events.count(ptr), 0,
                    platform::errors::InvalidArgument(
                        "The Place can't exist in the stage of PushMemRecord"));
C
chengduo 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
                          new MemEvenRecorder::RecordMemEvent(place, size)));
}

void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
  if (g_state == ProfilerState::kDisabled) return;
  std::lock_guard<std::mutex> guard(mtx_);
  auto &events = address_memevent_[place];
  auto iter = events.find(ptr);
  // The ptr maybe not in address_memevent
  if (iter != events.end()) {
    events.erase(iter);
  }
}

void MemEvenRecorder::Flush() {
  std::lock_guard<std::mutex> guard(mtx_);
  address_memevent_.clear();
}

MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
                                                size_t bytes)
    : place_(place),
      bytes_(bytes),
      start_ns_(PosixInNsec()),
      alloc_in_(CurAnnotationName()) {
  PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
}

MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
  DeviceTracer *tracer = GetDeviceTracer();
  end_ns_ = PosixInNsec();

  auto annotation_free = CurAnnotationName();
  if (tracer) {
    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
                             annotation_free, g_mem_thread_id);
  }
  PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}

RecordRPCEvent::RecordRPCEvent(const std::string &name) {
G
gongweibao 已提交
146
  if (FLAGS_enable_rpc_profiler) {
147
    event_.reset(new platform::RecordEvent(name));
G
gongweibao 已提交
148 149 150
  }
}

X
Xin Pan 已提交
151 152
RecordBlock::RecordBlock(int block_id)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
153
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
154
  if (g_state == ProfilerState::kDisabled) return;
X
Xin Pan 已提交
155
  is_enabled_ = true;
X
Xin Pan 已提交
156 157 158 159 160
  SetCurBlock(block_id);
  name_ = string::Sprintf("block_%d", block_id);
}

RecordBlock::~RecordBlock() {
161
  // lock is not needed, the code below is thread-safe
X
Xin Pan 已提交
162
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
C
chengduo 已提交
163
  DeviceTracer *tracer = GetDeviceTracer();
X
Xin Pan 已提交
164 165 166 167
  if (tracer) {
    // We try to put all blocks at the same nested depth in the
    // same timeline lane. and distinguish the using thread_id.
    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
168
                          g_thread_id);
X
Xin Pan 已提交
169 170 171 172
  }
  ClearCurBlock();
}

W
wangchaochaohu 已提交
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                  const Place &place, const std::string &annotation) {
  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
                           place, g_mem_thread_id, annotation);
}

void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                 const Place &place, const std::string &annotation) {
  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
                           g_mem_thread_id, annotation);
}

void Mark(const std::string &name) {
  GetEventList().Record(EventType::kMark, name, g_thread_id);
}

189 190
Event *PushEvent(const std::string &name, const EventRole role) {
  return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
191 192
}

W
wangchaochaohu 已提交
193 194
void PopEvent(const std::string &name, const EventRole role) {
  GetEventList().Record(EventType::kPopRange, name, g_thread_id, role);
W
wangchaochaohu 已提交
195
}
D
dangqingqing 已提交
196
void EnableProfiler(ProfilerState state) {
W
wangchaochaohu 已提交
197 198 199 200
  PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
                    platform::errors::InvalidArgument(
                        "Can't enable profiling, since the input state is"
                        "ProfilerState::kDisabled"));
201
  SynchronizeAllDevice();
X
Xin Pan 已提交
202
  std::lock_guard<std::mutex> l(profiler_mu);
203 204
  if (state == g_state) {
    return;
205
  }
206
  g_state = state;
X
Xin Pan 已提交
207
  should_send_profile_state = true;
208
  GetDeviceTracer()->Enable();
209
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
210 211
  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
      g_state == ProfilerState::kCPU) {
212
    // Generate some dummy events first to reduce the startup overhead.
213 214
    DummyKernelAndEvent();
    GetDeviceTracer()->Reset();
D
dangqingqing 已提交
215 216 217
  }
#endif
  // Mark the profiling start.
218
  Mark("_start_profiler_");
D
dangqingqing 已提交
219 220
}

221
void ResetProfiler() {
222 223
  SynchronizeAllDevice();
  GetDeviceTracer()->Reset();
C
chengduo 已提交
224
  MemEvenRecorder::Instance().Flush();
D
dangqingqing 已提交
225
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
226 227 228 229
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    (*it)->Clear();
  }
C
chengduo 已提交
230 231 232 233
  for (auto it = g_all_mem_event_lists.begin();
       it != g_all_mem_event_lists.end(); ++it) {
    (*it)->Clear();
  }
234 235
}

236
void DisableProfiler(EventSortingKey sorted_key,
C
chengduo 已提交
237
                     const std::string &profile_path) {
238
  SynchronizeAllDevice();
C
chengduo 已提交
239 240
  MemEvenRecorder::Instance().Flush();

X
Xin Pan 已提交
241
  std::lock_guard<std::mutex> l(profiler_mu);
242
  if (g_state == ProfilerState::kDisabled) return;
243
  // Mark the profiling stop.
244
  Mark("_stop_profiler_");
245
  DealWithShowName();
246

C
chengduo 已提交
247
  DeviceTracer *tracer = GetDeviceTracer();
248
  if (tracer->IsEnabled()) {
249
    tracer->Disable();
250
    tracer->GenEventKernelCudaElapsedTime();
251
    tracer->GenProfile(profile_path);
252
  }
253 254

  std::vector<std::vector<Event>> all_events = GetAllEvents();
255

256 257
  ParseEvents(all_events, true, sorted_key);
  ParseEvents(all_events, false, sorted_key);
C
chengduo 已提交
258 259 260 261 262
  if (VLOG_IS_ON(5)) {
    std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
    ParseMemEvents(all_mem_events);
  }

263
  ResetProfiler();
264
  g_state = ProfilerState::kDisabled;
265
  g_tracer_option = TracerOption::kDefault;
X
Xin Pan 已提交
266
  should_send_profile_state = true;
267 268
}

W
wangchaochaohu 已提交
269 270 271 272 273 274 275 276 277 278
std::vector<std::vector<Event>> GetAllEvents() {
  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
  std::vector<std::vector<Event>> result;
  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
       ++it) {
    result.emplace_back((*it)->Reduce());
  }
  return result;
}

279 280
bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }

W
wangchaochaohu 已提交
281
bool ShouldSendProfileState() { return should_send_profile_state; }
282

283 284
std::string OpName(const framework::VariableNameMap &name_map,
                   const std::string &type_name) {
285 286
  if (platform::GetTracerOption() != platform::TracerOption::kAllOpDetail ||
      !IsProfileEnabled())
287 288 289 290 291
    return "";

  std::string ret = type_name + "%";
  for (auto it = name_map.begin(); it != name_map.end(); it++) {
    auto name_outputs = it->second;
292
    if (!name_outputs.empty()) {
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
      ret = ret + name_outputs[0];
      break;
    }
  }
  ret = ret + "%";

  return ret;
}

void SetTracerOption(TracerOption option) {
  std::lock_guard<std::mutex> l(profiler_mu);
  g_tracer_option = option;
}

platform::TracerOption GetTracerOption() { return g_tracer_option; }
W
wangchaochaohu 已提交
308 309 310 311 312 313 314 315 316 317 318

void SetProfileListener() {
  std::mt19937 rng;
  rng.seed(std::random_device()());
  std::uniform_int_distribution<std::mt19937::result_type> dist6(
      1, std::numeric_limits<int>::max());
  profiler_lister_id = dist6(rng);
}

int64_t ListenerId() { return profiler_lister_id; }

319 320 321 322 323 324 325
void NvprofEnableRecordEvent() {
  SynchronizeAllDevice();
  g_enable_nvprof_hook = true;
}

void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }

D
dangqingqing 已提交
326 327
}  // namespace platform
}  // namespace paddle