event_python.h 5.4 KB
Newer Older
1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <map>
C
chenjian 已提交
18 19
#include <memory>
#include <unordered_map>
20

21
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
22
#include "paddle/fluid/platform/profiler/event_node.h"
C
chenjian 已提交
23
#include "paddle/fluid/platform/profiler/extra_info.h"
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44

namespace paddle {
namespace platform {

struct DevicePythonNode {
  DevicePythonNode() = default;
  ~DevicePythonNode() {}
  // record name
  std::string name;
  // record type, one of TracerEventType
  TracerEventType type;
  // start timestamp of the record
  uint64_t start_ns;
  // end timestamp of the record
  uint64_t end_ns;
  // device id
  uint64_t device_id;
  // context id
  uint64_t context_id;
  // stream id
  uint64_t stream_id;
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
  // correlation id, used for correlating async activities happened on device
  uint32_t correlation_id;
  // The X-dimension block size for the kernel.
  uint32_t block_x;
  // The Y-dimension block size for the kernel.
  uint32_t block_y;
  // The Z-dimension grid size for the kernel.
  uint32_t block_z;
  // X-dimension of a grid.
  uint32_t grid_x;
  // Y-dimension of a grid.
  uint32_t grid_y;
  // Z-dimension of a grid.
  uint32_t grid_z;
  // dynamic + static
  uint64_t shared_memory;
  // The number of registers required for each thread executing the kernel.
  uint32_t registers_per_thread;
  float blocks_per_sm;
  float warps_per_sm;
  // theoretical achieved occupancy
  float occupancy;
  // The number of bytes transferred by the memory copy.
  uint64_t num_bytes;
  // the value being assigned to memory by the memory set.
  uint32_t value;
71 72
};

73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
struct MemPythonNode {
  MemPythonNode() = default;
  ~MemPythonNode() {}

  // timestamp of the record
  uint64_t timestamp_ns;
  // memory addr of allocation or free
  uint64_t addr;
  // memory manipulation type
  TracerMemEventType type;
  // process id of the record
  uint64_t process_id;
  // thread id of the record
  uint64_t thread_id;
  // increase bytes after this manipulation, allocation for sign +, free for
  // sign -
  int64_t increase_bytes;
  // place
  std::string place;
  // current total allocated memory
  uint64_t current_allocated;
  // current total reserved memory
  uint64_t current_reserved;
96 97 98 99
  // peak  allocated memory
  uint64_t peak_allocated;
  // peak  reserved memory
  uint64_t peak_reserved;
100 101
};

102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
struct HostPythonNode {
  HostPythonNode() = default;
  ~HostPythonNode();
  // record name
  std::string name;
  // record type, one of TracerEventType
  TracerEventType type;
  // start timestamp of the record
  uint64_t start_ns;
  // end timestamp of the record
  uint64_t end_ns;
  // process id of the record
  uint64_t process_id;
  // thread id of the record
  uint64_t thread_id;
117 118
  // correlation id, used for correlating async activities happened on device
  uint32_t correlation_id;
119 120 121 122 123
  // input shapes
  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
  std::map<std::string, std::vector<std::string>> dtypes;
  // call stack
  std::string callstack;
124 125 126 127 128 129
  // children node
  std::vector<HostPythonNode*> children_node_ptrs;
  // runtime node
  std::vector<HostPythonNode*> runtime_node_ptrs;
  // device node
  std::vector<DevicePythonNode*> device_node_ptrs;
130 131
  // mem node
  std::vector<MemPythonNode*> mem_node_ptrs;
132 133 134 135 136
};

class ProfilerResult {
 public:
  ProfilerResult() : tree_(nullptr) {}
137 138 139 140 141 142
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  explicit ProfilerResult(
      std::unique_ptr<NodeTrees> tree,
      const ExtraInfo& extra_info,
      const std::map<uint32_t, gpuDeviceProp> device_property_map);
#endif
C
chenjian 已提交
143 144
  explicit ProfilerResult(std::unique_ptr<NodeTrees> tree,
                          const ExtraInfo& extra_info);
145

146 147
  ~ProfilerResult();
  std::map<uint64_t, HostPythonNode*> GetData() {
C
chenjian 已提交
148
    return thread_event_trees_map_;
149
  }
C
chenjian 已提交
150 151 152 153 154 155 156
  std::unordered_map<std::string, std::string> GetExtraInfo() {
    return extra_info_.GetExtraInfo();
  }

  void Save(const std::string& file_name,
            const std::string format = std::string("json"));

L
liutiexing 已提交
157
  std::shared_ptr<NodeTrees> GetNodeTrees() { return tree_; }
158

159 160 161 162 163 164 165 166 167 168 169 170
  void SetVersion(const std::string& version) { version_ = version; }

  void SetSpanIndx(uint32_t span_indx) { span_indx_ = span_indx; }

  std::string GetVersion() { return version_; }
  uint32_t GetSpanIndx() { return span_indx_; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
    return device_property_map_;
  }
#endif

171
 private:
C
chenjian 已提交
172
  std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
L
liutiexing 已提交
173
  std::shared_ptr<NodeTrees> tree_;
C
chenjian 已提交
174
  ExtraInfo extra_info_;
175 176 177 178 179
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::map<uint32_t, gpuDeviceProp> device_property_map_;
#endif
  std::string version_;
  uint32_t span_indx_;
C
chenjian 已提交
180
  HostPythonNode* CopyTree(HostTraceEventNode* root);
181 182
};

C
chenjian 已提交
183 184
std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename);

185 186
}  // namespace platform
}  // namespace paddle