event_node.cc 16.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/platform/profiler/event_node.h"

14
#include <climits>
15

16 17 18 19 20
#include <algorithm>
#include <deque>
#include <set>
#include <stack>

C
chenjian 已提交
21 22
#include "paddle/fluid/platform/profiler/utils.h"

23 24 25 26 27
namespace paddle {
namespace platform {

HostTraceEventNode::~HostTraceEventNode() {
  // delete all runtime nodes and recursive delete children
28 29
  for (auto& runtime_node_ptr : runtime_node_ptrs_) {
    delete runtime_node_ptr;
30
  }
31 32
  for (auto& child : children_) {
    delete child;
33 34 35 36 37
  }
}

CudaRuntimeTraceEventNode::~CudaRuntimeTraceEventNode() {
  // delete all device nodes
38 39
  for (auto& device_node_ptr : device_node_ptrs_) {
    delete device_node_ptr;
40 41 42 43 44
  }
}

NodeTrees::~NodeTrees() {
  // delete all root nodes
45 46
  for (auto& item : thread_event_trees_map_) {
    delete item.second;
47 48 49 50 51
  }
}

void NodeTrees::BuildTrees(
    const std::vector<HostTraceEventNode*>& host_event_nodes,
C
chenjian 已提交
52 53 54 55
    const std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
    const std::vector<DeviceTraceEventNode*>& device_event_nodes,
    const std::vector<MemTraceEventNode*>& mem_event_nodes,
    const std::vector<OperatorSupplementEventNode*>& op_supplement_events) {
56
  // separate Host Event Nodes into different threads
57 58 59 60 61 62
  std::map<uint64_t, std::vector<HostTraceEventNode*>>
      thread2host_event_nodes;  // used to store HostTraceEventNodes per thread
  std::map<uint64_t, std::vector<CudaRuntimeTraceEventNode*>>
      thread2runtime_event_nodes;  // used to store CudaRuntimeTraceEventNode
                                   // per
                                   // thread
C
chenjian 已提交
63 64 65 66 67 68 69 70 71
  std::map<uint64_t, std::vector<MemTraceEventNode*>>
      thread2mem_event_nodes;  // used to store MemTraceEventNode
                               // per
                               // thread
  std::map<uint64_t, std::vector<OperatorSupplementEventNode*>>
      thread2op_supplement_event_nodes;  // used to store
                                         // OperatorSupplementEventNode
                                         // per
                                         // thread
72 73 74 75
  std::map<uint32_t, CudaRuntimeTraceEventNode*>
      correlation_id2runtime_event_node;  // used to store the relation between
                                          // correlation id and runtime node
  // construct thread2host_event_nodes
76 77 78
  for (auto host_event_node : host_event_nodes) {
    thread2host_event_nodes[host_event_node->ThreadId()].push_back(
        host_event_node);
79 80 81
  }
  // construct thread2runtime_event_nodes and
  // correlation_id2runtime_event_node
82 83 84 85 86
  for (auto runtime_event_node : runtime_event_nodes) {
    thread2runtime_event_nodes[runtime_event_node->ThreadId()].push_back(
        runtime_event_node);
    correlation_id2runtime_event_node[runtime_event_node->CorrelationId()] =
        runtime_event_node;
87 88 89
  }
  // associate CudaRuntimeTraceEventNode and DeviceTraceEventNode
  // construct correlation_id2device_event_nodes
90 91 92
  for (auto device_event_node : device_event_nodes) {
    auto dst_iter = correlation_id2runtime_event_node.find(
        device_event_node->CorrelationId());
C
chenjian 已提交
93 94 95
    if (dst_iter == correlation_id2runtime_event_node.end()) {
      continue;
    }
96
    dst_iter->second->AddDeviceTraceEventNode(device_event_node);
97
  }
C
chenjian 已提交
98
  // construct thread2mem_event_nodes
99 100 101
  for (auto mem_event_node : mem_event_nodes) {
    thread2mem_event_nodes[mem_event_node->ThreadId()].push_back(
        mem_event_node);
C
chenjian 已提交
102 103
  }
  // construct thread2op_supplement_event_nodes
104 105 106
  for (auto op_supplement_event : op_supplement_events) {
    thread2op_supplement_event_nodes[op_supplement_event->ThreadId()].push_back(
        op_supplement_event);
C
chenjian 已提交
107
  }
108 109 110 111 112
  // sort host event nodes and runtime event nodes according to start_ns and
  // end_ns
  // the smaller start_ns is, the further ahead position is.
  // when start_ns of two nodes are equal, the one with bigger end_ns should be
  // ahead.
113 114 115
  for (auto& event_node : thread2host_event_nodes) {
    std::sort(event_node.second.begin(),
              event_node.second.end(),
116 117 118 119 120 121 122 123 124 125 126
              [](HostTraceEventNode* node1, HostTraceEventNode* node2) {
                if (node1->StartNs() < node2->StartNs()) {
                  return true;
                }
                if ((node1->StartNs() == node2->StartNs()) &&
                    (node1->EndNs() > node2->EndNs())) {
                  return true;
                }
                return false;
              });
  }
127
  for (auto& event_node : thread2runtime_event_nodes) {
128
    std::sort(
129 130
        event_node.second.begin(),
        event_node.second.end(),
131 132 133 134 135 136 137 138 139 140 141
        [](CudaRuntimeTraceEventNode* node1, CudaRuntimeTraceEventNode* node2) {
          if (node1->StartNs() < node2->StartNs()) {
            return true;
          }
          if ((node1->StartNs() == node2->StartNs()) &&
              (node1->EndNs() > node2->EndNs())) {
            return true;
          }
          return false;
        });
  }
C
chenjian 已提交
142
  // sort mem event nodes and operator supplement event nodes
143 144 145
  for (auto& event_node : thread2mem_event_nodes) {
    std::sort(event_node.second.begin(),
              event_node.second.end(),
C
chenjian 已提交
146 147 148 149 150 151 152 153
              [](MemTraceEventNode* node1, MemTraceEventNode* node2) {
                if (node1->TimeStampNs() <= node2->TimeStampNs()) {
                  return true;
                }
                return false;
              });
  }

154 155 156
  for (auto& event_node : thread2op_supplement_event_nodes) {
    std::sort(event_node.second.begin(),
              event_node.second.end(),
C
chenjian 已提交
157 158 159 160 161 162 163 164
              [](OperatorSupplementEventNode* node1,
                 OperatorSupplementEventNode* node2) {
                if (node1->TimeStampNs() <= node2->TimeStampNs()) {
                  return true;
                }
                return false;
              });
  }
165 166 167

  // construct trees
  std::set<uint64_t> thread_set;
168 169
  for (auto& event_node : thread2host_event_nodes) {
    thread_set.insert(event_node.first);
170
  }
171 172
  for (auto& event_node : thread2runtime_event_nodes) {
    thread_set.insert(event_node.first);
173
  }
174 175
  for (auto& event_node : thread2mem_event_nodes) {
    thread_set.insert(event_node.first);
C
chenjian 已提交
176
  }
177 178
  for (auto& event_node : thread2op_supplement_event_nodes) {
    thread_set.insert(event_node.first);
C
chenjian 已提交
179
  }
180

181 182 183 184 185 186
  for (auto item : thread_set) {
    thread_event_trees_map_[item] =
        BuildTreeRelationship(thread2host_event_nodes[item],
                              thread2runtime_event_nodes[item],
                              thread2mem_event_nodes[item],
                              thread2op_supplement_event_nodes[item]);
187 188 189 190 191
  }
}

HostTraceEventNode* NodeTrees::BuildTreeRelationship(
    std::vector<HostTraceEventNode*> host_event_nodes,
C
chenjian 已提交
192 193 194
    std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
    std::vector<MemTraceEventNode*> mem_event_nodes,
    std::vector<OperatorSupplementEventNode*> op_supplement_events) {
195 196 197
  // a stack used for analyse relationship
  auto node_stack = std::vector<HostTraceEventNode*>();
  // root node, top level
198 199 200 201 202 203 204
  auto root_node =
      new HostTraceEventNode(HostTraceEvent(std::string("root node"),
                                            TracerEventType::UserDefined,
                                            0,
                                            ULLONG_MAX,
                                            0,
                                            0));
205 206 207
  // push root node into node_stack
  node_stack.push_back(root_node);
  // handle host_event_nodes
208
  for (auto& host_event_node : host_event_nodes) {
209 210
    while (true) {
      auto stack_top_node = node_stack.back();
211
      if (host_event_node->StartNs() < stack_top_node->EndNs()) {
212 213
        // current node is the child of stack_top_node
        PADDLE_ENFORCE_LE(
214
            host_event_node->EndNs(),
215
            stack_top_node->EndNs(),
216 217
            platform::errors::Fatal(
                "should not have time range intersection within one thread"));
218 219
        stack_top_node->AddChild(host_event_node);
        node_stack.push_back(host_event_node);
220 221 222 223 224 225 226 227 228 229
        break;
      } else {
        node_stack.pop_back();
        // insert runtime node
        // select runtime nodes which time range within stack_top_node
        std::vector<CudaRuntimeTraceEventNode*>::iterator firstposition;
        std::vector<CudaRuntimeTraceEventNode*>::iterator lastposition =
            runtime_event_nodes.end();
        bool hasenter = false;
        for (auto runtimenode = runtime_event_nodes.begin();
230 231
             runtimenode != runtime_event_nodes.end();
             ++runtimenode) {
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
          if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) &&
              ((*runtimenode)->EndNs() <= stack_top_node->EndNs())) {
            if (!hasenter) {
              firstposition = runtimenode;
              hasenter = true;
            }
            stack_top_node->AddCudaRuntimeNode(*runtimenode);
          } else {
            // from this runtime node, not within stack_top_node, erase the
            // nodes from runtime_event_nodes
            if ((*runtimenode)->StartNs() > stack_top_node->EndNs()) {
              lastposition = runtimenode;
              break;
            }
          }
        }
        if (hasenter) {
          runtime_event_nodes.erase(firstposition, lastposition);
        }
      }
    }
  }
  // to insert left runtimenode into host_event_nodes
  while (!node_stack.empty()) {
    auto stack_top_node = node_stack.back();
    // insert runtime node
    // select runtime nodes which time range within stack_top_node
    std::vector<CudaRuntimeTraceEventNode*>::iterator firstposition;
    std::vector<CudaRuntimeTraceEventNode*>::iterator lastposition =
        runtime_event_nodes.end();
    bool hasenter = false;
    for (auto runtimenode = runtime_event_nodes.begin();
264 265
         runtimenode != runtime_event_nodes.end();
         ++runtimenode) {
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
      if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) &&
          ((*runtimenode)->EndNs() <= stack_top_node->EndNs())) {
        if (!hasenter) {
          firstposition = runtimenode;
          hasenter = true;
        }
        stack_top_node->AddCudaRuntimeNode(*runtimenode);
      } else {
        // from this runtime node, not within stack_top_node, erase the
        // nodes from runtime_event_nodes
        if ((*runtimenode)->StartNs() > stack_top_node->EndNs()) {
          lastposition = runtimenode;
          break;
        }
      }
    }
    if (hasenter) {
      runtime_event_nodes.erase(firstposition, lastposition);
    }
    node_stack.pop_back();
  }
C
chenjian 已提交
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304

  // build relationship between host event node and mem event node
  // First, post-order traverse the tree. Then, insert the memory and op
  // supplement node into correct host nodes.
  auto stack = std::stack<HostTraceEventNode*>();
  auto flag_stack = std::stack<int32_t>();
  auto post_order_nodes = std::vector<HostTraceEventNode*>();
  stack.push(root_node);
  flag_stack.push(0);
  while (!stack.empty()) {
    auto current_node = stack.top();
    stack.pop();
    auto flag = flag_stack.top();
    flag_stack.pop();
    if (flag == 0) {
      stack.push(current_node);
      flag_stack.push(1);
      for (auto child = current_node->GetChildren().rbegin();
305 306
           child != current_node->GetChildren().rend();
           ++child) {
C
chenjian 已提交
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
        stack.push(*child);
        flag_stack.push(0);
      }
    } else {
      post_order_nodes.push_back(current_node);
    }
  }

  for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
    bool hasenter = false;
    std::vector<MemTraceEventNode*>::iterator firstposition;
    std::vector<MemTraceEventNode*>::iterator lastposition =
        mem_event_nodes.end();
    for (auto mem_it = mem_event_nodes.begin(); mem_it < mem_event_nodes.end();
         ++mem_it) {
      if ((*mem_it)->TimeStampNs() >= (*it)->StartNs() &&
          (*mem_it)->TimeStampNs() <= (*it)->EndNs()) {
        (*it)->AddMemNode(*mem_it);
        if (!hasenter) {
          firstposition = mem_it;
          hasenter = true;
        }
      } else {
        if ((*mem_it)->TimeStampNs() > (*it)->EndNs()) {
          lastposition = mem_it;
          break;
        }
      }
    }
    if (hasenter) {
      mem_event_nodes.erase(firstposition, lastposition);
    }
  }

  // build relationship between host event node and op supplement node
  for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
    int op_supplement_count = 0;
    bool hasenter = false;
    std::vector<OperatorSupplementEventNode*>::iterator firstposition;
    std::vector<OperatorSupplementEventNode*>::iterator lastposition =
        op_supplement_events.end();
    for (auto op_supplement_it = op_supplement_events.begin();
349 350
         op_supplement_it < op_supplement_events.end();
         ++op_supplement_it) {
C
chenjian 已提交
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
      if ((*op_supplement_it)->TimeStampNs() >= (*it)->StartNs() &&
          (*op_supplement_it)->TimeStampNs() <= (*it)->EndNs()) {
        if (!hasenter) {
          firstposition = op_supplement_it;
          hasenter = true;
        }
        (*it)->SetOperatorSupplementNode(*op_supplement_it);
        op_supplement_count += 1;
      } else {
        if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) {
          lastposition = op_supplement_it;
          break;
        }
      }
    }
    if (hasenter) {
      op_supplement_events.erase(firstposition, lastposition);
    }
  }

371 372 373 374 375 376 377 378 379
  return root_node;
}

std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
    bool bfs) const {
  // traverse the tree, provide two methods: bfs(breadth first search) or
  // dfs(depth first search)
  std::map<uint64_t, std::vector<HostTraceEventNode*>> thread2host_event_nodes;
  if (bfs == true) {
380
    for (auto item : thread_event_trees_map_) {
381
      auto deque = std::deque<HostTraceEventNode*>();
382 383
      uint64_t thread_id = item.first;
      auto root_node = item.second;
384 385 386 387 388
      deque.push_back(root_node);
      while (!deque.empty()) {
        auto current_node = deque.front();
        deque.pop_front();
        thread2host_event_nodes[thread_id].push_back(current_node);
389 390
        for (auto child : current_node->GetChildren()) {
          deque.push_back(child);
391 392 393 394 395
        }
      }
    }

  } else {
396
    for (auto item : thread_event_trees_map_) {
397
      auto stack = std::stack<HostTraceEventNode*>();
398 399
      uint64_t thread_id = item.first;
      auto root_node = item.second;
400 401 402 403 404
      stack.push(root_node);
      while (!stack.empty()) {
        auto current_node = stack.top();
        stack.pop();
        thread2host_event_nodes[thread_id].push_back(current_node);
C
chenjian 已提交
405
        for (auto child = current_node->GetChildren().rbegin();
406 407
             child != current_node->GetChildren().rend();
             ++child) {
408 409 410 411 412 413 414 415 416 417 418 419 420
          stack.push(*child);
        }
      }
    }
  }
  return thread2host_event_nodes;
}

void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); }

void NodeTrees::HandleTrees(
    std::function<void(HostTraceEventNode*)> host_event_node_handle,
    std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle,
C
chenjian 已提交
421 422 423 424
    std::function<void(DeviceTraceEventNode*)> device_event_node_handle,
    std::function<void(MemTraceEventNode*)> mem_event_node_handle,
    std::function<void(OperatorSupplementEventNode*)>
        op_supplement_node_handle) {
425 426 427
  // using different user-defined function to handle different nodes
  const std::map<uint64_t, std::vector<HostTraceEventNode*>>
      thread2host_event_nodes = Traverse(true);
428 429 430
  for (const auto& event_node : thread2host_event_nodes) {
    for (auto hostnode = event_node.second.begin();
         hostnode != event_node.second.end();
431
         ++hostnode) {
432
      if (hostnode != event_node.second.begin()) {  // skip root node
433 434
        host_event_node_handle(*hostnode);
      }
435 436 437 438
      for (auto event_node : (*hostnode)->GetRuntimeTraceEventNodes()) {
        runtime_event_node_handle(event_node);
        for (auto devicenode = event_node->GetDeviceTraceEventNodes().begin();
             devicenode != event_node->GetDeviceTraceEventNodes().end();
439 440 441 442
             ++devicenode) {
          device_event_node_handle(*devicenode);
        }
      }
443 444
      for (auto event_node : (*hostnode)->GetMemTraceEventNodes()) {
        mem_event_node_handle(event_node);
C
chenjian 已提交
445 446 447 448 449
      }
      if ((*hostnode)->GetOperatorSupplementEventNode()) {
        op_supplement_node_handle(
            (*hostnode)->GetOperatorSupplementEventNode());
      }
450 451 452 453 454
    }
  }
}
}  // namespace platform
}  // namespace paddle