/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler/event_node.h" #include #include #include #include #include #include "paddle/fluid/platform/profiler/utils.h" namespace paddle { namespace platform { HostTraceEventNode::~HostTraceEventNode() { // delete all runtime nodes and recursive delete children for (auto& runtime_node_ptr : runtime_node_ptrs_) { delete runtime_node_ptr; } for (auto& child : children_) { delete child; } } CudaRuntimeTraceEventNode::~CudaRuntimeTraceEventNode() { // delete all device nodes for (auto& device_node_ptr : device_node_ptrs_) { delete device_node_ptr; } } NodeTrees::~NodeTrees() { // delete all root nodes for (auto& item : thread_event_trees_map_) { delete item.second; } } void NodeTrees::BuildTrees( const std::vector& host_event_nodes, const std::vector& runtime_event_nodes, const std::vector& device_event_nodes, const std::vector& mem_event_nodes, const std::vector& op_supplement_events) { // separate Host Event Nodes into different threads std::map> thread2host_event_nodes; // used to store HostTraceEventNodes per thread std::map> thread2runtime_event_nodes; // used to store CudaRuntimeTraceEventNode // per // thread std::map> thread2mem_event_nodes; // used to store MemTraceEventNode // per // thread std::map> thread2op_supplement_event_nodes; // used to store // OperatorSupplementEventNode // per // thread std::map correlation_id2runtime_event_node; // used to store the relation between // correlation id and runtime node // construct thread2host_event_nodes for (auto host_event_node : host_event_nodes) { thread2host_event_nodes[host_event_node->ThreadId()].push_back( host_event_node); } // construct thread2runtime_event_nodes and // correlation_id2runtime_event_node for (auto runtime_event_node : runtime_event_nodes) { thread2runtime_event_nodes[runtime_event_node->ThreadId()].push_back( runtime_event_node); correlation_id2runtime_event_node[runtime_event_node->CorrelationId()] = runtime_event_node; } // associate CudaRuntimeTraceEventNode and DeviceTraceEventNode // construct correlation_id2device_event_nodes for (auto device_event_node : device_event_nodes) { auto dst_iter = correlation_id2runtime_event_node.find( device_event_node->CorrelationId()); if (dst_iter == correlation_id2runtime_event_node.end()) { continue; } dst_iter->second->AddDeviceTraceEventNode(device_event_node); } // construct thread2mem_event_nodes for (auto mem_event_node : mem_event_nodes) { thread2mem_event_nodes[mem_event_node->ThreadId()].push_back( mem_event_node); } // construct thread2op_supplement_event_nodes for (auto op_supplement_event : op_supplement_events) { thread2op_supplement_event_nodes[op_supplement_event->ThreadId()].push_back( op_supplement_event); } // sort host event nodes and runtime event nodes according to start_ns and // end_ns // the smaller start_ns is, the further ahead position is. // when start_ns of two nodes are equal, the one with bigger end_ns should be // ahead. for (auto& event_node : thread2host_event_nodes) { std::sort(event_node.second.begin(), event_node.second.end(), [](HostTraceEventNode* node1, HostTraceEventNode* node2) { if (node1->StartNs() < node2->StartNs()) { return true; } if ((node1->StartNs() == node2->StartNs()) && (node1->EndNs() > node2->EndNs())) { return true; } return false; }); } for (auto& event_node : thread2runtime_event_nodes) { std::sort( event_node.second.begin(), event_node.second.end(), [](CudaRuntimeTraceEventNode* node1, CudaRuntimeTraceEventNode* node2) { if (node1->StartNs() < node2->StartNs()) { return true; } if ((node1->StartNs() == node2->StartNs()) && (node1->EndNs() > node2->EndNs())) { return true; } return false; }); } // sort mem event nodes and operator supplement event nodes for (auto& event_node : thread2mem_event_nodes) { std::sort(event_node.second.begin(), event_node.second.end(), [](MemTraceEventNode* node1, MemTraceEventNode* node2) { if (node1->TimeStampNs() <= node2->TimeStampNs()) { return true; } return false; }); } for (auto& event_node : thread2op_supplement_event_nodes) { std::sort(event_node.second.begin(), event_node.second.end(), [](OperatorSupplementEventNode* node1, OperatorSupplementEventNode* node2) { if (node1->TimeStampNs() <= node2->TimeStampNs()) { return true; } return false; }); } // construct trees std::set thread_set; for (auto& event_node : thread2host_event_nodes) { thread_set.insert(event_node.first); } for (auto& event_node : thread2runtime_event_nodes) { thread_set.insert(event_node.first); } for (auto& event_node : thread2mem_event_nodes) { thread_set.insert(event_node.first); } for (auto& event_node : thread2op_supplement_event_nodes) { thread_set.insert(event_node.first); } for (auto item : thread_set) { thread_event_trees_map_[item] = BuildTreeRelationship(thread2host_event_nodes[item], thread2runtime_event_nodes[item], thread2mem_event_nodes[item], thread2op_supplement_event_nodes[item]); } } HostTraceEventNode* NodeTrees::BuildTreeRelationship( std::vector host_event_nodes, std::vector runtime_event_nodes, std::vector mem_event_nodes, std::vector op_supplement_events) { // a stack used for analyse relationship auto node_stack = std::vector(); // root node, top level auto root_node = new HostTraceEventNode(HostTraceEvent(std::string("root node"), TracerEventType::UserDefined, 0, ULLONG_MAX, 0, 0)); // push root node into node_stack node_stack.push_back(root_node); // handle host_event_nodes for (auto& host_event_node : host_event_nodes) { while (true) { auto stack_top_node = node_stack.back(); if (host_event_node->StartNs() < stack_top_node->EndNs()) { // current node is the child of stack_top_node PADDLE_ENFORCE_LE( host_event_node->EndNs(), stack_top_node->EndNs(), platform::errors::Fatal( "should not have time range intersection within one thread")); stack_top_node->AddChild(host_event_node); node_stack.push_back(host_event_node); break; } else { node_stack.pop_back(); // insert runtime node // select runtime nodes which time range within stack_top_node std::vector::iterator firstposition; std::vector::iterator lastposition = runtime_event_nodes.end(); bool hasenter = false; for (auto runtimenode = runtime_event_nodes.begin(); runtimenode != runtime_event_nodes.end(); ++runtimenode) { if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) && ((*runtimenode)->EndNs() <= stack_top_node->EndNs())) { if (!hasenter) { firstposition = runtimenode; hasenter = true; } stack_top_node->AddCudaRuntimeNode(*runtimenode); } else { // from this runtime node, not within stack_top_node, erase the // nodes from runtime_event_nodes if ((*runtimenode)->StartNs() > stack_top_node->EndNs()) { lastposition = runtimenode; break; } } } if (hasenter) { runtime_event_nodes.erase(firstposition, lastposition); } } } } // to insert left runtimenode into host_event_nodes while (!node_stack.empty()) { auto stack_top_node = node_stack.back(); // insert runtime node // select runtime nodes which time range within stack_top_node std::vector::iterator firstposition; std::vector::iterator lastposition = runtime_event_nodes.end(); bool hasenter = false; for (auto runtimenode = runtime_event_nodes.begin(); runtimenode != runtime_event_nodes.end(); ++runtimenode) { if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) && ((*runtimenode)->EndNs() <= stack_top_node->EndNs())) { if (!hasenter) { firstposition = runtimenode; hasenter = true; } stack_top_node->AddCudaRuntimeNode(*runtimenode); } else { // from this runtime node, not within stack_top_node, erase the // nodes from runtime_event_nodes if ((*runtimenode)->StartNs() > stack_top_node->EndNs()) { lastposition = runtimenode; break; } } } if (hasenter) { runtime_event_nodes.erase(firstposition, lastposition); } node_stack.pop_back(); } // build relationship between host event node and mem event node // First, post-order traverse the tree. Then, insert the memory and op // supplement node into correct host nodes. auto stack = std::stack(); auto flag_stack = std::stack(); auto post_order_nodes = std::vector(); stack.push(root_node); flag_stack.push(0); while (!stack.empty()) { auto current_node = stack.top(); stack.pop(); auto flag = flag_stack.top(); flag_stack.pop(); if (flag == 0) { stack.push(current_node); flag_stack.push(1); for (auto child = current_node->GetChildren().rbegin(); child != current_node->GetChildren().rend(); ++child) { stack.push(*child); flag_stack.push(0); } } else { post_order_nodes.push_back(current_node); } } for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) { bool hasenter = false; std::vector::iterator firstposition; std::vector::iterator lastposition = mem_event_nodes.end(); for (auto mem_it = mem_event_nodes.begin(); mem_it < mem_event_nodes.end(); ++mem_it) { if ((*mem_it)->TimeStampNs() >= (*it)->StartNs() && (*mem_it)->TimeStampNs() <= (*it)->EndNs()) { (*it)->AddMemNode(*mem_it); if (!hasenter) { firstposition = mem_it; hasenter = true; } } else { if ((*mem_it)->TimeStampNs() > (*it)->EndNs()) { lastposition = mem_it; break; } } } if (hasenter) { mem_event_nodes.erase(firstposition, lastposition); } } // build relationship between host event node and op supplement node for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) { int op_supplement_count = 0; bool hasenter = false; std::vector::iterator firstposition; std::vector::iterator lastposition = op_supplement_events.end(); for (auto op_supplement_it = op_supplement_events.begin(); op_supplement_it < op_supplement_events.end(); ++op_supplement_it) { if ((*op_supplement_it)->TimeStampNs() >= (*it)->StartNs() && (*op_supplement_it)->TimeStampNs() <= (*it)->EndNs()) { if (!hasenter) { firstposition = op_supplement_it; hasenter = true; } (*it)->SetOperatorSupplementNode(*op_supplement_it); op_supplement_count += 1; } else { if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) { lastposition = op_supplement_it; break; } } } if (hasenter) { op_supplement_events.erase(firstposition, lastposition); } } return root_node; } std::map> NodeTrees::Traverse( bool bfs) const { // traverse the tree, provide two methods: bfs(breadth first search) or // dfs(depth first search) std::map> thread2host_event_nodes; if (bfs == true) { for (auto item : thread_event_trees_map_) { auto deque = std::deque(); uint64_t thread_id = item.first; auto root_node = item.second; deque.push_back(root_node); while (!deque.empty()) { auto current_node = deque.front(); deque.pop_front(); thread2host_event_nodes[thread_id].push_back(current_node); for (auto child : current_node->GetChildren()) { deque.push_back(child); } } } } else { for (auto item : thread_event_trees_map_) { auto stack = std::stack(); uint64_t thread_id = item.first; auto root_node = item.second; stack.push(root_node); while (!stack.empty()) { auto current_node = stack.top(); stack.pop(); thread2host_event_nodes[thread_id].push_back(current_node); for (auto child = current_node->GetChildren().rbegin(); child != current_node->GetChildren().rend(); ++child) { stack.push(*child); } } } } return thread2host_event_nodes; } void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); } void NodeTrees::HandleTrees( std::function host_event_node_handle, std::function runtime_event_node_handle, std::function device_event_node_handle, std::function mem_event_node_handle, std::function op_supplement_node_handle) { // using different user-defined function to handle different nodes const std::map> thread2host_event_nodes = Traverse(true); for (const auto& event_node : thread2host_event_nodes) { for (auto hostnode = event_node.second.begin(); hostnode != event_node.second.end(); ++hostnode) { if (hostnode != event_node.second.begin()) { // skip root node host_event_node_handle(*hostnode); } for (auto event_node : (*hostnode)->GetRuntimeTraceEventNodes()) { runtime_event_node_handle(event_node); for (auto devicenode = event_node->GetDeviceTraceEventNodes().begin(); devicenode != event_node->GetDeviceTraceEventNodes().end(); ++devicenode) { device_event_node_handle(*devicenode); } } for (auto event_node : (*hostnode)->GetMemTraceEventNodes()) { mem_event_node_handle(event_node); } if ((*hostnode)->GetOperatorSupplementEventNode()) { op_supplement_node_handle( (*hostnode)->GetOperatorSupplementEventNode()); } } } } } // namespace platform } // namespace paddle