/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler/event_node.h" #include #include #include #include #include #include "paddle/fluid/platform/profiler/utils.h" namespace paddle { namespace platform { HostTraceEventNode::~HostTraceEventNode() { // delete all runtime nodes and recursive delete children for (auto it = runtime_node_ptrs_.begin(); it != runtime_node_ptrs_.end(); ++it) { delete *it; } for (auto it = children_.begin(); it != children_.end(); ++it) { delete *it; } } CudaRuntimeTraceEventNode::~CudaRuntimeTraceEventNode() { // delete all device nodes for (auto it = device_node_ptrs_.begin(); it != device_node_ptrs_.end(); ++it) { delete *it; } } NodeTrees::~NodeTrees() { // delete all root nodes for (auto it = thread_event_trees_map_.begin(); it != thread_event_trees_map_.end(); ++it) { delete it->second; } } void NodeTrees::BuildTrees( const std::vector& host_event_nodes, const std::vector& runtime_event_nodes, const std::vector& device_event_nodes, const std::vector& mem_event_nodes, const std::vector& op_supplement_events) { // separate Host Event Nodes into different threads std::map> thread2host_event_nodes; // used to store HostTraceEventNodes per thread std::map> thread2runtime_event_nodes; // used to store CudaRuntimeTraceEventNode // per // thread std::map> thread2mem_event_nodes; // used to store MemTraceEventNode // per // thread std::map> thread2op_supplement_event_nodes; // used to store // OperatorSupplementEventNode // per // thread std::map correlation_id2runtime_event_node; // used to store the relation between // correlation id and runtime node // construct thread2host_event_nodes for (auto it = host_event_nodes.begin(); it != host_event_nodes.end(); ++it) { thread2host_event_nodes[(*it)->ThreadId()].push_back(*it); } // construct thread2runtime_event_nodes and // correlation_id2runtime_event_node for (auto it = runtime_event_nodes.begin(); it != runtime_event_nodes.end(); ++it) { thread2runtime_event_nodes[(*it)->ThreadId()].push_back(*it); correlation_id2runtime_event_node[(*it)->CorrelationId()] = *it; } // associate CudaRuntimeTraceEventNode and DeviceTraceEventNode // construct correlation_id2device_event_nodes for (auto it = device_event_nodes.begin(); it != device_event_nodes.end(); ++it) { auto dst_iter = correlation_id2runtime_event_node.find((*it)->CorrelationId()); PADDLE_ENFORCE_NE( dst_iter, correlation_id2runtime_event_node.end(), platform::errors::NotFound("Unknown device events, " "no corresponding cuda runtime events")); dst_iter->second->AddDeviceTraceEventNode(*it); } // construct thread2mem_event_nodes for (auto it = mem_event_nodes.begin(); it != mem_event_nodes.end(); ++it) { thread2mem_event_nodes[(*it)->ThreadId()].push_back(*it); } // construct thread2op_supplement_event_nodes for (auto it = op_supplement_events.begin(); it != op_supplement_events.end(); ++it) { thread2op_supplement_event_nodes[(*it)->ThreadId()].push_back(*it); } // sort host event nodes and runtime event nodes according to start_ns and // end_ns // the smaller start_ns is, the further ahead position is. // when start_ns of two nodes are equal, the one with bigger end_ns should be // ahead. for (auto it = thread2host_event_nodes.begin(); it != thread2host_event_nodes.end(); ++it) { std::sort(it->second.begin(), it->second.end(), [](HostTraceEventNode* node1, HostTraceEventNode* node2) { if (node1->StartNs() < node2->StartNs()) { return true; } if ((node1->StartNs() == node2->StartNs()) && (node1->EndNs() > node2->EndNs())) { return true; } return false; }); } for (auto it = thread2runtime_event_nodes.begin(); it != thread2runtime_event_nodes.end(); ++it) { std::sort( it->second.begin(), it->second.end(), [](CudaRuntimeTraceEventNode* node1, CudaRuntimeTraceEventNode* node2) { if (node1->StartNs() < node2->StartNs()) { return true; } if ((node1->StartNs() == node2->StartNs()) && (node1->EndNs() > node2->EndNs())) { return true; } return false; }); } // sort mem event nodes and operator supplement event nodes for (auto it = thread2mem_event_nodes.begin(); it != thread2mem_event_nodes.end(); ++it) { std::sort(it->second.begin(), it->second.end(), [](MemTraceEventNode* node1, MemTraceEventNode* node2) { if (node1->TimeStampNs() <= node2->TimeStampNs()) { return true; } return false; }); } for (auto it = thread2op_supplement_event_nodes.begin(); it != thread2op_supplement_event_nodes.end(); ++it) { std::sort(it->second.begin(), it->second.end(), [](OperatorSupplementEventNode* node1, OperatorSupplementEventNode* node2) { if (node1->TimeStampNs() <= node2->TimeStampNs()) { return true; } return false; }); } // construct trees std::set thread_set; for (auto it = thread2host_event_nodes.begin(); it != thread2host_event_nodes.end(); ++it) { thread_set.insert(it->first); } for (auto it = thread2runtime_event_nodes.begin(); it != thread2runtime_event_nodes.end(); ++it) { thread_set.insert(it->first); } for (auto it = thread2mem_event_nodes.begin(); it != thread2mem_event_nodes.end(); ++it) { thread_set.insert(it->first); } for (auto it = thread2op_supplement_event_nodes.begin(); it != thread2op_supplement_event_nodes.end(); ++it) { thread_set.insert(it->first); } for (auto it = thread_set.begin(); it != thread_set.end(); ++it) { thread_event_trees_map_[*it] = BuildTreeRelationship(thread2host_event_nodes[*it], thread2runtime_event_nodes[*it], thread2mem_event_nodes[*it], thread2op_supplement_event_nodes[*it]); } } HostTraceEventNode* NodeTrees::BuildTreeRelationship( std::vector host_event_nodes, std::vector runtime_event_nodes, std::vector mem_event_nodes, std::vector op_supplement_events) { // a stack used for analyse relationship auto node_stack = std::vector(); // root node, top level auto root_node = new HostTraceEventNode(HostTraceEvent(std::string("root node"), TracerEventType::UserDefined, 0, ULLONG_MAX, 0, 0)); // push root node into node_stack node_stack.push_back(root_node); // handle host_event_nodes for (auto it = host_event_nodes.begin(); it != host_event_nodes.end(); ++it) { while (true) { auto stack_top_node = node_stack.back(); if ((*it)->StartNs() < stack_top_node->EndNs()) { // current node is the child of stack_top_node PADDLE_ENFORCE_LE( (*it)->EndNs(), stack_top_node->EndNs(), platform::errors::Fatal( "should not have time range intersection within one thread")); stack_top_node->AddChild(*it); node_stack.push_back(*it); break; } else { node_stack.pop_back(); // insert runtime node // select runtime nodes which time range within stack_top_node std::vector::iterator firstposition; std::vector::iterator lastposition = runtime_event_nodes.end(); bool hasenter = false; for (auto runtimenode = runtime_event_nodes.begin(); runtimenode != runtime_event_nodes.end(); ++runtimenode) { if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) && ((*runtimenode)->EndNs() <= stack_top_node->EndNs())) { if (!hasenter) { firstposition = runtimenode; hasenter = true; } stack_top_node->AddCudaRuntimeNode(*runtimenode); } else { // from this runtime node, not within stack_top_node, erase the // nodes from runtime_event_nodes if ((*runtimenode)->StartNs() > stack_top_node->EndNs()) { lastposition = runtimenode; break; } } } if (hasenter) { runtime_event_nodes.erase(firstposition, lastposition); } } } } // to insert left runtimenode into host_event_nodes while (!node_stack.empty()) { auto stack_top_node = node_stack.back(); // insert runtime node // select runtime nodes which time range within stack_top_node std::vector::iterator firstposition; std::vector::iterator lastposition = runtime_event_nodes.end(); bool hasenter = false; for (auto runtimenode = runtime_event_nodes.begin(); runtimenode != runtime_event_nodes.end(); ++runtimenode) { if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) && ((*runtimenode)->EndNs() <= stack_top_node->EndNs())) { if (!hasenter) { firstposition = runtimenode; hasenter = true; } stack_top_node->AddCudaRuntimeNode(*runtimenode); } else { // from this runtime node, not within stack_top_node, erase the // nodes from runtime_event_nodes if ((*runtimenode)->StartNs() > stack_top_node->EndNs()) { lastposition = runtimenode; break; } } } if (hasenter) { runtime_event_nodes.erase(firstposition, lastposition); } node_stack.pop_back(); } // build relationship between host event node and mem event node // First, post-order traverse the tree. Then, insert the memory and op // supplement node into correct host nodes. auto stack = std::stack(); auto flag_stack = std::stack(); auto post_order_nodes = std::vector(); stack.push(root_node); flag_stack.push(0); while (!stack.empty()) { auto current_node = stack.top(); stack.pop(); auto flag = flag_stack.top(); flag_stack.pop(); if (flag == 0) { stack.push(current_node); flag_stack.push(1); for (auto child = current_node->GetChildren().rbegin(); child != current_node->GetChildren().rend(); ++child) { stack.push(*child); flag_stack.push(0); } } else { post_order_nodes.push_back(current_node); } } for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) { bool hasenter = false; std::vector::iterator firstposition; std::vector::iterator lastposition = mem_event_nodes.end(); for (auto mem_it = mem_event_nodes.begin(); mem_it < mem_event_nodes.end(); ++mem_it) { if ((*mem_it)->TimeStampNs() >= (*it)->StartNs() && (*mem_it)->TimeStampNs() <= (*it)->EndNs()) { (*it)->AddMemNode(*mem_it); if (!hasenter) { firstposition = mem_it; hasenter = true; } } else { if ((*mem_it)->TimeStampNs() > (*it)->EndNs()) { lastposition = mem_it; break; } } } if (hasenter) { mem_event_nodes.erase(firstposition, lastposition); } } // build relationship between host event node and op supplement node for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) { int op_supplement_count = 0; bool hasenter = false; std::vector::iterator firstposition; std::vector::iterator lastposition = op_supplement_events.end(); for (auto op_supplement_it = op_supplement_events.begin(); op_supplement_it < op_supplement_events.end(); ++op_supplement_it) { if ((*op_supplement_it)->TimeStampNs() >= (*it)->StartNs() && (*op_supplement_it)->TimeStampNs() <= (*it)->EndNs()) { if (!hasenter) { firstposition = op_supplement_it; hasenter = true; } (*it)->SetOperatorSupplementNode(*op_supplement_it); PADDLE_ENFORCE_EQ((*it)->Type(), TracerEventType::Operator, platform::errors::PreconditionNotMet( "Operator supplement events should be embraced " "by event of type TracerEventType::Operator, " "but got type TracerEventType::%s", StringTracerEventType((*it)->Type()))); op_supplement_count += 1; } else { if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) { PADDLE_ENFORCE_LE(op_supplement_count, 1, platform::errors::PreconditionNotMet( "One event of TracerEventType::Operator has no " "more than 1 op supplement event, but got %d.", op_supplement_count)); lastposition = op_supplement_it; break; } } } if (hasenter) { op_supplement_events.erase(firstposition, lastposition); } } return root_node; } std::map> NodeTrees::Traverse( bool bfs) const { // traverse the tree, provide two methods: bfs(breadth first search) or // dfs(depth first search) std::map> thread2host_event_nodes; if (bfs == true) { for (auto it = thread_event_trees_map_.begin(); it != thread_event_trees_map_.end(); ++it) { auto deque = std::deque(); uint64_t thread_id = it->first; auto root_node = it->second; deque.push_back(root_node); while (!deque.empty()) { auto current_node = deque.front(); deque.pop_front(); thread2host_event_nodes[thread_id].push_back(current_node); for (auto child = current_node->GetChildren().begin(); child != current_node->GetChildren().end(); ++child) { deque.push_back(*child); } } } } else { for (auto it = thread_event_trees_map_.begin(); it != thread_event_trees_map_.end(); ++it) { auto stack = std::stack(); uint64_t thread_id = it->first; auto root_node = it->second; stack.push(root_node); while (!stack.empty()) { auto current_node = stack.top(); stack.pop(); thread2host_event_nodes[thread_id].push_back(current_node); for (auto child = current_node->GetChildren().rbegin(); child != current_node->GetChildren().rend(); ++child) { stack.push(*child); } } } } return thread2host_event_nodes; } void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); } void NodeTrees::HandleTrees( std::function host_event_node_handle, std::function runtime_event_node_handle, std::function device_event_node_handle, std::function mem_event_node_handle, std::function op_supplement_node_handle) { // using different user-defined function to handle different nodes const std::map> thread2host_event_nodes = Traverse(true); for (auto it = thread2host_event_nodes.begin(); it != thread2host_event_nodes.end(); ++it) { for (auto hostnode = it->second.begin(); hostnode != it->second.end(); ++hostnode) { if (hostnode != it->second.begin()) { // skip root node host_event_node_handle(*hostnode); } for (auto runtimenode = (*hostnode)->GetRuntimeTraceEventNodes().begin(); runtimenode != (*hostnode)->GetRuntimeTraceEventNodes().end(); ++runtimenode) { runtime_event_node_handle(*runtimenode); for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin(); devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end(); ++devicenode) { device_event_node_handle(*devicenode); } } for (auto memeventnode = (*hostnode)->GetMemTraceEventNodes().begin(); memeventnode != (*hostnode)->GetMemTraceEventNodes().end(); ++memeventnode) { mem_event_node_handle(*memeventnode); } if ((*hostnode)->GetOperatorSupplementEventNode()) { op_supplement_node_handle( (*hostnode)->GetOperatorSupplementEventNode()); } } } } } // namespace platform } // namespace paddle