From 40d2b7c6fe92890d77a400c51cf933f18b62275c Mon Sep 17 00:00:00 2001 From: chenjian Date: Wed, 16 Feb 2022 09:45:47 +0800 Subject: [PATCH] Add profiler tree dump code (#39519) * add profiler tree dump code * add CMakeLists item * reduce some contents for pr --- paddle/fluid/platform/profiler/CMakeLists.txt | 1 + .../platform/profiler/dump/CMakeLists.txt | 4 + .../profiler/dump/deserialization_reader.cc | 218 ++++++++++++++ .../profiler/dump/deserialization_reader.h | 45 +++ .../platform/profiler/dump/nodetree.proto | 181 ++++++++++++ .../profiler/dump/serialization_logger.cc | 265 ++++++++++++++++++ .../profiler/dump/serialization_logger.h | 49 ++++ .../dump/test_serialization_logger.cc | 174 ++++++++++++ paddle/fluid/platform/profiler/utils.h | 46 +++ 9 files changed, 983 insertions(+) create mode 100644 paddle/fluid/platform/profiler/dump/CMakeLists.txt create mode 100644 paddle/fluid/platform/profiler/dump/deserialization_reader.cc create mode 100644 paddle/fluid/platform/profiler/dump/deserialization_reader.h create mode 100644 paddle/fluid/platform/profiler/dump/nodetree.proto create mode 100644 paddle/fluid/platform/profiler/dump/serialization_logger.cc create mode 100755 paddle/fluid/platform/profiler/dump/serialization_logger.h create mode 100644 paddle/fluid/platform/profiler/dump/test_serialization_logger.cc create mode 100644 paddle/fluid/platform/profiler/utils.h diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index 626847f046..320e989bd9 100644 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -5,3 +5,4 @@ cc_library(event_node SRCS event_node.cc DEPS enforce) cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node) cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger) cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node) +add_subdirectory(dump) diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt new file mode 100644 index 0000000000..e25333f7a8 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt @@ -0,0 +1,4 @@ +proto_library(nodetreeproto SRCS nodetree.proto) +cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node) +cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node) +cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS serialization_logger deserialization_reader event_node) diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc new file mode 100644 index 0000000000..d1049a7dc1 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -0,0 +1,218 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" + +#include + +namespace paddle { +namespace platform { + +DeserializationReader::DeserializationReader(const std::string& filename) + : filename_(filename) { + OpenFile(); + node_trees_proto_ = new NodeTreesProto(); +} + +DeserializationReader::DeserializationReader(const char* filename) + : filename_(filename) { + OpenFile(); + node_trees_proto_ = new NodeTreesProto(); +} + +void DeserializationReader::OpenFile() { + input_file_stream_.open(filename_, std::ifstream::in | std::ifstream::binary); + if (!input_file_stream_) { + VLOG(2) << "Unable to open file for writing profiling data." << std::endl; + } else { + VLOG(0) << "Read profiling data from " << filename_ << std::endl; + } +} + +std::unique_ptr DeserializationReader::Parse() { + if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) { + VLOG(2) << "Unable to load node trees in protobuf." << std::endl; + return nullptr; + } + std::map thread_event_trees_map; + for (int node_tree_index = 0; + node_tree_index < node_trees_proto_->thread_trees_size(); + node_tree_index++) { + // handle one thread tree + std::map index_node_map; + std::map child_parent_map; + const ThreadNodeTreeProto& thread_node_tree_proto = + node_trees_proto_->thread_trees(node_tree_index); + uint64_t current_threadid = thread_node_tree_proto.thread_id(); + for (int host_node_index = 0; + host_node_index < thread_node_tree_proto.host_nodes_size(); + host_node_index++) { + // handle host node + const HostTraceEventNodeProto& host_node_proto = + thread_node_tree_proto.host_nodes(host_node_index); + HostTraceEventNode* host_node = + RestoreHostTraceEventNode(host_node_proto); + index_node_map[host_node_proto.id()] = host_node; + child_parent_map[host_node_proto.id()] = host_node_proto.parentid(); + // handle runtime node + for (int runtime_node_index = 0; + runtime_node_index < host_node_proto.runtime_nodes_size(); + runtime_node_index++) { + const CudaRuntimeTraceEventNodeProto& runtime_node_proto = + host_node_proto.runtime_nodes(runtime_node_index); + CudaRuntimeTraceEventNode* runtime_node = + RestoreCudaRuntimeTraceEventNode(runtime_node_proto); + host_node->AddCudaRuntimeNode(runtime_node); // insert into host_node + // handle device node + for (int device_node_index = 0; + device_node_index < runtime_node_proto.device_nodes_size(); + device_node_index++) { + const DeviceTraceEventNodeProto& device_node_proto = + runtime_node_proto.device_nodes(device_node_index); + DeviceTraceEventNode* device_node = + RestoreDeviceTraceEventNode(device_node_proto); + runtime_node->AddDeviceTraceEventNode( + device_node); // insert into runtime_node + } + } + } + // restore parent-child relationship + for (auto it = child_parent_map.begin(); it != child_parent_map.end(); + it++) { + if (it->second != -1) { // not root node + index_node_map[it->second]->AddChild(index_node_map[it->first]); + } else { + thread_event_trees_map[current_threadid] = + index_node_map[it->first]; // root node + } + } + } + // restore NodeTrees object + return std::unique_ptr(new NodeTrees(thread_event_trees_map)); +} + +DeserializationReader::~DeserializationReader() { + delete node_trees_proto_; + input_file_stream_.close(); +} + +DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode( + const DeviceTraceEventNodeProto& device_node_proto) { + const DeviceTraceEventProto& device_event_proto = + device_node_proto.device_event(); + DeviceTraceEvent device_event; + device_event.name = device_event_proto.name(); + device_event.type = static_cast(device_event_proto.type()); + device_event.start_ns = device_event_proto.start_ns(); + device_event.end_ns = device_event_proto.end_ns(); + device_event.device_id = device_event_proto.device_id(); + device_event.context_id = device_event_proto.context_id(); + device_event.stream_id = device_event_proto.stream_id(); + device_event.correlation_id = device_event_proto.correlation_id(); + switch (device_event.type) { + case TracerEventType::Kernel: + device_event.kernel_info = HandleKernelEventInfoProto(device_event_proto); + break; + + case TracerEventType::Memcpy: + device_event.memcpy_info = HandleMemcpyEventInfoProto(device_event_proto); + break; + + case TracerEventType::Memset: + device_event.memset_info = HandleMemsetEventInfoProto(device_event_proto); + break; + default: + break; + } + return new DeviceTraceEventNode(device_event); +} + +CudaRuntimeTraceEventNode* +DeserializationReader::RestoreCudaRuntimeTraceEventNode( + const CudaRuntimeTraceEventNodeProto& runtime_node_proto) { + const CudaRuntimeTraceEventProto& runtime_event_proto = + runtime_node_proto.runtime_trace_event(); + RuntimeTraceEvent runtime_event; + runtime_event.name = runtime_event_proto.name(); + runtime_event.start_ns = runtime_event_proto.start_ns(); + runtime_event.end_ns = runtime_event_proto.end_ns(); + runtime_event.process_id = runtime_event_proto.process_id(); + runtime_event.thread_id = runtime_event_proto.thread_id(); + runtime_event.correlation_id = runtime_event_proto.correlation_id(); + runtime_event.callback_id = runtime_event_proto.callback_id(); + return new CudaRuntimeTraceEventNode(runtime_event); +} + +HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode( + const HostTraceEventNodeProto& host_node_proto) { + const HostTraceEventProto& host_event_proto = + host_node_proto.host_trace_event(); + HostTraceEvent host_event; + host_event.name = host_event_proto.name(); + host_event.type = static_cast(host_event_proto.type()); + host_event.start_ns = host_event_proto.start_ns(); + host_event.end_ns = host_event_proto.end_ns(); + host_event.process_id = host_event_proto.process_id(); + host_event.thread_id = host_event_proto.thread_id(); + return new HostTraceEventNode(host_event); +} + +KernelEventInfo DeserializationReader::HandleKernelEventInfoProto( + const DeviceTraceEventProto& device_event_proto) { + const KernelEventInfoProto& kernel_info_proto = + device_event_proto.kernel_info(); + KernelEventInfo kernel_info; + kernel_info.block_x = kernel_info_proto.block_x(); + kernel_info.block_y = kernel_info_proto.block_y(); + kernel_info.block_z = kernel_info_proto.block_z(); + kernel_info.grid_x = kernel_info_proto.grid_x(); + kernel_info.grid_y = kernel_info_proto.grid_y(); + kernel_info.grid_z = kernel_info_proto.grid_z(); + kernel_info.dynamic_shared_memory = kernel_info_proto.dynamic_shared_memory(); + kernel_info.static_shared_memory = kernel_info_proto.static_shared_memory(); + kernel_info.registers_per_thread = kernel_info_proto.registers_per_thread(); + kernel_info.local_memory_per_thread = + kernel_info_proto.local_memory_per_thread(); + kernel_info.local_memory_total = kernel_info_proto.local_memory_total(); + kernel_info.queued = kernel_info_proto.queued(); + kernel_info.submitted = kernel_info_proto.submitted(); + kernel_info.completed = kernel_info_proto.completed(); + return kernel_info; +} + +MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto( + const DeviceTraceEventProto& device_event_proto) { + const MemcpyEventInfoProto& memcpy_info_proto = + device_event_proto.memcpy_info(); + MemcpyEventInfo memcpy_info; + memcpy_info.num_bytes = memcpy_info_proto.num_bytes(); + std::strncpy(memcpy_info.copy_kind, memcpy_info_proto.copy_kind().c_str(), + kMemKindMaxLen - 1); + std::strncpy(memcpy_info.src_kind, memcpy_info_proto.src_kind().c_str(), + kMemKindMaxLen - 1); + std::strncpy(memcpy_info.dst_kind, memcpy_info_proto.dst_kind().c_str(), + kMemKindMaxLen - 1); + return memcpy_info; +} + +MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto( + const DeviceTraceEventProto& device_event_proto) { + const MemsetEventInfoProto& memset_info_proto = + device_event_proto.memset_info(); + MemsetEventInfo memset_info; + memset_info.num_bytes = memset_info_proto.num_bytes(); + std::strncpy(memset_info.memory_kind, memset_info_proto.memory_kind().c_str(), + kMemKindMaxLen - 1); + memset_info.value = memset_info_proto.value(); + return memset_info; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h new file mode 100644 index 0000000000..1ad2dabf22 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" +#include "paddle/fluid/platform/profiler/event_node.h" + +namespace paddle { +namespace platform { + +class DeserializationReader { + public: + explicit DeserializationReader(const std::string& filename); + explicit DeserializationReader(const char* filename); + ~DeserializationReader(); + std::unique_ptr Parse(); + + private: + void OpenFile(); + DeviceTraceEventNode* RestoreDeviceTraceEventNode( + const DeviceTraceEventNodeProto&); + CudaRuntimeTraceEventNode* RestoreCudaRuntimeTraceEventNode( + const CudaRuntimeTraceEventNodeProto&); + HostTraceEventNode* RestoreHostTraceEventNode(const HostTraceEventNodeProto&); + KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&); + MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&); + MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&); + std::string filename_; + std::ifstream input_file_stream_; + NodeTreesProto* node_trees_proto_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto new file mode 100644 index 0000000000..37dac0e597 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -0,0 +1,181 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.platform; + +enum TracerEventTypeProto { + // Used to mark operator record + Operator = 0; + // Used to mark dataloader record + Dataloader = 1; + // Used to mark profile step record + ProfileStep = 2; + // Used to mark cuda runtime record returned by cupti + CudaRuntime = 3; + // Used to mark kernel computation record returned by cupti + Kernel = 4; + // Used to mark memcpy record returned by cupti + Memcpy = 5; + // Used to mark memset record returned by cupti + Memset = 6; + // Used to mark record defined by user + UserDefined = 7; + // A flag to denote the number of current types + NumTypes = 8; +} + +message KernelEventInfoProto { + // The X-dimension block size for the kernel. + required uint32 block_x = 1; + // The Y-dimension block size for the kernel. + required uint32 block_y = 2; + // The Z-dimension grid size for the kernel. + required uint32 block_z = 3; + // X-dimension of a grid. + required uint32 grid_x = 4; + // Y-dimension of a grid. + required uint32 grid_y = 5; + // Z-dimension of a grid. + required uint32 grid_z = 6; + // The dynamic shared memory reserved for the kernel, in bytes. + required uint32 dynamic_shared_memory = 7; + // The static shared memory allocated for the kernel, in bytes. + required uint32 static_shared_memory = 8; + // The number of registers required for each thread executing the kernel. + required uint32 registers_per_thread = 9; + // The amount of local memory reserved for each thread, in bytes. + required uint32 local_memory_per_thread = 10; + // The total amount of local memory reserved for the kernel, in bytes. + required uint32 local_memory_total = 11; + // The timestamp when the kernel is queued up in the command buffer, in ns. + // This timestamp is not collected by default. Use API + // cuptiActivityEnableLatencyTimestamps() to enable collection. + required uint64 queued = 12; + // The timestamp when the command buffer containing the kernel launch is + // submitted to the GPU, in ns. + // This timestamp is not collected by default. Use API + // cuptiActivityEnableLatencyTimestamps() to enable collection. + required uint64 submitted = 13; + // The completed timestamp for the kernel execution, in ns. + required uint64 completed = 14; +} + +message MemcpyEventInfoProto { + // The number of bytes transferred by the memory copy. + required uint64 num_bytes = 1; + // The kind of the memory copy. + // Each kind represents the source and destination targets of a memory copy. + // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind + required string copy_kind = 2; + // The source memory kind read by the memory copy. + // Each kind represents the type of the memory accessed by a memory + // operation/copy. Refer to CUpti_ActivityMemoryKind + required string src_kind = 3; + // The destination memory kind read by the memory copy. + required string dst_kind = 4; +} + +message MemsetEventInfoProto { + // The number of bytes being set by the memory set. + required uint64 num_bytes = 1; + // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind + required string memory_kind = 2; + // the value being assigned to memory by the memory set. + required uint32 value = 3; +} + +message HostTraceEventProto { + required string name = 1; + required TracerEventTypeProto type = 2; + // start timestamp of the record + required uint64 start_ns = 3; + // end timestamp of the record + required uint64 end_ns = 4; + // process id of the record + required uint64 process_id = 5; + // thread id of the record + required uint64 thread_id = 6; +} + +message CudaRuntimeTraceEventProto { + // record name + required string name = 1; + // start timestamp of the record + required uint64 start_ns = 2; + // end timestamp of the record + required uint64 end_ns = 3; + // process id of the record + required uint64 process_id = 4; + // thread id of the record + required uint64 thread_id = 5; + // correlation id, used for correlating async activities happened on device + required uint32 correlation_id = 6; + // callback id, used to identify which cuda runtime api is called + required uint32 callback_id = 7; +} + +message DeviceTraceEventProto { + // record name + required string name = 1; + // record type, one of TracerEventType + required TracerEventTypeProto type = 2; + // start timestamp of the record + required uint64 start_ns = 3; + // end timestamp of the record + required uint64 end_ns = 4; + // device id + required uint64 device_id = 5; + // context id + required uint64 context_id = 6; + // stream id + required uint64 stream_id = 7; + // correlation id, used for correlating async activities happened on device + required uint32 correlation_id = 8; + // union, specific device record type has different detail information + oneof detail_info { + // used for TracerEventType::Kernel + KernelEventInfoProto kernel_info = 9; + // used for TracerEventType::Memcpy + MemcpyEventInfoProto memcpy_info = 10; + // used for TracerEventType::Memset + MemsetEventInfoProto memset_info = 11; + } +} + +message DeviceTraceEventNodeProto { + required DeviceTraceEventProto device_event = 1; +} + +message CudaRuntimeTraceEventNodeProto { + required CudaRuntimeTraceEventProto runtime_trace_event = 1; + repeated DeviceTraceEventNodeProto device_nodes = 2; +} + +message HostTraceEventNodeProto { + required int64 id = 1; + required int64 parentid = 2; + required HostTraceEventProto host_trace_event = 3; + repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4; +} + +message ThreadNodeTreeProto { + required uint64 thread_id = 1; + repeated HostTraceEventNodeProto host_nodes = 2; +} + +message NodeTreesProto { + required string version = 1; + repeated ThreadNodeTreeProto thread_trees = 2; +} diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc new file mode 100644 index 0000000000..d9ed84bd43 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -0,0 +1,265 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "glog/logging.h" + +#include "paddle/fluid/platform/profiler/dump/serialization_logger.h" +#include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/utils.h" + +namespace paddle { +namespace platform { + +static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; +static const char* version = "1.0.0"; + +static std::string DefaultFileName() { + auto pid = GetProcessId(); + return string_format(std::string(kDefaultFilename), pid, + GetStringFormatLocalTime().c_str()); +} + +void SerializationLogger::OpenFile() { + output_file_stream_.open(filename_, std::ofstream::out | + std::ofstream::trunc | + std::ofstream::binary); + if (!output_file_stream_) { + LOG(WARNING) << "Unable to open file for writing profiling data." + << std::endl; + } else { + LOG(INFO) << "writing profiling data to " << filename_ << std::endl; + } + node_trees_proto_ = new NodeTreesProto(); + node_trees_proto_->set_version(std::string(version)); +} + +void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { + // dump the whole tree into file + const std::map> + thread2host_event_nodes = node_trees.Traverse(true); + + for (auto it = thread2host_event_nodes.begin(); + it != thread2host_event_nodes.end(); ++it) { + // 1. order every node an index, every node a parent + std::map node_index_map; + std::map node_parent_map; + int64_t index = 0; + for (auto hostnode = it->second.begin(); hostnode != it->second.end(); + ++hostnode) { + node_index_map[(*hostnode)] = index; // order each node + index++; + } + node_parent_map[(*(it->second.begin()))] = -1; // root's parent set as -1 + for (auto hostnode = it->second.begin(); hostnode != it->second.end(); + ++hostnode) { + for (auto childnode = (*hostnode)->GetChildren().begin(); + childnode != (*hostnode)->GetChildren().end(); ++childnode) { + node_parent_map[(*childnode)] = + node_index_map[(*hostnode)]; // mark each node's parent + } + } + + // 2. serialize host node, runtime node and device node + current_thread_node_tree_proto_ = + node_trees_proto_->add_thread_trees(); // add ThreadNodeTreeProto + current_thread_node_tree_proto_->set_thread_id(it->first); + for (auto hostnode = it->second.begin(); hostnode != it->second.end(); + ++hostnode) { + HostTraceEventNodeProto* host_node_proto = + current_thread_node_tree_proto_ + ->add_host_nodes(); // add HostTraceEventNodeProto + host_node_proto->set_id(node_index_map[(*hostnode)]); + host_node_proto->set_parentid(node_parent_map[(*hostnode)]); + current_host_trace_event_node_proto_ = + host_node_proto; // set current HostTraceEventNodeProto + (*hostnode)->LogMe(this); // fill detail information + + for (auto runtimenode = (*hostnode)->GetRuntimeTraceEventNodes().begin(); + runtimenode != (*hostnode)->GetRuntimeTraceEventNodes().end(); + ++runtimenode) { + CudaRuntimeTraceEventNodeProto* runtime_node_proto = + current_host_trace_event_node_proto_ + ->add_runtime_nodes(); // add CudaRuntimeTraceEventNodeProto + current_runtime_trace_event_node_proto_ = + runtime_node_proto; // set current CudaRuntimeTraceEventNodeProto + (*runtimenode)->LogMe(this); // fill detail information + for (auto devicenode = + (*runtimenode)->GetDeviceTraceEventNodes().begin(); + devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end(); + ++devicenode) { + DeviceTraceEventNodeProto* device_node_proto = + current_runtime_trace_event_node_proto_ + ->add_device_nodes(); // add DeviceTraceEventNodeProto + current_device_trace_event_node_proto_ = + device_node_proto; // set current DeviceTraceEventNodeProto + (*devicenode)->LogMe(this); // fill detail information + } + } + } + } +} + +void SerializationLogger::LogHostTraceEventNode( + const HostTraceEventNode& host_node) { + HostTraceEventProto* host_trace_event = new HostTraceEventProto(); + host_trace_event->set_name(host_node.Name()); + host_trace_event->set_type( + static_cast(host_node.Type())); + host_trace_event->set_start_ns(host_node.StartNs()); + host_trace_event->set_end_ns(host_node.EndNs()); + host_trace_event->set_process_id(host_node.ProcessId()); + host_trace_event->set_thread_id(host_node.ThreadId()); + current_host_trace_event_node_proto_->set_allocated_host_trace_event( + host_trace_event); +} + +void SerializationLogger::LogRuntimeTraceEventNode( + const CudaRuntimeTraceEventNode& runtime_node) { + CudaRuntimeTraceEventProto* runtime_trace_event = + new CudaRuntimeTraceEventProto(); + runtime_trace_event->set_name(runtime_node.Name()); + runtime_trace_event->set_start_ns(runtime_node.StartNs()); + runtime_trace_event->set_end_ns(runtime_node.EndNs()); + runtime_trace_event->set_process_id(runtime_node.ProcessId()); + runtime_trace_event->set_thread_id(runtime_node.ThreadId()); + runtime_trace_event->set_correlation_id(runtime_node.CorrelationId()); + runtime_trace_event->set_callback_id(runtime_node.CallbackId()); + current_runtime_trace_event_node_proto_->set_allocated_runtime_trace_event( + runtime_trace_event); +} + +void SerializationLogger::LogDeviceTraceEventNode( + const DeviceTraceEventNode& device_node) { + switch (device_node.Type()) { + case TracerEventType::Kernel: + HandleTypeKernel(device_node); + break; + case TracerEventType::Memcpy: + HandleTypeMemcpy(device_node); + break; + case TracerEventType::Memset: + HandleTypeMemset(device_node); + break; + default: + break; + } +} + +void SerializationLogger::HandleTypeKernel( + const DeviceTraceEventNode& device_node) { + DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto(); + KernelEventInfoProto* kernel_info = new KernelEventInfoProto(); + // fill DeviceTraceEventProto + device_trace_event->set_name(device_node.Name()); + device_trace_event->set_type( + static_cast(device_node.Type())); + device_trace_event->set_start_ns(device_node.StartNs()); + device_trace_event->set_end_ns(device_node.EndNs()); + device_trace_event->set_device_id(device_node.DeviceId()); + device_trace_event->set_context_id(device_node.ContextId()); + device_trace_event->set_stream_id(device_node.StreamId()); + device_trace_event->set_correlation_id(device_node.CorrelationId()); + // fill KernelEventInfoProto + KernelEventInfo info = device_node.KernelInfo(); + kernel_info->set_block_x(info.block_x); + kernel_info->set_block_y(info.block_y); + kernel_info->set_block_z(info.block_z); + kernel_info->set_grid_x(info.grid_x); + kernel_info->set_grid_y(info.grid_y); + kernel_info->set_grid_z(info.grid_z); + kernel_info->set_dynamic_shared_memory(info.dynamic_shared_memory); + kernel_info->set_static_shared_memory(info.static_shared_memory); + kernel_info->set_registers_per_thread(info.registers_per_thread); + kernel_info->set_local_memory_per_thread(info.local_memory_per_thread); + kernel_info->set_local_memory_total(info.local_memory_total); + kernel_info->set_queued(info.queued); + kernel_info->set_submitted(info.submitted); + kernel_info->set_completed(info.completed); + // binding + device_trace_event->set_allocated_kernel_info(kernel_info); + current_device_trace_event_node_proto_->set_allocated_device_event( + device_trace_event); +} + +void SerializationLogger::HandleTypeMemcpy( + const DeviceTraceEventNode& device_node) { + DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto(); + MemcpyEventInfoProto* memcpy_info = new MemcpyEventInfoProto(); + // fill DeviceTraceEventProto + device_trace_event->set_name(device_node.Name()); + device_trace_event->set_type( + static_cast(device_node.Type())); + device_trace_event->set_start_ns(device_node.StartNs()); + device_trace_event->set_end_ns(device_node.EndNs()); + device_trace_event->set_device_id(device_node.DeviceId()); + device_trace_event->set_context_id(device_node.ContextId()); + device_trace_event->set_stream_id(device_node.StreamId()); + device_trace_event->set_correlation_id(device_node.CorrelationId()); + // fill MemcpyEventInfoProto + MemcpyEventInfo info = device_node.MemcpyInfo(); + memcpy_info->set_num_bytes(info.num_bytes); + memcpy_info->set_copy_kind(std::string(info.copy_kind)); + memcpy_info->set_src_kind(std::string(info.src_kind)); + memcpy_info->set_dst_kind(std::string(info.dst_kind)); + // binding + device_trace_event->set_allocated_memcpy_info(memcpy_info); + current_device_trace_event_node_proto_->set_allocated_device_event( + device_trace_event); +} + +void SerializationLogger::HandleTypeMemset( + const DeviceTraceEventNode& device_node) { + DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto(); + MemsetEventInfoProto* memset_info = new MemsetEventInfoProto(); + // fill DeviceTraceEventProto + device_trace_event->set_name(device_node.Name()); + device_trace_event->set_type( + static_cast(device_node.Type())); + device_trace_event->set_start_ns(device_node.StartNs()); + device_trace_event->set_end_ns(device_node.EndNs()); + device_trace_event->set_device_id(device_node.DeviceId()); + device_trace_event->set_context_id(device_node.ContextId()); + device_trace_event->set_stream_id(device_node.StreamId()); + device_trace_event->set_correlation_id(device_node.CorrelationId()); + // fill MemsetEventInfoProto + MemsetEventInfo info = device_node.MemsetInfo(); + memset_info->set_num_bytes(info.num_bytes); + memset_info->set_memory_kind(std::string(info.memory_kind)); + memset_info->set_value(info.value); + // binding + device_trace_event->set_allocated_memset_info(memset_info); + current_device_trace_event_node_proto_->set_allocated_device_event( + device_trace_event); +} + +SerializationLogger::SerializationLogger(const std::string& filename) { + filename_ = filename.empty() ? DefaultFileName() : filename; + OpenFile(); +} + +SerializationLogger::SerializationLogger(const char* filename_cstr) { + std::string filename(filename_cstr); + filename_ = filename.empty() ? DefaultFileName() : filename; + OpenFile(); +} + +SerializationLogger::~SerializationLogger() { + if (!output_file_stream_) { + delete node_trees_proto_; + return; + } + node_trees_proto_->SerializeToOstream(&output_file_stream_); + delete node_trees_proto_; + output_file_stream_.close(); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h new file mode 100755 index 0000000000..1295be95d4 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" +#include "paddle/fluid/platform/profiler/output_logger.h" + +namespace paddle { +namespace platform { + +// Dump a NodeTrees into a profobuf file. +// A SerializationLogger object can only dump a NodeTrees object, +// creates a file in the constructor and closes the file in the destructor. +class SerializationLogger : public BaseLogger { + public: + explicit SerializationLogger(const std::string& filename); + explicit SerializationLogger(const char* filename); + ~SerializationLogger(); + std::string filename() { return filename_; } + void LogDeviceTraceEventNode(const DeviceTraceEventNode&) override; + void LogHostTraceEventNode(const HostTraceEventNode&) override; + void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; + void LogNodeTrees(const NodeTrees&) override; + + private: + void OpenFile(); + void HandleTypeKernel(const DeviceTraceEventNode&); + void HandleTypeMemset(const DeviceTraceEventNode&); + void HandleTypeMemcpy(const DeviceTraceEventNode&); + std::string filename_; + std::ofstream output_file_stream_; + NodeTreesProto* node_trees_proto_; + ThreadNodeTreeProto* current_thread_node_tree_proto_; + HostTraceEventNodeProto* current_host_trace_event_node_proto_; + CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_; + DeviceTraceEventNodeProto* current_device_trace_event_node_proto_; +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc new file mode 100644 index 0000000000..2fe9626ec7 --- /dev/null +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" + +#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" +#include "paddle/fluid/platform/profiler/dump/serialization_logger.h" +#include "paddle/fluid/platform/profiler/event_node.h" + +using paddle::platform::SerializationLogger; +using paddle::platform::DeserializationReader; +using paddle::platform::NodeTrees; +using paddle::platform::HostTraceEventNode; +using paddle::platform::CudaRuntimeTraceEventNode; +using paddle::platform::DeviceTraceEventNode; +using paddle::platform::HostTraceEvent; +using paddle::platform::RuntimeTraceEvent; +using paddle::platform::DeviceTraceEvent; +using paddle::platform::TracerEventType; +using paddle::platform::KernelEventInfo; +using paddle::platform::MemcpyEventInfo; +using paddle::platform::MemsetEventInfo; + +TEST(SerializationLoggerTest, dump_case0) { + std::list host_events; + std::list runtime_events; + std::list device_events; + host_events.push_back(HostTraceEvent(std::string("dataloader#1"), + TracerEventType::Dataloader, 1000, 10000, + 10, 10)); + host_events.push_back(HostTraceEvent( + std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10)); + host_events.push_back(HostTraceEvent( + std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10)); + host_events.push_back(HostTraceEvent( + std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, + 17000, 10, 10, 1, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, + 35000, 10, 10, 2, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000, + 37000, 10, 11, 3, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000, + 19000, 10, 10, 4, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000, + 39000, 10, 11, 5, 0)); + device_events.push_back( + DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, + 55000, 0, 10, 10, 1, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, + 95000, 0, 10, 10, 2, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, + 65000, 0, 10, 11, 3, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000, + 59000, 0, 10, 10, 4, MemcpyEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, + 69000, 0, 10, 11, 5, MemsetEventInfo())); + SerializationLogger logger("test_serialization_logger_case0.pb"); + NodeTrees tree(host_events, runtime_events, device_events); + std::map> nodes = + tree.Traverse(true); + EXPECT_EQ(nodes[10].size(), 4u); + EXPECT_EQ(nodes[11].size(), 2u); + std::vector thread1_nodes = nodes[10]; + std::vector thread2_nodes = nodes[11]; + for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetChildren().size(), 3u); + } + if ((*it)->Name() == "op1") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } + for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { + if ((*it)->Name() == "op3") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } + tree.LogMe(&logger); +} + +TEST(SerializationLoggerTest, dump_case1) { + std::list host_events; + std::list runtime_events; + std::list device_events; + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, + 17000, 10, 10, 1, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, + 35000, 10, 10, 2, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000, + 37000, 10, 11, 3, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000, + 19000, 10, 10, 4, 0)); + runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000, + 39000, 10, 11, 5, 0)); + device_events.push_back( + DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, + 55000, 0, 10, 10, 1, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, + 95000, 0, 10, 10, 2, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, + 65000, 0, 10, 11, 3, KernelEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000, + 59000, 0, 10, 10, 4, MemcpyEventInfo())); + device_events.push_back( + DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, + 69000, 0, 10, 11, 5, MemsetEventInfo())); + SerializationLogger logger("test_serialization_logger_case1.pb"); + NodeTrees tree(host_events, runtime_events, device_events); + std::map> nodes = + tree.Traverse(true); + EXPECT_EQ(nodes[10].size(), 1u); + EXPECT_EQ(nodes[11].size(), 1u); + std::vector thread1_nodes = nodes[10]; + std::vector thread2_nodes = nodes[11]; + for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u); + } + } + for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } + tree.LogMe(&logger); +} + +TEST(DeserializationReaderTest, restore_case0) { + DeserializationReader reader("test_serialization_logger_case0.pb"); + std::unique_ptr tree = reader.Parse(); + std::map> nodes = + tree->Traverse(true); + EXPECT_EQ(nodes[10].size(), 4u); + EXPECT_EQ(nodes[11].size(), 2u); + std::vector thread1_nodes = nodes[10]; + std::vector thread2_nodes = nodes[11]; + for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetChildren().size(), 3u); + } + if ((*it)->Name() == "op1") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } + for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { + if ((*it)->Name() == "op3") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } +} diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h new file mode 100644 index 0000000000..04014b972c --- /dev/null +++ b/paddle/fluid/platform/profiler/utils.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "paddle/fluid/platform/os_info.h" + +namespace paddle { +namespace platform { + +template +std::string string_format(const std::string& format, Args... args) { + int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + + 1; // Extra space for '\0' + PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal( + "Error during profiler data formatting.")); + auto size = static_cast(size_s); + auto buf = std::make_unique(size); + std::snprintf(buf.get(), size, format.c_str(), args...); + return std::string(buf.get(), size - 1); // exclude the '\0' +} + +static std::string GetStringFormatLocalTime() { + std::time_t rawtime; + std::tm* timeinfo; + char buf[100]; + std::time(&rawtime); + timeinfo = std::localtime(&rawtime); + std::strftime(buf, 100, "%F-%X", timeinfo); + return std::string(buf); +} + +static int64_t nsToUs(int64_t ns) { return ns / 1000; } + +} // namespace platform +} // namespace paddle -- GitLab