未验证 提交 40d2b7c6 编写于 作者: C chenjian 提交者: GitHub

Add profiler tree dump code (#39519)

* add profiler tree dump code

* add CMakeLists item

* reduce some contents for pr
上级 3581c075
......@@ -5,3 +5,4 @@ cc_library(event_node SRCS event_node.cc DEPS enforce)
cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node)
cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
add_subdirectory(dump)
proto_library(nodetreeproto SRCS nodetree.proto)
cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node)
cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node)
cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS serialization_logger deserialization_reader event_node)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
#include <cstring>
namespace paddle {
namespace platform {
DeserializationReader::DeserializationReader(const std::string& filename)
: filename_(filename) {
OpenFile();
node_trees_proto_ = new NodeTreesProto();
}
DeserializationReader::DeserializationReader(const char* filename)
: filename_(filename) {
OpenFile();
node_trees_proto_ = new NodeTreesProto();
}
void DeserializationReader::OpenFile() {
input_file_stream_.open(filename_, std::ifstream::in | std::ifstream::binary);
if (!input_file_stream_) {
VLOG(2) << "Unable to open file for writing profiling data." << std::endl;
} else {
VLOG(0) << "Read profiling data from " << filename_ << std::endl;
}
}
std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) {
VLOG(2) << "Unable to load node trees in protobuf." << std::endl;
return nullptr;
}
std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
for (int node_tree_index = 0;
node_tree_index < node_trees_proto_->thread_trees_size();
node_tree_index++) {
// handle one thread tree
std::map<int64_t, HostTraceEventNode*> index_node_map;
std::map<int64_t, int64_t> child_parent_map;
const ThreadNodeTreeProto& thread_node_tree_proto =
node_trees_proto_->thread_trees(node_tree_index);
uint64_t current_threadid = thread_node_tree_proto.thread_id();
for (int host_node_index = 0;
host_node_index < thread_node_tree_proto.host_nodes_size();
host_node_index++) {
// handle host node
const HostTraceEventNodeProto& host_node_proto =
thread_node_tree_proto.host_nodes(host_node_index);
HostTraceEventNode* host_node =
RestoreHostTraceEventNode(host_node_proto);
index_node_map[host_node_proto.id()] = host_node;
child_parent_map[host_node_proto.id()] = host_node_proto.parentid();
// handle runtime node
for (int runtime_node_index = 0;
runtime_node_index < host_node_proto.runtime_nodes_size();
runtime_node_index++) {
const CudaRuntimeTraceEventNodeProto& runtime_node_proto =
host_node_proto.runtime_nodes(runtime_node_index);
CudaRuntimeTraceEventNode* runtime_node =
RestoreCudaRuntimeTraceEventNode(runtime_node_proto);
host_node->AddCudaRuntimeNode(runtime_node); // insert into host_node
// handle device node
for (int device_node_index = 0;
device_node_index < runtime_node_proto.device_nodes_size();
device_node_index++) {
const DeviceTraceEventNodeProto& device_node_proto =
runtime_node_proto.device_nodes(device_node_index);
DeviceTraceEventNode* device_node =
RestoreDeviceTraceEventNode(device_node_proto);
runtime_node->AddDeviceTraceEventNode(
device_node); // insert into runtime_node
}
}
}
// restore parent-child relationship
for (auto it = child_parent_map.begin(); it != child_parent_map.end();
it++) {
if (it->second != -1) { // not root node
index_node_map[it->second]->AddChild(index_node_map[it->first]);
} else {
thread_event_trees_map[current_threadid] =
index_node_map[it->first]; // root node
}
}
}
// restore NodeTrees object
return std::unique_ptr<NodeTrees>(new NodeTrees(thread_event_trees_map));
}
DeserializationReader::~DeserializationReader() {
delete node_trees_proto_;
input_file_stream_.close();
}
DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode(
const DeviceTraceEventNodeProto& device_node_proto) {
const DeviceTraceEventProto& device_event_proto =
device_node_proto.device_event();
DeviceTraceEvent device_event;
device_event.name = device_event_proto.name();
device_event.type = static_cast<TracerEventType>(device_event_proto.type());
device_event.start_ns = device_event_proto.start_ns();
device_event.end_ns = device_event_proto.end_ns();
device_event.device_id = device_event_proto.device_id();
device_event.context_id = device_event_proto.context_id();
device_event.stream_id = device_event_proto.stream_id();
device_event.correlation_id = device_event_proto.correlation_id();
switch (device_event.type) {
case TracerEventType::Kernel:
device_event.kernel_info = HandleKernelEventInfoProto(device_event_proto);
break;
case TracerEventType::Memcpy:
device_event.memcpy_info = HandleMemcpyEventInfoProto(device_event_proto);
break;
case TracerEventType::Memset:
device_event.memset_info = HandleMemsetEventInfoProto(device_event_proto);
break;
default:
break;
}
return new DeviceTraceEventNode(device_event);
}
CudaRuntimeTraceEventNode*
DeserializationReader::RestoreCudaRuntimeTraceEventNode(
const CudaRuntimeTraceEventNodeProto& runtime_node_proto) {
const CudaRuntimeTraceEventProto& runtime_event_proto =
runtime_node_proto.runtime_trace_event();
RuntimeTraceEvent runtime_event;
runtime_event.name = runtime_event_proto.name();
runtime_event.start_ns = runtime_event_proto.start_ns();
runtime_event.end_ns = runtime_event_proto.end_ns();
runtime_event.process_id = runtime_event_proto.process_id();
runtime_event.thread_id = runtime_event_proto.thread_id();
runtime_event.correlation_id = runtime_event_proto.correlation_id();
runtime_event.callback_id = runtime_event_proto.callback_id();
return new CudaRuntimeTraceEventNode(runtime_event);
}
HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode(
const HostTraceEventNodeProto& host_node_proto) {
const HostTraceEventProto& host_event_proto =
host_node_proto.host_trace_event();
HostTraceEvent host_event;
host_event.name = host_event_proto.name();
host_event.type = static_cast<TracerEventType>(host_event_proto.type());
host_event.start_ns = host_event_proto.start_ns();
host_event.end_ns = host_event_proto.end_ns();
host_event.process_id = host_event_proto.process_id();
host_event.thread_id = host_event_proto.thread_id();
return new HostTraceEventNode(host_event);
}
KernelEventInfo DeserializationReader::HandleKernelEventInfoProto(
const DeviceTraceEventProto& device_event_proto) {
const KernelEventInfoProto& kernel_info_proto =
device_event_proto.kernel_info();
KernelEventInfo kernel_info;
kernel_info.block_x = kernel_info_proto.block_x();
kernel_info.block_y = kernel_info_proto.block_y();
kernel_info.block_z = kernel_info_proto.block_z();
kernel_info.grid_x = kernel_info_proto.grid_x();
kernel_info.grid_y = kernel_info_proto.grid_y();
kernel_info.grid_z = kernel_info_proto.grid_z();
kernel_info.dynamic_shared_memory = kernel_info_proto.dynamic_shared_memory();
kernel_info.static_shared_memory = kernel_info_proto.static_shared_memory();
kernel_info.registers_per_thread = kernel_info_proto.registers_per_thread();
kernel_info.local_memory_per_thread =
kernel_info_proto.local_memory_per_thread();
kernel_info.local_memory_total = kernel_info_proto.local_memory_total();
kernel_info.queued = kernel_info_proto.queued();
kernel_info.submitted = kernel_info_proto.submitted();
kernel_info.completed = kernel_info_proto.completed();
return kernel_info;
}
MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto(
const DeviceTraceEventProto& device_event_proto) {
const MemcpyEventInfoProto& memcpy_info_proto =
device_event_proto.memcpy_info();
MemcpyEventInfo memcpy_info;
memcpy_info.num_bytes = memcpy_info_proto.num_bytes();
std::strncpy(memcpy_info.copy_kind, memcpy_info_proto.copy_kind().c_str(),
kMemKindMaxLen - 1);
std::strncpy(memcpy_info.src_kind, memcpy_info_proto.src_kind().c_str(),
kMemKindMaxLen - 1);
std::strncpy(memcpy_info.dst_kind, memcpy_info_proto.dst_kind().c_str(),
kMemKindMaxLen - 1);
return memcpy_info;
}
MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto(
const DeviceTraceEventProto& device_event_proto) {
const MemsetEventInfoProto& memset_info_proto =
device_event_proto.memset_info();
MemsetEventInfo memset_info;
memset_info.num_bytes = memset_info_proto.num_bytes();
std::strncpy(memset_info.memory_kind, memset_info_proto.memory_kind().c_str(),
kMemKindMaxLen - 1);
memset_info.value = memset_info_proto.value();
return memset_info;
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
#include "paddle/fluid/platform/profiler/event_node.h"
namespace paddle {
namespace platform {
class DeserializationReader {
public:
explicit DeserializationReader(const std::string& filename);
explicit DeserializationReader(const char* filename);
~DeserializationReader();
std::unique_ptr<NodeTrees> Parse();
private:
void OpenFile();
DeviceTraceEventNode* RestoreDeviceTraceEventNode(
const DeviceTraceEventNodeProto&);
CudaRuntimeTraceEventNode* RestoreCudaRuntimeTraceEventNode(
const CudaRuntimeTraceEventNodeProto&);
HostTraceEventNode* RestoreHostTraceEventNode(const HostTraceEventNodeProto&);
KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&);
MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&);
MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&);
std::string filename_;
std::ifstream input_file_stream_;
NodeTreesProto* node_trees_proto_;
};
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package paddle.platform;
enum TracerEventTypeProto {
// Used to mark operator record
Operator = 0;
// Used to mark dataloader record
Dataloader = 1;
// Used to mark profile step record
ProfileStep = 2;
// Used to mark cuda runtime record returned by cupti
CudaRuntime = 3;
// Used to mark kernel computation record returned by cupti
Kernel = 4;
// Used to mark memcpy record returned by cupti
Memcpy = 5;
// Used to mark memset record returned by cupti
Memset = 6;
// Used to mark record defined by user
UserDefined = 7;
// A flag to denote the number of current types
NumTypes = 8;
}
message KernelEventInfoProto {
// The X-dimension block size for the kernel.
required uint32 block_x = 1;
// The Y-dimension block size for the kernel.
required uint32 block_y = 2;
// The Z-dimension grid size for the kernel.
required uint32 block_z = 3;
// X-dimension of a grid.
required uint32 grid_x = 4;
// Y-dimension of a grid.
required uint32 grid_y = 5;
// Z-dimension of a grid.
required uint32 grid_z = 6;
// The dynamic shared memory reserved for the kernel, in bytes.
required uint32 dynamic_shared_memory = 7;
// The static shared memory allocated for the kernel, in bytes.
required uint32 static_shared_memory = 8;
// The number of registers required for each thread executing the kernel.
required uint32 registers_per_thread = 9;
// The amount of local memory reserved for each thread, in bytes.
required uint32 local_memory_per_thread = 10;
// The total amount of local memory reserved for the kernel, in bytes.
required uint32 local_memory_total = 11;
// The timestamp when the kernel is queued up in the command buffer, in ns.
// This timestamp is not collected by default. Use API
// cuptiActivityEnableLatencyTimestamps() to enable collection.
required uint64 queued = 12;
// The timestamp when the command buffer containing the kernel launch is
// submitted to the GPU, in ns.
// This timestamp is not collected by default. Use API
// cuptiActivityEnableLatencyTimestamps() to enable collection.
required uint64 submitted = 13;
// The completed timestamp for the kernel execution, in ns.
required uint64 completed = 14;
}
message MemcpyEventInfoProto {
// The number of bytes transferred by the memory copy.
required uint64 num_bytes = 1;
// The kind of the memory copy.
// Each kind represents the source and destination targets of a memory copy.
// Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind
required string copy_kind = 2;
// The source memory kind read by the memory copy.
// Each kind represents the type of the memory accessed by a memory
// operation/copy. Refer to CUpti_ActivityMemoryKind
required string src_kind = 3;
// The destination memory kind read by the memory copy.
required string dst_kind = 4;
}
message MemsetEventInfoProto {
// The number of bytes being set by the memory set.
required uint64 num_bytes = 1;
// The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind
required string memory_kind = 2;
// the value being assigned to memory by the memory set.
required uint32 value = 3;
}
message HostTraceEventProto {
required string name = 1;
required TracerEventTypeProto type = 2;
// start timestamp of the record
required uint64 start_ns = 3;
// end timestamp of the record
required uint64 end_ns = 4;
// process id of the record
required uint64 process_id = 5;
// thread id of the record
required uint64 thread_id = 6;
}
message CudaRuntimeTraceEventProto {
// record name
required string name = 1;
// start timestamp of the record
required uint64 start_ns = 2;
// end timestamp of the record
required uint64 end_ns = 3;
// process id of the record
required uint64 process_id = 4;
// thread id of the record
required uint64 thread_id = 5;
// correlation id, used for correlating async activities happened on device
required uint32 correlation_id = 6;
// callback id, used to identify which cuda runtime api is called
required uint32 callback_id = 7;
}
message DeviceTraceEventProto {
// record name
required string name = 1;
// record type, one of TracerEventType
required TracerEventTypeProto type = 2;
// start timestamp of the record
required uint64 start_ns = 3;
// end timestamp of the record
required uint64 end_ns = 4;
// device id
required uint64 device_id = 5;
// context id
required uint64 context_id = 6;
// stream id
required uint64 stream_id = 7;
// correlation id, used for correlating async activities happened on device
required uint32 correlation_id = 8;
// union, specific device record type has different detail information
oneof detail_info {
// used for TracerEventType::Kernel
KernelEventInfoProto kernel_info = 9;
// used for TracerEventType::Memcpy
MemcpyEventInfoProto memcpy_info = 10;
// used for TracerEventType::Memset
MemsetEventInfoProto memset_info = 11;
}
}
message DeviceTraceEventNodeProto {
required DeviceTraceEventProto device_event = 1;
}
message CudaRuntimeTraceEventNodeProto {
required CudaRuntimeTraceEventProto runtime_trace_event = 1;
repeated DeviceTraceEventNodeProto device_nodes = 2;
}
message HostTraceEventNodeProto {
required int64 id = 1;
required int64 parentid = 2;
required HostTraceEventProto host_trace_event = 3;
repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4;
}
message ThreadNodeTreeProto {
required uint64 thread_id = 1;
repeated HostTraceEventNodeProto host_nodes = 2;
}
message NodeTreesProto {
required string version = 1;
repeated ThreadNodeTreeProto thread_trees = 2;
}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle {
namespace platform {
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
static const char* version = "1.0.0";
static std::string DefaultFileName() {
auto pid = GetProcessId();
return string_format(std::string(kDefaultFilename), pid,
GetStringFormatLocalTime().c_str());
}
void SerializationLogger::OpenFile() {
output_file_stream_.open(filename_, std::ofstream::out |
std::ofstream::trunc |
std::ofstream::binary);
if (!output_file_stream_) {
LOG(WARNING) << "Unable to open file for writing profiling data."
<< std::endl;
} else {
LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
}
node_trees_proto_ = new NodeTreesProto();
node_trees_proto_->set_version(std::string(version));
}
void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
// dump the whole tree into file
const std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes = node_trees.Traverse(true);
for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) {
// 1. order every node an index, every node a parent
std::map<HostTraceEventNode*, int64_t> node_index_map;
std::map<HostTraceEventNode*, int64_t> node_parent_map;
int64_t index = 0;
for (auto hostnode = it->second.begin(); hostnode != it->second.end();
++hostnode) {
node_index_map[(*hostnode)] = index; // order each node
index++;
}
node_parent_map[(*(it->second.begin()))] = -1; // root's parent set as -1
for (auto hostnode = it->second.begin(); hostnode != it->second.end();
++hostnode) {
for (auto childnode = (*hostnode)->GetChildren().begin();
childnode != (*hostnode)->GetChildren().end(); ++childnode) {
node_parent_map[(*childnode)] =
node_index_map[(*hostnode)]; // mark each node's parent
}
}
// 2. serialize host node, runtime node and device node
current_thread_node_tree_proto_ =
node_trees_proto_->add_thread_trees(); // add ThreadNodeTreeProto
current_thread_node_tree_proto_->set_thread_id(it->first);
for (auto hostnode = it->second.begin(); hostnode != it->second.end();
++hostnode) {
HostTraceEventNodeProto* host_node_proto =
current_thread_node_tree_proto_
->add_host_nodes(); // add HostTraceEventNodeProto
host_node_proto->set_id(node_index_map[(*hostnode)]);
host_node_proto->set_parentid(node_parent_map[(*hostnode)]);
current_host_trace_event_node_proto_ =
host_node_proto; // set current HostTraceEventNodeProto
(*hostnode)->LogMe(this); // fill detail information
for (auto runtimenode = (*hostnode)->GetRuntimeTraceEventNodes().begin();
runtimenode != (*hostnode)->GetRuntimeTraceEventNodes().end();
++runtimenode) {
CudaRuntimeTraceEventNodeProto* runtime_node_proto =
current_host_trace_event_node_proto_
->add_runtime_nodes(); // add CudaRuntimeTraceEventNodeProto
current_runtime_trace_event_node_proto_ =
runtime_node_proto; // set current CudaRuntimeTraceEventNodeProto
(*runtimenode)->LogMe(this); // fill detail information
for (auto devicenode =
(*runtimenode)->GetDeviceTraceEventNodes().begin();
devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end();
++devicenode) {
DeviceTraceEventNodeProto* device_node_proto =
current_runtime_trace_event_node_proto_
->add_device_nodes(); // add DeviceTraceEventNodeProto
current_device_trace_event_node_proto_ =
device_node_proto; // set current DeviceTraceEventNodeProto
(*devicenode)->LogMe(this); // fill detail information
}
}
}
}
}
void SerializationLogger::LogHostTraceEventNode(
const HostTraceEventNode& host_node) {
HostTraceEventProto* host_trace_event = new HostTraceEventProto();
host_trace_event->set_name(host_node.Name());
host_trace_event->set_type(
static_cast<TracerEventTypeProto>(host_node.Type()));
host_trace_event->set_start_ns(host_node.StartNs());
host_trace_event->set_end_ns(host_node.EndNs());
host_trace_event->set_process_id(host_node.ProcessId());
host_trace_event->set_thread_id(host_node.ThreadId());
current_host_trace_event_node_proto_->set_allocated_host_trace_event(
host_trace_event);
}
void SerializationLogger::LogRuntimeTraceEventNode(
const CudaRuntimeTraceEventNode& runtime_node) {
CudaRuntimeTraceEventProto* runtime_trace_event =
new CudaRuntimeTraceEventProto();
runtime_trace_event->set_name(runtime_node.Name());
runtime_trace_event->set_start_ns(runtime_node.StartNs());
runtime_trace_event->set_end_ns(runtime_node.EndNs());
runtime_trace_event->set_process_id(runtime_node.ProcessId());
runtime_trace_event->set_thread_id(runtime_node.ThreadId());
runtime_trace_event->set_correlation_id(runtime_node.CorrelationId());
runtime_trace_event->set_callback_id(runtime_node.CallbackId());
current_runtime_trace_event_node_proto_->set_allocated_runtime_trace_event(
runtime_trace_event);
}
void SerializationLogger::LogDeviceTraceEventNode(
const DeviceTraceEventNode& device_node) {
switch (device_node.Type()) {
case TracerEventType::Kernel:
HandleTypeKernel(device_node);
break;
case TracerEventType::Memcpy:
HandleTypeMemcpy(device_node);
break;
case TracerEventType::Memset:
HandleTypeMemset(device_node);
break;
default:
break;
}
}
void SerializationLogger::HandleTypeKernel(
const DeviceTraceEventNode& device_node) {
DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
KernelEventInfoProto* kernel_info = new KernelEventInfoProto();
// fill DeviceTraceEventProto
device_trace_event->set_name(device_node.Name());
device_trace_event->set_type(
static_cast<TracerEventTypeProto>(device_node.Type()));
device_trace_event->set_start_ns(device_node.StartNs());
device_trace_event->set_end_ns(device_node.EndNs());
device_trace_event->set_device_id(device_node.DeviceId());
device_trace_event->set_context_id(device_node.ContextId());
device_trace_event->set_stream_id(device_node.StreamId());
device_trace_event->set_correlation_id(device_node.CorrelationId());
// fill KernelEventInfoProto
KernelEventInfo info = device_node.KernelInfo();
kernel_info->set_block_x(info.block_x);
kernel_info->set_block_y(info.block_y);
kernel_info->set_block_z(info.block_z);
kernel_info->set_grid_x(info.grid_x);
kernel_info->set_grid_y(info.grid_y);
kernel_info->set_grid_z(info.grid_z);
kernel_info->set_dynamic_shared_memory(info.dynamic_shared_memory);
kernel_info->set_static_shared_memory(info.static_shared_memory);
kernel_info->set_registers_per_thread(info.registers_per_thread);
kernel_info->set_local_memory_per_thread(info.local_memory_per_thread);
kernel_info->set_local_memory_total(info.local_memory_total);
kernel_info->set_queued(info.queued);
kernel_info->set_submitted(info.submitted);
kernel_info->set_completed(info.completed);
// binding
device_trace_event->set_allocated_kernel_info(kernel_info);
current_device_trace_event_node_proto_->set_allocated_device_event(
device_trace_event);
}
void SerializationLogger::HandleTypeMemcpy(
const DeviceTraceEventNode& device_node) {
DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
MemcpyEventInfoProto* memcpy_info = new MemcpyEventInfoProto();
// fill DeviceTraceEventProto
device_trace_event->set_name(device_node.Name());
device_trace_event->set_type(
static_cast<TracerEventTypeProto>(device_node.Type()));
device_trace_event->set_start_ns(device_node.StartNs());
device_trace_event->set_end_ns(device_node.EndNs());
device_trace_event->set_device_id(device_node.DeviceId());
device_trace_event->set_context_id(device_node.ContextId());
device_trace_event->set_stream_id(device_node.StreamId());
device_trace_event->set_correlation_id(device_node.CorrelationId());
// fill MemcpyEventInfoProto
MemcpyEventInfo info = device_node.MemcpyInfo();
memcpy_info->set_num_bytes(info.num_bytes);
memcpy_info->set_copy_kind(std::string(info.copy_kind));
memcpy_info->set_src_kind(std::string(info.src_kind));
memcpy_info->set_dst_kind(std::string(info.dst_kind));
// binding
device_trace_event->set_allocated_memcpy_info(memcpy_info);
current_device_trace_event_node_proto_->set_allocated_device_event(
device_trace_event);
}
void SerializationLogger::HandleTypeMemset(
const DeviceTraceEventNode& device_node) {
DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
MemsetEventInfoProto* memset_info = new MemsetEventInfoProto();
// fill DeviceTraceEventProto
device_trace_event->set_name(device_node.Name());
device_trace_event->set_type(
static_cast<TracerEventTypeProto>(device_node.Type()));
device_trace_event->set_start_ns(device_node.StartNs());
device_trace_event->set_end_ns(device_node.EndNs());
device_trace_event->set_device_id(device_node.DeviceId());
device_trace_event->set_context_id(device_node.ContextId());
device_trace_event->set_stream_id(device_node.StreamId());
device_trace_event->set_correlation_id(device_node.CorrelationId());
// fill MemsetEventInfoProto
MemsetEventInfo info = device_node.MemsetInfo();
memset_info->set_num_bytes(info.num_bytes);
memset_info->set_memory_kind(std::string(info.memory_kind));
memset_info->set_value(info.value);
// binding
device_trace_event->set_allocated_memset_info(memset_info);
current_device_trace_event_node_proto_->set_allocated_device_event(
device_trace_event);
}
SerializationLogger::SerializationLogger(const std::string& filename) {
filename_ = filename.empty() ? DefaultFileName() : filename;
OpenFile();
}
SerializationLogger::SerializationLogger(const char* filename_cstr) {
std::string filename(filename_cstr);
filename_ = filename.empty() ? DefaultFileName() : filename;
OpenFile();
}
SerializationLogger::~SerializationLogger() {
if (!output_file_stream_) {
delete node_trees_proto_;
return;
}
node_trees_proto_->SerializeToOstream(&output_file_stream_);
delete node_trees_proto_;
output_file_stream_.close();
}
} // namespace platform
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
#include "paddle/fluid/platform/profiler/output_logger.h"
namespace paddle {
namespace platform {
// Dump a NodeTrees into a profobuf file.
// A SerializationLogger object can only dump a NodeTrees object,
// creates a file in the constructor and closes the file in the destructor.
class SerializationLogger : public BaseLogger {
public:
explicit SerializationLogger(const std::string& filename);
explicit SerializationLogger(const char* filename);
~SerializationLogger();
std::string filename() { return filename_; }
void LogDeviceTraceEventNode(const DeviceTraceEventNode&) override;
void LogHostTraceEventNode(const HostTraceEventNode&) override;
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override;
private:
void OpenFile();
void HandleTypeKernel(const DeviceTraceEventNode&);
void HandleTypeMemset(const DeviceTraceEventNode&);
void HandleTypeMemcpy(const DeviceTraceEventNode&);
std::string filename_;
std::ofstream output_file_stream_;
NodeTreesProto* node_trees_proto_;
ThreadNodeTreeProto* current_thread_node_tree_proto_;
HostTraceEventNodeProto* current_host_trace_event_node_proto_;
CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_;
DeviceTraceEventNodeProto* current_device_trace_event_node_proto_;
};
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
#include "paddle/fluid/platform/profiler/event_node.h"
using paddle::platform::SerializationLogger;
using paddle::platform::DeserializationReader;
using paddle::platform::NodeTrees;
using paddle::platform::HostTraceEventNode;
using paddle::platform::CudaRuntimeTraceEventNode;
using paddle::platform::DeviceTraceEventNode;
using paddle::platform::HostTraceEvent;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::TracerEventType;
using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo;
TEST(SerializationLoggerTest, dump_case0) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000,
10, 10));
host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
39000, 10, 11, 5, 0));
device_events.push_back(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
55000, 0, 10, 10, 1, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
95000, 0, 10, 10, 2, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
65000, 0, 10, 11, 3, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
59000, 0, 10, 10, 4, MemcpyEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case0.pb");
NodeTrees tree(host_events, runtime_events, device_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u);
EXPECT_EQ(nodes[11].size(), 2u);
std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
if ((*it)->Name() == "root node") {
EXPECT_EQ((*it)->GetChildren().size(), 3u);
}
if ((*it)->Name() == "op1") {
EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
}
}
for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
if ((*it)->Name() == "op3") {
EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
}
}
tree.LogMe(&logger);
}
TEST(SerializationLoggerTest, dump_case1) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
39000, 10, 11, 5, 0));
device_events.push_back(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
55000, 0, 10, 10, 1, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
95000, 0, 10, 10, 2, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
65000, 0, 10, 11, 3, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
59000, 0, 10, 10, 4, MemcpyEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case1.pb");
NodeTrees tree(host_events, runtime_events, device_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u);
EXPECT_EQ(nodes[11].size(), 1u);
std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
if ((*it)->Name() == "root node") {
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
}
}
for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
if ((*it)->Name() == "root node") {
EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
}
}
tree.LogMe(&logger);
}
TEST(DeserializationReaderTest, restore_case0) {
DeserializationReader reader("test_serialization_logger_case0.pb");
std::unique_ptr<NodeTrees> tree = reader.Parse();
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree->Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u);
EXPECT_EQ(nodes[11].size(), 2u);
std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
if ((*it)->Name() == "root node") {
EXPECT_EQ((*it)->GetChildren().size(), 3u);
}
if ((*it)->Name() == "op1") {
EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
}
}
for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
if ((*it)->Name() == "op3") {
EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
}
}
}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/os_info.h"
namespace paddle {
namespace platform {
template <typename... Args>
std::string string_format(const std::string& format, Args... args) {
int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
1; // Extra space for '\0'
PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
"Error during profiler data formatting."));
auto size = static_cast<size_t>(size_s);
auto buf = std::make_unique<char[]>(size);
std::snprintf(buf.get(), size, format.c_str(), args...);
return std::string(buf.get(), size - 1); // exclude the '\0'
}
static std::string GetStringFormatLocalTime() {
std::time_t rawtime;
std::tm* timeinfo;
char buf[100];
std::time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::strftime(buf, 100, "%F-%X", timeinfo);
return std::string(buf);
}
static int64_t nsToUs(int64_t ns) { return ns / 1000; }
} // namespace platform
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册