未验证 提交 94271bc2 编写于 作者: C chenjian 提交者: GitHub

add new field for event node (#43223) (#44245)

* add new field for event node

* fix

* fix bug

* fix bug

* fix clang

* fix clang format

* fix code format
上级 3cc6ae69
......@@ -27,24 +27,16 @@ limitations under the License. */
namespace paddle {
namespace platform {
static const char* kSchemaVersion = "1.0.0";
static const char* kSchemaVersion = "1.0.1";
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
static uint32_t span_indx = 0;
static std::string DefaultFileName() {
auto pid = GetProcessId();
return string_format(std::string(kDefaultFilename), pid,
GetStringFormatLocalTime().c_str());
return string_format(
std::string(kDefaultFilename), pid, GetStringFormatLocalTime().c_str());
}
const char* ChromeTracingLogger::categary_name_[] = {
"Operator", "Dataloader", "ProfileStep",
"CudaRuntime", "Kernel", "Memcpy",
"Memset", "UserDefined", "OperatorInner",
"Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};
void ChromeTracingLogger::OpenFile() {
output_file_stream_.open(filename_,
std::ofstream::out | std::ofstream::trunc);
......@@ -81,7 +73,8 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
// find the earliest time in current timeline
start_time_ = std::numeric_limits<uint64_t>::max();
for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) {
it != thread2host_event_nodes.end();
++it) {
if (it->second.begin() + 1 != it->second.end()) {
if ((*(it->second.begin() + 1))->StartNs() < start_time_) {
start_time_ = (*(it->second.begin() + 1))->StartNs();
......@@ -99,7 +92,8 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
}
for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) {
it != thread2host_event_nodes.end();
++it) {
for (auto hostnode = it->second.begin(); hostnode != it->second.end();
++hostnode) {
if (hostnode != it->second.begin()) { // skip root node
......@@ -116,8 +110,45 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
(*devicenode)->LogMe(this);
}
}
for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin();
memnode != (*hostnode)->GetMemTraceEventNodes().end();
++memnode) {
(*memnode)->LogMe(this);
}
}
}
}
void ChromeTracingLogger::LogMemTraceEventNode(
const MemTraceEventNode& mem_node) {
if (!output_file_stream_) {
return;
}
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "[memory]", "pid": %lld, "tid": "%lld",
"ts": %lld,
"ph": "i", "cat": "%s",
"args": {
"place": "%s",
"addr": "%llu",
"current_allocated": %llu,
"current_reserved": %llu,
"increase_bytes": %lld
}
},
)JSON"),
mem_node.ProcessId(),
mem_node.ThreadId(),
mem_node.TimeStampNs(),
StringTracerMemEventType(mem_node.Type()),
mem_node.Place().c_str(),
mem_node.Addr(),
mem_node.CurrentAllocated(),
mem_node.CurrentReserved(),
mem_node.IncreaseBytes());
}
void ChromeTracingLogger::LogHostTraceEventNode(
......@@ -132,6 +163,16 @@ void ChromeTracingLogger::LogHostTraceEventNode(
} else {
dur_display = string_format(std::string("%.3f us"), dur * 1000);
}
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> input_dtypes;
std::string callstack;
OperatorSupplementEventNode* op_supplement_node =
host_node.GetOperatorSupplementEventNode();
if (op_supplement_node != nullptr) {
input_shapes = op_supplement_node->InputShapes();
input_dtypes = op_supplement_node->Dtypes();
callstack = op_supplement_node->CallStack();
}
switch (host_node.Type()) {
case TracerEventType::ProfileStep:
case TracerEventType::Forward:
......@@ -156,13 +197,58 @@ void ChromeTracingLogger::LogHostTraceEventNode(
}
},
)JSON"),
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()),
host_node.Name().c_str(),
dur_display.c_str(),
host_node.ProcessId(),
host_node.ThreadId(),
nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())],
StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_));
break;
case TracerEventType::Operator:
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "thread_state_runnable",
"args": {
"start_time": "%.3f us",
"end_time": "%.3f us",
"input_shapes": %s,
"input_dtypes": %s,
"callstack": "%s"
}
},
)JSON"),
host_node.Name().c_str(),
dur_display.c_str(),
host_node.ProcessId(),
host_node.ThreadId(),
nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_),
json_dict(input_shapes).c_str(),
json_dict(input_dtypes).c_str(),
callstack.c_str());
break;
case TracerEventType::CudaRuntime:
case TracerEventType::Kernel:
case TracerEventType::Memcpy:
case TracerEventType::Memset:
case TracerEventType::UserDefined:
case TracerEventType::OperatorInner:
case TracerEventType::Communication:
case TracerEventType::MluRuntime:
case TracerEventType::NumTypes:
default:
output_file_stream_ << string_format(
std::string(
......@@ -178,10 +264,13 @@ void ChromeTracingLogger::LogHostTraceEventNode(
}
},
)JSON"),
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()),
host_node.Name().c_str(),
dur_display.c_str(),
host_node.ProcessId(),
host_node.ThreadId(),
nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())],
StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_));
break;
......@@ -217,10 +306,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
}
},
)JSON"),
runtime_node.Name().c_str(), dur_display.c_str(),
runtime_node.ProcessId(), runtime_node.ThreadId(),
nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()),
categary_name_[static_cast<int>(runtime_node.Type())],
runtime_node.Name().c_str(),
dur_display.c_str(),
runtime_node.ProcessId(),
runtime_node.ThreadId(),
nsToUs(runtime_node.StartNs()),
nsToUsFloat(runtime_node.Duration()),
StringTracerEventType(runtime_node.Type()),
runtime_node.CorrelationId(),
nsToUsFloat(runtime_node.StartNs(), start_time_),
nsToUsFloat(runtime_node.EndNs(), start_time_));
......@@ -235,7 +327,8 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
"ph": "s", "cat": "async"
},
)JSON"),
runtime_node.CorrelationId(), runtime_node.ProcessId(),
runtime_node.CorrelationId(),
runtime_node.ProcessId(),
runtime_node.ThreadId(),
nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1));
pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
......@@ -260,8 +353,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
break;
}
if (nsToUs(device_node.Duration()) == 0) {
output_file_stream_ << string_format(
std::string(
output_file_stream_ << string_format(std::string(
R"JSON(
{
"name": "launch", "id": %d, "pid": %lld, "tid": %lld,
......@@ -269,8 +361,10 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
"ph": "f", "cat": "async"
},
)JSON"),
device_node.CorrelationId(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()));
device_node.CorrelationId(),
device_node.DeviceId(),
device_node.StreamId(),
nsToUs(device_node.StartNs()));
deviceid_streamid_set_.insert(
{device_node.DeviceId(), device_node.StreamId()});
} else {
......@@ -283,7 +377,8 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
"ph": "f", "cat": "async", "bp": "e"
},
)JSON"),
device_node.CorrelationId(), device_node.DeviceId(),
device_node.CorrelationId(),
device_node.DeviceId(),
device_node.StreamId(),
nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1));
deviceid_streamid_set_.insert(
......@@ -304,13 +399,17 @@ void ChromeTracingLogger::HandleTypeKernel(
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
kernel_info.grid_z) /
device_property.multiProcessorCount;
warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y *
kernel_info.block_z) /
warps_per_sm =
blocks_per_sm *
(kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
threads_per_warp;
occupancy = CalculateEstOccupancy(
device_node.DeviceId(), kernel_info.registers_per_thread,
kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory,
kernel_info.block_x, kernel_info.block_y, kernel_info.block_z,
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.registers_per_thread,
kernel_info.static_shared_memory,
kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
blocks_per_sm);
#endif
float dur = nsToMsFloat(device_node.Duration());
......@@ -343,18 +442,30 @@ void ChromeTracingLogger::HandleTypeKernel(
}
},
)JSON"),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
device_node.Name().c_str(),
dur_display.c_str(),
device_node.DeviceId(),
device_node.StreamId(),
nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(),
device_node.CorrelationId(), kernel_info.registers_per_thread,
nsToUsFloat(device_node.EndNs(), start_time_),
device_node.DeviceId(),
device_node.ContextId(),
device_node.StreamId(),
device_node.CorrelationId(),
kernel_info.registers_per_thread,
kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
blocks_per_sm, warps_per_sm, kernel_info.grid_x, kernel_info.grid_y,
kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y,
kernel_info.block_z, occupancy * 100);
blocks_per_sm,
warps_per_sm,
kernel_info.grid_x,
kernel_info.grid_y,
kernel_info.grid_z,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
occupancy * 100);
}
void ChromeTracingLogger::HandleTypeMemcpy(
......@@ -387,13 +498,19 @@ void ChromeTracingLogger::HandleTypeMemcpy(
}
},
)JSON"),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
device_node.Name().c_str(),
dur_display.c_str(),
device_node.DeviceId(),
device_node.StreamId(),
nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(),
device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
nsToUsFloat(device_node.EndNs(), start_time_),
device_node.StreamId(),
device_node.CorrelationId(),
memcpy_info.num_bytes,
memory_bandwidth);
}
void ChromeTracingLogger::HandleTypeMemset(
......@@ -423,14 +540,21 @@ void ChromeTracingLogger::HandleTypeMemset(
}
},
)JSON"),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
device_node.Name().c_str(),
dur_display.c_str(),
device_node.DeviceId(),
device_node.StreamId(),
nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(),
device_node.CorrelationId(), memset_info.num_bytes, memset_info.value);
nsToUsFloat(device_node.EndNs(), start_time_),
device_node.DeviceId(),
device_node.ContextId(),
device_node.StreamId(),
device_node.CorrelationId(),
memset_info.num_bytes,
memset_info.value);
}
void ChromeTracingLogger::StartLog() {
......@@ -441,7 +565,8 @@ void ChromeTracingLogger::StartLog() {
"displayTimeUnit": "ms",
"span_indx": "%d",
)JSON"),
kSchemaVersion, span_indx++);
kSchemaVersion,
span_indx++);
// add device property information
#if defined(PADDLE_WITH_CUDA)
output_file_stream_ << std::string(R"JSON(
......@@ -463,12 +588,17 @@ void ChromeTracingLogger::StartLog() {
"smCount": %d, "sharedMemPerBlockOptin": %d
},
)JSON"),
device_ids[index], device_property.name, device_property.totalGlobalMem,
device_property.major, device_property.minor,
device_ids[index],
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock, device_property.regsPerMultiprocessor,
device_property.warpSize, device_property.sharedMemPerBlock,
device_property.regsPerBlock,
device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin);
......@@ -488,12 +618,17 @@ void ChromeTracingLogger::StartLog() {
"smCount": %d, "sharedMemPerBlockOptin": %d
}],
)JSON"),
device_ids[device_ids.size() - 1], device_property.name,
device_property.totalGlobalMem, device_property.major,
device_property.minor, device_property.maxThreadsPerBlock,
device_ids[device_ids.size() - 1],
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock, device_property.regsPerMultiprocessor,
device_property.warpSize, device_property.sharedMemPerBlock,
device_property.regsPerBlock,
device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin);
......@@ -522,12 +657,14 @@ void ChromeTracingLogger::LogMetaInfo(
output_file_stream_ << string_format(std::string(R"JSON(
"%s": "%s",
)JSON"),
kv.first.c_str(), kv.second.c_str());
kv.first.c_str(),
kv.second.c_str());
} else {
output_file_stream_ << string_format(std::string(R"JSON(
"%s": "%s"
)JSON"),
kv.first.c_str(), kv.second.c_str());
kv.first.c_str(),
kv.second.c_str());
}
count--;
}
......@@ -591,13 +728,29 @@ void ChromeTracingLogger::RefineDisplayName(
}
},
)JSON"),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
(*it).first, (*it).first, (*it).second, (*it).second,
(*it).first,
(*it).second,
(*it).first,
(*it).first,
(*it).second,
(*it).first,
(*it).first,
(*it).second,
(*it).second,
extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
(*it).first, (*it).second, (*it).second,
(*it).first,
(*it).second,
(*it).second,
extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
(*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
(*it).first,
(*it).second,
(*it).first,
(*it).first,
(*it).second,
(*it).second * 2,
(*it).first,
(*it).second,
(*it).second * 2 + 1);
}
#ifdef PADDLE_WITH_MLU
......@@ -607,9 +760,9 @@ void ChromeTracingLogger::RefineDisplayName(
#endif
for (auto it = deviceid_streamid_set_.begin();
it != deviceid_streamid_set_.end(); ++it) {
output_file_stream_ << string_format(
std::string(
it != deviceid_streamid_set_.end();
++it) {
output_file_stream_ << string_format(std::string(
R"JSON(
{
"name": "process_name", "pid": %lld, "tid": %lld,
......@@ -640,9 +793,19 @@ void ChromeTracingLogger::RefineDisplayName(
}
},
)JSON"),
(*it).first, (*it).second, (*it).first, device_type.c_str(),
(*it).first, (*it).second, (*it).second, (*it).first, (*it).second,
(*it).first + 0x10000000, (*it).first, (*it).second, (*it).second);
(*it).first,
(*it).second,
(*it).first,
device_type.c_str(),
(*it).first,
(*it).second,
(*it).second,
(*it).first,
(*it).second,
(*it).first + 0x10000000,
(*it).first,
(*it).second,
(*it).second);
}
}
......
......@@ -36,6 +36,7 @@ class ChromeTracingLogger : public BaseLogger {
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>);
void LogMemTraceEventNode(const MemTraceEventNode&) override;
private:
void OpenFile();
......
......@@ -19,61 +19,102 @@
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/event_python.h"
using paddle::platform::SerializationLogger;
using paddle::platform::DeserializationReader;
using paddle::platform::NodeTrees;
using paddle::platform::HostTraceEventNode;
using paddle::platform::CudaRuntimeTraceEventNode;
using paddle::platform::DeserializationReader;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::DeviceTraceEventNode;
using paddle::platform::HostTraceEvent;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::TracerEventType;
using paddle::platform::HostTraceEventNode;
using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo;
using paddle::platform::MemTraceEvent;
using paddle::platform::NodeTrees;
using paddle::platform::OperatorSupplementEvent;
using paddle::platform::ProfilerResult;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::SerializationLogger;
using paddle::platform::TracerEventType;
TEST(SerializationLoggerTest, dump_case0) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000,
10, 10));
TracerEventType::Dataloader,
1000,
10000,
10,
10));
host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
39000, 10, 11, 5, 0));
device_events.push_back(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
55000, 0, 10, 10, 1, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
95000, 0, 10, 10, 2, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
65000, 0, 10, 11, 3, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
59000, 0, 10, 10, 4, MemcpyEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch2"), 25000, 35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch3"), 33000, 37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemcpy1"), 18000, 19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemset1"), 38000, 39000, 10, 11, 5, 0));
device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
TracerEventType::Kernel,
40000,
55000,
0,
10,
10,
1,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
TracerEventType::Kernel,
70000,
95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
65000,
0,
10,
11,
3,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memcpy1"),
TracerEventType::Memcpy,
56000,
59000,
0,
10,
10,
4,
MemcpyEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memset1"),
TracerEventType::Memset,
66000,
69000,
0,
10,
11,
5,
MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case0.pb");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u);
......@@ -102,33 +143,69 @@ TEST(SerializationLoggerTest, dump_case1) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
39000, 10, 11, 5, 0));
device_events.push_back(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
55000, 0, 10, 10, 1, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
95000, 0, 10, 10, 2, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
65000, 0, 10, 11, 3, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
59000, 0, 10, 10, 4, MemcpyEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch2"), 25000, 35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch3"), 33000, 37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemcpy1"), 18000, 19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemset1"), 38000, 39000, 10, 11, 5, 0));
device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
TracerEventType::Kernel,
40000,
55000,
0,
10,
10,
1,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
TracerEventType::Kernel,
70000,
95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
65000,
0,
10,
11,
3,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memcpy1"),
TracerEventType::Memcpy,
56000,
59000,
0,
10,
10,
4,
MemcpyEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memset1"),
TracerEventType::Memset,
66000,
69000,
0,
10,
11,
5,
MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case1.pb");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u);
......
......@@ -17,6 +17,8 @@ limitations under the License. */
#include <set>
#include <stack>
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle {
namespace platform {
......@@ -42,22 +44,34 @@ CudaRuntimeTraceEventNode::~CudaRuntimeTraceEventNode() {
NodeTrees::~NodeTrees() {
// delete all root nodes
for (auto it = thread_event_trees_map_.begin();
it != thread_event_trees_map_.end(); ++it) {
it != thread_event_trees_map_.end();
++it) {
delete it->second;
}
}
void NodeTrees::BuildTrees(
const std::vector<HostTraceEventNode*>& host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
const std::vector<DeviceTraceEventNode*>& device_event_nodes) {
// seperate Host Event Nodes into different threads
const std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
const std::vector<DeviceTraceEventNode*>& device_event_nodes,
const std::vector<MemTraceEventNode*>& mem_event_nodes,
const std::vector<OperatorSupplementEventNode*>& op_supplement_events) {
// separate Host Event Nodes into different threads
std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes; // used to store HostTraceEventNodes per thread
std::map<uint64_t, std::vector<CudaRuntimeTraceEventNode*>>
thread2runtime_event_nodes; // used to store CudaRuntimeTraceEventNode
// per
// thread
std::map<uint64_t, std::vector<MemTraceEventNode*>>
thread2mem_event_nodes; // used to store MemTraceEventNode
// per
// thread
std::map<uint64_t, std::vector<OperatorSupplementEventNode*>>
thread2op_supplement_event_nodes; // used to store
// OperatorSupplementEventNode
// per
// thread
std::map<uint32_t, CudaRuntimeTraceEventNode*>
correlation_id2runtime_event_node; // used to store the relation between
// correlation id and runtime node
......@@ -79,19 +93,31 @@ void NodeTrees::BuildTrees(
auto dst_iter =
correlation_id2runtime_event_node.find((*it)->CorrelationId());
PADDLE_ENFORCE_NE(
dst_iter, correlation_id2runtime_event_node.end(),
dst_iter,
correlation_id2runtime_event_node.end(),
platform::errors::NotFound("Unknown device events, "
"no corresponding cuda runtime events"));
dst_iter->second->AddDeviceTraceEventNode(*it);
}
// construct thread2mem_event_nodes
for (auto it = mem_event_nodes.begin(); it != mem_event_nodes.end(); ++it) {
thread2mem_event_nodes[(*it)->ThreadId()].push_back(*it);
}
// construct thread2op_supplement_event_nodes
for (auto it = op_supplement_events.begin(); it != op_supplement_events.end();
++it) {
thread2op_supplement_event_nodes[(*it)->ThreadId()].push_back(*it);
}
// sort host event nodes and runtime event nodes according to start_ns and
// end_ns
// the smaller start_ns is, the further ahead position is.
// when start_ns of two nodes are equal, the one with bigger end_ns should be
// ahead.
for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) {
std::sort(it->second.begin(), it->second.end(),
it != thread2host_event_nodes.end();
++it) {
std::sort(it->second.begin(),
it->second.end(),
[](HostTraceEventNode* node1, HostTraceEventNode* node2) {
if (node1->StartNs() < node2->StartNs()) {
return true;
......@@ -104,9 +130,11 @@ void NodeTrees::BuildTrees(
});
}
for (auto it = thread2runtime_event_nodes.begin();
it != thread2runtime_event_nodes.end(); ++it) {
it != thread2runtime_event_nodes.end();
++it) {
std::sort(
it->second.begin(), it->second.end(),
it->second.begin(),
it->second.end(),
[](CudaRuntimeTraceEventNode* node1, CudaRuntimeTraceEventNode* node2) {
if (node1->StartNs() < node2->StartNs()) {
return true;
......@@ -118,34 +146,82 @@ void NodeTrees::BuildTrees(
return false;
});
}
// sort mem event nodes and operator supplement event nodes
for (auto it = thread2mem_event_nodes.begin();
it != thread2mem_event_nodes.end();
++it) {
std::sort(it->second.begin(),
it->second.end(),
[](MemTraceEventNode* node1, MemTraceEventNode* node2) {
if (node1->TimeStampNs() <= node2->TimeStampNs()) {
return true;
}
return false;
});
}
for (auto it = thread2op_supplement_event_nodes.begin();
it != thread2op_supplement_event_nodes.end();
++it) {
std::sort(it->second.begin(),
it->second.end(),
[](OperatorSupplementEventNode* node1,
OperatorSupplementEventNode* node2) {
if (node1->TimeStampNs() <= node2->TimeStampNs()) {
return true;
}
return false;
});
}
// construct trees
std::set<uint64_t> thread_set;
for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) {
it != thread2host_event_nodes.end();
++it) {
thread_set.insert(it->first);
}
for (auto it = thread2runtime_event_nodes.begin();
it != thread2runtime_event_nodes.end(); ++it) {
it != thread2runtime_event_nodes.end();
++it) {
thread_set.insert(it->first);
}
for (auto it = thread2mem_event_nodes.begin();
it != thread2mem_event_nodes.end();
++it) {
thread_set.insert(it->first);
}
for (auto it = thread2op_supplement_event_nodes.begin();
it != thread2op_supplement_event_nodes.end();
++it) {
thread_set.insert(it->first);
}
for (auto it = thread_set.begin(); it != thread_set.end(); ++it) {
thread_event_trees_map_[*it] = BuildTreeRelationship(
thread2host_event_nodes[*it], thread2runtime_event_nodes[*it]);
thread_event_trees_map_[*it] =
BuildTreeRelationship(thread2host_event_nodes[*it],
thread2runtime_event_nodes[*it],
thread2mem_event_nodes[*it],
thread2op_supplement_event_nodes[*it]);
}
}
HostTraceEventNode* NodeTrees::BuildTreeRelationship(
std::vector<HostTraceEventNode*> host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes) {
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
std::vector<MemTraceEventNode*> mem_event_nodes,
std::vector<OperatorSupplementEventNode*> op_supplement_events) {
// a stack used for analyse relationship
auto node_stack = std::vector<HostTraceEventNode*>();
// root node, top level
auto root_node = new HostTraceEventNode(
HostTraceEvent(std::string("root node"), TracerEventType::UserDefined, 0,
ULLONG_MAX, 0, 0));
auto root_node =
new HostTraceEventNode(HostTraceEvent(std::string("root node"),
TracerEventType::UserDefined,
0,
ULLONG_MAX,
0,
0));
// push root node into node_stack
node_stack.push_back(root_node);
// handle host_event_nodes
......@@ -155,7 +231,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
if ((*it)->StartNs() < stack_top_node->EndNs()) {
// current node is the child of stack_top_node
PADDLE_ENFORCE_LE(
(*it)->EndNs(), stack_top_node->EndNs(),
(*it)->EndNs(),
stack_top_node->EndNs(),
platform::errors::Fatal(
"should not have time range intersection within one thread"));
stack_top_node->AddChild(*it);
......@@ -170,7 +247,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
runtime_event_nodes.end();
bool hasenter = false;
for (auto runtimenode = runtime_event_nodes.begin();
runtimenode != runtime_event_nodes.end(); ++runtimenode) {
runtimenode != runtime_event_nodes.end();
++runtimenode) {
if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) &&
((*runtimenode)->EndNs() <= stack_top_node->EndNs())) {
if (!hasenter) {
......@@ -203,7 +281,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
runtime_event_nodes.end();
bool hasenter = false;
for (auto runtimenode = runtime_event_nodes.begin();
runtimenode != runtime_event_nodes.end(); ++runtimenode) {
runtimenode != runtime_event_nodes.end();
++runtimenode) {
if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) &&
((*runtimenode)->EndNs() <= stack_top_node->EndNs())) {
if (!hasenter) {
......@@ -225,6 +304,103 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
}
node_stack.pop_back();
}
// build relationship between host event node and mem event node
// First, post-order traverse the tree. Then, insert the memory and op
// supplement node into correct host nodes.
auto stack = std::stack<HostTraceEventNode*>();
auto flag_stack = std::stack<int32_t>();
auto post_order_nodes = std::vector<HostTraceEventNode*>();
stack.push(root_node);
flag_stack.push(0);
while (!stack.empty()) {
auto current_node = stack.top();
stack.pop();
auto flag = flag_stack.top();
flag_stack.pop();
if (flag == 0) {
stack.push(current_node);
flag_stack.push(1);
for (auto child = current_node->GetChildren().rbegin();
child != current_node->GetChildren().rend();
++child) {
stack.push(*child);
flag_stack.push(0);
}
} else {
post_order_nodes.push_back(current_node);
}
}
for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
bool hasenter = false;
std::vector<MemTraceEventNode*>::iterator firstposition;
std::vector<MemTraceEventNode*>::iterator lastposition =
mem_event_nodes.end();
for (auto mem_it = mem_event_nodes.begin(); mem_it < mem_event_nodes.end();
++mem_it) {
if ((*mem_it)->TimeStampNs() >= (*it)->StartNs() &&
(*mem_it)->TimeStampNs() <= (*it)->EndNs()) {
(*it)->AddMemNode(*mem_it);
if (!hasenter) {
firstposition = mem_it;
hasenter = true;
}
} else {
if ((*mem_it)->TimeStampNs() > (*it)->EndNs()) {
lastposition = mem_it;
break;
}
}
}
if (hasenter) {
mem_event_nodes.erase(firstposition, lastposition);
}
}
// build relationship between host event node and op supplement node
for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
int op_supplement_count = 0;
bool hasenter = false;
std::vector<OperatorSupplementEventNode*>::iterator firstposition;
std::vector<OperatorSupplementEventNode*>::iterator lastposition =
op_supplement_events.end();
for (auto op_supplement_it = op_supplement_events.begin();
op_supplement_it < op_supplement_events.end();
++op_supplement_it) {
if ((*op_supplement_it)->TimeStampNs() >= (*it)->StartNs() &&
(*op_supplement_it)->TimeStampNs() <= (*it)->EndNs()) {
if (!hasenter) {
firstposition = op_supplement_it;
hasenter = true;
}
(*it)->SetOperatorSupplementNode(*op_supplement_it);
PADDLE_ENFORCE_EQ((*it)->Type(),
TracerEventType::Operator,
platform::errors::PreconditionNotMet(
"Operator supplement events should be embraced "
"by event of type TracerEventType::Operator, "
"but got type TracerEventType::%s",
StringTracerEventType((*it)->Type())));
op_supplement_count += 1;
} else {
if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) {
PADDLE_ENFORCE_LE(op_supplement_count,
1,
platform::errors::PreconditionNotMet(
"One event of TracerEventType::Operator has no "
"more than 1 op supplement event, but got %d.",
op_supplement_count));
lastposition = op_supplement_it;
break;
}
}
}
if (hasenter) {
op_supplement_events.erase(firstposition, lastposition);
}
}
return root_node;
}
......@@ -235,7 +411,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
std::map<uint64_t, std::vector<HostTraceEventNode*>> thread2host_event_nodes;
if (bfs == true) {
for (auto it = thread_event_trees_map_.begin();
it != thread_event_trees_map_.end(); ++it) {
it != thread_event_trees_map_.end();
++it) {
auto deque = std::deque<HostTraceEventNode*>();
uint64_t thread_id = it->first;
auto root_node = it->second;
......@@ -245,7 +422,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
deque.pop_front();
thread2host_event_nodes[thread_id].push_back(current_node);
for (auto child = current_node->GetChildren().begin();
child != current_node->GetChildren().end(); ++child) {
child != current_node->GetChildren().end();
++child) {
deque.push_back(*child);
}
}
......@@ -253,7 +431,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
} else {
for (auto it = thread_event_trees_map_.begin();
it != thread_event_trees_map_.end(); ++it) {
it != thread_event_trees_map_.end();
++it) {
auto stack = std::stack<HostTraceEventNode*>();
uint64_t thread_id = it->first;
auto root_node = it->second;
......@@ -262,8 +441,9 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
auto current_node = stack.top();
stack.pop();
thread2host_event_nodes[thread_id].push_back(current_node);
for (auto child = current_node->GetChildren().begin();
child != current_node->GetChildren().end(); ++child) {
for (auto child = current_node->GetChildren().rbegin();
child != current_node->GetChildren().rend();
++child) {
stack.push(*child);
}
}
......@@ -277,12 +457,16 @@ void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); }
void NodeTrees::HandleTrees(
std::function<void(HostTraceEventNode*)> host_event_node_handle,
std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle,
std::function<void(DeviceTraceEventNode*)> device_event_node_handle) {
std::function<void(DeviceTraceEventNode*)> device_event_node_handle,
std::function<void(MemTraceEventNode*)> mem_event_node_handle,
std::function<void(OperatorSupplementEventNode*)>
op_supplement_node_handle) {
// using different user-defined function to handle different nodes
const std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes = Traverse(true);
for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) {
it != thread2host_event_nodes.end();
++it) {
for (auto hostnode = it->second.begin(); hostnode != it->second.end();
++hostnode) {
if (hostnode != it->second.begin()) { // skip root node
......@@ -299,6 +483,15 @@ void NodeTrees::HandleTrees(
device_event_node_handle(*devicenode);
}
}
for (auto memeventnode = (*hostnode)->GetMemTraceEventNodes().begin();
memeventnode != (*hostnode)->GetMemTraceEventNodes().end();
++memeventnode) {
mem_event_node_handle(*memeventnode);
}
if ((*hostnode)->GetOperatorSupplementEventNode()) {
op_supplement_node_handle(
(*hostnode)->GetOperatorSupplementEventNode());
}
}
}
}
......
......@@ -21,12 +21,67 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/output_logger.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace platform {
class MemTraceEventNode {
public:
// constructor
explicit MemTraceEventNode(const MemTraceEvent& mem_event)
: mem_event_(mem_event) {}
// destructor
~MemTraceEventNode();
// getter
TracerMemEventType Type() const { return mem_event_.type; }
uint64_t Addr() const { return mem_event_.addr; }
uint64_t TimeStampNs() const { return mem_event_.timestamp_ns; }
uint64_t ProcessId() const { return mem_event_.process_id; }
uint64_t ThreadId() const { return mem_event_.thread_id; }
int64_t IncreaseBytes() const { return mem_event_.increase_bytes; }
std::string Place() const { return mem_event_.place; }
uint64_t CurrentAllocated() const { return mem_event_.current_allocated; }
uint64_t CurrentReserved() const { return mem_event_.current_reserved; }
// member function
void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); }
private:
// data
MemTraceEvent mem_event_;
};
class OperatorSupplementEventNode {
public:
// constructor
explicit OperatorSupplementEventNode(
const OperatorSupplementEvent& op_supplement_event)
: op_supplement_event_(op_supplement_event) {}
// destructor
~OperatorSupplementEventNode() {}
// getter
std::string Name() const { return op_supplement_event_.op_type; }
uint64_t TimeStampNs() const { return op_supplement_event_.timestamp_ns; }
std::map<std::string, std::vector<std::vector<int64_t>>>& InputShapes() {
return op_supplement_event_.input_shapes;
}
std::map<std::string, std::vector<std::string>>& Dtypes() {
return op_supplement_event_.dtypes;
}
std::string CallStack() { return op_supplement_event_.callstack; }
uint64_t ProcessId() const { return op_supplement_event_.process_id; }
uint64_t ThreadId() const { return op_supplement_event_.thread_id; }
private:
// data
OperatorSupplementEvent op_supplement_event_;
};
class DeviceTraceEventNode {
public:
// constructor
......@@ -48,7 +103,8 @@ class DeviceTraceEventNode {
uint32_t CorrelationId() const { return device_event_.correlation_id; }
KernelEventInfo KernelInfo() const {
PADDLE_ENFORCE_EQ(
device_event_.type, TracerEventType::Kernel,
device_event_.type,
TracerEventType::Kernel,
platform::errors::Unavailable(
"Can not kernel_info, "
"TracerEventType in node must be TracerEventType::Kernel."));
......@@ -56,7 +112,8 @@ class DeviceTraceEventNode {
}
MemcpyEventInfo MemcpyInfo() const {
PADDLE_ENFORCE_EQ(
device_event_.type, TracerEventType::Memcpy,
device_event_.type,
TracerEventType::Memcpy,
platform::errors::Unavailable(
"Can not get memcpy_info, "
"TracerEventType in node must be TracerEventType::Memcpy."));
......@@ -64,7 +121,8 @@ class DeviceTraceEventNode {
}
MemsetEventInfo MemsetInfo() const {
PADDLE_ENFORCE_EQ(
device_event_.type, TracerEventType::Memset,
device_event_.type,
TracerEventType::Memset,
platform::errors::Unavailable(
"Can not get memset_info, "
"TracerEventType in node must be TracerEventType::Memset."));
......@@ -139,10 +197,25 @@ class HostTraceEventNode {
void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) {
runtime_node_ptrs_.push_back(node);
}
std::vector<HostTraceEventNode*>& GetChildren() { return children_; }
std::vector<CudaRuntimeTraceEventNode*>& GetRuntimeTraceEventNodes() {
void AddMemNode(MemTraceEventNode* node) { mem_node_ptrs_.push_back(node); }
void SetOperatorSupplementNode(OperatorSupplementEventNode* node) {
op_supplement_node_ptr_ = node;
}
const std::vector<HostTraceEventNode*>& GetChildren() const {
return children_;
}
const std::vector<CudaRuntimeTraceEventNode*>& GetRuntimeTraceEventNodes()
const {
return runtime_node_ptrs_;
}
const std::vector<MemTraceEventNode*>& GetMemTraceEventNodes() const {
return mem_node_ptrs_;
}
OperatorSupplementEventNode* GetOperatorSupplementEventNode() const {
return op_supplement_node_ptr_;
}
void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); }
private:
......@@ -152,6 +225,9 @@ class HostTraceEventNode {
std::vector<CudaRuntimeTraceEventNode*> runtime_node_ptrs_;
// host events called by this
std::vector<HostTraceEventNode*> children_;
// memory events happened in this event period
std::vector<MemTraceEventNode*> mem_node_ptrs_;
OperatorSupplementEventNode* op_supplement_node_ptr_ = nullptr;
};
class NodeTrees {
......@@ -159,10 +235,14 @@ class NodeTrees {
// constructor
NodeTrees(const std::list<HostTraceEvent>& host_events,
const std::list<RuntimeTraceEvent>& runtime_events,
const std::list<DeviceTraceEvent>& device_events) {
const std::list<DeviceTraceEvent>& device_events,
const std::list<MemTraceEvent>& mem_events,
const std::list<OperatorSupplementEvent>& op_supplement_events) {
std::vector<HostTraceEventNode*> host_event_nodes;
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes;
std::vector<DeviceTraceEventNode*> device_event_nodes;
std::vector<MemTraceEventNode*> mem_event_nodes;
std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes;
// encapsulate event into nodes
for (auto it = host_events.begin(); it != host_events.end(); ++it) {
host_event_nodes.push_back(new HostTraceEventNode(*it));
......@@ -173,8 +253,20 @@ class NodeTrees {
for (auto it = device_events.begin(); it != device_events.end(); ++it) {
device_event_nodes.push_back(new DeviceTraceEventNode(*it));
}
for (auto it = mem_events.begin(); it != mem_events.end(); ++it) {
mem_event_nodes.push_back(new MemTraceEventNode(*it));
}
for (auto it = op_supplement_events.begin();
it != op_supplement_events.end();
++it) {
op_supplement_event_nodes.push_back(new OperatorSupplementEventNode(*it));
}
// build tree
BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes);
BuildTrees(host_event_nodes,
runtime_event_nodes,
device_event_nodes,
mem_event_nodes,
op_supplement_event_nodes);
}
explicit NodeTrees(
......@@ -187,8 +279,10 @@ class NodeTrees {
void LogMe(BaseLogger* logger);
void HandleTrees(std::function<void(HostTraceEventNode*)>,
std::function<void(CudaRuntimeTraceEventNode*)>,
std::function<void(DeviceTraceEventNode*)>);
std::map<uint64_t, HostTraceEventNode*> GetNodeTrees() {
std::function<void(DeviceTraceEventNode*)>,
std::function<void(MemTraceEventNode*)>,
std::function<void(OperatorSupplementEventNode*)>);
const std::map<uint64_t, HostTraceEventNode*>& GetNodeTrees() const {
return thread_event_trees_map_;
}
std::map<uint64_t, std::vector<HostTraceEventNode*>> Traverse(bool bfs) const;
......@@ -196,11 +290,15 @@ class NodeTrees {
private:
std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map_;
void BuildTrees(const std::vector<HostTraceEventNode*>&,
std::vector<CudaRuntimeTraceEventNode*>&,
const std::vector<DeviceTraceEventNode*>&);
const std::vector<CudaRuntimeTraceEventNode*>&,
const std::vector<DeviceTraceEventNode*>&,
const std::vector<MemTraceEventNode*>&,
const std::vector<OperatorSupplementEventNode*>&);
HostTraceEventNode* BuildTreeRelationship(
std::vector<HostTraceEventNode*> host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes);
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
std::vector<MemTraceEventNode*> mem_event_nodes,
std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes);
};
} // namespace platform
......
......@@ -24,6 +24,7 @@ class DeviceTraceEventNode; // forward declaration
class HostTraceEventNode; // forward declaration
class CudaRuntimeTraceEventNode; // forward declaration
class NodeTrees; // forward declaration
class MemTraceEventNode; // forward declaration
class BaseLogger {
public:
......@@ -33,6 +34,7 @@ class BaseLogger {
virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
virtual void LogNodeTrees(const NodeTrees&) {}
virtual void LogMemTraceEventNode(const MemTraceEventNode&) {}
};
} // namespace platform
......
......@@ -100,9 +100,12 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
tracer.Get().StopTracing();
tracer.Get().CollectTraceData(&collector);
}
std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(),
std::unique_ptr<NodeTrees> tree(
new NodeTrees(collector.HostEvents(),
collector.RuntimeEvents(),
collector.DeviceEvents()));
collector.DeviceEvents(),
collector.MemEvents(),
collector.OperatorSupplementEvents()));
cpu_utilization_.RecordEndTimeInfo();
ExtraInfo extrainfo;
extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
......@@ -115,7 +118,8 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
collector.ThreadNames();
for (const auto& kv : thread_names) {
extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
std::string("%s"), kv.second.c_str());
std::string("%s"),
kv.second.c_str());
}
return std::unique_ptr<ProfilerResult>(
new platform::ProfilerResult(std::move(tree), extrainfo));
......
......@@ -18,57 +18,120 @@
#include "paddle/fluid/platform/profiler/event_node.h"
using paddle::platform::ChromeTracingLogger;
using paddle::platform::NodeTrees;
using paddle::platform::HostTraceEventNode;
using paddle::platform::CudaRuntimeTraceEventNode;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::DeviceTraceEventNode;
using paddle::platform::HostTraceEvent;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::TracerEventType;
using paddle::platform::HostTraceEventNode;
using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo;
using paddle::platform::MemTraceEvent;
using paddle::platform::MemTraceEventNode;
using paddle::platform::NodeTrees;
using paddle::platform::OperatorSupplementEvent;
using paddle::platform::OperatorSupplementEventNode;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::TracerEventType;
using paddle::platform::TracerMemEventType;
TEST(NodeTreesTest, LogMe_case0) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000,
10, 10));
TracerEventType::Dataloader,
1000,
10000,
10,
10));
host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
39000, 10, 11, 5, 0));
device_events.push_back(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
55000, 0, 10, 10, 1, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
95000, 0, 10, 10, 2, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
65000, 0, 10, 11, 3, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
59000, 0, 10, 10, 4, MemcpyEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
mem_events.push_back(MemTraceEvent(11500,
0x1000,
TracerMemEventType::Allocate,
10,
10,
50,
"GPU:0",
50,
50));
mem_events.push_back(MemTraceEvent(
11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50));
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
input_shapes[std::string("X")].push_back(std::vector<int64_t>{4, 5, 6, 7});
dtypes[std::string("X")].push_back(std::string("int8"));
dtypes[std::string("X")].push_back(std::string("float32"));
op_supplement_events.push_back(OperatorSupplementEvent(
11600, "op1", input_shapes, dtypes, "op1()", 10, 10));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch2"), 25000, 35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch3"), 33000, 37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemcpy1"), 18000, 19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemset1"), 38000, 39000, 10, 11, 5, 0));
device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
TracerEventType::Kernel,
40000,
55000,
0,
10,
10,
1,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
TracerEventType::Kernel,
70000,
95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
65000,
0,
10,
11,
3,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memcpy1"),
TracerEventType::Memcpy,
56000,
59000,
0,
10,
10,
4,
MemcpyEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memset1"),
TracerEventType::Memset,
66000,
69000,
0,
10,
11,
5,
MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case0.json");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u);
......@@ -82,6 +145,8 @@ TEST(NodeTreesTest, LogMe_case0) {
if ((*it)->Name() == "op1") {
EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
}
}
for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
......@@ -91,39 +156,76 @@ TEST(NodeTreesTest, LogMe_case0) {
}
}
tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
}
TEST(NodeTreesTest, LogMe_case1) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
39000, 10, 11, 5, 0));
device_events.push_back(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
55000, 0, 10, 10, 1, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
95000, 0, 10, 10, 2, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
65000, 0, 10, 11, 3, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
59000, 0, 10, 10, 4, MemcpyEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch2"), 25000, 35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch3"), 33000, 37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemcpy1"), 18000, 19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemset1"), 38000, 39000, 10, 11, 5, 0));
device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
TracerEventType::Kernel,
40000,
55000,
0,
10,
10,
1,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
TracerEventType::Kernel,
70000,
95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
65000,
0,
10,
11,
3,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memcpy1"),
TracerEventType::Memcpy,
56000,
59000,
0,
10,
10,
4,
MemcpyEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memset1"),
TracerEventType::Memset,
66000,
69000,
0,
10,
11,
5,
MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case1.json");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u);
......@@ -142,35 +244,79 @@ TEST(NodeTreesTest, LogMe_case1) {
}
}
tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
}
TEST(NodeTreesTest, HandleTrees_case0) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 10000, 100000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 30000, 70000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 2000, 120000, 10, 11));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
25000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 35000,
45000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 10000,
55000, 10, 11, 3, 0));
device_events.push_back(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
55000, 0, 10, 10, 1, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
95000, 0, 10, 10, 2, KernelEventInfo()));
device_events.push_back(
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
75000, 0, 10, 11, 3, KernelEventInfo()));
mem_events.push_back(MemTraceEvent(11500,
0x1000,
TracerMemEventType::Allocate,
10,
10,
50,
"GPU:0",
50,
50));
mem_events.push_back(MemTraceEvent(
11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50));
op_supplement_events.push_back(OperatorSupplementEvent(
11600,
"op1",
std::map<std::string, std::vector<std::vector<int64_t>>>(),
std::map<std::string, std::vector<std::string>>(),
"op1()",
10,
10));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch1"), 15000, 25000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch2"), 35000, 45000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch3"), 10000, 55000, 10, 11, 3, 0));
device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
TracerEventType::Kernel,
40000,
55000,
0,
10,
10,
1,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
TracerEventType::Kernel,
70000,
95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
75000,
0,
10,
11,
3,
KernelEventInfo()));
ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 3u);
......@@ -194,10 +340,20 @@ TEST(NodeTreesTest, HandleTrees_case0) {
}
std::function<void(HostTraceEventNode*)> host_event_node_handle(
[&](HostTraceEventNode* a) { logger.LogHostTraceEventNode(*a); });
std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle([&](
CudaRuntimeTraceEventNode* a) { logger.LogRuntimeTraceEventNode(*a); });
std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle(
[&](CudaRuntimeTraceEventNode* a) {
logger.LogRuntimeTraceEventNode(*a);
});
std::function<void(DeviceTraceEventNode*)> device_event_node_handle(
[&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); });
tree.HandleTrees(host_event_node_handle, runtime_event_node_handle,
device_event_node_handle);
std::function<void(MemTraceEventNode*)> mem_event_node_handle(
[&](MemTraceEventNode* a) { logger.LogMemTraceEventNode(*a); });
std::function<void(OperatorSupplementEventNode*)>
op_supplement_event_node_handle([&](OperatorSupplementEventNode* a) {});
tree.HandleTrees(host_event_node_handle,
runtime_event_node_handle,
device_event_node_handle,
mem_event_node_handle,
op_supplement_event_node_handle);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
}
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <map>
#include <string>
#include <vector>
namespace paddle {
namespace platform {
......@@ -56,6 +58,15 @@ enum class TracerEventType {
NumTypes
};
enum class TracerMemEventType {
// Used to mark memory allocation
Allocate = 0,
// Used to mark memory free
Free = 1,
// A flag to denote the number of current types
NumTypes
};
struct KernelEventInfo {
// The X-dimension block size for the kernel.
uint32_t block_x;
......@@ -118,10 +129,46 @@ struct MemsetEventInfo {
uint32_t value;
};
struct OperatorSupplementEvent {
OperatorSupplementEvent() = default;
OperatorSupplementEvent(
uint64_t timestamp_ns,
const std::string& op_type,
const std::map<std::string, std::vector<std::vector<int64_t>>>&
input_shapes,
const std::map<std::string, std::vector<std::string>>& dtypes,
const std::string& callstack,
uint64_t process_id,
uint64_t thread_id)
: timestamp_ns(timestamp_ns),
op_type(op_type),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack),
process_id(process_id),
thread_id(thread_id) {}
// timestamp of the record
uint64_t timestamp_ns;
// op type name
std::string op_type;
// input shapes
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
// call stack
std::string callstack;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
};
struct HostTraceEvent {
HostTraceEvent() = default;
HostTraceEvent(const std::string& name, TracerEventType type,
uint64_t start_ns, uint64_t end_ns, uint64_t process_id,
HostTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t process_id,
uint64_t thread_id)
: name(name),
type(type),
......@@ -145,9 +192,13 @@ struct HostTraceEvent {
struct RuntimeTraceEvent {
RuntimeTraceEvent() = default;
RuntimeTraceEvent(const std::string& name, uint64_t start_ns, uint64_t end_ns,
uint64_t process_id, uint64_t thread_id,
uint32_t correlation_id, uint32_t callback_id)
RuntimeTraceEvent(const std::string& name,
uint64_t start_ns,
uint64_t end_ns,
uint64_t process_id,
uint64_t thread_id,
uint32_t correlation_id,
uint32_t callback_id)
: name(name),
start_ns(start_ns),
end_ns(end_ns),
......@@ -176,10 +227,15 @@ struct RuntimeTraceEvent {
struct DeviceTraceEvent {
DeviceTraceEvent() = default;
DeviceTraceEvent(const std::string& name, TracerEventType type,
uint64_t start_ns, uint64_t end_ns, uint64_t device_id,
uint64_t context_id, uint64_t stream_id,
uint32_t correlation_id, const KernelEventInfo& kernel_info)
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const KernelEventInfo& kernel_info)
: name(name),
type(type),
start_ns(start_ns),
......@@ -189,10 +245,15 @@ struct DeviceTraceEvent {
stream_id(stream_id),
correlation_id(correlation_id),
kernel_info(kernel_info) {}
DeviceTraceEvent(const std::string& name, TracerEventType type,
uint64_t start_ns, uint64_t end_ns, uint64_t device_id,
uint64_t context_id, uint64_t stream_id,
uint32_t correlation_id, const MemcpyEventInfo& memcpy_info)
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const MemcpyEventInfo& memcpy_info)
: name(name),
type(type),
start_ns(start_ns),
......@@ -202,10 +263,15 @@ struct DeviceTraceEvent {
stream_id(stream_id),
correlation_id(correlation_id),
memcpy_info(memcpy_info) {}
DeviceTraceEvent(const std::string& name, TracerEventType type,
uint64_t start_ns, uint64_t end_ns, uint64_t device_id,
uint64_t context_id, uint64_t stream_id,
uint32_t correlation_id, const MemsetEventInfo& memset_info)
DeviceTraceEvent(const std::string& name,
TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const MemsetEventInfo& memset_info)
: name(name),
type(type),
start_ns(start_ns),
......@@ -242,5 +308,47 @@ struct DeviceTraceEvent {
};
};
struct MemTraceEvent {
MemTraceEvent() = default;
MemTraceEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
uint64_t process_id,
uint64_t thread_id,
int64_t increase_bytes,
const std::string& place,
uint64_t current_allocated,
uint64_t current_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
process_id(process_id),
thread_id(thread_id),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved) {}
// timestamp of the record
uint64_t timestamp_ns;
// memory addr of allocation or free
uint64_t addr;
// memory manipulation type
TracerMemEventType type;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// increase bytes after this manipulation, allocation for sign +, free for
// sign -
int64_t increase_bytes;
// place
std::string place;
// current total allocated memory
uint64_t current_allocated;
// current total reserved memory
uint64_t current_reserved;
};
} // namespace platform
} // namespace paddle
......@@ -38,6 +38,12 @@ class TraceEventCollector {
thread_names_[tid] = name;
}
void AddMemEvent(MemTraceEvent&& event) { mem_events_.push_back(event); }
void AddOperatorSupplementEvent(OperatorSupplementEvent&& event) {
op_supplement_events_.push_back(event);
}
const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
......@@ -48,6 +54,12 @@ class TraceEventCollector {
return device_events_;
}
const std::list<MemTraceEvent>& MemEvents() const { return mem_events_; }
const std::list<OperatorSupplementEvent>& OperatorSupplementEvents() const {
return op_supplement_events_;
}
const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
return thread_names_;
}
......@@ -57,6 +69,8 @@ class TraceEventCollector {
host_events_.clear();
runtime_events_.clear();
device_events_.clear();
mem_events_.clear();
op_supplement_events_.clear();
}
private:
......@@ -64,6 +78,8 @@ class TraceEventCollector {
std::list<HostTraceEvent> host_events_;
std::list<RuntimeTraceEvent> runtime_events_;
std::list<DeviceTraceEvent> device_events_;
std::list<MemTraceEvent> mem_events_;
std::list<OperatorSupplementEvent> op_supplement_events_;
};
} // namespace platform
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler/utils.h"
#include <sstream>
#include <vector>
#include "glog/logging.h"
......@@ -21,11 +22,35 @@ limitations under the License. */
namespace paddle {
namespace platform {
template <>
std::string json_vector<std::string>(
const std::vector<std::string> type_vector) {
std::ostringstream res_stream;
auto count = type_vector.size();
res_stream << "[";
for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
if (count > 1) {
res_stream << "\"" << (*it) << "\""
<< ",";
} else {
res_stream << "\"" << (*it) << "\"";
}
count--;
}
res_stream << "]";
return res_stream.str();
}
#ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
float CalculateEstOccupancy(uint32_t DeviceId,
uint16_t RegistersPerThread,
int32_t StaticSharedMemory,
int32_t DynamicSharedMemory, int32_t BlockX,
int32_t BlockY, int32_t BlockZ, float BlocksPerSm) {
int32_t DynamicSharedMemory,
int32_t BlockX,
int32_t BlockY,
int32_t BlockZ,
float BlocksPerSm) {
float occupancy = 0.0;
std::vector<int> device_ids = GetSelectedDevices();
if (DeviceId < device_ids.size()) {
......@@ -42,8 +67,12 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
size_t dynamicSmemSize = DynamicSharedMemory;
cudaOccResult occ_result;
cudaOccDeviceProp prop(device_property);
cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
&occ_result, &prop, &occFuncAttr, &occDeviceState, blockSize,
cudaOccError status =
cudaOccMaxActiveBlocksPerMultiprocessor(&occ_result,
&prop,
&occFuncAttr,
&occDeviceState,
blockSize,
dynamicSmemSize);
if (status == CUDA_OCC_SUCCESS) {
if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) {
......@@ -61,5 +90,30 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
}
#endif
const char* StringTracerMemEventType(TracerMemEventType type) {
static const char* categary_name_[] = {"Allocate", "Free"};
return categary_name_[static_cast<int>(type)];
}
const char* StringTracerEventType(TracerEventType type) {
static const char* categary_name_[] = {"Operator",
"Dataloader",
"ProfileStep",
"CudaRuntime",
"Kernel",
"Memcpy",
"Memset",
"UserDefined",
"OperatorInner",
"Forward",
"Backward",
"Optimization",
"Communication",
"PythonOp",
"PythonUserDefined",
"MluRuntime"};
return categary_name_[static_cast<int>(type)];
}
} // namespace platform
} // namespace paddle
......@@ -14,10 +14,15 @@ limitations under the License. */
#pragma once
#include <ctime>
#include <map>
#include <ostream>
#include <string>
#include <vector>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace platform {
......@@ -26,14 +31,74 @@ template <typename... Args>
std::string string_format(const std::string& format, Args... args) {
int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
1; // Extra space for '\0'
PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
"Error during profiler data formatting."));
PADDLE_ENFORCE_GE(
size_s,
0,
platform::errors::Fatal("Error during profiler data formatting."));
auto size = static_cast<size_t>(size_s);
auto buf = std::make_unique<char[]>(size);
std::snprintf(buf.get(), size, format.c_str(), args...);
return std::string(buf.get(), size - 1); // exclude the '\0'
}
template <typename basic_type>
std::string json_vector(const std::vector<basic_type> type_vector) {
std::ostringstream res_stream;
auto count = type_vector.size();
res_stream << "[";
for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
if (count > 1) {
res_stream << (*it) << ",";
} else {
res_stream << (*it);
}
count--;
}
res_stream << "]";
return res_stream.str();
}
template <typename basic_type>
std::string json_vector(
const std::vector<std::vector<basic_type>> shape_vector) {
std::ostringstream res_stream;
auto count = shape_vector.size();
res_stream << "[";
for (auto it = shape_vector.begin(); it != shape_vector.end(); it++) {
if (count > 1) {
res_stream << json_vector(*it) << ",";
} else {
res_stream << json_vector(*it);
}
count--;
}
res_stream << "]";
return res_stream.str();
}
template <>
std::string json_vector<std::string>(
const std::vector<std::string> type_vector);
template <typename type>
std::string json_dict(const std::map<std::string, std::vector<type>> data_map) {
std::ostringstream res_stream;
auto count = data_map.size();
res_stream << "{";
for (auto it = data_map.begin(); it != data_map.end(); it++) {
if (count > 1) {
res_stream << "\"" << it->first << "\""
<< ":" << json_vector(it->second) << ",";
} else {
res_stream << "\"" << it->first << "\""
<< ":" << json_vector(it->second);
}
count--;
}
res_stream << "}";
return res_stream.str();
}
static std::string GetStringFormatLocalTime() {
std::time_t rawtime;
std::tm* timeinfo;
......@@ -48,6 +113,10 @@ static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) {
return (end_ns - start_ns) / 1000;
}
const char* StringTracerMemEventType(TracerMemEventType type);
const char* StringTracerEventType(TracerEventType type);
static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
return static_cast<float>(end_ns - start_ns) / 1000;
}
......@@ -56,10 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
}
#ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
float CalculateEstOccupancy(uint32_t deviceId,
uint16_t registersPerThread,
int32_t staticSharedMemory,
int32_t dynamicSharedMemory, int32_t blockX,
int32_t blockY, int32_t blockZ, float blocksPerSm);
int32_t dynamicSharedMemory,
int32_t blockX,
int32_t blockY,
int32_t blockZ,
float blocksPerSm);
#endif
} // namespace platform
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册