未验证 提交 94271bc2 编写于 作者: C chenjian 提交者: GitHub

add new field for event node (#43223) (#44245)

* add new field for event node

* fix

* fix bug

* fix bug

* fix clang

* fix clang format

* fix code format
上级 3cc6ae69
...@@ -27,24 +27,16 @@ limitations under the License. */ ...@@ -27,24 +27,16 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
static const char* kSchemaVersion = "1.0.0"; static const char* kSchemaVersion = "1.0.1";
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json"; static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
static uint32_t span_indx = 0; static uint32_t span_indx = 0;
static std::string DefaultFileName() { static std::string DefaultFileName() {
auto pid = GetProcessId(); auto pid = GetProcessId();
return string_format(std::string(kDefaultFilename), pid, return string_format(
GetStringFormatLocalTime().c_str()); std::string(kDefaultFilename), pid, GetStringFormatLocalTime().c_str());
} }
const char* ChromeTracingLogger::categary_name_[] = {
"Operator", "Dataloader", "ProfileStep",
"CudaRuntime", "Kernel", "Memcpy",
"Memset", "UserDefined", "OperatorInner",
"Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};
void ChromeTracingLogger::OpenFile() { void ChromeTracingLogger::OpenFile() {
output_file_stream_.open(filename_, output_file_stream_.open(filename_,
std::ofstream::out | std::ofstream::trunc); std::ofstream::out | std::ofstream::trunc);
...@@ -81,7 +73,8 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { ...@@ -81,7 +73,8 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
// find the earliest time in current timeline // find the earliest time in current timeline
start_time_ = std::numeric_limits<uint64_t>::max(); start_time_ = std::numeric_limits<uint64_t>::max();
for (auto it = thread2host_event_nodes.begin(); for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) { it != thread2host_event_nodes.end();
++it) {
if (it->second.begin() + 1 != it->second.end()) { if (it->second.begin() + 1 != it->second.end()) {
if ((*(it->second.begin() + 1))->StartNs() < start_time_) { if ((*(it->second.begin() + 1))->StartNs() < start_time_) {
start_time_ = (*(it->second.begin() + 1))->StartNs(); start_time_ = (*(it->second.begin() + 1))->StartNs();
...@@ -99,7 +92,8 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { ...@@ -99,7 +92,8 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
} }
for (auto it = thread2host_event_nodes.begin(); for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) { it != thread2host_event_nodes.end();
++it) {
for (auto hostnode = it->second.begin(); hostnode != it->second.end(); for (auto hostnode = it->second.begin(); hostnode != it->second.end();
++hostnode) { ++hostnode) {
if (hostnode != it->second.begin()) { // skip root node if (hostnode != it->second.begin()) { // skip root node
...@@ -116,8 +110,45 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { ...@@ -116,8 +110,45 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
(*devicenode)->LogMe(this); (*devicenode)->LogMe(this);
} }
} }
for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin();
memnode != (*hostnode)->GetMemTraceEventNodes().end();
++memnode) {
(*memnode)->LogMe(this);
}
}
}
}
void ChromeTracingLogger::LogMemTraceEventNode(
const MemTraceEventNode& mem_node) {
if (!output_file_stream_) {
return;
} }
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "[memory]", "pid": %lld, "tid": "%lld",
"ts": %lld,
"ph": "i", "cat": "%s",
"args": {
"place": "%s",
"addr": "%llu",
"current_allocated": %llu,
"current_reserved": %llu,
"increase_bytes": %lld
} }
},
)JSON"),
mem_node.ProcessId(),
mem_node.ThreadId(),
mem_node.TimeStampNs(),
StringTracerMemEventType(mem_node.Type()),
mem_node.Place().c_str(),
mem_node.Addr(),
mem_node.CurrentAllocated(),
mem_node.CurrentReserved(),
mem_node.IncreaseBytes());
} }
void ChromeTracingLogger::LogHostTraceEventNode( void ChromeTracingLogger::LogHostTraceEventNode(
...@@ -132,6 +163,16 @@ void ChromeTracingLogger::LogHostTraceEventNode( ...@@ -132,6 +163,16 @@ void ChromeTracingLogger::LogHostTraceEventNode(
} else { } else {
dur_display = string_format(std::string("%.3f us"), dur * 1000); dur_display = string_format(std::string("%.3f us"), dur * 1000);
} }
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> input_dtypes;
std::string callstack;
OperatorSupplementEventNode* op_supplement_node =
host_node.GetOperatorSupplementEventNode();
if (op_supplement_node != nullptr) {
input_shapes = op_supplement_node->InputShapes();
input_dtypes = op_supplement_node->Dtypes();
callstack = op_supplement_node->CallStack();
}
switch (host_node.Type()) { switch (host_node.Type()) {
case TracerEventType::ProfileStep: case TracerEventType::ProfileStep:
case TracerEventType::Forward: case TracerEventType::Forward:
...@@ -156,13 +197,58 @@ void ChromeTracingLogger::LogHostTraceEventNode( ...@@ -156,13 +197,58 @@ void ChromeTracingLogger::LogHostTraceEventNode(
} }
}, },
)JSON"), )JSON"),
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(), host_node.Name().c_str(),
host_node.ThreadId(), nsToUs(host_node.StartNs()), dur_display.c_str(),
host_node.ProcessId(),
host_node.ThreadId(),
nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()), nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())], StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_), nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_)); nsToUsFloat(host_node.EndNs(), start_time_));
break; break;
case TracerEventType::Operator:
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "thread_state_runnable",
"args": {
"start_time": "%.3f us",
"end_time": "%.3f us",
"input_shapes": %s,
"input_dtypes": %s,
"callstack": "%s"
}
},
)JSON"),
host_node.Name().c_str(),
dur_display.c_str(),
host_node.ProcessId(),
host_node.ThreadId(),
nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_),
json_dict(input_shapes).c_str(),
json_dict(input_dtypes).c_str(),
callstack.c_str());
break;
case TracerEventType::CudaRuntime:
case TracerEventType::Kernel:
case TracerEventType::Memcpy:
case TracerEventType::Memset:
case TracerEventType::UserDefined:
case TracerEventType::OperatorInner:
case TracerEventType::Communication:
case TracerEventType::MluRuntime:
case TracerEventType::NumTypes:
default: default:
output_file_stream_ << string_format( output_file_stream_ << string_format(
std::string( std::string(
...@@ -178,10 +264,13 @@ void ChromeTracingLogger::LogHostTraceEventNode( ...@@ -178,10 +264,13 @@ void ChromeTracingLogger::LogHostTraceEventNode(
} }
}, },
)JSON"), )JSON"),
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(), host_node.Name().c_str(),
host_node.ThreadId(), nsToUs(host_node.StartNs()), dur_display.c_str(),
host_node.ProcessId(),
host_node.ThreadId(),
nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()), nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())], StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_), nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_)); nsToUsFloat(host_node.EndNs(), start_time_));
break; break;
...@@ -217,10 +306,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( ...@@ -217,10 +306,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
} }
}, },
)JSON"), )JSON"),
runtime_node.Name().c_str(), dur_display.c_str(), runtime_node.Name().c_str(),
runtime_node.ProcessId(), runtime_node.ThreadId(), dur_display.c_str(),
nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()), runtime_node.ProcessId(),
categary_name_[static_cast<int>(runtime_node.Type())], runtime_node.ThreadId(),
nsToUs(runtime_node.StartNs()),
nsToUsFloat(runtime_node.Duration()),
StringTracerEventType(runtime_node.Type()),
runtime_node.CorrelationId(), runtime_node.CorrelationId(),
nsToUsFloat(runtime_node.StartNs(), start_time_), nsToUsFloat(runtime_node.StartNs(), start_time_),
nsToUsFloat(runtime_node.EndNs(), start_time_)); nsToUsFloat(runtime_node.EndNs(), start_time_));
...@@ -235,7 +327,8 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( ...@@ -235,7 +327,8 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
"ph": "s", "cat": "async" "ph": "s", "cat": "async"
}, },
)JSON"), )JSON"),
runtime_node.CorrelationId(), runtime_node.ProcessId(), runtime_node.CorrelationId(),
runtime_node.ProcessId(),
runtime_node.ThreadId(), runtime_node.ThreadId(),
nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1)); nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1));
pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()}); pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
...@@ -260,8 +353,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( ...@@ -260,8 +353,7 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
break; break;
} }
if (nsToUs(device_node.Duration()) == 0) { if (nsToUs(device_node.Duration()) == 0) {
output_file_stream_ << string_format( output_file_stream_ << string_format(std::string(
std::string(
R"JSON( R"JSON(
{ {
"name": "launch", "id": %d, "pid": %lld, "tid": %lld, "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
...@@ -269,8 +361,10 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( ...@@ -269,8 +361,10 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
"ph": "f", "cat": "async" "ph": "f", "cat": "async"
}, },
)JSON"), )JSON"),
device_node.CorrelationId(), device_node.DeviceId(), device_node.CorrelationId(),
device_node.StreamId(), nsToUs(device_node.StartNs())); device_node.DeviceId(),
device_node.StreamId(),
nsToUs(device_node.StartNs()));
deviceid_streamid_set_.insert( deviceid_streamid_set_.insert(
{device_node.DeviceId(), device_node.StreamId()}); {device_node.DeviceId(), device_node.StreamId()});
} else { } else {
...@@ -283,7 +377,8 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( ...@@ -283,7 +377,8 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
"ph": "f", "cat": "async", "bp": "e" "ph": "f", "cat": "async", "bp": "e"
}, },
)JSON"), )JSON"),
device_node.CorrelationId(), device_node.DeviceId(), device_node.CorrelationId(),
device_node.DeviceId(),
device_node.StreamId(), device_node.StreamId(),
nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1)); nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1));
deviceid_streamid_set_.insert( deviceid_streamid_set_.insert(
...@@ -304,13 +399,17 @@ void ChromeTracingLogger::HandleTypeKernel( ...@@ -304,13 +399,17 @@ void ChromeTracingLogger::HandleTypeKernel(
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y * blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
kernel_info.grid_z) / kernel_info.grid_z) /
device_property.multiProcessorCount; device_property.multiProcessorCount;
warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y * warps_per_sm =
kernel_info.block_z) / blocks_per_sm *
(kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
threads_per_warp; threads_per_warp;
occupancy = CalculateEstOccupancy( occupancy = CalculateEstOccupancy(device_node.DeviceId(),
device_node.DeviceId(), kernel_info.registers_per_thread, kernel_info.registers_per_thread,
kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory, kernel_info.static_shared_memory,
kernel_info.block_x, kernel_info.block_y, kernel_info.block_z, kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
blocks_per_sm); blocks_per_sm);
#endif #endif
float dur = nsToMsFloat(device_node.Duration()); float dur = nsToMsFloat(device_node.Duration());
...@@ -343,18 +442,30 @@ void ChromeTracingLogger::HandleTypeKernel( ...@@ -343,18 +442,30 @@ void ChromeTracingLogger::HandleTypeKernel(
} }
}, },
)JSON"), )JSON"),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.Name().c_str(),
device_node.StreamId(), nsToUs(device_node.StartNs()), dur_display.c_str(),
device_node.DeviceId(),
device_node.StreamId(),
nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()), nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())], StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), nsToUsFloat(device_node.EndNs(), start_time_),
device_node.ContextId(), device_node.StreamId(), device_node.DeviceId(),
device_node.CorrelationId(), kernel_info.registers_per_thread, device_node.ContextId(),
device_node.StreamId(),
device_node.CorrelationId(),
kernel_info.registers_per_thread,
kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory, kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
blocks_per_sm, warps_per_sm, kernel_info.grid_x, kernel_info.grid_y, blocks_per_sm,
kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y, warps_per_sm,
kernel_info.block_z, occupancy * 100); kernel_info.grid_x,
kernel_info.grid_y,
kernel_info.grid_z,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
occupancy * 100);
} }
void ChromeTracingLogger::HandleTypeMemcpy( void ChromeTracingLogger::HandleTypeMemcpy(
...@@ -387,13 +498,19 @@ void ChromeTracingLogger::HandleTypeMemcpy( ...@@ -387,13 +498,19 @@ void ChromeTracingLogger::HandleTypeMemcpy(
} }
}, },
)JSON"), )JSON"),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.Name().c_str(),
device_node.StreamId(), nsToUs(device_node.StartNs()), dur_display.c_str(),
device_node.DeviceId(),
device_node.StreamId(),
nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()), nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())], StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(), nsToUsFloat(device_node.EndNs(), start_time_),
device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth); device_node.StreamId(),
device_node.CorrelationId(),
memcpy_info.num_bytes,
memory_bandwidth);
} }
void ChromeTracingLogger::HandleTypeMemset( void ChromeTracingLogger::HandleTypeMemset(
...@@ -423,14 +540,21 @@ void ChromeTracingLogger::HandleTypeMemset( ...@@ -423,14 +540,21 @@ void ChromeTracingLogger::HandleTypeMemset(
} }
}, },
)JSON"), )JSON"),
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.Name().c_str(),
device_node.StreamId(), nsToUs(device_node.StartNs()), dur_display.c_str(),
device_node.DeviceId(),
device_node.StreamId(),
nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()), nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())], StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), nsToUsFloat(device_node.EndNs(), start_time_),
device_node.ContextId(), device_node.StreamId(), device_node.DeviceId(),
device_node.CorrelationId(), memset_info.num_bytes, memset_info.value); device_node.ContextId(),
device_node.StreamId(),
device_node.CorrelationId(),
memset_info.num_bytes,
memset_info.value);
} }
void ChromeTracingLogger::StartLog() { void ChromeTracingLogger::StartLog() {
...@@ -441,7 +565,8 @@ void ChromeTracingLogger::StartLog() { ...@@ -441,7 +565,8 @@ void ChromeTracingLogger::StartLog() {
"displayTimeUnit": "ms", "displayTimeUnit": "ms",
"span_indx": "%d", "span_indx": "%d",
)JSON"), )JSON"),
kSchemaVersion, span_indx++); kSchemaVersion,
span_indx++);
// add device property information // add device property information
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
output_file_stream_ << std::string(R"JSON( output_file_stream_ << std::string(R"JSON(
...@@ -463,12 +588,17 @@ void ChromeTracingLogger::StartLog() { ...@@ -463,12 +588,17 @@ void ChromeTracingLogger::StartLog() {
"smCount": %d, "sharedMemPerBlockOptin": %d "smCount": %d, "sharedMemPerBlockOptin": %d
}, },
)JSON"), )JSON"),
device_ids[index], device_property.name, device_property.totalGlobalMem, device_ids[index],
device_property.major, device_property.minor, device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock, device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor, device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock, device_property.regsPerMultiprocessor, device_property.regsPerBlock,
device_property.warpSize, device_property.sharedMemPerBlock, device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor, device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount, device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin); device_property.sharedMemPerBlockOptin);
...@@ -488,12 +618,17 @@ void ChromeTracingLogger::StartLog() { ...@@ -488,12 +618,17 @@ void ChromeTracingLogger::StartLog() {
"smCount": %d, "sharedMemPerBlockOptin": %d "smCount": %d, "sharedMemPerBlockOptin": %d
}], }],
)JSON"), )JSON"),
device_ids[device_ids.size() - 1], device_property.name, device_ids[device_ids.size() - 1],
device_property.totalGlobalMem, device_property.major, device_property.name,
device_property.minor, device_property.maxThreadsPerBlock, device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor, device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock, device_property.regsPerMultiprocessor, device_property.regsPerBlock,
device_property.warpSize, device_property.sharedMemPerBlock, device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor, device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount, device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin); device_property.sharedMemPerBlockOptin);
...@@ -522,12 +657,14 @@ void ChromeTracingLogger::LogMetaInfo( ...@@ -522,12 +657,14 @@ void ChromeTracingLogger::LogMetaInfo(
output_file_stream_ << string_format(std::string(R"JSON( output_file_stream_ << string_format(std::string(R"JSON(
"%s": "%s", "%s": "%s",
)JSON"), )JSON"),
kv.first.c_str(), kv.second.c_str()); kv.first.c_str(),
kv.second.c_str());
} else { } else {
output_file_stream_ << string_format(std::string(R"JSON( output_file_stream_ << string_format(std::string(R"JSON(
"%s": "%s" "%s": "%s"
)JSON"), )JSON"),
kv.first.c_str(), kv.second.c_str()); kv.first.c_str(),
kv.second.c_str());
} }
count--; count--;
} }
...@@ -591,13 +728,29 @@ void ChromeTracingLogger::RefineDisplayName( ...@@ -591,13 +728,29 @@ void ChromeTracingLogger::RefineDisplayName(
} }
}, },
)JSON"), )JSON"),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second, (*it).first,
(*it).first, (*it).first, (*it).second, (*it).second, (*it).second,
(*it).first,
(*it).first,
(*it).second,
(*it).first,
(*it).first,
(*it).second,
(*it).second,
extra_info[string_format(std::string("%lld"), (*it).second)].c_str(), extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
(*it).first, (*it).second, (*it).second, (*it).first,
(*it).second,
(*it).second,
extra_info[string_format(std::string("%lld"), (*it).second)].c_str(), extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second, (*it).first,
(*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1); (*it).second,
(*it).first,
(*it).first,
(*it).second,
(*it).second * 2,
(*it).first,
(*it).second,
(*it).second * 2 + 1);
} }
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
...@@ -607,9 +760,9 @@ void ChromeTracingLogger::RefineDisplayName( ...@@ -607,9 +760,9 @@ void ChromeTracingLogger::RefineDisplayName(
#endif #endif
for (auto it = deviceid_streamid_set_.begin(); for (auto it = deviceid_streamid_set_.begin();
it != deviceid_streamid_set_.end(); ++it) { it != deviceid_streamid_set_.end();
output_file_stream_ << string_format( ++it) {
std::string( output_file_stream_ << string_format(std::string(
R"JSON( R"JSON(
{ {
"name": "process_name", "pid": %lld, "tid": %lld, "name": "process_name", "pid": %lld, "tid": %lld,
...@@ -640,9 +793,19 @@ void ChromeTracingLogger::RefineDisplayName( ...@@ -640,9 +793,19 @@ void ChromeTracingLogger::RefineDisplayName(
} }
}, },
)JSON"), )JSON"),
(*it).first, (*it).second, (*it).first, device_type.c_str(), (*it).first,
(*it).first, (*it).second, (*it).second, (*it).first, (*it).second, (*it).second,
(*it).first + 0x10000000, (*it).first, (*it).second, (*it).second); (*it).first,
device_type.c_str(),
(*it).first,
(*it).second,
(*it).second,
(*it).first,
(*it).second,
(*it).first + 0x10000000,
(*it).first,
(*it).second,
(*it).second);
} }
} }
......
...@@ -36,6 +36,7 @@ class ChromeTracingLogger : public BaseLogger { ...@@ -36,6 +36,7 @@ class ChromeTracingLogger : public BaseLogger {
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override; void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>); void LogMetaInfo(const std::unordered_map<std::string, std::string>);
void LogMemTraceEventNode(const MemTraceEventNode&) override;
private: private:
void OpenFile(); void OpenFile();
......
...@@ -19,61 +19,102 @@ ...@@ -19,61 +19,102 @@
#include "paddle/fluid/platform/profiler/event_node.h" #include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/event_python.h"
using paddle::platform::SerializationLogger;
using paddle::platform::DeserializationReader;
using paddle::platform::NodeTrees;
using paddle::platform::HostTraceEventNode;
using paddle::platform::CudaRuntimeTraceEventNode; using paddle::platform::CudaRuntimeTraceEventNode;
using paddle::platform::DeserializationReader;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::DeviceTraceEventNode; using paddle::platform::DeviceTraceEventNode;
using paddle::platform::HostTraceEvent; using paddle::platform::HostTraceEvent;
using paddle::platform::RuntimeTraceEvent; using paddle::platform::HostTraceEventNode;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::TracerEventType;
using paddle::platform::KernelEventInfo; using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo; using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo; using paddle::platform::MemsetEventInfo;
using paddle::platform::MemTraceEvent;
using paddle::platform::NodeTrees;
using paddle::platform::OperatorSupplementEvent;
using paddle::platform::ProfilerResult; using paddle::platform::ProfilerResult;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::SerializationLogger;
using paddle::platform::TracerEventType;
TEST(SerializationLoggerTest, dump_case0) { TEST(SerializationLoggerTest, dump_case0) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"), host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000, TracerEventType::Dataloader,
10, 10)); 1000,
10000,
10,
10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10)); std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10)); std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11)); std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, runtime_events.push_back(RuntimeTraceEvent(
17000, 10, 10, 1, 0)); std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, runtime_events.push_back(RuntimeTraceEvent(
35000, 10, 10, 2, 0)); std::string("cudalaunch2"), 25000, 35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000, runtime_events.push_back(RuntimeTraceEvent(
37000, 10, 11, 3, 0)); std::string("cudalaunch3"), 33000, 37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000, runtime_events.push_back(RuntimeTraceEvent(
19000, 10, 10, 4, 0)); std::string("cudaMemcpy1"), 18000, 19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000, runtime_events.push_back(RuntimeTraceEvent(
39000, 10, 11, 5, 0)); std::string("cudaMemset1"), 38000, 39000, 10, 11, 5, 0));
device_events.push_back( device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, TracerEventType::Kernel,
55000, 0, 10, 10, 1, KernelEventInfo())); 40000,
device_events.push_back( 55000,
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, 0,
95000, 0, 10, 10, 2, KernelEventInfo())); 10,
device_events.push_back( 10,
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, 1,
65000, 0, 10, 11, 3, KernelEventInfo())); KernelEventInfo()));
device_events.push_back( device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000, TracerEventType::Kernel,
59000, 0, 10, 10, 4, MemcpyEventInfo())); 70000,
device_events.push_back( 95000,
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, 0,
69000, 0, 10, 11, 5, MemsetEventInfo())); 10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
65000,
0,
10,
11,
3,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memcpy1"),
TracerEventType::Memcpy,
56000,
59000,
0,
10,
10,
4,
MemcpyEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memset1"),
TracerEventType::Memset,
66000,
69000,
0,
10,
11,
5,
MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case0.pb"); SerializationLogger logger("test_serialization_logger_case0.pb");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u); EXPECT_EQ(nodes[10].size(), 4u);
...@@ -102,33 +143,69 @@ TEST(SerializationLoggerTest, dump_case1) { ...@@ -102,33 +143,69 @@ TEST(SerializationLoggerTest, dump_case1) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, std::list<MemTraceEvent> mem_events;
17000, 10, 10, 1, 0)); std::list<OperatorSupplementEvent> op_supplement_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, runtime_events.push_back(RuntimeTraceEvent(
35000, 10, 10, 2, 0)); std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000, runtime_events.push_back(RuntimeTraceEvent(
37000, 10, 11, 3, 0)); std::string("cudalaunch2"), 25000, 35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000, runtime_events.push_back(RuntimeTraceEvent(
19000, 10, 10, 4, 0)); std::string("cudalaunch3"), 33000, 37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000, runtime_events.push_back(RuntimeTraceEvent(
39000, 10, 11, 5, 0)); std::string("cudaMemcpy1"), 18000, 19000, 10, 10, 4, 0));
device_events.push_back( runtime_events.push_back(RuntimeTraceEvent(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, std::string("cudaMemset1"), 38000, 39000, 10, 11, 5, 0));
55000, 0, 10, 10, 1, KernelEventInfo())); device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
device_events.push_back( TracerEventType::Kernel,
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, 40000,
95000, 0, 10, 10, 2, KernelEventInfo())); 55000,
device_events.push_back( 0,
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, 10,
65000, 0, 10, 11, 3, KernelEventInfo())); 10,
device_events.push_back( 1,
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000, KernelEventInfo()));
59000, 0, 10, 10, 4, MemcpyEventInfo())); device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
device_events.push_back( TracerEventType::Kernel,
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, 70000,
69000, 0, 10, 11, 5, MemsetEventInfo())); 95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
65000,
0,
10,
11,
3,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memcpy1"),
TracerEventType::Memcpy,
56000,
59000,
0,
10,
10,
4,
MemcpyEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memset1"),
TracerEventType::Memset,
66000,
69000,
0,
10,
11,
5,
MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case1.pb"); SerializationLogger logger("test_serialization_logger_case1.pb");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u); EXPECT_EQ(nodes[10].size(), 1u);
......
...@@ -17,6 +17,8 @@ limitations under the License. */ ...@@ -17,6 +17,8 @@ limitations under the License. */
#include <set> #include <set>
#include <stack> #include <stack>
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -42,22 +44,34 @@ CudaRuntimeTraceEventNode::~CudaRuntimeTraceEventNode() { ...@@ -42,22 +44,34 @@ CudaRuntimeTraceEventNode::~CudaRuntimeTraceEventNode() {
NodeTrees::~NodeTrees() { NodeTrees::~NodeTrees() {
// delete all root nodes // delete all root nodes
for (auto it = thread_event_trees_map_.begin(); for (auto it = thread_event_trees_map_.begin();
it != thread_event_trees_map_.end(); ++it) { it != thread_event_trees_map_.end();
++it) {
delete it->second; delete it->second;
} }
} }
void NodeTrees::BuildTrees( void NodeTrees::BuildTrees(
const std::vector<HostTraceEventNode*>& host_event_nodes, const std::vector<HostTraceEventNode*>& host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes, const std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
const std::vector<DeviceTraceEventNode*>& device_event_nodes) { const std::vector<DeviceTraceEventNode*>& device_event_nodes,
// seperate Host Event Nodes into different threads const std::vector<MemTraceEventNode*>& mem_event_nodes,
const std::vector<OperatorSupplementEventNode*>& op_supplement_events) {
// separate Host Event Nodes into different threads
std::map<uint64_t, std::vector<HostTraceEventNode*>> std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes; // used to store HostTraceEventNodes per thread thread2host_event_nodes; // used to store HostTraceEventNodes per thread
std::map<uint64_t, std::vector<CudaRuntimeTraceEventNode*>> std::map<uint64_t, std::vector<CudaRuntimeTraceEventNode*>>
thread2runtime_event_nodes; // used to store CudaRuntimeTraceEventNode thread2runtime_event_nodes; // used to store CudaRuntimeTraceEventNode
// per // per
// thread // thread
std::map<uint64_t, std::vector<MemTraceEventNode*>>
thread2mem_event_nodes; // used to store MemTraceEventNode
// per
// thread
std::map<uint64_t, std::vector<OperatorSupplementEventNode*>>
thread2op_supplement_event_nodes; // used to store
// OperatorSupplementEventNode
// per
// thread
std::map<uint32_t, CudaRuntimeTraceEventNode*> std::map<uint32_t, CudaRuntimeTraceEventNode*>
correlation_id2runtime_event_node; // used to store the relation between correlation_id2runtime_event_node; // used to store the relation between
// correlation id and runtime node // correlation id and runtime node
...@@ -79,19 +93,31 @@ void NodeTrees::BuildTrees( ...@@ -79,19 +93,31 @@ void NodeTrees::BuildTrees(
auto dst_iter = auto dst_iter =
correlation_id2runtime_event_node.find((*it)->CorrelationId()); correlation_id2runtime_event_node.find((*it)->CorrelationId());
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
dst_iter, correlation_id2runtime_event_node.end(), dst_iter,
correlation_id2runtime_event_node.end(),
platform::errors::NotFound("Unknown device events, " platform::errors::NotFound("Unknown device events, "
"no corresponding cuda runtime events")); "no corresponding cuda runtime events"));
dst_iter->second->AddDeviceTraceEventNode(*it); dst_iter->second->AddDeviceTraceEventNode(*it);
} }
// construct thread2mem_event_nodes
for (auto it = mem_event_nodes.begin(); it != mem_event_nodes.end(); ++it) {
thread2mem_event_nodes[(*it)->ThreadId()].push_back(*it);
}
// construct thread2op_supplement_event_nodes
for (auto it = op_supplement_events.begin(); it != op_supplement_events.end();
++it) {
thread2op_supplement_event_nodes[(*it)->ThreadId()].push_back(*it);
}
// sort host event nodes and runtime event nodes according to start_ns and // sort host event nodes and runtime event nodes according to start_ns and
// end_ns // end_ns
// the smaller start_ns is, the further ahead position is. // the smaller start_ns is, the further ahead position is.
// when start_ns of two nodes are equal, the one with bigger end_ns should be // when start_ns of two nodes are equal, the one with bigger end_ns should be
// ahead. // ahead.
for (auto it = thread2host_event_nodes.begin(); for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) { it != thread2host_event_nodes.end();
std::sort(it->second.begin(), it->second.end(), ++it) {
std::sort(it->second.begin(),
it->second.end(),
[](HostTraceEventNode* node1, HostTraceEventNode* node2) { [](HostTraceEventNode* node1, HostTraceEventNode* node2) {
if (node1->StartNs() < node2->StartNs()) { if (node1->StartNs() < node2->StartNs()) {
return true; return true;
...@@ -104,9 +130,11 @@ void NodeTrees::BuildTrees( ...@@ -104,9 +130,11 @@ void NodeTrees::BuildTrees(
}); });
} }
for (auto it = thread2runtime_event_nodes.begin(); for (auto it = thread2runtime_event_nodes.begin();
it != thread2runtime_event_nodes.end(); ++it) { it != thread2runtime_event_nodes.end();
++it) {
std::sort( std::sort(
it->second.begin(), it->second.end(), it->second.begin(),
it->second.end(),
[](CudaRuntimeTraceEventNode* node1, CudaRuntimeTraceEventNode* node2) { [](CudaRuntimeTraceEventNode* node1, CudaRuntimeTraceEventNode* node2) {
if (node1->StartNs() < node2->StartNs()) { if (node1->StartNs() < node2->StartNs()) {
return true; return true;
...@@ -118,34 +146,82 @@ void NodeTrees::BuildTrees( ...@@ -118,34 +146,82 @@ void NodeTrees::BuildTrees(
return false; return false;
}); });
} }
// sort mem event nodes and operator supplement event nodes
for (auto it = thread2mem_event_nodes.begin();
it != thread2mem_event_nodes.end();
++it) {
std::sort(it->second.begin(),
it->second.end(),
[](MemTraceEventNode* node1, MemTraceEventNode* node2) {
if (node1->TimeStampNs() <= node2->TimeStampNs()) {
return true;
}
return false;
});
}
for (auto it = thread2op_supplement_event_nodes.begin();
it != thread2op_supplement_event_nodes.end();
++it) {
std::sort(it->second.begin(),
it->second.end(),
[](OperatorSupplementEventNode* node1,
OperatorSupplementEventNode* node2) {
if (node1->TimeStampNs() <= node2->TimeStampNs()) {
return true;
}
return false;
});
}
// construct trees // construct trees
std::set<uint64_t> thread_set; std::set<uint64_t> thread_set;
for (auto it = thread2host_event_nodes.begin(); for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) { it != thread2host_event_nodes.end();
++it) {
thread_set.insert(it->first); thread_set.insert(it->first);
} }
for (auto it = thread2runtime_event_nodes.begin(); for (auto it = thread2runtime_event_nodes.begin();
it != thread2runtime_event_nodes.end(); ++it) { it != thread2runtime_event_nodes.end();
++it) {
thread_set.insert(it->first);
}
for (auto it = thread2mem_event_nodes.begin();
it != thread2mem_event_nodes.end();
++it) {
thread_set.insert(it->first);
}
for (auto it = thread2op_supplement_event_nodes.begin();
it != thread2op_supplement_event_nodes.end();
++it) {
thread_set.insert(it->first); thread_set.insert(it->first);
} }
for (auto it = thread_set.begin(); it != thread_set.end(); ++it) { for (auto it = thread_set.begin(); it != thread_set.end(); ++it) {
thread_event_trees_map_[*it] = BuildTreeRelationship( thread_event_trees_map_[*it] =
thread2host_event_nodes[*it], thread2runtime_event_nodes[*it]); BuildTreeRelationship(thread2host_event_nodes[*it],
thread2runtime_event_nodes[*it],
thread2mem_event_nodes[*it],
thread2op_supplement_event_nodes[*it]);
} }
} }
HostTraceEventNode* NodeTrees::BuildTreeRelationship( HostTraceEventNode* NodeTrees::BuildTreeRelationship(
std::vector<HostTraceEventNode*> host_event_nodes, std::vector<HostTraceEventNode*> host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes) { std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
std::vector<MemTraceEventNode*> mem_event_nodes,
std::vector<OperatorSupplementEventNode*> op_supplement_events) {
// a stack used for analyse relationship // a stack used for analyse relationship
auto node_stack = std::vector<HostTraceEventNode*>(); auto node_stack = std::vector<HostTraceEventNode*>();
// root node, top level // root node, top level
auto root_node = new HostTraceEventNode( auto root_node =
HostTraceEvent(std::string("root node"), TracerEventType::UserDefined, 0, new HostTraceEventNode(HostTraceEvent(std::string("root node"),
ULLONG_MAX, 0, 0)); TracerEventType::UserDefined,
0,
ULLONG_MAX,
0,
0));
// push root node into node_stack // push root node into node_stack
node_stack.push_back(root_node); node_stack.push_back(root_node);
// handle host_event_nodes // handle host_event_nodes
...@@ -155,7 +231,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship( ...@@ -155,7 +231,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
if ((*it)->StartNs() < stack_top_node->EndNs()) { if ((*it)->StartNs() < stack_top_node->EndNs()) {
// current node is the child of stack_top_node // current node is the child of stack_top_node
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
(*it)->EndNs(), stack_top_node->EndNs(), (*it)->EndNs(),
stack_top_node->EndNs(),
platform::errors::Fatal( platform::errors::Fatal(
"should not have time range intersection within one thread")); "should not have time range intersection within one thread"));
stack_top_node->AddChild(*it); stack_top_node->AddChild(*it);
...@@ -170,7 +247,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship( ...@@ -170,7 +247,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
runtime_event_nodes.end(); runtime_event_nodes.end();
bool hasenter = false; bool hasenter = false;
for (auto runtimenode = runtime_event_nodes.begin(); for (auto runtimenode = runtime_event_nodes.begin();
runtimenode != runtime_event_nodes.end(); ++runtimenode) { runtimenode != runtime_event_nodes.end();
++runtimenode) {
if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) && if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) &&
((*runtimenode)->EndNs() <= stack_top_node->EndNs())) { ((*runtimenode)->EndNs() <= stack_top_node->EndNs())) {
if (!hasenter) { if (!hasenter) {
...@@ -203,7 +281,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship( ...@@ -203,7 +281,8 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
runtime_event_nodes.end(); runtime_event_nodes.end();
bool hasenter = false; bool hasenter = false;
for (auto runtimenode = runtime_event_nodes.begin(); for (auto runtimenode = runtime_event_nodes.begin();
runtimenode != runtime_event_nodes.end(); ++runtimenode) { runtimenode != runtime_event_nodes.end();
++runtimenode) {
if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) && if (((*runtimenode)->StartNs() >= stack_top_node->StartNs()) &&
((*runtimenode)->EndNs() <= stack_top_node->EndNs())) { ((*runtimenode)->EndNs() <= stack_top_node->EndNs())) {
if (!hasenter) { if (!hasenter) {
...@@ -225,6 +304,103 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship( ...@@ -225,6 +304,103 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
} }
node_stack.pop_back(); node_stack.pop_back();
} }
// build relationship between host event node and mem event node
// First, post-order traverse the tree. Then, insert the memory and op
// supplement node into correct host nodes.
auto stack = std::stack<HostTraceEventNode*>();
auto flag_stack = std::stack<int32_t>();
auto post_order_nodes = std::vector<HostTraceEventNode*>();
stack.push(root_node);
flag_stack.push(0);
while (!stack.empty()) {
auto current_node = stack.top();
stack.pop();
auto flag = flag_stack.top();
flag_stack.pop();
if (flag == 0) {
stack.push(current_node);
flag_stack.push(1);
for (auto child = current_node->GetChildren().rbegin();
child != current_node->GetChildren().rend();
++child) {
stack.push(*child);
flag_stack.push(0);
}
} else {
post_order_nodes.push_back(current_node);
}
}
for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
bool hasenter = false;
std::vector<MemTraceEventNode*>::iterator firstposition;
std::vector<MemTraceEventNode*>::iterator lastposition =
mem_event_nodes.end();
for (auto mem_it = mem_event_nodes.begin(); mem_it < mem_event_nodes.end();
++mem_it) {
if ((*mem_it)->TimeStampNs() >= (*it)->StartNs() &&
(*mem_it)->TimeStampNs() <= (*it)->EndNs()) {
(*it)->AddMemNode(*mem_it);
if (!hasenter) {
firstposition = mem_it;
hasenter = true;
}
} else {
if ((*mem_it)->TimeStampNs() > (*it)->EndNs()) {
lastposition = mem_it;
break;
}
}
}
if (hasenter) {
mem_event_nodes.erase(firstposition, lastposition);
}
}
// build relationship between host event node and op supplement node
for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
int op_supplement_count = 0;
bool hasenter = false;
std::vector<OperatorSupplementEventNode*>::iterator firstposition;
std::vector<OperatorSupplementEventNode*>::iterator lastposition =
op_supplement_events.end();
for (auto op_supplement_it = op_supplement_events.begin();
op_supplement_it < op_supplement_events.end();
++op_supplement_it) {
if ((*op_supplement_it)->TimeStampNs() >= (*it)->StartNs() &&
(*op_supplement_it)->TimeStampNs() <= (*it)->EndNs()) {
if (!hasenter) {
firstposition = op_supplement_it;
hasenter = true;
}
(*it)->SetOperatorSupplementNode(*op_supplement_it);
PADDLE_ENFORCE_EQ((*it)->Type(),
TracerEventType::Operator,
platform::errors::PreconditionNotMet(
"Operator supplement events should be embraced "
"by event of type TracerEventType::Operator, "
"but got type TracerEventType::%s",
StringTracerEventType((*it)->Type())));
op_supplement_count += 1;
} else {
if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) {
PADDLE_ENFORCE_LE(op_supplement_count,
1,
platform::errors::PreconditionNotMet(
"One event of TracerEventType::Operator has no "
"more than 1 op supplement event, but got %d.",
op_supplement_count));
lastposition = op_supplement_it;
break;
}
}
}
if (hasenter) {
op_supplement_events.erase(firstposition, lastposition);
}
}
return root_node; return root_node;
} }
...@@ -235,7 +411,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse( ...@@ -235,7 +411,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
std::map<uint64_t, std::vector<HostTraceEventNode*>> thread2host_event_nodes; std::map<uint64_t, std::vector<HostTraceEventNode*>> thread2host_event_nodes;
if (bfs == true) { if (bfs == true) {
for (auto it = thread_event_trees_map_.begin(); for (auto it = thread_event_trees_map_.begin();
it != thread_event_trees_map_.end(); ++it) { it != thread_event_trees_map_.end();
++it) {
auto deque = std::deque<HostTraceEventNode*>(); auto deque = std::deque<HostTraceEventNode*>();
uint64_t thread_id = it->first; uint64_t thread_id = it->first;
auto root_node = it->second; auto root_node = it->second;
...@@ -245,7 +422,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse( ...@@ -245,7 +422,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
deque.pop_front(); deque.pop_front();
thread2host_event_nodes[thread_id].push_back(current_node); thread2host_event_nodes[thread_id].push_back(current_node);
for (auto child = current_node->GetChildren().begin(); for (auto child = current_node->GetChildren().begin();
child != current_node->GetChildren().end(); ++child) { child != current_node->GetChildren().end();
++child) {
deque.push_back(*child); deque.push_back(*child);
} }
} }
...@@ -253,7 +431,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse( ...@@ -253,7 +431,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
} else { } else {
for (auto it = thread_event_trees_map_.begin(); for (auto it = thread_event_trees_map_.begin();
it != thread_event_trees_map_.end(); ++it) { it != thread_event_trees_map_.end();
++it) {
auto stack = std::stack<HostTraceEventNode*>(); auto stack = std::stack<HostTraceEventNode*>();
uint64_t thread_id = it->first; uint64_t thread_id = it->first;
auto root_node = it->second; auto root_node = it->second;
...@@ -262,8 +441,9 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse( ...@@ -262,8 +441,9 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
auto current_node = stack.top(); auto current_node = stack.top();
stack.pop(); stack.pop();
thread2host_event_nodes[thread_id].push_back(current_node); thread2host_event_nodes[thread_id].push_back(current_node);
for (auto child = current_node->GetChildren().begin(); for (auto child = current_node->GetChildren().rbegin();
child != current_node->GetChildren().end(); ++child) { child != current_node->GetChildren().rend();
++child) {
stack.push(*child); stack.push(*child);
} }
} }
...@@ -277,12 +457,16 @@ void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); } ...@@ -277,12 +457,16 @@ void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); }
void NodeTrees::HandleTrees( void NodeTrees::HandleTrees(
std::function<void(HostTraceEventNode*)> host_event_node_handle, std::function<void(HostTraceEventNode*)> host_event_node_handle,
std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle, std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle,
std::function<void(DeviceTraceEventNode*)> device_event_node_handle) { std::function<void(DeviceTraceEventNode*)> device_event_node_handle,
std::function<void(MemTraceEventNode*)> mem_event_node_handle,
std::function<void(OperatorSupplementEventNode*)>
op_supplement_node_handle) {
// using different user-defined function to handle different nodes // using different user-defined function to handle different nodes
const std::map<uint64_t, std::vector<HostTraceEventNode*>> const std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes = Traverse(true); thread2host_event_nodes = Traverse(true);
for (auto it = thread2host_event_nodes.begin(); for (auto it = thread2host_event_nodes.begin();
it != thread2host_event_nodes.end(); ++it) { it != thread2host_event_nodes.end();
++it) {
for (auto hostnode = it->second.begin(); hostnode != it->second.end(); for (auto hostnode = it->second.begin(); hostnode != it->second.end();
++hostnode) { ++hostnode) {
if (hostnode != it->second.begin()) { // skip root node if (hostnode != it->second.begin()) { // skip root node
...@@ -299,6 +483,15 @@ void NodeTrees::HandleTrees( ...@@ -299,6 +483,15 @@ void NodeTrees::HandleTrees(
device_event_node_handle(*devicenode); device_event_node_handle(*devicenode);
} }
} }
for (auto memeventnode = (*hostnode)->GetMemTraceEventNodes().begin();
memeventnode != (*hostnode)->GetMemTraceEventNodes().end();
++memeventnode) {
mem_event_node_handle(*memeventnode);
}
if ((*hostnode)->GetOperatorSupplementEventNode()) {
op_supplement_node_handle(
(*hostnode)->GetOperatorSupplementEventNode());
}
} }
} }
} }
......
...@@ -21,12 +21,67 @@ limitations under the License. */ ...@@ -21,12 +21,67 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/output_logger.h" #include "paddle/fluid/platform/profiler/output_logger.h"
#include "paddle/fluid/platform/profiler/trace_event.h" #include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
class MemTraceEventNode {
public:
// constructor
explicit MemTraceEventNode(const MemTraceEvent& mem_event)
: mem_event_(mem_event) {}
// destructor
~MemTraceEventNode();
// getter
TracerMemEventType Type() const { return mem_event_.type; }
uint64_t Addr() const { return mem_event_.addr; }
uint64_t TimeStampNs() const { return mem_event_.timestamp_ns; }
uint64_t ProcessId() const { return mem_event_.process_id; }
uint64_t ThreadId() const { return mem_event_.thread_id; }
int64_t IncreaseBytes() const { return mem_event_.increase_bytes; }
std::string Place() const { return mem_event_.place; }
uint64_t CurrentAllocated() const { return mem_event_.current_allocated; }
uint64_t CurrentReserved() const { return mem_event_.current_reserved; }
// member function
void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); }
private:
// data
MemTraceEvent mem_event_;
};
class OperatorSupplementEventNode {
public:
// constructor
explicit OperatorSupplementEventNode(
const OperatorSupplementEvent& op_supplement_event)
: op_supplement_event_(op_supplement_event) {}
// destructor
~OperatorSupplementEventNode() {}
// getter
std::string Name() const { return op_supplement_event_.op_type; }
uint64_t TimeStampNs() const { return op_supplement_event_.timestamp_ns; }
std::map<std::string, std::vector<std::vector<int64_t>>>& InputShapes() {
return op_supplement_event_.input_shapes;
}
std::map<std::string, std::vector<std::string>>& Dtypes() {
return op_supplement_event_.dtypes;
}
std::string CallStack() { return op_supplement_event_.callstack; }
uint64_t ProcessId() const { return op_supplement_event_.process_id; }
uint64_t ThreadId() const { return op_supplement_event_.thread_id; }
private:
// data
OperatorSupplementEvent op_supplement_event_;
};
class DeviceTraceEventNode { class DeviceTraceEventNode {
public: public:
// constructor // constructor
...@@ -48,7 +103,8 @@ class DeviceTraceEventNode { ...@@ -48,7 +103,8 @@ class DeviceTraceEventNode {
uint32_t CorrelationId() const { return device_event_.correlation_id; } uint32_t CorrelationId() const { return device_event_.correlation_id; }
KernelEventInfo KernelInfo() const { KernelEventInfo KernelInfo() const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
device_event_.type, TracerEventType::Kernel, device_event_.type,
TracerEventType::Kernel,
platform::errors::Unavailable( platform::errors::Unavailable(
"Can not kernel_info, " "Can not kernel_info, "
"TracerEventType in node must be TracerEventType::Kernel.")); "TracerEventType in node must be TracerEventType::Kernel."));
...@@ -56,7 +112,8 @@ class DeviceTraceEventNode { ...@@ -56,7 +112,8 @@ class DeviceTraceEventNode {
} }
MemcpyEventInfo MemcpyInfo() const { MemcpyEventInfo MemcpyInfo() const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
device_event_.type, TracerEventType::Memcpy, device_event_.type,
TracerEventType::Memcpy,
platform::errors::Unavailable( platform::errors::Unavailable(
"Can not get memcpy_info, " "Can not get memcpy_info, "
"TracerEventType in node must be TracerEventType::Memcpy.")); "TracerEventType in node must be TracerEventType::Memcpy."));
...@@ -64,7 +121,8 @@ class DeviceTraceEventNode { ...@@ -64,7 +121,8 @@ class DeviceTraceEventNode {
} }
MemsetEventInfo MemsetInfo() const { MemsetEventInfo MemsetInfo() const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
device_event_.type, TracerEventType::Memset, device_event_.type,
TracerEventType::Memset,
platform::errors::Unavailable( platform::errors::Unavailable(
"Can not get memset_info, " "Can not get memset_info, "
"TracerEventType in node must be TracerEventType::Memset.")); "TracerEventType in node must be TracerEventType::Memset."));
...@@ -139,10 +197,25 @@ class HostTraceEventNode { ...@@ -139,10 +197,25 @@ class HostTraceEventNode {
void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) { void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) {
runtime_node_ptrs_.push_back(node); runtime_node_ptrs_.push_back(node);
} }
std::vector<HostTraceEventNode*>& GetChildren() { return children_; } void AddMemNode(MemTraceEventNode* node) { mem_node_ptrs_.push_back(node); }
std::vector<CudaRuntimeTraceEventNode*>& GetRuntimeTraceEventNodes() { void SetOperatorSupplementNode(OperatorSupplementEventNode* node) {
op_supplement_node_ptr_ = node;
}
const std::vector<HostTraceEventNode*>& GetChildren() const {
return children_;
}
const std::vector<CudaRuntimeTraceEventNode*>& GetRuntimeTraceEventNodes()
const {
return runtime_node_ptrs_; return runtime_node_ptrs_;
} }
const std::vector<MemTraceEventNode*>& GetMemTraceEventNodes() const {
return mem_node_ptrs_;
}
OperatorSupplementEventNode* GetOperatorSupplementEventNode() const {
return op_supplement_node_ptr_;
}
void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); } void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); }
private: private:
...@@ -152,6 +225,9 @@ class HostTraceEventNode { ...@@ -152,6 +225,9 @@ class HostTraceEventNode {
std::vector<CudaRuntimeTraceEventNode*> runtime_node_ptrs_; std::vector<CudaRuntimeTraceEventNode*> runtime_node_ptrs_;
// host events called by this // host events called by this
std::vector<HostTraceEventNode*> children_; std::vector<HostTraceEventNode*> children_;
// memory events happened in this event period
std::vector<MemTraceEventNode*> mem_node_ptrs_;
OperatorSupplementEventNode* op_supplement_node_ptr_ = nullptr;
}; };
class NodeTrees { class NodeTrees {
...@@ -159,10 +235,14 @@ class NodeTrees { ...@@ -159,10 +235,14 @@ class NodeTrees {
// constructor // constructor
NodeTrees(const std::list<HostTraceEvent>& host_events, NodeTrees(const std::list<HostTraceEvent>& host_events,
const std::list<RuntimeTraceEvent>& runtime_events, const std::list<RuntimeTraceEvent>& runtime_events,
const std::list<DeviceTraceEvent>& device_events) { const std::list<DeviceTraceEvent>& device_events,
const std::list<MemTraceEvent>& mem_events,
const std::list<OperatorSupplementEvent>& op_supplement_events) {
std::vector<HostTraceEventNode*> host_event_nodes; std::vector<HostTraceEventNode*> host_event_nodes;
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes; std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes;
std::vector<DeviceTraceEventNode*> device_event_nodes; std::vector<DeviceTraceEventNode*> device_event_nodes;
std::vector<MemTraceEventNode*> mem_event_nodes;
std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes;
// encapsulate event into nodes // encapsulate event into nodes
for (auto it = host_events.begin(); it != host_events.end(); ++it) { for (auto it = host_events.begin(); it != host_events.end(); ++it) {
host_event_nodes.push_back(new HostTraceEventNode(*it)); host_event_nodes.push_back(new HostTraceEventNode(*it));
...@@ -173,8 +253,20 @@ class NodeTrees { ...@@ -173,8 +253,20 @@ class NodeTrees {
for (auto it = device_events.begin(); it != device_events.end(); ++it) { for (auto it = device_events.begin(); it != device_events.end(); ++it) {
device_event_nodes.push_back(new DeviceTraceEventNode(*it)); device_event_nodes.push_back(new DeviceTraceEventNode(*it));
} }
for (auto it = mem_events.begin(); it != mem_events.end(); ++it) {
mem_event_nodes.push_back(new MemTraceEventNode(*it));
}
for (auto it = op_supplement_events.begin();
it != op_supplement_events.end();
++it) {
op_supplement_event_nodes.push_back(new OperatorSupplementEventNode(*it));
}
// build tree // build tree
BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes); BuildTrees(host_event_nodes,
runtime_event_nodes,
device_event_nodes,
mem_event_nodes,
op_supplement_event_nodes);
} }
explicit NodeTrees( explicit NodeTrees(
...@@ -187,8 +279,10 @@ class NodeTrees { ...@@ -187,8 +279,10 @@ class NodeTrees {
void LogMe(BaseLogger* logger); void LogMe(BaseLogger* logger);
void HandleTrees(std::function<void(HostTraceEventNode*)>, void HandleTrees(std::function<void(HostTraceEventNode*)>,
std::function<void(CudaRuntimeTraceEventNode*)>, std::function<void(CudaRuntimeTraceEventNode*)>,
std::function<void(DeviceTraceEventNode*)>); std::function<void(DeviceTraceEventNode*)>,
std::map<uint64_t, HostTraceEventNode*> GetNodeTrees() { std::function<void(MemTraceEventNode*)>,
std::function<void(OperatorSupplementEventNode*)>);
const std::map<uint64_t, HostTraceEventNode*>& GetNodeTrees() const {
return thread_event_trees_map_; return thread_event_trees_map_;
} }
std::map<uint64_t, std::vector<HostTraceEventNode*>> Traverse(bool bfs) const; std::map<uint64_t, std::vector<HostTraceEventNode*>> Traverse(bool bfs) const;
...@@ -196,11 +290,15 @@ class NodeTrees { ...@@ -196,11 +290,15 @@ class NodeTrees {
private: private:
std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map_; std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map_;
void BuildTrees(const std::vector<HostTraceEventNode*>&, void BuildTrees(const std::vector<HostTraceEventNode*>&,
std::vector<CudaRuntimeTraceEventNode*>&, const std::vector<CudaRuntimeTraceEventNode*>&,
const std::vector<DeviceTraceEventNode*>&); const std::vector<DeviceTraceEventNode*>&,
const std::vector<MemTraceEventNode*>&,
const std::vector<OperatorSupplementEventNode*>&);
HostTraceEventNode* BuildTreeRelationship( HostTraceEventNode* BuildTreeRelationship(
std::vector<HostTraceEventNode*> host_event_nodes, std::vector<HostTraceEventNode*> host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes); std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
std::vector<MemTraceEventNode*> mem_event_nodes,
std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes);
}; };
} // namespace platform } // namespace platform
......
...@@ -24,6 +24,7 @@ class DeviceTraceEventNode; // forward declaration ...@@ -24,6 +24,7 @@ class DeviceTraceEventNode; // forward declaration
class HostTraceEventNode; // forward declaration class HostTraceEventNode; // forward declaration
class CudaRuntimeTraceEventNode; // forward declaration class CudaRuntimeTraceEventNode; // forward declaration
class NodeTrees; // forward declaration class NodeTrees; // forward declaration
class MemTraceEventNode; // forward declaration
class BaseLogger { class BaseLogger {
public: public:
...@@ -33,6 +34,7 @@ class BaseLogger { ...@@ -33,6 +34,7 @@ class BaseLogger {
virtual void LogHostTraceEventNode(const HostTraceEventNode&) {} virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {} virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
virtual void LogNodeTrees(const NodeTrees&) {} virtual void LogNodeTrees(const NodeTrees&) {}
virtual void LogMemTraceEventNode(const MemTraceEventNode&) {}
}; };
} // namespace platform } // namespace platform
......
...@@ -100,9 +100,12 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() { ...@@ -100,9 +100,12 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
tracer.Get().StopTracing(); tracer.Get().StopTracing();
tracer.Get().CollectTraceData(&collector); tracer.Get().CollectTraceData(&collector);
} }
std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(), std::unique_ptr<NodeTrees> tree(
new NodeTrees(collector.HostEvents(),
collector.RuntimeEvents(), collector.RuntimeEvents(),
collector.DeviceEvents())); collector.DeviceEvents(),
collector.MemEvents(),
collector.OperatorSupplementEvents()));
cpu_utilization_.RecordEndTimeInfo(); cpu_utilization_.RecordEndTimeInfo();
ExtraInfo extrainfo; ExtraInfo extrainfo;
extrainfo.AddExtraInfo(std::string("System Cpu Utilization"), extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
...@@ -115,7 +118,8 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() { ...@@ -115,7 +118,8 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
collector.ThreadNames(); collector.ThreadNames();
for (const auto& kv : thread_names) { for (const auto& kv : thread_names) {
extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first), extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
std::string("%s"), kv.second.c_str()); std::string("%s"),
kv.second.c_str());
} }
return std::unique_ptr<ProfilerResult>( return std::unique_ptr<ProfilerResult>(
new platform::ProfilerResult(std::move(tree), extrainfo)); new platform::ProfilerResult(std::move(tree), extrainfo));
......
...@@ -18,57 +18,120 @@ ...@@ -18,57 +18,120 @@
#include "paddle/fluid/platform/profiler/event_node.h" #include "paddle/fluid/platform/profiler/event_node.h"
using paddle::platform::ChromeTracingLogger; using paddle::platform::ChromeTracingLogger;
using paddle::platform::NodeTrees;
using paddle::platform::HostTraceEventNode;
using paddle::platform::CudaRuntimeTraceEventNode; using paddle::platform::CudaRuntimeTraceEventNode;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::DeviceTraceEventNode; using paddle::platform::DeviceTraceEventNode;
using paddle::platform::HostTraceEvent; using paddle::platform::HostTraceEvent;
using paddle::platform::RuntimeTraceEvent; using paddle::platform::HostTraceEventNode;
using paddle::platform::DeviceTraceEvent;
using paddle::platform::TracerEventType;
using paddle::platform::KernelEventInfo; using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo; using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo; using paddle::platform::MemsetEventInfo;
using paddle::platform::MemTraceEvent;
using paddle::platform::MemTraceEventNode;
using paddle::platform::NodeTrees;
using paddle::platform::OperatorSupplementEvent;
using paddle::platform::OperatorSupplementEventNode;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::TracerEventType;
using paddle::platform::TracerMemEventType;
TEST(NodeTreesTest, LogMe_case0) { TEST(NodeTreesTest, LogMe_case0) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"), host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000, TracerEventType::Dataloader,
10, 10)); 1000,
10000,
10,
10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10)); std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10)); std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11)); std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, mem_events.push_back(MemTraceEvent(11500,
17000, 10, 10, 1, 0)); 0x1000,
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, TracerMemEventType::Allocate,
35000, 10, 10, 2, 0)); 10,
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000, 10,
37000, 10, 11, 3, 0)); 50,
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000, "GPU:0",
19000, 10, 10, 4, 0)); 50,
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000, 50));
39000, 10, 11, 5, 0)); mem_events.push_back(MemTraceEvent(
device_events.push_back( 11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50));
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
55000, 0, 10, 10, 1, KernelEventInfo())); std::map<std::string, std::vector<std::string>> dtypes;
device_events.push_back( input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, input_shapes[std::string("X")].push_back(std::vector<int64_t>{4, 5, 6, 7});
95000, 0, 10, 10, 2, KernelEventInfo())); dtypes[std::string("X")].push_back(std::string("int8"));
device_events.push_back( dtypes[std::string("X")].push_back(std::string("float32"));
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, op_supplement_events.push_back(OperatorSupplementEvent(
65000, 0, 10, 11, 3, KernelEventInfo())); 11600, "op1", input_shapes, dtypes, "op1()", 10, 10));
device_events.push_back( runtime_events.push_back(RuntimeTraceEvent(
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000, std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
59000, 0, 10, 10, 4, MemcpyEventInfo())); runtime_events.push_back(RuntimeTraceEvent(
device_events.push_back( std::string("cudalaunch2"), 25000, 35000, 10, 10, 2, 0));
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, runtime_events.push_back(RuntimeTraceEvent(
69000, 0, 10, 11, 5, MemsetEventInfo())); std::string("cudalaunch3"), 33000, 37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemcpy1"), 18000, 19000, 10, 10, 4, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudaMemset1"), 38000, 39000, 10, 11, 5, 0));
device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
TracerEventType::Kernel,
40000,
55000,
0,
10,
10,
1,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
TracerEventType::Kernel,
70000,
95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
65000,
0,
10,
11,
3,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memcpy1"),
TracerEventType::Memcpy,
56000,
59000,
0,
10,
10,
4,
MemcpyEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memset1"),
TracerEventType::Memset,
66000,
69000,
0,
10,
11,
5,
MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case0.json"); ChromeTracingLogger logger("test_nodetrees_logme_case0.json");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u); EXPECT_EQ(nodes[10].size(), 4u);
...@@ -82,6 +145,8 @@ TEST(NodeTreesTest, LogMe_case0) { ...@@ -82,6 +145,8 @@ TEST(NodeTreesTest, LogMe_case0) {
if ((*it)->Name() == "op1") { if ((*it)->Name() == "op1") {
EXPECT_EQ((*it)->GetChildren().size(), 0u); EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
} }
} }
for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
...@@ -91,39 +156,76 @@ TEST(NodeTreesTest, LogMe_case0) { ...@@ -91,39 +156,76 @@ TEST(NodeTreesTest, LogMe_case0) {
} }
} }
tree.LogMe(&logger); tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
} }
TEST(NodeTreesTest, LogMe_case1) { TEST(NodeTreesTest, LogMe_case1) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, std::list<MemTraceEvent> mem_events;
17000, 10, 10, 1, 0)); std::list<OperatorSupplementEvent> op_supplement_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, runtime_events.push_back(RuntimeTraceEvent(
35000, 10, 10, 2, 0)); std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000, runtime_events.push_back(RuntimeTraceEvent(
37000, 10, 11, 3, 0)); std::string("cudalaunch2"), 25000, 35000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000, runtime_events.push_back(RuntimeTraceEvent(
19000, 10, 10, 4, 0)); std::string("cudalaunch3"), 33000, 37000, 10, 11, 3, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000, runtime_events.push_back(RuntimeTraceEvent(
39000, 10, 11, 5, 0)); std::string("cudaMemcpy1"), 18000, 19000, 10, 10, 4, 0));
device_events.push_back( runtime_events.push_back(RuntimeTraceEvent(
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, std::string("cudaMemset1"), 38000, 39000, 10, 11, 5, 0));
55000, 0, 10, 10, 1, KernelEventInfo())); device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
device_events.push_back( TracerEventType::Kernel,
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, 40000,
95000, 0, 10, 10, 2, KernelEventInfo())); 55000,
device_events.push_back( 0,
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, 10,
65000, 0, 10, 11, 3, KernelEventInfo())); 10,
device_events.push_back( 1,
DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000, KernelEventInfo()));
59000, 0, 10, 10, 4, MemcpyEventInfo())); device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
device_events.push_back( TracerEventType::Kernel,
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, 70000,
69000, 0, 10, 11, 5, MemsetEventInfo())); 95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
65000,
0,
10,
11,
3,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memcpy1"),
TracerEventType::Memcpy,
56000,
59000,
0,
10,
10,
4,
MemcpyEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("memset1"),
TracerEventType::Memset,
66000,
69000,
0,
10,
11,
5,
MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case1.json"); ChromeTracingLogger logger("test_nodetrees_logme_case1.json");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u); EXPECT_EQ(nodes[10].size(), 1u);
...@@ -142,35 +244,79 @@ TEST(NodeTreesTest, LogMe_case1) { ...@@ -142,35 +244,79 @@ TEST(NodeTreesTest, LogMe_case1) {
} }
} }
tree.LogMe(&logger); tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
} }
TEST(NodeTreesTest, HandleTrees_case0) { TEST(NodeTreesTest, HandleTrees_case0) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 10000, 100000, 10, 10)); std::string("op1"), TracerEventType::Operator, 10000, 100000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 30000, 70000, 10, 10)); std::string("op2"), TracerEventType::Operator, 30000, 70000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 2000, 120000, 10, 11)); std::string("op3"), TracerEventType::Operator, 2000, 120000, 10, 11));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, mem_events.push_back(MemTraceEvent(11500,
25000, 10, 10, 1, 0)); 0x1000,
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 35000, TracerMemEventType::Allocate,
45000, 10, 10, 2, 0)); 10,
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 10000, 10,
55000, 10, 11, 3, 0)); 50,
device_events.push_back( "GPU:0",
DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000, 50,
55000, 0, 10, 10, 1, KernelEventInfo())); 50));
device_events.push_back( mem_events.push_back(MemTraceEvent(
DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000, 11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50));
95000, 0, 10, 10, 2, KernelEventInfo())); op_supplement_events.push_back(OperatorSupplementEvent(
device_events.push_back( 11600,
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, "op1",
75000, 0, 10, 11, 3, KernelEventInfo())); std::map<std::string, std::vector<std::vector<int64_t>>>(),
std::map<std::string, std::vector<std::string>>(),
"op1()",
10,
10));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch1"), 15000, 25000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch2"), 35000, 45000, 10, 10, 2, 0));
runtime_events.push_back(RuntimeTraceEvent(
std::string("cudalaunch3"), 10000, 55000, 10, 11, 3, 0));
device_events.push_back(DeviceTraceEvent(std::string("kernel1"),
TracerEventType::Kernel,
40000,
55000,
0,
10,
10,
1,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel2"),
TracerEventType::Kernel,
70000,
95000,
0,
10,
10,
2,
KernelEventInfo()));
device_events.push_back(DeviceTraceEvent(std::string("kernel3"),
TracerEventType::Kernel,
60000,
75000,
0,
10,
11,
3,
KernelEventInfo()));
ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json"); ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events,
runtime_events,
device_events,
mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 3u); EXPECT_EQ(nodes[10].size(), 3u);
...@@ -194,10 +340,20 @@ TEST(NodeTreesTest, HandleTrees_case0) { ...@@ -194,10 +340,20 @@ TEST(NodeTreesTest, HandleTrees_case0) {
} }
std::function<void(HostTraceEventNode*)> host_event_node_handle( std::function<void(HostTraceEventNode*)> host_event_node_handle(
[&](HostTraceEventNode* a) { logger.LogHostTraceEventNode(*a); }); [&](HostTraceEventNode* a) { logger.LogHostTraceEventNode(*a); });
std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle([&]( std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle(
CudaRuntimeTraceEventNode* a) { logger.LogRuntimeTraceEventNode(*a); }); [&](CudaRuntimeTraceEventNode* a) {
logger.LogRuntimeTraceEventNode(*a);
});
std::function<void(DeviceTraceEventNode*)> device_event_node_handle( std::function<void(DeviceTraceEventNode*)> device_event_node_handle(
[&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); }); [&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); });
tree.HandleTrees(host_event_node_handle, runtime_event_node_handle, std::function<void(MemTraceEventNode*)> mem_event_node_handle(
device_event_node_handle); [&](MemTraceEventNode* a) { logger.LogMemTraceEventNode(*a); });
std::function<void(OperatorSupplementEventNode*)>
op_supplement_event_node_handle([&](OperatorSupplementEventNode* a) {});
tree.HandleTrees(host_event_node_handle,
runtime_event_node_handle,
device_event_node_handle,
mem_event_node_handle,
op_supplement_event_node_handle);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
} }
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <string> #include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -56,6 +58,15 @@ enum class TracerEventType { ...@@ -56,6 +58,15 @@ enum class TracerEventType {
NumTypes NumTypes
}; };
enum class TracerMemEventType {
// Used to mark memory allocation
Allocate = 0,
// Used to mark memory free
Free = 1,
// A flag to denote the number of current types
NumTypes
};
struct KernelEventInfo { struct KernelEventInfo {
// The X-dimension block size for the kernel. // The X-dimension block size for the kernel.
uint32_t block_x; uint32_t block_x;
...@@ -118,10 +129,46 @@ struct MemsetEventInfo { ...@@ -118,10 +129,46 @@ struct MemsetEventInfo {
uint32_t value; uint32_t value;
}; };
struct OperatorSupplementEvent {
OperatorSupplementEvent() = default;
OperatorSupplementEvent(
uint64_t timestamp_ns,
const std::string& op_type,
const std::map<std::string, std::vector<std::vector<int64_t>>>&
input_shapes,
const std::map<std::string, std::vector<std::string>>& dtypes,
const std::string& callstack,
uint64_t process_id,
uint64_t thread_id)
: timestamp_ns(timestamp_ns),
op_type(op_type),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack),
process_id(process_id),
thread_id(thread_id) {}
// timestamp of the record
uint64_t timestamp_ns;
// op type name
std::string op_type;
// input shapes
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
// call stack
std::string callstack;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
};
struct HostTraceEvent { struct HostTraceEvent {
HostTraceEvent() = default; HostTraceEvent() = default;
HostTraceEvent(const std::string& name, TracerEventType type, HostTraceEvent(const std::string& name,
uint64_t start_ns, uint64_t end_ns, uint64_t process_id, TracerEventType type,
uint64_t start_ns,
uint64_t end_ns,
uint64_t process_id,
uint64_t thread_id) uint64_t thread_id)
: name(name), : name(name),
type(type), type(type),
...@@ -145,9 +192,13 @@ struct HostTraceEvent { ...@@ -145,9 +192,13 @@ struct HostTraceEvent {
struct RuntimeTraceEvent { struct RuntimeTraceEvent {
RuntimeTraceEvent() = default; RuntimeTraceEvent() = default;
RuntimeTraceEvent(const std::string& name, uint64_t start_ns, uint64_t end_ns, RuntimeTraceEvent(const std::string& name,
uint64_t process_id, uint64_t thread_id, uint64_t start_ns,
uint32_t correlation_id, uint32_t callback_id) uint64_t end_ns,
uint64_t process_id,
uint64_t thread_id,
uint32_t correlation_id,
uint32_t callback_id)
: name(name), : name(name),
start_ns(start_ns), start_ns(start_ns),
end_ns(end_ns), end_ns(end_ns),
...@@ -176,10 +227,15 @@ struct RuntimeTraceEvent { ...@@ -176,10 +227,15 @@ struct RuntimeTraceEvent {
struct DeviceTraceEvent { struct DeviceTraceEvent {
DeviceTraceEvent() = default; DeviceTraceEvent() = default;
DeviceTraceEvent(const std::string& name, TracerEventType type, DeviceTraceEvent(const std::string& name,
uint64_t start_ns, uint64_t end_ns, uint64_t device_id, TracerEventType type,
uint64_t context_id, uint64_t stream_id, uint64_t start_ns,
uint32_t correlation_id, const KernelEventInfo& kernel_info) uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const KernelEventInfo& kernel_info)
: name(name), : name(name),
type(type), type(type),
start_ns(start_ns), start_ns(start_ns),
...@@ -189,10 +245,15 @@ struct DeviceTraceEvent { ...@@ -189,10 +245,15 @@ struct DeviceTraceEvent {
stream_id(stream_id), stream_id(stream_id),
correlation_id(correlation_id), correlation_id(correlation_id),
kernel_info(kernel_info) {} kernel_info(kernel_info) {}
DeviceTraceEvent(const std::string& name, TracerEventType type, DeviceTraceEvent(const std::string& name,
uint64_t start_ns, uint64_t end_ns, uint64_t device_id, TracerEventType type,
uint64_t context_id, uint64_t stream_id, uint64_t start_ns,
uint32_t correlation_id, const MemcpyEventInfo& memcpy_info) uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const MemcpyEventInfo& memcpy_info)
: name(name), : name(name),
type(type), type(type),
start_ns(start_ns), start_ns(start_ns),
...@@ -202,10 +263,15 @@ struct DeviceTraceEvent { ...@@ -202,10 +263,15 @@ struct DeviceTraceEvent {
stream_id(stream_id), stream_id(stream_id),
correlation_id(correlation_id), correlation_id(correlation_id),
memcpy_info(memcpy_info) {} memcpy_info(memcpy_info) {}
DeviceTraceEvent(const std::string& name, TracerEventType type, DeviceTraceEvent(const std::string& name,
uint64_t start_ns, uint64_t end_ns, uint64_t device_id, TracerEventType type,
uint64_t context_id, uint64_t stream_id, uint64_t start_ns,
uint32_t correlation_id, const MemsetEventInfo& memset_info) uint64_t end_ns,
uint64_t device_id,
uint64_t context_id,
uint64_t stream_id,
uint32_t correlation_id,
const MemsetEventInfo& memset_info)
: name(name), : name(name),
type(type), type(type),
start_ns(start_ns), start_ns(start_ns),
...@@ -242,5 +308,47 @@ struct DeviceTraceEvent { ...@@ -242,5 +308,47 @@ struct DeviceTraceEvent {
}; };
}; };
struct MemTraceEvent {
MemTraceEvent() = default;
MemTraceEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
uint64_t process_id,
uint64_t thread_id,
int64_t increase_bytes,
const std::string& place,
uint64_t current_allocated,
uint64_t current_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
process_id(process_id),
thread_id(thread_id),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved) {}
// timestamp of the record
uint64_t timestamp_ns;
// memory addr of allocation or free
uint64_t addr;
// memory manipulation type
TracerMemEventType type;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// increase bytes after this manipulation, allocation for sign +, free for
// sign -
int64_t increase_bytes;
// place
std::string place;
// current total allocated memory
uint64_t current_allocated;
// current total reserved memory
uint64_t current_reserved;
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -38,6 +38,12 @@ class TraceEventCollector { ...@@ -38,6 +38,12 @@ class TraceEventCollector {
thread_names_[tid] = name; thread_names_[tid] = name;
} }
void AddMemEvent(MemTraceEvent&& event) { mem_events_.push_back(event); }
void AddOperatorSupplementEvent(OperatorSupplementEvent&& event) {
op_supplement_events_.push_back(event);
}
const std::list<HostTraceEvent>& HostEvents() const { return host_events_; } const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
const std::list<RuntimeTraceEvent>& RuntimeEvents() const { const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
...@@ -48,6 +54,12 @@ class TraceEventCollector { ...@@ -48,6 +54,12 @@ class TraceEventCollector {
return device_events_; return device_events_;
} }
const std::list<MemTraceEvent>& MemEvents() const { return mem_events_; }
const std::list<OperatorSupplementEvent>& OperatorSupplementEvents() const {
return op_supplement_events_;
}
const std::unordered_map<uint64_t, std::string>& ThreadNames() const { const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
return thread_names_; return thread_names_;
} }
...@@ -57,6 +69,8 @@ class TraceEventCollector { ...@@ -57,6 +69,8 @@ class TraceEventCollector {
host_events_.clear(); host_events_.clear();
runtime_events_.clear(); runtime_events_.clear();
device_events_.clear(); device_events_.clear();
mem_events_.clear();
op_supplement_events_.clear();
} }
private: private:
...@@ -64,6 +78,8 @@ class TraceEventCollector { ...@@ -64,6 +78,8 @@ class TraceEventCollector {
std::list<HostTraceEvent> host_events_; std::list<HostTraceEvent> host_events_;
std::list<RuntimeTraceEvent> runtime_events_; std::list<RuntimeTraceEvent> runtime_events_;
std::list<DeviceTraceEvent> device_events_; std::list<DeviceTraceEvent> device_events_;
std::list<MemTraceEvent> mem_events_;
std::list<OperatorSupplementEvent> op_supplement_events_;
}; };
} // namespace platform } // namespace platform
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler/utils.h" #include "paddle/fluid/platform/profiler/utils.h"
#include <sstream>
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
...@@ -21,11 +22,35 @@ limitations under the License. */ ...@@ -21,11 +22,35 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
template <>
std::string json_vector<std::string>(
const std::vector<std::string> type_vector) {
std::ostringstream res_stream;
auto count = type_vector.size();
res_stream << "[";
for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
if (count > 1) {
res_stream << "\"" << (*it) << "\""
<< ",";
} else {
res_stream << "\"" << (*it) << "\"";
}
count--;
}
res_stream << "]";
return res_stream.str();
}
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, float CalculateEstOccupancy(uint32_t DeviceId,
uint16_t RegistersPerThread,
int32_t StaticSharedMemory, int32_t StaticSharedMemory,
int32_t DynamicSharedMemory, int32_t BlockX, int32_t DynamicSharedMemory,
int32_t BlockY, int32_t BlockZ, float BlocksPerSm) { int32_t BlockX,
int32_t BlockY,
int32_t BlockZ,
float BlocksPerSm) {
float occupancy = 0.0; float occupancy = 0.0;
std::vector<int> device_ids = GetSelectedDevices(); std::vector<int> device_ids = GetSelectedDevices();
if (DeviceId < device_ids.size()) { if (DeviceId < device_ids.size()) {
...@@ -42,8 +67,12 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, ...@@ -42,8 +67,12 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
size_t dynamicSmemSize = DynamicSharedMemory; size_t dynamicSmemSize = DynamicSharedMemory;
cudaOccResult occ_result; cudaOccResult occ_result;
cudaOccDeviceProp prop(device_property); cudaOccDeviceProp prop(device_property);
cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccError status =
&occ_result, &prop, &occFuncAttr, &occDeviceState, blockSize, cudaOccMaxActiveBlocksPerMultiprocessor(&occ_result,
&prop,
&occFuncAttr,
&occDeviceState,
blockSize,
dynamicSmemSize); dynamicSmemSize);
if (status == CUDA_OCC_SUCCESS) { if (status == CUDA_OCC_SUCCESS) {
if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) { if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) {
...@@ -61,5 +90,30 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, ...@@ -61,5 +90,30 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
} }
#endif #endif
const char* StringTracerMemEventType(TracerMemEventType type) {
static const char* categary_name_[] = {"Allocate", "Free"};
return categary_name_[static_cast<int>(type)];
}
const char* StringTracerEventType(TracerEventType type) {
static const char* categary_name_[] = {"Operator",
"Dataloader",
"ProfileStep",
"CudaRuntime",
"Kernel",
"Memcpy",
"Memset",
"UserDefined",
"OperatorInner",
"Forward",
"Backward",
"Optimization",
"Communication",
"PythonOp",
"PythonUserDefined",
"MluRuntime"};
return categary_name_[static_cast<int>(type)];
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -14,10 +14,15 @@ limitations under the License. */ ...@@ -14,10 +14,15 @@ limitations under the License. */
#pragma once #pragma once
#include <ctime> #include <ctime>
#include <map>
#include <ostream>
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -26,14 +31,74 @@ template <typename... Args> ...@@ -26,14 +31,74 @@ template <typename... Args>
std::string string_format(const std::string& format, Args... args) { std::string string_format(const std::string& format, Args... args) {
int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
1; // Extra space for '\0' 1; // Extra space for '\0'
PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal( PADDLE_ENFORCE_GE(
"Error during profiler data formatting.")); size_s,
0,
platform::errors::Fatal("Error during profiler data formatting."));
auto size = static_cast<size_t>(size_s); auto size = static_cast<size_t>(size_s);
auto buf = std::make_unique<char[]>(size); auto buf = std::make_unique<char[]>(size);
std::snprintf(buf.get(), size, format.c_str(), args...); std::snprintf(buf.get(), size, format.c_str(), args...);
return std::string(buf.get(), size - 1); // exclude the '\0' return std::string(buf.get(), size - 1); // exclude the '\0'
} }
template <typename basic_type>
std::string json_vector(const std::vector<basic_type> type_vector) {
std::ostringstream res_stream;
auto count = type_vector.size();
res_stream << "[";
for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
if (count > 1) {
res_stream << (*it) << ",";
} else {
res_stream << (*it);
}
count--;
}
res_stream << "]";
return res_stream.str();
}
template <typename basic_type>
std::string json_vector(
const std::vector<std::vector<basic_type>> shape_vector) {
std::ostringstream res_stream;
auto count = shape_vector.size();
res_stream << "[";
for (auto it = shape_vector.begin(); it != shape_vector.end(); it++) {
if (count > 1) {
res_stream << json_vector(*it) << ",";
} else {
res_stream << json_vector(*it);
}
count--;
}
res_stream << "]";
return res_stream.str();
}
template <>
std::string json_vector<std::string>(
const std::vector<std::string> type_vector);
template <typename type>
std::string json_dict(const std::map<std::string, std::vector<type>> data_map) {
std::ostringstream res_stream;
auto count = data_map.size();
res_stream << "{";
for (auto it = data_map.begin(); it != data_map.end(); it++) {
if (count > 1) {
res_stream << "\"" << it->first << "\""
<< ":" << json_vector(it->second) << ",";
} else {
res_stream << "\"" << it->first << "\""
<< ":" << json_vector(it->second);
}
count--;
}
res_stream << "}";
return res_stream.str();
}
static std::string GetStringFormatLocalTime() { static std::string GetStringFormatLocalTime() {
std::time_t rawtime; std::time_t rawtime;
std::tm* timeinfo; std::tm* timeinfo;
...@@ -48,6 +113,10 @@ static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) { ...@@ -48,6 +113,10 @@ static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) {
return (end_ns - start_ns) / 1000; return (end_ns - start_ns) / 1000;
} }
const char* StringTracerMemEventType(TracerMemEventType type);
const char* StringTracerEventType(TracerEventType type);
static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) { static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
return static_cast<float>(end_ns - start_ns) / 1000; return static_cast<float>(end_ns - start_ns) / 1000;
} }
...@@ -56,10 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) { ...@@ -56,10 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
} }
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, float CalculateEstOccupancy(uint32_t deviceId,
uint16_t registersPerThread,
int32_t staticSharedMemory, int32_t staticSharedMemory,
int32_t dynamicSharedMemory, int32_t blockX, int32_t dynamicSharedMemory,
int32_t blockY, int32_t blockZ, float blocksPerSm); int32_t blockX,
int32_t blockY,
int32_t blockZ,
float blocksPerSm);
#endif #endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册