未验证 提交 06de4891 编写于 作者: C chenjian 提交者: GitHub

add new field for event node (#43223)

* add new field for event node

* fix

* fix bug

* fix bug

* fix clang

* fix clang format

* fix code format
上级 6d3a68cb
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
static const char* kSchemaVersion = "1.0.0"; static const char* kSchemaVersion = "1.0.1";
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json"; static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
static uint32_t span_indx = 0; static uint32_t span_indx = 0;
...@@ -37,14 +37,6 @@ static std::string DefaultFileName() { ...@@ -37,14 +37,6 @@ static std::string DefaultFileName() {
GetStringFormatLocalTime().c_str()); GetStringFormatLocalTime().c_str());
} }
const char* ChromeTracingLogger::categary_name_[] = {
"Operator", "Dataloader", "ProfileStep",
"CudaRuntime", "Kernel", "Memcpy",
"Memset", "UserDefined", "OperatorInner",
"Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};
void ChromeTracingLogger::OpenFile() { void ChromeTracingLogger::OpenFile() {
output_file_stream_.open(filename_, output_file_stream_.open(filename_,
std::ofstream::out | std::ofstream::trunc); std::ofstream::out | std::ofstream::trunc);
...@@ -116,10 +108,41 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { ...@@ -116,10 +108,41 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
(*devicenode)->LogMe(this); (*devicenode)->LogMe(this);
} }
} }
for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin();
memnode != (*hostnode)->GetMemTraceEventNodes().end(); ++memnode) {
(*memnode)->LogMe(this);
}
} }
} }
} }
void ChromeTracingLogger::LogMemTraceEventNode(
const MemTraceEventNode& mem_node) {
if (!output_file_stream_) {
return;
}
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "[memory]", "pid": %lld, "tid": "%lld",
"ts": %lld,
"ph": "i", "cat": "%s",
"args": {
"place": "%s",
"addr": "%llu",
"current_allocated": %llu,
"current_reserved": %llu,
"increase_bytes": %lld
}
},
)JSON"),
mem_node.ProcessId(), mem_node.ThreadId(), mem_node.TimeStampNs(),
StringTracerMemEventType(mem_node.Type()), mem_node.Place().c_str(),
mem_node.Addr(), mem_node.CurrentAllocated(), mem_node.CurrentReserved(),
mem_node.IncreaseBytes());
}
void ChromeTracingLogger::LogHostTraceEventNode( void ChromeTracingLogger::LogHostTraceEventNode(
const HostTraceEventNode& host_node) { const HostTraceEventNode& host_node) {
if (!output_file_stream_) { if (!output_file_stream_) {
...@@ -132,6 +155,16 @@ void ChromeTracingLogger::LogHostTraceEventNode( ...@@ -132,6 +155,16 @@ void ChromeTracingLogger::LogHostTraceEventNode(
} else { } else {
dur_display = string_format(std::string("%.3f us"), dur * 1000); dur_display = string_format(std::string("%.3f us"), dur * 1000);
} }
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> input_dtypes;
std::string callstack;
OperatorSupplementEventNode* op_supplement_node =
host_node.GetOperatorSupplementEventNode();
if (op_supplement_node != nullptr) {
input_shapes = op_supplement_node->InputShapes();
input_dtypes = op_supplement_node->Dtypes();
callstack = op_supplement_node->CallStack();
}
switch (host_node.Type()) { switch (host_node.Type()) {
case TracerEventType::ProfileStep: case TracerEventType::ProfileStep:
case TracerEventType::Forward: case TracerEventType::Forward:
...@@ -159,10 +192,48 @@ void ChromeTracingLogger::LogHostTraceEventNode( ...@@ -159,10 +192,48 @@ void ChromeTracingLogger::LogHostTraceEventNode(
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(), host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()), host_node.ThreadId(), nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()), nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())], StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_), nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_)); nsToUsFloat(host_node.EndNs(), start_time_));
break; break;
case TracerEventType::Operator:
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "thread_state_runnable",
"args": {
"start_time": "%.3f us",
"end_time": "%.3f us",
"input_shapes": %s,
"input_dtypes": %s,
"callstack": "%s"
}
},
)JSON"),
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_),
json_dict(input_shapes).c_str(), json_dict(input_dtypes).c_str(),
callstack.c_str());
break;
case TracerEventType::CudaRuntime:
case TracerEventType::Kernel:
case TracerEventType::Memcpy:
case TracerEventType::Memset:
case TracerEventType::UserDefined:
case TracerEventType::OperatorInner:
case TracerEventType::Communication:
case TracerEventType::MluRuntime:
case TracerEventType::NumTypes:
default: default:
output_file_stream_ << string_format( output_file_stream_ << string_format(
std::string( std::string(
...@@ -181,7 +252,7 @@ void ChromeTracingLogger::LogHostTraceEventNode( ...@@ -181,7 +252,7 @@ void ChromeTracingLogger::LogHostTraceEventNode(
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(), host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()), host_node.ThreadId(), nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()), nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())], StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_), nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_)); nsToUsFloat(host_node.EndNs(), start_time_));
break; break;
...@@ -220,8 +291,7 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( ...@@ -220,8 +291,7 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
runtime_node.Name().c_str(), dur_display.c_str(), runtime_node.Name().c_str(), dur_display.c_str(),
runtime_node.ProcessId(), runtime_node.ThreadId(), runtime_node.ProcessId(), runtime_node.ThreadId(),
nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()), nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()),
categary_name_[static_cast<int>(runtime_node.Type())], StringTracerEventType(runtime_node.Type()), runtime_node.CorrelationId(),
runtime_node.CorrelationId(),
nsToUsFloat(runtime_node.StartNs(), start_time_), nsToUsFloat(runtime_node.StartNs(), start_time_),
nsToUsFloat(runtime_node.EndNs(), start_time_)); nsToUsFloat(runtime_node.EndNs(), start_time_));
pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()}); pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
...@@ -347,7 +417,7 @@ void ChromeTracingLogger::HandleTypeKernel( ...@@ -347,7 +417,7 @@ void ChromeTracingLogger::HandleTypeKernel(
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()), device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()), nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())], StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(), device_node.ContextId(), device_node.StreamId(),
...@@ -391,7 +461,7 @@ void ChromeTracingLogger::HandleTypeMemcpy( ...@@ -391,7 +461,7 @@ void ChromeTracingLogger::HandleTypeMemcpy(
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()), device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()), nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())], StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(), nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(),
device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth); device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
...@@ -427,7 +497,7 @@ void ChromeTracingLogger::HandleTypeMemset( ...@@ -427,7 +497,7 @@ void ChromeTracingLogger::HandleTypeMemset(
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(), device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()), device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()), nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())], StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_), nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(), nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(), device_node.ContextId(), device_node.StreamId(),
......
...@@ -37,6 +37,7 @@ class ChromeTracingLogger : public BaseLogger { ...@@ -37,6 +37,7 @@ class ChromeTracingLogger : public BaseLogger {
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override; void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>); void LogMetaInfo(const std::unordered_map<std::string, std::string>);
void LogMemTraceEventNode(const MemTraceEventNode&) override;
private: private:
void OpenFile(); void OpenFile();
......
...@@ -27,7 +27,9 @@ using paddle::platform::HostTraceEventNode; ...@@ -27,7 +27,9 @@ using paddle::platform::HostTraceEventNode;
using paddle::platform::KernelEventInfo; using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo; using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo; using paddle::platform::MemsetEventInfo;
using paddle::platform::MemTraceEvent;
using paddle::platform::NodeTrees; using paddle::platform::NodeTrees;
using paddle::platform::OperatorSupplementEvent;
using paddle::platform::ProfilerResult; using paddle::platform::ProfilerResult;
using paddle::platform::RuntimeTraceEvent; using paddle::platform::RuntimeTraceEvent;
using paddle::platform::SerializationLogger; using paddle::platform::SerializationLogger;
...@@ -37,6 +39,8 @@ TEST(SerializationLoggerTest, dump_case0) { ...@@ -37,6 +39,8 @@ TEST(SerializationLoggerTest, dump_case0) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"), host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000, TracerEventType::Dataloader, 1000, 10000,
10, 10)); 10, 10));
...@@ -72,7 +76,8 @@ TEST(SerializationLoggerTest, dump_case0) { ...@@ -72,7 +76,8 @@ TEST(SerializationLoggerTest, dump_case0) {
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo())); 69000, 0, 10, 11, 5, MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case0.pb"); SerializationLogger logger("test_serialization_logger_case0.pb");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u); EXPECT_EQ(nodes[10].size(), 4u);
...@@ -101,6 +106,8 @@ TEST(SerializationLoggerTest, dump_case1) { ...@@ -101,6 +106,8 @@ TEST(SerializationLoggerTest, dump_case1) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0)); 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
...@@ -127,7 +134,8 @@ TEST(SerializationLoggerTest, dump_case1) { ...@@ -127,7 +134,8 @@ TEST(SerializationLoggerTest, dump_case1) {
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo())); 69000, 0, 10, 11, 5, MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case1.pb"); SerializationLogger logger("test_serialization_logger_case1.pb");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u); EXPECT_EQ(nodes[10].size(), 1u);
......
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
#include <set> #include <set>
#include <stack> #include <stack>
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -50,8 +52,10 @@ NodeTrees::~NodeTrees() { ...@@ -50,8 +52,10 @@ NodeTrees::~NodeTrees() {
void NodeTrees::BuildTrees( void NodeTrees::BuildTrees(
const std::vector<HostTraceEventNode*>& host_event_nodes, const std::vector<HostTraceEventNode*>& host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes, const std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
const std::vector<DeviceTraceEventNode*>& device_event_nodes) { const std::vector<DeviceTraceEventNode*>& device_event_nodes,
const std::vector<MemTraceEventNode*>& mem_event_nodes,
const std::vector<OperatorSupplementEventNode*>& op_supplement_events) {
// separate Host Event Nodes into different threads // separate Host Event Nodes into different threads
std::map<uint64_t, std::vector<HostTraceEventNode*>> std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes; // used to store HostTraceEventNodes per thread thread2host_event_nodes; // used to store HostTraceEventNodes per thread
...@@ -59,6 +63,15 @@ void NodeTrees::BuildTrees( ...@@ -59,6 +63,15 @@ void NodeTrees::BuildTrees(
thread2runtime_event_nodes; // used to store CudaRuntimeTraceEventNode thread2runtime_event_nodes; // used to store CudaRuntimeTraceEventNode
// per // per
// thread // thread
std::map<uint64_t, std::vector<MemTraceEventNode*>>
thread2mem_event_nodes; // used to store MemTraceEventNode
// per
// thread
std::map<uint64_t, std::vector<OperatorSupplementEventNode*>>
thread2op_supplement_event_nodes; // used to store
// OperatorSupplementEventNode
// per
// thread
std::map<uint32_t, CudaRuntimeTraceEventNode*> std::map<uint32_t, CudaRuntimeTraceEventNode*>
correlation_id2runtime_event_node; // used to store the relation between correlation_id2runtime_event_node; // used to store the relation between
// correlation id and runtime node // correlation id and runtime node
...@@ -85,6 +98,15 @@ void NodeTrees::BuildTrees( ...@@ -85,6 +98,15 @@ void NodeTrees::BuildTrees(
"no corresponding cuda runtime events")); "no corresponding cuda runtime events"));
dst_iter->second->AddDeviceTraceEventNode(*it); dst_iter->second->AddDeviceTraceEventNode(*it);
} }
// construct thread2mem_event_nodes
for (auto it = mem_event_nodes.begin(); it != mem_event_nodes.end(); ++it) {
thread2mem_event_nodes[(*it)->ThreadId()].push_back(*it);
}
// construct thread2op_supplement_event_nodes
for (auto it = op_supplement_events.begin(); it != op_supplement_events.end();
++it) {
thread2op_supplement_event_nodes[(*it)->ThreadId()].push_back(*it);
}
// sort host event nodes and runtime event nodes according to start_ns and // sort host event nodes and runtime event nodes according to start_ns and
// end_ns // end_ns
// the smaller start_ns is, the further ahead position is. // the smaller start_ns is, the further ahead position is.
...@@ -119,6 +141,29 @@ void NodeTrees::BuildTrees( ...@@ -119,6 +141,29 @@ void NodeTrees::BuildTrees(
return false; return false;
}); });
} }
// sort mem event nodes and operator supplement event nodes
for (auto it = thread2mem_event_nodes.begin();
it != thread2mem_event_nodes.end(); ++it) {
std::sort(it->second.begin(), it->second.end(),
[](MemTraceEventNode* node1, MemTraceEventNode* node2) {
if (node1->TimeStampNs() <= node2->TimeStampNs()) {
return true;
}
return false;
});
}
for (auto it = thread2op_supplement_event_nodes.begin();
it != thread2op_supplement_event_nodes.end(); ++it) {
std::sort(it->second.begin(), it->second.end(),
[](OperatorSupplementEventNode* node1,
OperatorSupplementEventNode* node2) {
if (node1->TimeStampNs() <= node2->TimeStampNs()) {
return true;
}
return false;
});
}
// construct trees // construct trees
std::set<uint64_t> thread_set; std::set<uint64_t> thread_set;
...@@ -131,16 +176,27 @@ void NodeTrees::BuildTrees( ...@@ -131,16 +176,27 @@ void NodeTrees::BuildTrees(
it != thread2runtime_event_nodes.end(); ++it) { it != thread2runtime_event_nodes.end(); ++it) {
thread_set.insert(it->first); thread_set.insert(it->first);
} }
for (auto it = thread2mem_event_nodes.begin();
it != thread2mem_event_nodes.end(); ++it) {
thread_set.insert(it->first);
}
for (auto it = thread2op_supplement_event_nodes.begin();
it != thread2op_supplement_event_nodes.end(); ++it) {
thread_set.insert(it->first);
}
for (auto it = thread_set.begin(); it != thread_set.end(); ++it) { for (auto it = thread_set.begin(); it != thread_set.end(); ++it) {
thread_event_trees_map_[*it] = BuildTreeRelationship( thread_event_trees_map_[*it] = BuildTreeRelationship(
thread2host_event_nodes[*it], thread2runtime_event_nodes[*it]); thread2host_event_nodes[*it], thread2runtime_event_nodes[*it],
thread2mem_event_nodes[*it], thread2op_supplement_event_nodes[*it]);
} }
} }
HostTraceEventNode* NodeTrees::BuildTreeRelationship( HostTraceEventNode* NodeTrees::BuildTreeRelationship(
std::vector<HostTraceEventNode*> host_event_nodes, std::vector<HostTraceEventNode*> host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes) { std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
std::vector<MemTraceEventNode*> mem_event_nodes,
std::vector<OperatorSupplementEventNode*> op_supplement_events) {
// a stack used for analyse relationship // a stack used for analyse relationship
auto node_stack = std::vector<HostTraceEventNode*>(); auto node_stack = std::vector<HostTraceEventNode*>();
// root node, top level // root node, top level
...@@ -226,6 +282,99 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship( ...@@ -226,6 +282,99 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
} }
node_stack.pop_back(); node_stack.pop_back();
} }
// build relationship between host event node and mem event node
// First, post-order traverse the tree. Then, insert the memory and op
// supplement node into correct host nodes.
auto stack = std::stack<HostTraceEventNode*>();
auto flag_stack = std::stack<int32_t>();
auto post_order_nodes = std::vector<HostTraceEventNode*>();
stack.push(root_node);
flag_stack.push(0);
while (!stack.empty()) {
auto current_node = stack.top();
stack.pop();
auto flag = flag_stack.top();
flag_stack.pop();
if (flag == 0) {
stack.push(current_node);
flag_stack.push(1);
for (auto child = current_node->GetChildren().rbegin();
child != current_node->GetChildren().rend(); ++child) {
stack.push(*child);
flag_stack.push(0);
}
} else {
post_order_nodes.push_back(current_node);
}
}
for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
bool hasenter = false;
std::vector<MemTraceEventNode*>::iterator firstposition;
std::vector<MemTraceEventNode*>::iterator lastposition =
mem_event_nodes.end();
for (auto mem_it = mem_event_nodes.begin(); mem_it < mem_event_nodes.end();
++mem_it) {
if ((*mem_it)->TimeStampNs() >= (*it)->StartNs() &&
(*mem_it)->TimeStampNs() <= (*it)->EndNs()) {
(*it)->AddMemNode(*mem_it);
if (!hasenter) {
firstposition = mem_it;
hasenter = true;
}
} else {
if ((*mem_it)->TimeStampNs() > (*it)->EndNs()) {
lastposition = mem_it;
break;
}
}
}
if (hasenter) {
mem_event_nodes.erase(firstposition, lastposition);
}
}
// build relationship between host event node and op supplement node
for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
int op_supplement_count = 0;
bool hasenter = false;
std::vector<OperatorSupplementEventNode*>::iterator firstposition;
std::vector<OperatorSupplementEventNode*>::iterator lastposition =
op_supplement_events.end();
for (auto op_supplement_it = op_supplement_events.begin();
op_supplement_it < op_supplement_events.end(); ++op_supplement_it) {
if ((*op_supplement_it)->TimeStampNs() >= (*it)->StartNs() &&
(*op_supplement_it)->TimeStampNs() <= (*it)->EndNs()) {
if (!hasenter) {
firstposition = op_supplement_it;
hasenter = true;
}
(*it)->SetOperatorSupplementNode(*op_supplement_it);
PADDLE_ENFORCE_EQ((*it)->Type(), TracerEventType::Operator,
platform::errors::PreconditionNotMet(
"Operator supplement events should be embraced "
"by event of type TracerEventType::Operator, "
"but got type TracerEventType::%s",
StringTracerEventType((*it)->Type())));
op_supplement_count += 1;
} else {
if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) {
PADDLE_ENFORCE_LE(op_supplement_count, 1,
platform::errors::PreconditionNotMet(
"One event of TracerEventType::Operator has no "
"more than 1 op supplement event, but got %d.",
op_supplement_count));
lastposition = op_supplement_it;
break;
}
}
}
if (hasenter) {
op_supplement_events.erase(firstposition, lastposition);
}
}
return root_node; return root_node;
} }
...@@ -263,8 +412,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse( ...@@ -263,8 +412,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
auto current_node = stack.top(); auto current_node = stack.top();
stack.pop(); stack.pop();
thread2host_event_nodes[thread_id].push_back(current_node); thread2host_event_nodes[thread_id].push_back(current_node);
for (auto child = current_node->GetChildren().begin(); for (auto child = current_node->GetChildren().rbegin();
child != current_node->GetChildren().end(); ++child) { child != current_node->GetChildren().rend(); ++child) {
stack.push(*child); stack.push(*child);
} }
} }
...@@ -278,7 +427,10 @@ void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); } ...@@ -278,7 +427,10 @@ void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); }
void NodeTrees::HandleTrees( void NodeTrees::HandleTrees(
std::function<void(HostTraceEventNode*)> host_event_node_handle, std::function<void(HostTraceEventNode*)> host_event_node_handle,
std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle, std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle,
std::function<void(DeviceTraceEventNode*)> device_event_node_handle) { std::function<void(DeviceTraceEventNode*)> device_event_node_handle,
std::function<void(MemTraceEventNode*)> mem_event_node_handle,
std::function<void(OperatorSupplementEventNode*)>
op_supplement_node_handle) {
// using different user-defined function to handle different nodes // using different user-defined function to handle different nodes
const std::map<uint64_t, std::vector<HostTraceEventNode*>> const std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes = Traverse(true); thread2host_event_nodes = Traverse(true);
...@@ -300,6 +452,15 @@ void NodeTrees::HandleTrees( ...@@ -300,6 +452,15 @@ void NodeTrees::HandleTrees(
device_event_node_handle(*devicenode); device_event_node_handle(*devicenode);
} }
} }
for (auto memeventnode = (*hostnode)->GetMemTraceEventNodes().begin();
memeventnode != (*hostnode)->GetMemTraceEventNodes().end();
++memeventnode) {
mem_event_node_handle(*memeventnode);
}
if ((*hostnode)->GetOperatorSupplementEventNode()) {
op_supplement_node_handle(
(*hostnode)->GetOperatorSupplementEventNode());
}
} }
} }
} }
......
...@@ -21,12 +21,67 @@ limitations under the License. */ ...@@ -21,12 +21,67 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/output_logger.h" #include "paddle/fluid/platform/profiler/output_logger.h"
#include "paddle/fluid/platform/profiler/trace_event.h" #include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
class MemTraceEventNode {
public:
// constructor
explicit MemTraceEventNode(const MemTraceEvent& mem_event)
: mem_event_(mem_event) {}
// destructor
~MemTraceEventNode();
// getter
TracerMemEventType Type() const { return mem_event_.type; }
uint64_t Addr() const { return mem_event_.addr; }
uint64_t TimeStampNs() const { return mem_event_.timestamp_ns; }
uint64_t ProcessId() const { return mem_event_.process_id; }
uint64_t ThreadId() const { return mem_event_.thread_id; }
int64_t IncreaseBytes() const { return mem_event_.increase_bytes; }
std::string Place() const { return mem_event_.place; }
uint64_t CurrentAllocated() const { return mem_event_.current_allocated; }
uint64_t CurrentReserved() const { return mem_event_.current_reserved; }
// member function
void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); }
private:
// data
MemTraceEvent mem_event_;
};
class OperatorSupplementEventNode {
public:
// constructor
explicit OperatorSupplementEventNode(
const OperatorSupplementEvent& op_supplement_event)
: op_supplement_event_(op_supplement_event) {}
// destructor
~OperatorSupplementEventNode() {}
// getter
std::string Name() const { return op_supplement_event_.op_type; }
uint64_t TimeStampNs() const { return op_supplement_event_.timestamp_ns; }
std::map<std::string, std::vector<std::vector<int64_t>>>& InputShapes() {
return op_supplement_event_.input_shapes;
}
std::map<std::string, std::vector<std::string>>& Dtypes() {
return op_supplement_event_.dtypes;
}
std::string CallStack() { return op_supplement_event_.callstack; }
uint64_t ProcessId() const { return op_supplement_event_.process_id; }
uint64_t ThreadId() const { return op_supplement_event_.thread_id; }
private:
// data
OperatorSupplementEvent op_supplement_event_;
};
class DeviceTraceEventNode { class DeviceTraceEventNode {
public: public:
// constructor // constructor
...@@ -139,6 +194,10 @@ class HostTraceEventNode { ...@@ -139,6 +194,10 @@ class HostTraceEventNode {
void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) { void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) {
runtime_node_ptrs_.push_back(node); runtime_node_ptrs_.push_back(node);
} }
void AddMemNode(MemTraceEventNode* node) { mem_node_ptrs_.push_back(node); }
void SetOperatorSupplementNode(OperatorSupplementEventNode* node) {
op_supplement_node_ptr_ = node;
}
const std::vector<HostTraceEventNode*>& GetChildren() const { const std::vector<HostTraceEventNode*>& GetChildren() const {
return children_; return children_;
} }
...@@ -146,6 +205,14 @@ class HostTraceEventNode { ...@@ -146,6 +205,14 @@ class HostTraceEventNode {
const { const {
return runtime_node_ptrs_; return runtime_node_ptrs_;
} }
const std::vector<MemTraceEventNode*>& GetMemTraceEventNodes() const {
return mem_node_ptrs_;
}
OperatorSupplementEventNode* GetOperatorSupplementEventNode() const {
return op_supplement_node_ptr_;
}
void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); } void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); }
private: private:
...@@ -155,6 +222,9 @@ class HostTraceEventNode { ...@@ -155,6 +222,9 @@ class HostTraceEventNode {
std::vector<CudaRuntimeTraceEventNode*> runtime_node_ptrs_; std::vector<CudaRuntimeTraceEventNode*> runtime_node_ptrs_;
// host events called by this // host events called by this
std::vector<HostTraceEventNode*> children_; std::vector<HostTraceEventNode*> children_;
// memory events happened in this event period
std::vector<MemTraceEventNode*> mem_node_ptrs_;
OperatorSupplementEventNode* op_supplement_node_ptr_ = nullptr;
}; };
class NodeTrees { class NodeTrees {
...@@ -162,10 +232,14 @@ class NodeTrees { ...@@ -162,10 +232,14 @@ class NodeTrees {
// constructor // constructor
NodeTrees(const std::list<HostTraceEvent>& host_events, NodeTrees(const std::list<HostTraceEvent>& host_events,
const std::list<RuntimeTraceEvent>& runtime_events, const std::list<RuntimeTraceEvent>& runtime_events,
const std::list<DeviceTraceEvent>& device_events) { const std::list<DeviceTraceEvent>& device_events,
const std::list<MemTraceEvent>& mem_events,
const std::list<OperatorSupplementEvent>& op_supplement_events) {
std::vector<HostTraceEventNode*> host_event_nodes; std::vector<HostTraceEventNode*> host_event_nodes;
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes; std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes;
std::vector<DeviceTraceEventNode*> device_event_nodes; std::vector<DeviceTraceEventNode*> device_event_nodes;
std::vector<MemTraceEventNode*> mem_event_nodes;
std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes;
// encapsulate event into nodes // encapsulate event into nodes
for (auto it = host_events.begin(); it != host_events.end(); ++it) { for (auto it = host_events.begin(); it != host_events.end(); ++it) {
host_event_nodes.push_back(new HostTraceEventNode(*it)); host_event_nodes.push_back(new HostTraceEventNode(*it));
...@@ -176,8 +250,16 @@ class NodeTrees { ...@@ -176,8 +250,16 @@ class NodeTrees {
for (auto it = device_events.begin(); it != device_events.end(); ++it) { for (auto it = device_events.begin(); it != device_events.end(); ++it) {
device_event_nodes.push_back(new DeviceTraceEventNode(*it)); device_event_nodes.push_back(new DeviceTraceEventNode(*it));
} }
for (auto it = mem_events.begin(); it != mem_events.end(); ++it) {
mem_event_nodes.push_back(new MemTraceEventNode(*it));
}
for (auto it = op_supplement_events.begin();
it != op_supplement_events.end(); ++it) {
op_supplement_event_nodes.push_back(new OperatorSupplementEventNode(*it));
}
// build tree // build tree
BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes); BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes,
mem_event_nodes, op_supplement_event_nodes);
} }
explicit NodeTrees( explicit NodeTrees(
...@@ -190,7 +272,9 @@ class NodeTrees { ...@@ -190,7 +272,9 @@ class NodeTrees {
void LogMe(BaseLogger* logger); void LogMe(BaseLogger* logger);
void HandleTrees(std::function<void(HostTraceEventNode*)>, void HandleTrees(std::function<void(HostTraceEventNode*)>,
std::function<void(CudaRuntimeTraceEventNode*)>, std::function<void(CudaRuntimeTraceEventNode*)>,
std::function<void(DeviceTraceEventNode*)>); std::function<void(DeviceTraceEventNode*)>,
std::function<void(MemTraceEventNode*)>,
std::function<void(OperatorSupplementEventNode*)>);
const std::map<uint64_t, HostTraceEventNode*>& GetNodeTrees() const { const std::map<uint64_t, HostTraceEventNode*>& GetNodeTrees() const {
return thread_event_trees_map_; return thread_event_trees_map_;
} }
...@@ -199,11 +283,15 @@ class NodeTrees { ...@@ -199,11 +283,15 @@ class NodeTrees {
private: private:
std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map_; std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map_;
void BuildTrees(const std::vector<HostTraceEventNode*>&, void BuildTrees(const std::vector<HostTraceEventNode*>&,
std::vector<CudaRuntimeTraceEventNode*>&, const std::vector<CudaRuntimeTraceEventNode*>&,
const std::vector<DeviceTraceEventNode*>&); const std::vector<DeviceTraceEventNode*>&,
const std::vector<MemTraceEventNode*>&,
const std::vector<OperatorSupplementEventNode*>&);
HostTraceEventNode* BuildTreeRelationship( HostTraceEventNode* BuildTreeRelationship(
std::vector<HostTraceEventNode*> host_event_nodes, std::vector<HostTraceEventNode*> host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes); std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
std::vector<MemTraceEventNode*> mem_event_nodes,
std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes);
}; };
} // namespace platform } // namespace platform
......
...@@ -24,6 +24,7 @@ class DeviceTraceEventNode; // forward declaration ...@@ -24,6 +24,7 @@ class DeviceTraceEventNode; // forward declaration
class HostTraceEventNode; // forward declaration class HostTraceEventNode; // forward declaration
class CudaRuntimeTraceEventNode; // forward declaration class CudaRuntimeTraceEventNode; // forward declaration
class NodeTrees; // forward declaration class NodeTrees; // forward declaration
class MemTraceEventNode; // forward declaration
class BaseLogger { class BaseLogger {
public: public:
...@@ -33,6 +34,7 @@ class BaseLogger { ...@@ -33,6 +34,7 @@ class BaseLogger {
virtual void LogHostTraceEventNode(const HostTraceEventNode&) {} virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {} virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
virtual void LogNodeTrees(const NodeTrees&) {} virtual void LogNodeTrees(const NodeTrees&) {}
virtual void LogMemTraceEventNode(const MemTraceEventNode&) {}
}; };
} // namespace platform } // namespace platform
......
...@@ -101,9 +101,10 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() { ...@@ -101,9 +101,10 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
tracer.Get().StopTracing(); tracer.Get().StopTracing();
tracer.Get().CollectTraceData(&collector); tracer.Get().CollectTraceData(&collector);
} }
std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(), std::unique_ptr<NodeTrees> tree(
collector.RuntimeEvents(), new NodeTrees(collector.HostEvents(), collector.RuntimeEvents(),
collector.DeviceEvents())); collector.DeviceEvents(), collector.MemEvents(),
collector.OperatorSupplementEvents()));
cpu_utilization_.RecordEndTimeInfo(); cpu_utilization_.RecordEndTimeInfo();
ExtraInfo extrainfo; ExtraInfo extrainfo;
extrainfo.AddExtraInfo(std::string("System Cpu Utilization"), extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
......
...@@ -25,13 +25,20 @@ using paddle::platform::HostTraceEventNode; ...@@ -25,13 +25,20 @@ using paddle::platform::HostTraceEventNode;
using paddle::platform::KernelEventInfo; using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo; using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo; using paddle::platform::MemsetEventInfo;
using paddle::platform::MemTraceEvent;
using paddle::platform::MemTraceEventNode;
using paddle::platform::NodeTrees; using paddle::platform::NodeTrees;
using paddle::platform::OperatorSupplementEvent;
using paddle::platform::OperatorSupplementEventNode;
using paddle::platform::RuntimeTraceEvent; using paddle::platform::RuntimeTraceEvent;
using paddle::platform::TracerEventType; using paddle::platform::TracerEventType;
using paddle::platform::TracerMemEventType;
TEST(NodeTreesTest, LogMe_case0) { TEST(NodeTreesTest, LogMe_case0) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"), host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000, TracerEventType::Dataloader, 1000, 10000,
10, 10)); 10, 10));
...@@ -41,6 +48,19 @@ TEST(NodeTreesTest, LogMe_case0) { ...@@ -41,6 +48,19 @@ TEST(NodeTreesTest, LogMe_case0) {
std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10)); std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11)); std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
mem_events.push_back(MemTraceEvent(11500, 0x1000,
TracerMemEventType::Allocate, 10, 10, 50,
"GPU:0", 50, 50));
mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
10, 10, -50, "GPU:0", 0, 50));
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
input_shapes[std::string("X")].push_back(std::vector<int64_t>{4, 5, 6, 7});
dtypes[std::string("X")].push_back(std::string("int8"));
dtypes[std::string("X")].push_back(std::string("float32"));
op_supplement_events.push_back(OperatorSupplementEvent(
11600, "op1", input_shapes, dtypes, "op1()", 10, 10));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0)); 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
...@@ -67,7 +87,8 @@ TEST(NodeTreesTest, LogMe_case0) { ...@@ -67,7 +87,8 @@ TEST(NodeTreesTest, LogMe_case0) {
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo())); 69000, 0, 10, 11, 5, MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case0.json"); ChromeTracingLogger logger("test_nodetrees_logme_case0.json");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u); EXPECT_EQ(nodes[10].size(), 4u);
...@@ -81,6 +102,8 @@ TEST(NodeTreesTest, LogMe_case0) { ...@@ -81,6 +102,8 @@ TEST(NodeTreesTest, LogMe_case0) {
if ((*it)->Name() == "op1") { if ((*it)->Name() == "op1") {
EXPECT_EQ((*it)->GetChildren().size(), 0u); EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
} }
} }
for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
...@@ -90,12 +113,15 @@ TEST(NodeTreesTest, LogMe_case0) { ...@@ -90,12 +113,15 @@ TEST(NodeTreesTest, LogMe_case0) {
} }
} }
tree.LogMe(&logger); tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
} }
TEST(NodeTreesTest, LogMe_case1) { TEST(NodeTreesTest, LogMe_case1) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0)); 17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
...@@ -122,7 +148,8 @@ TEST(NodeTreesTest, LogMe_case1) { ...@@ -122,7 +148,8 @@ TEST(NodeTreesTest, LogMe_case1) {
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000, DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo())); 69000, 0, 10, 11, 5, MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case1.json"); ChromeTracingLogger logger("test_nodetrees_logme_case1.json");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u); EXPECT_EQ(nodes[10].size(), 1u);
...@@ -141,18 +168,29 @@ TEST(NodeTreesTest, LogMe_case1) { ...@@ -141,18 +168,29 @@ TEST(NodeTreesTest, LogMe_case1) {
} }
} }
tree.LogMe(&logger); tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
} }
TEST(NodeTreesTest, HandleTrees_case0) { TEST(NodeTreesTest, HandleTrees_case0) {
std::list<HostTraceEvent> host_events; std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events; std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events; std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 10000, 100000, 10, 10)); std::string("op1"), TracerEventType::Operator, 10000, 100000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 30000, 70000, 10, 10)); std::string("op2"), TracerEventType::Operator, 30000, 70000, 10, 10));
host_events.push_back(HostTraceEvent( host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 2000, 120000, 10, 11)); std::string("op3"), TracerEventType::Operator, 2000, 120000, 10, 11));
mem_events.push_back(MemTraceEvent(11500, 0x1000,
TracerMemEventType::Allocate, 10, 10, 50,
"GPU:0", 50, 50));
mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
10, 10, -50, "GPU:0", 0, 50));
op_supplement_events.push_back(OperatorSupplementEvent(
11600, "op1", std::map<std::string, std::vector<std::vector<int64_t>>>(),
std::map<std::string, std::vector<std::string>>(), "op1()", 10, 10));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
25000, 10, 10, 1, 0)); 25000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 35000, runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 35000,
...@@ -169,7 +207,8 @@ TEST(NodeTreesTest, HandleTrees_case0) { ...@@ -169,7 +207,8 @@ TEST(NodeTreesTest, HandleTrees_case0) {
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000, DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
75000, 0, 10, 11, 3, KernelEventInfo())); 75000, 0, 10, 11, 3, KernelEventInfo()));
ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json"); ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json");
NodeTrees tree(host_events, runtime_events, device_events); NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes = std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true); tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 3u); EXPECT_EQ(nodes[10].size(), 3u);
...@@ -199,6 +238,12 @@ TEST(NodeTreesTest, HandleTrees_case0) { ...@@ -199,6 +238,12 @@ TEST(NodeTreesTest, HandleTrees_case0) {
}); });
std::function<void(DeviceTraceEventNode*)> device_event_node_handle( std::function<void(DeviceTraceEventNode*)> device_event_node_handle(
[&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); }); [&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); });
std::function<void(MemTraceEventNode*)> mem_event_node_handle(
[&](MemTraceEventNode* a) { logger.LogMemTraceEventNode(*a); });
std::function<void(OperatorSupplementEventNode*)>
op_supplement_event_node_handle([&](OperatorSupplementEventNode* a) {});
tree.HandleTrees(host_event_node_handle, runtime_event_node_handle, tree.HandleTrees(host_event_node_handle, runtime_event_node_handle,
device_event_node_handle); device_event_node_handle, mem_event_node_handle,
op_supplement_event_node_handle);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
} }
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <string> #include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -56,6 +58,15 @@ enum class TracerEventType { ...@@ -56,6 +58,15 @@ enum class TracerEventType {
NumTypes NumTypes
}; };
enum class TracerMemEventType {
// Used to mark memory allocation
Allocate = 0,
// Used to mark memory free
Free = 1,
// A flag to denote the number of current types
NumTypes
};
struct KernelEventInfo { struct KernelEventInfo {
// The X-dimension block size for the kernel. // The X-dimension block size for the kernel.
uint32_t block_x; uint32_t block_x;
...@@ -118,6 +129,36 @@ struct MemsetEventInfo { ...@@ -118,6 +129,36 @@ struct MemsetEventInfo {
uint32_t value; uint32_t value;
}; };
struct OperatorSupplementEvent {
OperatorSupplementEvent() = default;
OperatorSupplementEvent(
uint64_t timestamp_ns, const std::string& op_type,
const std::map<std::string, std::vector<std::vector<int64_t>>>&
input_shapes,
const std::map<std::string, std::vector<std::string>>& dtypes,
const std::string& callstack, uint64_t process_id, uint64_t thread_id)
: timestamp_ns(timestamp_ns),
op_type(op_type),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack),
process_id(process_id),
thread_id(thread_id) {}
// timestamp of the record
uint64_t timestamp_ns;
// op type name
std::string op_type;
// input shapes
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
// call stack
std::string callstack;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
};
struct HostTraceEvent { struct HostTraceEvent {
HostTraceEvent() = default; HostTraceEvent() = default;
HostTraceEvent(const std::string& name, TracerEventType type, HostTraceEvent(const std::string& name, TracerEventType type,
...@@ -242,5 +283,42 @@ struct DeviceTraceEvent { ...@@ -242,5 +283,42 @@ struct DeviceTraceEvent {
}; };
}; };
struct MemTraceEvent {
MemTraceEvent() = default;
MemTraceEvent(uint64_t timestamp_ns, uint64_t addr, TracerMemEventType type,
uint64_t process_id, uint64_t thread_id, int64_t increase_bytes,
const std::string& place, uint64_t current_allocated,
uint64_t current_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
process_id(process_id),
thread_id(thread_id),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved) {}
// timestamp of the record
uint64_t timestamp_ns;
// memory addr of allocation or free
uint64_t addr;
// memory manipulation type
TracerMemEventType type;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// increase bytes after this manipulation, allocation for sign +, free for
// sign -
int64_t increase_bytes;
// place
std::string place;
// current total allocated memory
uint64_t current_allocated;
// current total reserved memory
uint64_t current_reserved;
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -39,6 +39,12 @@ class TraceEventCollector { ...@@ -39,6 +39,12 @@ class TraceEventCollector {
thread_names_[tid] = name; thread_names_[tid] = name;
} }
void AddMemEvent(MemTraceEvent&& event) { mem_events_.push_back(event); }
void AddOperatorSupplementEvent(OperatorSupplementEvent&& event) {
op_supplement_events_.push_back(event);
}
const std::list<HostTraceEvent>& HostEvents() const { return host_events_; } const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
const std::list<RuntimeTraceEvent>& RuntimeEvents() const { const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
...@@ -49,6 +55,12 @@ class TraceEventCollector { ...@@ -49,6 +55,12 @@ class TraceEventCollector {
return device_events_; return device_events_;
} }
const std::list<MemTraceEvent>& MemEvents() const { return mem_events_; }
const std::list<OperatorSupplementEvent>& OperatorSupplementEvents() const {
return op_supplement_events_;
}
const std::unordered_map<uint64_t, std::string>& ThreadNames() const { const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
return thread_names_; return thread_names_;
} }
...@@ -58,6 +70,8 @@ class TraceEventCollector { ...@@ -58,6 +70,8 @@ class TraceEventCollector {
host_events_.clear(); host_events_.clear();
runtime_events_.clear(); runtime_events_.clear();
device_events_.clear(); device_events_.clear();
mem_events_.clear();
op_supplement_events_.clear();
} }
private: private:
...@@ -65,6 +79,8 @@ class TraceEventCollector { ...@@ -65,6 +79,8 @@ class TraceEventCollector {
std::list<HostTraceEvent> host_events_; std::list<HostTraceEvent> host_events_;
std::list<RuntimeTraceEvent> runtime_events_; std::list<RuntimeTraceEvent> runtime_events_;
std::list<DeviceTraceEvent> device_events_; std::list<DeviceTraceEvent> device_events_;
std::list<MemTraceEvent> mem_events_;
std::list<OperatorSupplementEvent> op_supplement_events_;
}; };
} // namespace platform } // namespace platform
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler/utils.h" #include "paddle/fluid/platform/profiler/utils.h"
#include <sstream>
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
...@@ -21,6 +22,26 @@ limitations under the License. */ ...@@ -21,6 +22,26 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
template <>
std::string json_vector<std::string>(
const std::vector<std::string> type_vector) {
std::ostringstream res_stream;
auto count = type_vector.size();
res_stream << "[";
for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
if (count > 1) {
res_stream << "\"" << (*it) << "\""
<< ",";
} else {
res_stream << "\"" << (*it) << "\"";
}
count--;
}
res_stream << "]";
return res_stream.str();
}
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
int32_t StaticSharedMemory, int32_t StaticSharedMemory,
...@@ -61,5 +82,21 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, ...@@ -61,5 +82,21 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
} }
#endif #endif
const char* StringTracerMemEventType(TracerMemEventType type) {
static const char* categary_name_[] = {"Allocate", "Free"};
return categary_name_[static_cast<int>(type)];
}
const char* StringTracerEventType(TracerEventType type) {
static const char* categary_name_[] = {
"Operator", "Dataloader", "ProfileStep",
"CudaRuntime", "Kernel", "Memcpy",
"Memset", "UserDefined", "OperatorInner",
"Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};
return categary_name_[static_cast<int>(type)];
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -14,11 +14,15 @@ limitations under the License. */ ...@@ -14,11 +14,15 @@ limitations under the License. */
#pragma once #pragma once
#include <ctime> #include <ctime>
#include <map>
#include <ostream>
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -36,6 +40,64 @@ std::string string_format(const std::string& format, Args... args) { ...@@ -36,6 +40,64 @@ std::string string_format(const std::string& format, Args... args) {
return std::string(buf.get(), size - 1); // exclude the '\0' return std::string(buf.get(), size - 1); // exclude the '\0'
} }
template <typename basic_type>
std::string json_vector(const std::vector<basic_type> type_vector) {
std::ostringstream res_stream;
auto count = type_vector.size();
res_stream << "[";
for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
if (count > 1) {
res_stream << (*it) << ",";
} else {
res_stream << (*it);
}
count--;
}
res_stream << "]";
return res_stream.str();
}
template <typename basic_type>
std::string json_vector(
const std::vector<std::vector<basic_type>> shape_vector) {
std::ostringstream res_stream;
auto count = shape_vector.size();
res_stream << "[";
for (auto it = shape_vector.begin(); it != shape_vector.end(); it++) {
if (count > 1) {
res_stream << json_vector(*it) << ",";
} else {
res_stream << json_vector(*it);
}
count--;
}
res_stream << "]";
return res_stream.str();
}
template <>
std::string json_vector<std::string>(
const std::vector<std::string> type_vector);
template <typename type>
std::string json_dict(const std::map<std::string, std::vector<type>> data_map) {
std::ostringstream res_stream;
auto count = data_map.size();
res_stream << "{";
for (auto it = data_map.begin(); it != data_map.end(); it++) {
if (count > 1) {
res_stream << "\"" << it->first << "\""
<< ":" << json_vector(it->second) << ",";
} else {
res_stream << "\"" << it->first << "\""
<< ":" << json_vector(it->second);
}
count--;
}
res_stream << "}";
return res_stream.str();
}
static std::string GetStringFormatLocalTime() { static std::string GetStringFormatLocalTime() {
std::time_t rawtime; std::time_t rawtime;
std::tm* timeinfo; std::tm* timeinfo;
...@@ -50,6 +112,10 @@ static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) { ...@@ -50,6 +112,10 @@ static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) {
return (end_ns - start_ns) / 1000; return (end_ns - start_ns) / 1000;
} }
const char* StringTracerMemEventType(TracerMemEventType type);
const char* StringTracerEventType(TracerEventType type);
static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) { static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
return static_cast<float>(end_ns - start_ns) / 1000; return static_cast<float>(end_ns - start_ns) / 1000;
} }
...@@ -63,5 +129,6 @@ float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, ...@@ -63,5 +129,6 @@ float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
int32_t dynamicSharedMemory, int32_t blockX, int32_t dynamicSharedMemory, int32_t blockX,
int32_t blockY, int32_t blockZ, float blocksPerSm); int32_t blockY, int32_t blockZ, float blocksPerSm);
#endif #endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册