未验证 提交 06de4891 编写于 作者: C chenjian 提交者: GitHub

add new field for event node (#43223)

* add new field for event node

* fix

* fix bug

* fix bug

* fix clang

* fix clang format

* fix code format
上级 6d3a68cb
......@@ -27,7 +27,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
static const char* kSchemaVersion = "1.0.0";
static const char* kSchemaVersion = "1.0.1";
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
static uint32_t span_indx = 0;
......@@ -37,14 +37,6 @@ static std::string DefaultFileName() {
GetStringFormatLocalTime().c_str());
}
const char* ChromeTracingLogger::categary_name_[] = {
"Operator", "Dataloader", "ProfileStep",
"CudaRuntime", "Kernel", "Memcpy",
"Memset", "UserDefined", "OperatorInner",
"Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};
void ChromeTracingLogger::OpenFile() {
output_file_stream_.open(filename_,
std::ofstream::out | std::ofstream::trunc);
......@@ -116,10 +108,41 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
(*devicenode)->LogMe(this);
}
}
for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin();
memnode != (*hostnode)->GetMemTraceEventNodes().end(); ++memnode) {
(*memnode)->LogMe(this);
}
}
}
}
void ChromeTracingLogger::LogMemTraceEventNode(
const MemTraceEventNode& mem_node) {
if (!output_file_stream_) {
return;
}
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "[memory]", "pid": %lld, "tid": "%lld",
"ts": %lld,
"ph": "i", "cat": "%s",
"args": {
"place": "%s",
"addr": "%llu",
"current_allocated": %llu,
"current_reserved": %llu,
"increase_bytes": %lld
}
},
)JSON"),
mem_node.ProcessId(), mem_node.ThreadId(), mem_node.TimeStampNs(),
StringTracerMemEventType(mem_node.Type()), mem_node.Place().c_str(),
mem_node.Addr(), mem_node.CurrentAllocated(), mem_node.CurrentReserved(),
mem_node.IncreaseBytes());
}
void ChromeTracingLogger::LogHostTraceEventNode(
const HostTraceEventNode& host_node) {
if (!output_file_stream_) {
......@@ -132,6 +155,16 @@ void ChromeTracingLogger::LogHostTraceEventNode(
} else {
dur_display = string_format(std::string("%.3f us"), dur * 1000);
}
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> input_dtypes;
std::string callstack;
OperatorSupplementEventNode* op_supplement_node =
host_node.GetOperatorSupplementEventNode();
if (op_supplement_node != nullptr) {
input_shapes = op_supplement_node->InputShapes();
input_dtypes = op_supplement_node->Dtypes();
callstack = op_supplement_node->CallStack();
}
switch (host_node.Type()) {
case TracerEventType::ProfileStep:
case TracerEventType::Forward:
......@@ -159,10 +192,48 @@ void ChromeTracingLogger::LogHostTraceEventNode(
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())],
StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_));
break;
case TracerEventType::Operator:
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %.3f,
"ph": "X", "cat": "%s",
"cname": "thread_state_runnable",
"args": {
"start_time": "%.3f us",
"end_time": "%.3f us",
"input_shapes": %s,
"input_dtypes": %s,
"callstack": "%s"
}
},
)JSON"),
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_),
json_dict(input_shapes).c_str(), json_dict(input_dtypes).c_str(),
callstack.c_str());
break;
case TracerEventType::CudaRuntime:
case TracerEventType::Kernel:
case TracerEventType::Memcpy:
case TracerEventType::Memset:
case TracerEventType::UserDefined:
case TracerEventType::OperatorInner:
case TracerEventType::Communication:
case TracerEventType::MluRuntime:
case TracerEventType::NumTypes:
default:
output_file_stream_ << string_format(
std::string(
......@@ -181,7 +252,7 @@ void ChromeTracingLogger::LogHostTraceEventNode(
host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
host_node.ThreadId(), nsToUs(host_node.StartNs()),
nsToUsFloat(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())],
StringTracerEventType(host_node.Type()),
nsToUsFloat(host_node.StartNs(), start_time_),
nsToUsFloat(host_node.EndNs(), start_time_));
break;
......@@ -220,8 +291,7 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
runtime_node.Name().c_str(), dur_display.c_str(),
runtime_node.ProcessId(), runtime_node.ThreadId(),
nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()),
categary_name_[static_cast<int>(runtime_node.Type())],
runtime_node.CorrelationId(),
StringTracerEventType(runtime_node.Type()), runtime_node.CorrelationId(),
nsToUsFloat(runtime_node.StartNs(), start_time_),
nsToUsFloat(runtime_node.EndNs(), start_time_));
pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
......@@ -347,7 +417,7 @@ void ChromeTracingLogger::HandleTypeKernel(
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(),
......@@ -391,7 +461,7 @@ void ChromeTracingLogger::HandleTypeMemcpy(
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(),
device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
......@@ -427,7 +497,7 @@ void ChromeTracingLogger::HandleTypeMemset(
device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUsFloat(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
StringTracerEventType(device_node.Type()),
nsToUsFloat(device_node.StartNs(), start_time_),
nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(),
......
......@@ -37,6 +37,7 @@ class ChromeTracingLogger : public BaseLogger {
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>);
void LogMemTraceEventNode(const MemTraceEventNode&) override;
private:
void OpenFile();
......
......@@ -27,7 +27,9 @@ using paddle::platform::HostTraceEventNode;
using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo;
using paddle::platform::MemTraceEvent;
using paddle::platform::NodeTrees;
using paddle::platform::OperatorSupplementEvent;
using paddle::platform::ProfilerResult;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::SerializationLogger;
......@@ -37,6 +39,8 @@ TEST(SerializationLoggerTest, dump_case0) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000,
10, 10));
......@@ -72,7 +76,8 @@ TEST(SerializationLoggerTest, dump_case0) {
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case0.pb");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u);
......@@ -101,6 +106,8 @@ TEST(SerializationLoggerTest, dump_case1) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
......@@ -127,7 +134,8 @@ TEST(SerializationLoggerTest, dump_case1) {
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case1.pb");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u);
......
......@@ -18,6 +18,8 @@ limitations under the License. */
#include <set>
#include <stack>
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle {
namespace platform {
......@@ -50,8 +52,10 @@ NodeTrees::~NodeTrees() {
void NodeTrees::BuildTrees(
const std::vector<HostTraceEventNode*>& host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
const std::vector<DeviceTraceEventNode*>& device_event_nodes) {
const std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
const std::vector<DeviceTraceEventNode*>& device_event_nodes,
const std::vector<MemTraceEventNode*>& mem_event_nodes,
const std::vector<OperatorSupplementEventNode*>& op_supplement_events) {
// separate Host Event Nodes into different threads
std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes; // used to store HostTraceEventNodes per thread
......@@ -59,6 +63,15 @@ void NodeTrees::BuildTrees(
thread2runtime_event_nodes; // used to store CudaRuntimeTraceEventNode
// per
// thread
std::map<uint64_t, std::vector<MemTraceEventNode*>>
thread2mem_event_nodes; // used to store MemTraceEventNode
// per
// thread
std::map<uint64_t, std::vector<OperatorSupplementEventNode*>>
thread2op_supplement_event_nodes; // used to store
// OperatorSupplementEventNode
// per
// thread
std::map<uint32_t, CudaRuntimeTraceEventNode*>
correlation_id2runtime_event_node; // used to store the relation between
// correlation id and runtime node
......@@ -85,6 +98,15 @@ void NodeTrees::BuildTrees(
"no corresponding cuda runtime events"));
dst_iter->second->AddDeviceTraceEventNode(*it);
}
// construct thread2mem_event_nodes
for (auto it = mem_event_nodes.begin(); it != mem_event_nodes.end(); ++it) {
thread2mem_event_nodes[(*it)->ThreadId()].push_back(*it);
}
// construct thread2op_supplement_event_nodes
for (auto it = op_supplement_events.begin(); it != op_supplement_events.end();
++it) {
thread2op_supplement_event_nodes[(*it)->ThreadId()].push_back(*it);
}
// sort host event nodes and runtime event nodes according to start_ns and
// end_ns
// the smaller start_ns is, the further ahead position is.
......@@ -119,6 +141,29 @@ void NodeTrees::BuildTrees(
return false;
});
}
// sort mem event nodes and operator supplement event nodes
for (auto it = thread2mem_event_nodes.begin();
it != thread2mem_event_nodes.end(); ++it) {
std::sort(it->second.begin(), it->second.end(),
[](MemTraceEventNode* node1, MemTraceEventNode* node2) {
if (node1->TimeStampNs() <= node2->TimeStampNs()) {
return true;
}
return false;
});
}
for (auto it = thread2op_supplement_event_nodes.begin();
it != thread2op_supplement_event_nodes.end(); ++it) {
std::sort(it->second.begin(), it->second.end(),
[](OperatorSupplementEventNode* node1,
OperatorSupplementEventNode* node2) {
if (node1->TimeStampNs() <= node2->TimeStampNs()) {
return true;
}
return false;
});
}
// construct trees
std::set<uint64_t> thread_set;
......@@ -131,16 +176,27 @@ void NodeTrees::BuildTrees(
it != thread2runtime_event_nodes.end(); ++it) {
thread_set.insert(it->first);
}
for (auto it = thread2mem_event_nodes.begin();
it != thread2mem_event_nodes.end(); ++it) {
thread_set.insert(it->first);
}
for (auto it = thread2op_supplement_event_nodes.begin();
it != thread2op_supplement_event_nodes.end(); ++it) {
thread_set.insert(it->first);
}
for (auto it = thread_set.begin(); it != thread_set.end(); ++it) {
thread_event_trees_map_[*it] = BuildTreeRelationship(
thread2host_event_nodes[*it], thread2runtime_event_nodes[*it]);
thread2host_event_nodes[*it], thread2runtime_event_nodes[*it],
thread2mem_event_nodes[*it], thread2op_supplement_event_nodes[*it]);
}
}
HostTraceEventNode* NodeTrees::BuildTreeRelationship(
std::vector<HostTraceEventNode*> host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes) {
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
std::vector<MemTraceEventNode*> mem_event_nodes,
std::vector<OperatorSupplementEventNode*> op_supplement_events) {
// a stack used for analyse relationship
auto node_stack = std::vector<HostTraceEventNode*>();
// root node, top level
......@@ -226,6 +282,99 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
}
node_stack.pop_back();
}
// build relationship between host event node and mem event node
// First, post-order traverse the tree. Then, insert the memory and op
// supplement node into correct host nodes.
auto stack = std::stack<HostTraceEventNode*>();
auto flag_stack = std::stack<int32_t>();
auto post_order_nodes = std::vector<HostTraceEventNode*>();
stack.push(root_node);
flag_stack.push(0);
while (!stack.empty()) {
auto current_node = stack.top();
stack.pop();
auto flag = flag_stack.top();
flag_stack.pop();
if (flag == 0) {
stack.push(current_node);
flag_stack.push(1);
for (auto child = current_node->GetChildren().rbegin();
child != current_node->GetChildren().rend(); ++child) {
stack.push(*child);
flag_stack.push(0);
}
} else {
post_order_nodes.push_back(current_node);
}
}
for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
bool hasenter = false;
std::vector<MemTraceEventNode*>::iterator firstposition;
std::vector<MemTraceEventNode*>::iterator lastposition =
mem_event_nodes.end();
for (auto mem_it = mem_event_nodes.begin(); mem_it < mem_event_nodes.end();
++mem_it) {
if ((*mem_it)->TimeStampNs() >= (*it)->StartNs() &&
(*mem_it)->TimeStampNs() <= (*it)->EndNs()) {
(*it)->AddMemNode(*mem_it);
if (!hasenter) {
firstposition = mem_it;
hasenter = true;
}
} else {
if ((*mem_it)->TimeStampNs() > (*it)->EndNs()) {
lastposition = mem_it;
break;
}
}
}
if (hasenter) {
mem_event_nodes.erase(firstposition, lastposition);
}
}
// build relationship between host event node and op supplement node
for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
int op_supplement_count = 0;
bool hasenter = false;
std::vector<OperatorSupplementEventNode*>::iterator firstposition;
std::vector<OperatorSupplementEventNode*>::iterator lastposition =
op_supplement_events.end();
for (auto op_supplement_it = op_supplement_events.begin();
op_supplement_it < op_supplement_events.end(); ++op_supplement_it) {
if ((*op_supplement_it)->TimeStampNs() >= (*it)->StartNs() &&
(*op_supplement_it)->TimeStampNs() <= (*it)->EndNs()) {
if (!hasenter) {
firstposition = op_supplement_it;
hasenter = true;
}
(*it)->SetOperatorSupplementNode(*op_supplement_it);
PADDLE_ENFORCE_EQ((*it)->Type(), TracerEventType::Operator,
platform::errors::PreconditionNotMet(
"Operator supplement events should be embraced "
"by event of type TracerEventType::Operator, "
"but got type TracerEventType::%s",
StringTracerEventType((*it)->Type())));
op_supplement_count += 1;
} else {
if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) {
PADDLE_ENFORCE_LE(op_supplement_count, 1,
platform::errors::PreconditionNotMet(
"One event of TracerEventType::Operator has no "
"more than 1 op supplement event, but got %d.",
op_supplement_count));
lastposition = op_supplement_it;
break;
}
}
}
if (hasenter) {
op_supplement_events.erase(firstposition, lastposition);
}
}
return root_node;
}
......@@ -263,8 +412,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
auto current_node = stack.top();
stack.pop();
thread2host_event_nodes[thread_id].push_back(current_node);
for (auto child = current_node->GetChildren().begin();
child != current_node->GetChildren().end(); ++child) {
for (auto child = current_node->GetChildren().rbegin();
child != current_node->GetChildren().rend(); ++child) {
stack.push(*child);
}
}
......@@ -278,7 +427,10 @@ void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); }
void NodeTrees::HandleTrees(
std::function<void(HostTraceEventNode*)> host_event_node_handle,
std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle,
std::function<void(DeviceTraceEventNode*)> device_event_node_handle) {
std::function<void(DeviceTraceEventNode*)> device_event_node_handle,
std::function<void(MemTraceEventNode*)> mem_event_node_handle,
std::function<void(OperatorSupplementEventNode*)>
op_supplement_node_handle) {
// using different user-defined function to handle different nodes
const std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes = Traverse(true);
......@@ -300,6 +452,15 @@ void NodeTrees::HandleTrees(
device_event_node_handle(*devicenode);
}
}
for (auto memeventnode = (*hostnode)->GetMemTraceEventNodes().begin();
memeventnode != (*hostnode)->GetMemTraceEventNodes().end();
++memeventnode) {
mem_event_node_handle(*memeventnode);
}
if ((*hostnode)->GetOperatorSupplementEventNode()) {
op_supplement_node_handle(
(*hostnode)->GetOperatorSupplementEventNode());
}
}
}
}
......
......@@ -21,12 +21,67 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/output_logger.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace platform {
class MemTraceEventNode {
public:
// constructor
explicit MemTraceEventNode(const MemTraceEvent& mem_event)
: mem_event_(mem_event) {}
// destructor
~MemTraceEventNode();
// getter
TracerMemEventType Type() const { return mem_event_.type; }
uint64_t Addr() const { return mem_event_.addr; }
uint64_t TimeStampNs() const { return mem_event_.timestamp_ns; }
uint64_t ProcessId() const { return mem_event_.process_id; }
uint64_t ThreadId() const { return mem_event_.thread_id; }
int64_t IncreaseBytes() const { return mem_event_.increase_bytes; }
std::string Place() const { return mem_event_.place; }
uint64_t CurrentAllocated() const { return mem_event_.current_allocated; }
uint64_t CurrentReserved() const { return mem_event_.current_reserved; }
// member function
void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); }
private:
// data
MemTraceEvent mem_event_;
};
class OperatorSupplementEventNode {
public:
// constructor
explicit OperatorSupplementEventNode(
const OperatorSupplementEvent& op_supplement_event)
: op_supplement_event_(op_supplement_event) {}
// destructor
~OperatorSupplementEventNode() {}
// getter
std::string Name() const { return op_supplement_event_.op_type; }
uint64_t TimeStampNs() const { return op_supplement_event_.timestamp_ns; }
std::map<std::string, std::vector<std::vector<int64_t>>>& InputShapes() {
return op_supplement_event_.input_shapes;
}
std::map<std::string, std::vector<std::string>>& Dtypes() {
return op_supplement_event_.dtypes;
}
std::string CallStack() { return op_supplement_event_.callstack; }
uint64_t ProcessId() const { return op_supplement_event_.process_id; }
uint64_t ThreadId() const { return op_supplement_event_.thread_id; }
private:
// data
OperatorSupplementEvent op_supplement_event_;
};
class DeviceTraceEventNode {
public:
// constructor
......@@ -139,6 +194,10 @@ class HostTraceEventNode {
void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) {
runtime_node_ptrs_.push_back(node);
}
void AddMemNode(MemTraceEventNode* node) { mem_node_ptrs_.push_back(node); }
void SetOperatorSupplementNode(OperatorSupplementEventNode* node) {
op_supplement_node_ptr_ = node;
}
const std::vector<HostTraceEventNode*>& GetChildren() const {
return children_;
}
......@@ -146,6 +205,14 @@ class HostTraceEventNode {
const {
return runtime_node_ptrs_;
}
const std::vector<MemTraceEventNode*>& GetMemTraceEventNodes() const {
return mem_node_ptrs_;
}
OperatorSupplementEventNode* GetOperatorSupplementEventNode() const {
return op_supplement_node_ptr_;
}
void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); }
private:
......@@ -155,6 +222,9 @@ class HostTraceEventNode {
std::vector<CudaRuntimeTraceEventNode*> runtime_node_ptrs_;
// host events called by this
std::vector<HostTraceEventNode*> children_;
// memory events happened in this event period
std::vector<MemTraceEventNode*> mem_node_ptrs_;
OperatorSupplementEventNode* op_supplement_node_ptr_ = nullptr;
};
class NodeTrees {
......@@ -162,10 +232,14 @@ class NodeTrees {
// constructor
NodeTrees(const std::list<HostTraceEvent>& host_events,
const std::list<RuntimeTraceEvent>& runtime_events,
const std::list<DeviceTraceEvent>& device_events) {
const std::list<DeviceTraceEvent>& device_events,
const std::list<MemTraceEvent>& mem_events,
const std::list<OperatorSupplementEvent>& op_supplement_events) {
std::vector<HostTraceEventNode*> host_event_nodes;
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes;
std::vector<DeviceTraceEventNode*> device_event_nodes;
std::vector<MemTraceEventNode*> mem_event_nodes;
std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes;
// encapsulate event into nodes
for (auto it = host_events.begin(); it != host_events.end(); ++it) {
host_event_nodes.push_back(new HostTraceEventNode(*it));
......@@ -176,8 +250,16 @@ class NodeTrees {
for (auto it = device_events.begin(); it != device_events.end(); ++it) {
device_event_nodes.push_back(new DeviceTraceEventNode(*it));
}
for (auto it = mem_events.begin(); it != mem_events.end(); ++it) {
mem_event_nodes.push_back(new MemTraceEventNode(*it));
}
for (auto it = op_supplement_events.begin();
it != op_supplement_events.end(); ++it) {
op_supplement_event_nodes.push_back(new OperatorSupplementEventNode(*it));
}
// build tree
BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes);
BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes,
mem_event_nodes, op_supplement_event_nodes);
}
explicit NodeTrees(
......@@ -190,7 +272,9 @@ class NodeTrees {
void LogMe(BaseLogger* logger);
void HandleTrees(std::function<void(HostTraceEventNode*)>,
std::function<void(CudaRuntimeTraceEventNode*)>,
std::function<void(DeviceTraceEventNode*)>);
std::function<void(DeviceTraceEventNode*)>,
std::function<void(MemTraceEventNode*)>,
std::function<void(OperatorSupplementEventNode*)>);
const std::map<uint64_t, HostTraceEventNode*>& GetNodeTrees() const {
return thread_event_trees_map_;
}
......@@ -199,11 +283,15 @@ class NodeTrees {
private:
std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map_;
void BuildTrees(const std::vector<HostTraceEventNode*>&,
std::vector<CudaRuntimeTraceEventNode*>&,
const std::vector<DeviceTraceEventNode*>&);
const std::vector<CudaRuntimeTraceEventNode*>&,
const std::vector<DeviceTraceEventNode*>&,
const std::vector<MemTraceEventNode*>&,
const std::vector<OperatorSupplementEventNode*>&);
HostTraceEventNode* BuildTreeRelationship(
std::vector<HostTraceEventNode*> host_event_nodes,
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes);
std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
std::vector<MemTraceEventNode*> mem_event_nodes,
std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes);
};
} // namespace platform
......
......@@ -24,6 +24,7 @@ class DeviceTraceEventNode; // forward declaration
class HostTraceEventNode; // forward declaration
class CudaRuntimeTraceEventNode; // forward declaration
class NodeTrees; // forward declaration
class MemTraceEventNode; // forward declaration
class BaseLogger {
public:
......@@ -33,6 +34,7 @@ class BaseLogger {
virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
virtual void LogNodeTrees(const NodeTrees&) {}
virtual void LogMemTraceEventNode(const MemTraceEventNode&) {}
};
} // namespace platform
......
......@@ -101,9 +101,10 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
tracer.Get().StopTracing();
tracer.Get().CollectTraceData(&collector);
}
std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(),
collector.RuntimeEvents(),
collector.DeviceEvents()));
std::unique_ptr<NodeTrees> tree(
new NodeTrees(collector.HostEvents(), collector.RuntimeEvents(),
collector.DeviceEvents(), collector.MemEvents(),
collector.OperatorSupplementEvents()));
cpu_utilization_.RecordEndTimeInfo();
ExtraInfo extrainfo;
extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
......
......@@ -25,13 +25,20 @@ using paddle::platform::HostTraceEventNode;
using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo;
using paddle::platform::MemTraceEvent;
using paddle::platform::MemTraceEventNode;
using paddle::platform::NodeTrees;
using paddle::platform::OperatorSupplementEvent;
using paddle::platform::OperatorSupplementEventNode;
using paddle::platform::RuntimeTraceEvent;
using paddle::platform::TracerEventType;
using paddle::platform::TracerMemEventType;
TEST(NodeTreesTest, LogMe_case0) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
TracerEventType::Dataloader, 1000, 10000,
10, 10));
......@@ -41,6 +48,19 @@ TEST(NodeTreesTest, LogMe_case0) {
std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
mem_events.push_back(MemTraceEvent(11500, 0x1000,
TracerMemEventType::Allocate, 10, 10, 50,
"GPU:0", 50, 50));
mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
10, 10, -50, "GPU:0", 0, 50));
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
input_shapes[std::string("X")].push_back(std::vector<int64_t>{4, 5, 6, 7});
dtypes[std::string("X")].push_back(std::string("int8"));
dtypes[std::string("X")].push_back(std::string("float32"));
op_supplement_events.push_back(OperatorSupplementEvent(
11600, "op1", input_shapes, dtypes, "op1()", 10, 10));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
......@@ -67,7 +87,8 @@ TEST(NodeTreesTest, LogMe_case0) {
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case0.json");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u);
......@@ -81,6 +102,8 @@ TEST(NodeTreesTest, LogMe_case0) {
if ((*it)->Name() == "op1") {
EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
}
}
for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
......@@ -90,12 +113,15 @@ TEST(NodeTreesTest, LogMe_case0) {
}
}
tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
}
TEST(NodeTreesTest, LogMe_case1) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
17000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
......@@ -122,7 +148,8 @@ TEST(NodeTreesTest, LogMe_case1) {
DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
69000, 0, 10, 11, 5, MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case1.json");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u);
......@@ -141,18 +168,29 @@ TEST(NodeTreesTest, LogMe_case1) {
}
}
tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
}
TEST(NodeTreesTest, HandleTrees_case0) {
std::list<HostTraceEvent> host_events;
std::list<RuntimeTraceEvent> runtime_events;
std::list<DeviceTraceEvent> device_events;
std::list<MemTraceEvent> mem_events;
std::list<OperatorSupplementEvent> op_supplement_events;
host_events.push_back(HostTraceEvent(
std::string("op1"), TracerEventType::Operator, 10000, 100000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op2"), TracerEventType::Operator, 30000, 70000, 10, 10));
host_events.push_back(HostTraceEvent(
std::string("op3"), TracerEventType::Operator, 2000, 120000, 10, 11));
mem_events.push_back(MemTraceEvent(11500, 0x1000,
TracerMemEventType::Allocate, 10, 10, 50,
"GPU:0", 50, 50));
mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
10, 10, -50, "GPU:0", 0, 50));
op_supplement_events.push_back(OperatorSupplementEvent(
11600, "op1", std::map<std::string, std::vector<std::vector<int64_t>>>(),
std::map<std::string, std::vector<std::string>>(), "op1()", 10, 10));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
25000, 10, 10, 1, 0));
runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 35000,
......@@ -169,7 +207,8 @@ TEST(NodeTreesTest, HandleTrees_case0) {
DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
75000, 0, 10, 11, 3, KernelEventInfo()));
ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json");
NodeTrees tree(host_events, runtime_events, device_events);
NodeTrees tree(host_events, runtime_events, device_events, mem_events,
op_supplement_events);
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree.Traverse(true);
EXPECT_EQ(nodes[10].size(), 3u);
......@@ -199,6 +238,12 @@ TEST(NodeTreesTest, HandleTrees_case0) {
});
std::function<void(DeviceTraceEventNode*)> device_event_node_handle(
[&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); });
std::function<void(MemTraceEventNode*)> mem_event_node_handle(
[&](MemTraceEventNode* a) { logger.LogMemTraceEventNode(*a); });
std::function<void(OperatorSupplementEventNode*)>
op_supplement_event_node_handle([&](OperatorSupplementEventNode* a) {});
tree.HandleTrees(host_event_node_handle, runtime_event_node_handle,
device_event_node_handle);
device_event_node_handle, mem_event_node_handle,
op_supplement_event_node_handle);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
}
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <map>
#include <string>
#include <vector>
namespace paddle {
namespace platform {
......@@ -56,6 +58,15 @@ enum class TracerEventType {
NumTypes
};
enum class TracerMemEventType {
// Used to mark memory allocation
Allocate = 0,
// Used to mark memory free
Free = 1,
// A flag to denote the number of current types
NumTypes
};
struct KernelEventInfo {
// The X-dimension block size for the kernel.
uint32_t block_x;
......@@ -118,6 +129,36 @@ struct MemsetEventInfo {
uint32_t value;
};
struct OperatorSupplementEvent {
OperatorSupplementEvent() = default;
OperatorSupplementEvent(
uint64_t timestamp_ns, const std::string& op_type,
const std::map<std::string, std::vector<std::vector<int64_t>>>&
input_shapes,
const std::map<std::string, std::vector<std::string>>& dtypes,
const std::string& callstack, uint64_t process_id, uint64_t thread_id)
: timestamp_ns(timestamp_ns),
op_type(op_type),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack),
process_id(process_id),
thread_id(thread_id) {}
// timestamp of the record
uint64_t timestamp_ns;
// op type name
std::string op_type;
// input shapes
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
// call stack
std::string callstack;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
};
struct HostTraceEvent {
HostTraceEvent() = default;
HostTraceEvent(const std::string& name, TracerEventType type,
......@@ -242,5 +283,42 @@ struct DeviceTraceEvent {
};
};
struct MemTraceEvent {
MemTraceEvent() = default;
MemTraceEvent(uint64_t timestamp_ns, uint64_t addr, TracerMemEventType type,
uint64_t process_id, uint64_t thread_id, int64_t increase_bytes,
const std::string& place, uint64_t current_allocated,
uint64_t current_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
process_id(process_id),
thread_id(thread_id),
increase_bytes(increase_bytes),
place(place),
current_allocated(current_allocated),
current_reserved(current_reserved) {}
// timestamp of the record
uint64_t timestamp_ns;
// memory addr of allocation or free
uint64_t addr;
// memory manipulation type
TracerMemEventType type;
// process id of the record
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// increase bytes after this manipulation, allocation for sign +, free for
// sign -
int64_t increase_bytes;
// place
std::string place;
// current total allocated memory
uint64_t current_allocated;
// current total reserved memory
uint64_t current_reserved;
};
} // namespace platform
} // namespace paddle
......@@ -39,6 +39,12 @@ class TraceEventCollector {
thread_names_[tid] = name;
}
void AddMemEvent(MemTraceEvent&& event) { mem_events_.push_back(event); }
void AddOperatorSupplementEvent(OperatorSupplementEvent&& event) {
op_supplement_events_.push_back(event);
}
const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
......@@ -49,6 +55,12 @@ class TraceEventCollector {
return device_events_;
}
const std::list<MemTraceEvent>& MemEvents() const { return mem_events_; }
const std::list<OperatorSupplementEvent>& OperatorSupplementEvents() const {
return op_supplement_events_;
}
const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
return thread_names_;
}
......@@ -58,6 +70,8 @@ class TraceEventCollector {
host_events_.clear();
runtime_events_.clear();
device_events_.clear();
mem_events_.clear();
op_supplement_events_.clear();
}
private:
......@@ -65,6 +79,8 @@ class TraceEventCollector {
std::list<HostTraceEvent> host_events_;
std::list<RuntimeTraceEvent> runtime_events_;
std::list<DeviceTraceEvent> device_events_;
std::list<MemTraceEvent> mem_events_;
std::list<OperatorSupplementEvent> op_supplement_events_;
};
} // namespace platform
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler/utils.h"
#include <sstream>
#include <vector>
#include "glog/logging.h"
......@@ -21,6 +22,26 @@ limitations under the License. */
namespace paddle {
namespace platform {
template <>
std::string json_vector<std::string>(
const std::vector<std::string> type_vector) {
std::ostringstream res_stream;
auto count = type_vector.size();
res_stream << "[";
for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
if (count > 1) {
res_stream << "\"" << (*it) << "\""
<< ",";
} else {
res_stream << "\"" << (*it) << "\"";
}
count--;
}
res_stream << "]";
return res_stream.str();
}
#ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
int32_t StaticSharedMemory,
......@@ -61,5 +82,21 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
}
#endif
const char* StringTracerMemEventType(TracerMemEventType type) {
static const char* categary_name_[] = {"Allocate", "Free"};
return categary_name_[static_cast<int>(type)];
}
const char* StringTracerEventType(TracerEventType type) {
static const char* categary_name_[] = {
"Operator", "Dataloader", "ProfileStep",
"CudaRuntime", "Kernel", "Memcpy",
"Memset", "UserDefined", "OperatorInner",
"Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined",
"MluRuntime"};
return categary_name_[static_cast<int>(type)];
}
} // namespace platform
} // namespace paddle
......@@ -14,11 +14,15 @@ limitations under the License. */
#pragma once
#include <ctime>
#include <map>
#include <ostream>
#include <string>
#include <vector>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace platform {
......@@ -36,6 +40,64 @@ std::string string_format(const std::string& format, Args... args) {
return std::string(buf.get(), size - 1); // exclude the '\0'
}
template <typename basic_type>
std::string json_vector(const std::vector<basic_type> type_vector) {
std::ostringstream res_stream;
auto count = type_vector.size();
res_stream << "[";
for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
if (count > 1) {
res_stream << (*it) << ",";
} else {
res_stream << (*it);
}
count--;
}
res_stream << "]";
return res_stream.str();
}
template <typename basic_type>
std::string json_vector(
const std::vector<std::vector<basic_type>> shape_vector) {
std::ostringstream res_stream;
auto count = shape_vector.size();
res_stream << "[";
for (auto it = shape_vector.begin(); it != shape_vector.end(); it++) {
if (count > 1) {
res_stream << json_vector(*it) << ",";
} else {
res_stream << json_vector(*it);
}
count--;
}
res_stream << "]";
return res_stream.str();
}
template <>
std::string json_vector<std::string>(
const std::vector<std::string> type_vector);
template <typename type>
std::string json_dict(const std::map<std::string, std::vector<type>> data_map) {
std::ostringstream res_stream;
auto count = data_map.size();
res_stream << "{";
for (auto it = data_map.begin(); it != data_map.end(); it++) {
if (count > 1) {
res_stream << "\"" << it->first << "\""
<< ":" << json_vector(it->second) << ",";
} else {
res_stream << "\"" << it->first << "\""
<< ":" << json_vector(it->second);
}
count--;
}
res_stream << "}";
return res_stream.str();
}
static std::string GetStringFormatLocalTime() {
std::time_t rawtime;
std::tm* timeinfo;
......@@ -50,6 +112,10 @@ static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) {
return (end_ns - start_ns) / 1000;
}
const char* StringTracerMemEventType(TracerMemEventType type);
const char* StringTracerEventType(TracerEventType type);
static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
return static_cast<float>(end_ns - start_ns) / 1000;
}
......@@ -63,5 +129,6 @@ float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
int32_t dynamicSharedMemory, int32_t blockX,
int32_t blockY, int32_t blockZ, float blocksPerSm);
#endif
} // namespace platform
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册