未验证 提交 0c3f7fbc 编写于 作者: C chenjian 提交者: GitHub

Upgrade new profiler (#39984)

* add new profiler components

* fix bug

* upgrade new profiler

* fix operator.cc

* fix operator.cc

* fix cmakelists.txt

* fix bug

* fix according to pr

* fix bug

* fix cmake

* fix bug

* fix a bug

* fix bug

* fix bug
上级 bc113e10
......@@ -264,10 +264,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
// and different op name cost time,we set two event.
platform::RecordEvent op_type_record_event(
Type(), platform::TracerEventType::Operator, 1);
// auto op_name = platform::OpName(outputs_, Type());
// platform::RecordEvent op_name_record_event(
// op_name, platform::TracerEventType::Operator, 1,
// platform::EventRole::kUniqueOp);
auto op_name = platform::OpName(outputs_, Type());
platform::RecordEvent op_name_record_event(
op_name, platform::TracerEventType::Operator, 10,
platform::EventRole::kUniqueOp);
RunImpl(scope, place);
}
......
......@@ -2,10 +2,12 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
cc_library(event_node SRCS event_node.cc DEPS enforce)
cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils)
cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
add_subdirectory(dump)
cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind)
cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization)
cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler)
......@@ -18,40 +18,17 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle {
namespace platform {
static const char* kSchemaVersion = "1.0.0";
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
static uint32_t num_span = 0;
static int64_t nsToUs(int64_t ns) { return ns / 1000; }
template <typename... Args>
std::string string_format(const std::string& format, Args... args) {
int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
1; // Extra space for '\0'
PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
"Error during profiler data formatting."));
auto size = static_cast<size_t>(size_s);
auto buf = std::make_unique<char[]>(size);
std::snprintf(buf.get(), size, format.c_str(), args...);
return std::string(buf.get(), size - 1); // exclude the '\0'
}
std::string GetStringFormatLocalTime() {
std::time_t rawtime;
std::tm* timeinfo;
char buf[100];
std::time(&rawtime);
timeinfo = std::localtime(&rawtime);
std::strftime(buf, 100, "%F-%X", timeinfo);
return std::string(buf);
}
static uint32_t span_indx = 0;
static std::string DefaultFileName() {
auto pid = GetProcessId();
......@@ -60,16 +37,19 @@ static std::string DefaultFileName() {
}
const char* ChromeTracingLogger::categary_name_[] = {
"operator", "dataloader", "profile_step", "cuda_runtime", "kernel",
"memcpy", "memset", "user_defined", "others"};
"Operator", "Dataloader", "ProfileStep", "CudaRuntime",
"Kernel", "Memcpy", "Memset", "UserDefined",
"OperatorInner", "Forward", "Backward", "Optimization",
"Communication", "PythonOp", "PythonUserDefined"};
void ChromeTracingLogger::OpenFile() {
output_file_stream_.open(filename_,
std::ofstream::out | std::ofstream::trunc);
if (!output_file_stream_) {
VLOG(2) << "Unable to open file for writing profiling data." << std::endl;
LOG(WARNING) << "Unable to open file for writing profiling data."
<< std::endl;
} else {
VLOG(0) << "writing profiling data to " << filename_ << std::endl;
LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
}
}
......@@ -122,21 +102,54 @@ void ChromeTracingLogger::LogHostTraceEventNode(
if (!output_file_stream_) {
return;
}
output_file_stream_ << string_format(
std::string(
R"JSON(
switch (host_node.Type()) {
case TracerEventType::ProfileStep:
case TracerEventType::Forward:
case TracerEventType::Backward:
case TracerEventType::Dataloader:
case TracerEventType::Optimization:
case TracerEventType::PythonOp:
case TracerEventType::PythonUserDefined:
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": %lld,
"name": "%s", "pid": %lld, "tid": "%lld(Python)",
"ts": %lld, "dur": %lld,
"ph": "X", "cat": "%s",
"args": {
"start_ns": %lld,
"end_ns": %lld
}
},
)JSON"),
host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())]);
host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())],
host_node.StartNs(), host_node.EndNs());
break;
default:
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %lld,
"ph": "X", "cat": "%s",
"args": {
"start_ns": %lld,
"end_ns": %lld
}
},
)JSON"),
host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
categary_name_[static_cast<int>(host_node.Type())],
host_node.StartNs(), host_node.EndNs());
break;
}
pid_tid_set_.insert({host_node.ProcessId(), host_node.ThreadId()});
}
void ChromeTracingLogger::LogRuntimeTraceEventNode(
......@@ -148,11 +161,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
std::string(
R"JSON(
{
"name": "%s", "pid": %lld, "tid": %lld,
"name": "%s", "pid": %lld, "tid": "%lld(C++)",
"ts": %lld, "dur": %lld,
"ph": "X", "cat": "%s",
"args": {
"correlation id": %d
"correlation id": %d,
"start_ns": %lld,
"end_ns": %lld
}
},
)JSON"),
......@@ -160,7 +175,23 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()),
nsToUs(runtime_node.Duration()),
categary_name_[static_cast<int>(runtime_node.Type())],
runtime_node.CorrelationId());
runtime_node.CorrelationId(), runtime_node.StartNs(),
runtime_node.EndNs());
pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "launch", "id": %d, "pid": %lld, "tid": "%lld(C++)",
"ts": %lld,
"ph": "s", "cat": "async"
},
)JSON"),
runtime_node.CorrelationId(), runtime_node.ProcessId(),
runtime_node.ThreadId(),
nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1));
pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
}
void ChromeTracingLogger::LogDeviceTraceEventNode(
......@@ -180,6 +211,36 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
default:
break;
}
if (nsToUs(device_node.Duration()) == 0) {
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "launch", "id": %d, "pid": %lld, "tid": %lld,
"ts": %lld,
"ph": "f", "cat": "async"
},
)JSON"),
device_node.CorrelationId(), device_node.DeviceId(),
device_node.StreamId(), nsToUs(device_node.StartNs()));
deviceid_streamid_set_.insert(
{device_node.DeviceId(), device_node.StreamId()});
} else {
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "launch", "id": %d, "pid": %lld, "tid": %lld,
"ts": %lld,
"ph": "f", "cat": "async", "bp": "e"
},
)JSON"),
device_node.CorrelationId(), device_node.DeviceId(),
device_node.StreamId(),
nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1));
deviceid_streamid_set_.insert(
{device_node.DeviceId(), device_node.StreamId()});
}
}
void ChromeTracingLogger::HandleTypeKernel(
......@@ -188,16 +249,21 @@ void ChromeTracingLogger::HandleTypeKernel(
float blocks_per_sm = 0.0;
float warps_per_sm = 0.0;
float occupancy = 0.0;
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUPTI)
constexpr int threads_per_warp = 32;
const gpuDeviceProp& device_property =
GetDeviceProperties(device_node.DeviceId());
blocks_per_sm =
(kernel_info.grid_x * kernel_info.grid_y * kernel_info.grid_z) /
device_property.multiProcessorCount;
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
kernel_info.grid_z) /
device_property.multiProcessorCount;
warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y *
kernel_info.block_z) /
threads_per_warp;
occupancy = CalculateEstOccupancy(
device_node.DeviceId(), kernel_info.registers_per_thread,
kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory,
kernel_info.block_x, kernel_info.block_y, kernel_info.block_z,
blocks_per_sm);
#endif
output_file_stream_ << string_format(
......@@ -208,15 +274,17 @@ void ChromeTracingLogger::HandleTypeKernel(
"ts": %lld, "dur": %lld,
"ph": "X", "cat": "%s",
"args": {
"start_ns": %lld,
"end_ns": %lld,
"device": %d, "context": %d,
"stream": %d, "correlation id": %d,
"registers per thread": %d,
"shared memory": %f,
"shared memory": %d,
"blocks per SM": %f,
"warps per SM": %f,
"grid": [%d, %d, %d],
"block": [%d, %d, %d],
"est. achieved occupancy %": %f
"theoretical achieved occupancy %%": %f
}
},
)JSON"),
......@@ -224,12 +292,13 @@ void ChromeTracingLogger::HandleTypeKernel(
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUs(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(),
device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(),
device_node.CorrelationId(), kernel_info.registers_per_thread,
kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
blocks_per_sm, warps_per_sm, kernel_info.grid_x, kernel_info.grid_y,
kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y,
kernel_info.block_z, occupancy);
kernel_info.block_z, occupancy * 100);
}
void ChromeTracingLogger::HandleTypeMemcpy(
......@@ -247,6 +316,8 @@ void ChromeTracingLogger::HandleTypeMemcpy(
"ts": %lld, "dur": %lld,
"ph": "X", "cat": "%s",
"args": {
"start_ns": %lld,
"end_ns": %lld,
"stream": %d, "correlation id": %d,
"bytes": %d, "memory bandwidth (GB/s)": %f
}
......@@ -256,8 +327,8 @@ void ChromeTracingLogger::HandleTypeMemcpy(
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUs(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
device_node.StreamId(), device_node.CorrelationId(),
memcpy_info.num_bytes, memory_bandwidth);
device_node.StartNs(), device_node.EndNs(), device_node.StreamId(),
device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
}
void ChromeTracingLogger::HandleTypeMemset(
......@@ -271,6 +342,8 @@ void ChromeTracingLogger::HandleTypeMemset(
"ts": %lld, "dur": %lld,
"ph": "X", "cat": "%s",
"args": {
"start_ns": %lld,
"end_ns": %lld,
"device": %d, "context": %d,
"stream": %d, "correlation id": %d,
"bytes": %d, "value": %d
......@@ -281,7 +354,8 @@ void ChromeTracingLogger::HandleTypeMemset(
device_node.StreamId(), nsToUs(device_node.StartNs()),
nsToUs(device_node.Duration()),
categary_name_[static_cast<int>(device_node.Type())],
device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(),
device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
device_node.ContextId(), device_node.StreamId(),
device_node.CorrelationId(), memset_info.num_bytes, memset_info.value);
}
......@@ -290,10 +364,10 @@ void ChromeTracingLogger::StartLog() {
R"JSON(
{
"schemaVersion": "%s",
"displayTimeUnit": "us",
"SpanNumber": "%d",
"displayTimeUnit": "ms",
"span_indx": "%d",
)JSON"),
kSchemaVersion, num_span);
kSchemaVersion, span_indx++);
// add device property information
#if defined(PADDLE_WITH_CUDA)
output_file_stream_ << std::string(R"JSON(
......@@ -358,11 +432,143 @@ void ChromeTracingLogger::StartLog() {
)JSON");
}
void ChromeTracingLogger::EndLog() {
void ChromeTracingLogger::LogMetaInfo(
const std::unordered_map<std::string, std::string> extra_info) {
RefineDisplayName(extra_info);
output_file_stream_ << std::string(
R"JSON(
{}
]
],
)JSON");
output_file_stream_ << std::string(R"JSON(
"ExtraInfo": {)JSON");
size_t count = extra_info.size();
for (const auto& kv : extra_info) {
if (count > 1) {
output_file_stream_ << string_format(std::string(R"JSON(
"%s": "%s",
)JSON"),
kv.first.c_str(), kv.second.c_str());
} else {
output_file_stream_ << string_format(std::string(R"JSON(
"%s": "%s"
)JSON"),
kv.first.c_str(), kv.second.c_str());
}
count--;
}
output_file_stream_ << std::string(R"JSON(
})JSON");
}
void ChromeTracingLogger::RefineDisplayName(
std::unordered_map<std::string, std::string> extra_info) {
for (auto it = pid_tid_set_.begin(); it != pid_tid_set_.end(); ++it) {
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "process_name", "pid": %lld, "tid": "%lld(Python)",
"ph": "M",
"args": {
"name": "Process %lld (CPU)"
}
},
{
"name": "process_name", "pid": %lld, "tid": "%lld(C++)",
"ph": "M",
"args": {
"name": "Process %lld (CPU)"
}
},
{
"name": "thread_name", "pid": %lld, "tid": "%lld(Python)",
"ph": "M",
"args": {
"name": "thread %lld:%s(Python)"
}
},
{
"name": "thread_name", "pid": %lld, "tid": "%lld(C++)",
"ph": "M",
"args": {
"name": "thread %lld:%s(C++)"
}
},
{
"name": "process_sort_index", "pid": %lld, "tid": %lld,
"ph": "M",
"args": {
"sort_index": %lld
}
},
{
"name": "thread_sort_index", "pid": %lld, "tid": "%lld(Python)",
"ph": "M",
"args": {
"sort_index": %lld
}
},
{
"name": "thread_sort_index", "pid": %lld, "tid": "%lld(C++)",
"ph": "M",
"args": {
"sort_index": %lld
}
},
)JSON"),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
(*it).first, (*it).first, (*it).second, (*it).second,
extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
(*it).first, (*it).second, (*it).second,
extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
(*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
}
for (auto it = deviceid_streamid_set_.begin();
it != deviceid_streamid_set_.end(); ++it) {
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"name": "process_name", "pid": %lld, "tid": %lld,
"ph": "M",
"args": {
"name": "Deivce %lld (GPU)"
}
},
{
"name": "thread_name", "pid": %lld, "tid": %lld,
"ph": "M",
"args": {
"name": "stream %lld"
}
},
{
"name": "process_sort_index", "pid": %lld, "tid": %lld,
"ph": "M",
"args": {
"sort_index": %lld
}
},
{
"name": "thread_sort_index", "pid": %lld, "tid": %lld,
"ph": "M",
"args": {
"sort_index": %lld
}
},
)JSON"),
(*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
(*it).second, (*it).first, (*it).second, (*it).first + 0x10000000,
(*it).first, (*it).second, (*it).second);
}
}
void ChromeTracingLogger::EndLog() {
output_file_stream_ << std::string(
R"JSON(
}
)JSON");
}
......
......@@ -13,11 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <set>
#include <unordered_map>
#include <utility>
#include "paddle/fluid/platform/profiler/output_logger.h"
namespace paddle {
namespace platform {
// Dump a NodeTrees into a chrome tracing file.
// A ChromeTracingLogger object can only dump a NodeTrees object,
// creates a file in the constructor and closes the file in the destructor.
// should only call LogNodeTrees and LogMetaInfo in order.
class ChromeTracingLogger : public BaseLogger {
public:
explicit ChromeTracingLogger(const std::string& filename);
......@@ -28,6 +35,7 @@ class ChromeTracingLogger : public BaseLogger {
void LogHostTraceEventNode(const HostTraceEventNode&) override;
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>);
private:
void OpenFile();
......@@ -36,9 +44,12 @@ class ChromeTracingLogger : public BaseLogger {
void HandleTypeMemcpy(const DeviceTraceEventNode&);
void StartLog();
void EndLog();
void RefineDisplayName(std::unordered_map<std::string, std::string>);
std::string filename_;
std::ofstream output_file_stream_;
static const char* categary_name_[];
std::set<std::pair<uint64_t, uint64_t>> pid_tid_set_;
std::set<std::pair<uint64_t, uint64_t>> deviceid_streamid_set_;
};
} // namespace platform
......
......@@ -54,19 +54,16 @@ void CpuUtilization::RecordBeginTimeInfo() {
if (stat_file != nullptr) {
char temp_str[200];
uint64_t temp_lu;
while (true) {
int retval = fscanf(
stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
"%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
&system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
&irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
if (std::string(temp_str).find("cpu") != 0) {
break;
}
if (retval != 11) {
return;
}
int retval = fscanf(
stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
"%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
&system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_,
&softirq_start_, &steal_start_, &temp_lu, &temp_lu);
if (retval != 11) {
LOG(WARNING)
<< "Failed to read cpu utilization information at record beginning."
<< std::endl;
}
fclose(stat_file);
}
......@@ -90,19 +87,17 @@ void CpuUtilization::RecordEndTimeInfo() {
if (stat_file != nullptr) {
char temp_str[200];
uint64_t temp_lu;
while (true) {
int retval = fscanf(
stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
"%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
&system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
&softirq_end_, &steal_end_, &temp_lu, &temp_lu);
if (std::string(temp_str).find("cpu") != 0) {
break;
}
if (retval != 11) {
return;
}
int retval = fscanf(
stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
"%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
&system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
&softirq_end_, &steal_end_, &temp_lu, &temp_lu);
if (retval != 11) {
LOG(WARNING)
<< "Failed to read cpu utilization information at record end."
<< std::endl;
}
fclose(stat_file);
}
......
proto_library(nodetreeproto SRCS nodetree.proto)
cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node)
cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node)
cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS serialization_logger deserialization_reader event_node)
......@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
#include <cstring>
#include "paddle/fluid/platform/profiler/extra_info.h"
namespace paddle {
namespace platform {
......@@ -36,11 +36,19 @@ void DeserializationReader::OpenFile() {
}
}
std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) {
VLOG(2) << "Unable to load node trees in protobuf." << std::endl;
return nullptr;
}
// restore extra info
ExtraInfo extrainfo;
for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) {
ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx);
extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"),
extra_info_map.value().c_str());
}
// restore NodeTrees
std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
for (int node_tree_index = 0;
node_tree_index < node_trees_proto_->thread_trees_size();
......@@ -95,7 +103,9 @@ std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
}
}
// restore NodeTrees object
return std::unique_ptr<NodeTrees>(new NodeTrees(thread_event_trees_map));
std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
return std::unique_ptr<ProfilerResult>(
new ProfilerResult(std::move(tree), extrainfo));
}
DeserializationReader::~DeserializationReader() {
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include <memory>
#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/event_python.h"
namespace paddle {
namespace platform {
......@@ -24,7 +24,7 @@ class DeserializationReader {
explicit DeserializationReader(const std::string& filename);
explicit DeserializationReader(const char* filename);
~DeserializationReader();
std::unique_ptr<NodeTrees> Parse();
std::unique_ptr<ProfilerResult> Parse();
private:
void OpenFile();
......
......@@ -32,9 +32,21 @@ enum TracerEventTypeProto {
Memset = 6;
// Used to mark record defined by user
UserDefined = 7;
// A flag to denote the number of current types
NumTypes = 8;
}
// Used to mark operator detail, (such as infer shape, compute)
OperatorInner = 8;
// Used to mark model training or testing perspective, forward process
Forward = 9;
// Used to mark model training perspective, backward process
Backward = 10;
// Used to mark model training perspective, optimization process
Optimization = 11;
// Used to mark distributed training perspective
Communication = 12;
// Used to mark python api
PythonOp = 13;
// Used to mark python level userdefined
PythonUserDefined = 14;
};
message KernelEventInfoProto {
// The X-dimension block size for the kernel.
......@@ -175,7 +187,14 @@ message ThreadNodeTreeProto {
repeated HostTraceEventNodeProto host_nodes = 2;
}
message ExtraInfoMap {
required string key = 1;
required string value = 2;
}
message NodeTreesProto {
required string version = 1;
repeated ThreadNodeTreeProto thread_trees = 2;
required uint32 span_indx = 2;
repeated ThreadNodeTreeProto thread_trees = 3;
repeated ExtraInfoMap extra_info = 4;
}
......@@ -13,6 +13,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle {
......@@ -20,6 +21,7 @@ namespace platform {
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
static const char* version = "1.0.0";
static uint32_t span_indx = 0;
static std::string DefaultFileName() {
auto pid = GetProcessId();
......@@ -39,6 +41,7 @@ void SerializationLogger::OpenFile() {
}
node_trees_proto_ = new NodeTreesProto();
node_trees_proto_->set_version(std::string(version));
node_trees_proto_->set_span_indx(span_indx++);
}
void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
......@@ -240,6 +243,15 @@ void SerializationLogger::HandleTypeMemset(
device_trace_event);
}
void SerializationLogger::LogMetaInfo(
const std::unordered_map<std::string, std::string> extra_info) {
for (const auto& kv : extra_info) {
ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info();
extra_info_map->set_key(kv.first);
extra_info_map->set_value(kv.second);
}
}
SerializationLogger::SerializationLogger(const std::string& filename) {
filename_ = filename.empty() ? DefaultFileName() : filename;
OpenFile();
......
......@@ -11,6 +11,8 @@ limitations under the License. */
#pragma once
#include <unordered_map>
#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
#include "paddle/fluid/platform/profiler/output_logger.h"
......@@ -20,6 +22,7 @@ namespace platform {
// Dump a NodeTrees into a profobuf file.
// A SerializationLogger object can only dump a NodeTrees object,
// creates a file in the constructor and closes the file in the destructor.
// Should only call LogNodeTrees and LogMetaInfo.
class SerializationLogger : public BaseLogger {
public:
explicit SerializationLogger(const std::string& filename);
......@@ -30,12 +33,14 @@ class SerializationLogger : public BaseLogger {
void LogHostTraceEventNode(const HostTraceEventNode&) override;
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>);
private:
void OpenFile();
void HandleTypeKernel(const DeviceTraceEventNode&);
void HandleTypeMemset(const DeviceTraceEventNode&);
void HandleTypeMemcpy(const DeviceTraceEventNode&);
std::string filename_;
std::ofstream output_file_stream_;
NodeTreesProto* node_trees_proto_;
......
......@@ -17,6 +17,7 @@
#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/event_python.h"
using paddle::platform::SerializationLogger;
using paddle::platform::DeserializationReader;
......@@ -31,6 +32,7 @@ using paddle::platform::TracerEventType;
using paddle::platform::KernelEventInfo;
using paddle::platform::MemcpyEventInfo;
using paddle::platform::MemsetEventInfo;
using paddle::platform::ProfilerResult;
TEST(SerializationLoggerTest, dump_case0) {
std::list<HostTraceEvent> host_events;
......@@ -149,7 +151,8 @@ TEST(SerializationLoggerTest, dump_case1) {
TEST(DeserializationReaderTest, restore_case0) {
DeserializationReader reader("test_serialization_logger_case0.pb");
std::unique_ptr<NodeTrees> tree = reader.Parse();
auto profiler_result = reader.Parse();
auto& tree = profiler_result->GetNodeTrees();
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree->Traverse(true);
EXPECT_EQ(nodes[10].size(), 4u);
......@@ -172,3 +175,26 @@ TEST(DeserializationReaderTest, restore_case0) {
}
}
}
TEST(DeserializationReaderTest, restore_case1) {
DeserializationReader reader("test_serialization_logger_case1.pb");
auto profiler_result = reader.Parse();
auto& tree = profiler_result->GetNodeTrees();
std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
tree->Traverse(true);
EXPECT_EQ(nodes[10].size(), 1u);
EXPECT_EQ(nodes[11].size(), 1u);
std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
if ((*it)->Name() == "root node") {
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
}
}
for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
if ((*it)->Name() == "root node") {
EXPECT_EQ((*it)->GetChildren().size(), 0u);
EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
}
}
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
namespace paddle {
namespace platform {
HostPythonNode::~HostPythonNode() {
// delete all runtime or device nodes and recursive delete children
for (auto it = children_node_ptrs.begin(); it != children_node_ptrs.end();
++it) {
delete *it;
}
for (auto it = runtime_node_ptrs.begin(); it != runtime_node_ptrs.end();
++it) {
delete *it;
}
for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) {
delete *it;
}
}
HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
// Copy and transfer EventNode in NodeTree to PythonNode
if (root == nullptr) {
return nullptr;
}
// copy HostTraceEventNode and its children
HostPythonNode* host_python_node = new HostPythonNode();
host_python_node->name = root->Name();
host_python_node->type = root->Type();
host_python_node->start_ns = root->StartNs();
host_python_node->end_ns = root->EndNs();
host_python_node->process_id = root->ProcessId();
host_python_node->thread_id = root->ThreadId();
for (auto it = root->GetChildren().begin(); it != root->GetChildren().end();
++it) {
host_python_node->children_node_ptrs.push_back(CopyTree(*it));
}
// copy its CudaRuntimeTraceEventNode
for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin();
runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) {
HostPythonNode* runtime_python_node = new HostPythonNode();
runtime_python_node->name = (*runtimenode)->Name();
runtime_python_node->type = (*runtimenode)->Type();
runtime_python_node->start_ns = (*runtimenode)->StartNs();
runtime_python_node->end_ns = (*runtimenode)->EndNs();
runtime_python_node->process_id = (*runtimenode)->ProcessId();
runtime_python_node->thread_id = (*runtimenode)->ThreadId();
host_python_node->runtime_node_ptrs.push_back(runtime_python_node);
// copy DeviceTraceEventNode
for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin();
devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end();
++devicenode) {
DevicePythonNode* device_python_node = new DevicePythonNode();
device_python_node->name = (*devicenode)->Name();
device_python_node->type = (*devicenode)->Type();
device_python_node->start_ns = (*devicenode)->StartNs();
device_python_node->end_ns = (*devicenode)->EndNs();
device_python_node->device_id = (*devicenode)->DeviceId();
device_python_node->context_id = (*devicenode)->ContextId();
device_python_node->stream_id = (*devicenode)->StreamId();
runtime_python_node->device_node_ptrs.push_back(device_python_node);
}
}
return host_python_node;
}
ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info)
: tree_(std::move(tree)), extra_info_(extra_info) {
if (tree_ != nullptr) {
std::map<uint64_t, HostTraceEventNode*> nodetrees = tree_->GetNodeTrees();
for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) {
thread_event_trees_map_[it->first] = CopyTree(it->second);
}
}
}
ProfilerResult::~ProfilerResult() {
// delete all root nodes
for (auto it = thread_event_trees_map_.begin();
it != thread_event_trees_map_.end(); ++it) {
delete it->second;
}
}
void ProfilerResult::Save(const std::string& file_name,
const std::string format) {
if (format == std::string("json")) {
ChromeTracingLogger logger(file_name);
tree_->LogMe(&logger);
logger.LogMetaInfo(GetExtraInfo());
} else if (format == std::string("pb")) {
SerializationLogger logger(file_name);
tree_->LogMe(&logger);
logger.LogMetaInfo(GetExtraInfo());
}
return;
}
std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename) {
DeserializationReader reader(filename);
std::unique_ptr<ProfilerResult> result = reader.Parse();
return result;
}
} // namespace platform
} // namespace paddle
......@@ -15,8 +15,11 @@ limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <unordered_map>
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
namespace paddle {
namespace platform {
......@@ -66,18 +69,29 @@ struct HostPythonNode {
class ProfilerResult {
public:
ProfilerResult() : tree_(nullptr) {}
explicit ProfilerResult(NodeTrees* tree);
explicit ProfilerResult(std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info);
~ProfilerResult();
std::map<uint64_t, HostPythonNode*> GetData() {
return thread_event_trees_map;
return thread_event_trees_map_;
}
void Save(const std::string& file_name);
std::unordered_map<std::string, std::string> GetExtraInfo() {
return extra_info_.GetExtraInfo();
}
void Save(const std::string& file_name,
const std::string format = std::string("json"));
std::unique_ptr<NodeTrees>& GetNodeTrees() { return tree_; }
private:
std::map<uint64_t, HostPythonNode*> thread_event_trees_map;
NodeTrees* tree_;
HostPythonNode* CopyTree(HostTraceEventNode* node);
std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
std::unique_ptr<NodeTrees> tree_;
ExtraInfo extra_info_;
HostPythonNode* CopyTree(HostTraceEventNode* root);
};
std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename);
} // namespace platform
} // namespace paddle
......@@ -25,8 +25,10 @@
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler/cuda_tracer.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/host_tracer.h"
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle {
namespace platform {
......@@ -44,10 +46,15 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
Profiler::Profiler(const ProfilerOptions& options) {
options_ = options;
HostTracerOptions host_tracer_options;
host_tracer_options.trace_level = options.trace_level;
tracers_.emplace_back(new HostTracer(host_tracer_options), true);
tracers_.emplace_back(&CudaTracer::GetInstance(), false);
std::bitset<32> trace_switch(options_.trace_switch);
if (trace_switch.test(kProfileCPUOptionBit)) {
HostTracerOptions host_tracer_options;
host_tracer_options.trace_level = options_.trace_level;
tracers_.emplace_back(new HostTracer(host_tracer_options), true);
}
if (trace_switch.test(kProfileGPUOptionBit)) {
tracers_.emplace_back(&CudaTracer::GetInstance(), false);
}
}
Profiler::~Profiler() { alive_.store(false); }
......@@ -63,9 +70,10 @@ void Profiler::Start() {
for (auto& tracer : tracers_) {
tracer.Get().StartTracing();
}
cpu_utilization_.RecordBeginTimeInfo();
}
std::unique_ptr<NodeTrees> Profiler::Stop() {
std::unique_ptr<ProfilerResult> Profiler::Stop() {
SynchronizeAllDevice();
TraceEventCollector collector;
for (auto& tracer : tracers_) {
......@@ -75,7 +83,22 @@ std::unique_ptr<NodeTrees> Profiler::Stop() {
std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(),
collector.RuntimeEvents(),
collector.DeviceEvents()));
return tree;
cpu_utilization_.RecordEndTimeInfo();
ExtraInfo extrainfo;
extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
std::string("%f"),
cpu_utilization_.GetCpuUtilization());
extrainfo.AddExtraInfo(std::string("Process Cpu Utilization"),
std::string("%f"),
cpu_utilization_.GetCpuCurProcessUtilization());
const std::unordered_map<uint64_t, std::string> thread_names =
collector.ThreadNames();
for (const auto& kv : thread_names) {
extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
kv.second);
}
return std::unique_ptr<ProfilerResult>(
new platform::ProfilerResult(std::move(tree), extrainfo));
}
} // namespace platform
......
......@@ -15,12 +15,15 @@
#pragma once
#include <atomic>
#include <bitset>
#include <cstdint>
#include <functional>
#include <list>
#include <memory>
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/profiler/cpu_utilization.h"
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/tracer_base.h"
DECLARE_int64(host_trace_level);
......@@ -28,7 +31,11 @@ DECLARE_int64(host_trace_level);
namespace paddle {
namespace platform {
static constexpr uint32_t kProfileCPUOptionBit = 0;
static constexpr uint32_t kProfileGPUOptionBit = 1;
struct ProfilerOptions {
uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu
uint32_t trace_level = FLAGS_host_trace_level;
};
......@@ -40,7 +47,7 @@ class Profiler {
void Start();
std::unique_ptr<NodeTrees> Stop();
std::unique_ptr<ProfilerResult> Stop();
~Profiler();
......@@ -70,6 +77,7 @@ class Profiler {
ProfilerOptions options_;
uint64_t start_ns_ = UINT64_MAX;
std::list<TracerHolder> tracers_;
CpuUtilization cpu_utilization_;
};
} // namespace platform
......
......@@ -22,6 +22,7 @@
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/profiler.h"
......@@ -30,8 +31,10 @@ TEST(ProfilerTest, TestHostTracer) {
using paddle::platform::Profiler;
using paddle::platform::RecordInstantEvent;
using paddle::platform::TracerEventType;
using paddle::platform::ProfilerResult;
ProfilerOptions options;
options.trace_level = 2;
options.trace_switch = 3;
auto profiler = Profiler::Create(options);
EXPECT_TRUE(profiler);
profiler->Prepare();
......@@ -42,7 +45,8 @@ TEST(ProfilerTest, TestHostTracer) {
RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
3);
}
auto nodetree = profiler->Stop();
auto profiler_result = profiler->Stop();
auto& nodetree = profiler_result->GetNodeTrees();
std::set<std::string> host_events;
for (const auto pair : nodetree->Traverse(true)) {
for (const auto evt : pair.second) {
......@@ -56,8 +60,10 @@ TEST(ProfilerTest, TestHostTracer) {
TEST(ProfilerTest, TestCudaTracer) {
using paddle::platform::ProfilerOptions;
using paddle::platform::Profiler;
using paddle::platform::ProfilerResult;
ProfilerOptions options;
options.trace_level = 0;
options.trace_switch = 3;
auto profiler = Profiler::Create(options);
EXPECT_TRUE(profiler);
profiler->Prepare();
......@@ -72,7 +78,8 @@ TEST(ProfilerTest, TestCudaTracer) {
hipStreamCreate(&stream);
hipStreamSynchronize(stream);
#endif
auto nodetree = profiler->Stop();
auto profiler_result = profiler->Stop();
auto& nodetree = profiler_result->GetNodeTrees();
std::vector<std::string> runtime_events;
for (const auto pair : nodetree->Traverse(true)) {
for (const auto host_node : pair.second) {
......
......@@ -48,6 +48,8 @@ enum class TracerEventType {
Communication = 12,
// Used to mark python api
PythonOp = 13,
// Used to mark python level userdefined
PythonUserDefined = 14,
// A flag to denote the number of current types
NumTypes
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册