未验证 提交 23bc0e3c 编写于 作者: C chenjian 提交者: GitHub

Update protobuf output format for profiler (#45724)

* update protobuf format

* fix protobuf content

* fix file mode

* fix compiling error when gpu not exists

* fix compiling error when gpu not exists

* fix compiling error when gpu not exists

* fix compiling error when gpu not exists

* support rocm
上级 93e03fd7
...@@ -28,9 +28,7 @@ limitations under the License. */ ...@@ -28,9 +28,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
static const char* kSchemaVersion = "1.0.1";
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json"; static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
static uint32_t span_indx = 0;
static std::string DefaultFileName() { static std::string DefaultFileName() {
auto pid = GetProcessId(); auto pid = GetProcessId();
...@@ -68,6 +66,10 @@ ChromeTracingLogger::~ChromeTracingLogger() { ...@@ -68,6 +66,10 @@ ChromeTracingLogger::~ChromeTracingLogger() {
} }
void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
output_file_stream_ << std::string(
R"JSON(
"traceEvents": [
)JSON");
// log all nodes except root node, root node is a helper node. // log all nodes except root node, root node is a helper node.
const std::map<uint64_t, std::vector<HostTraceEventNode*>> const std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes = node_trees.Traverse(true); thread2host_event_nodes = node_trees.Traverse(true);
...@@ -545,28 +547,44 @@ void ChromeTracingLogger::HandleTypeMemset( ...@@ -545,28 +547,44 @@ void ChromeTracingLogger::HandleTypeMemset(
void ChromeTracingLogger::StartLog() { void ChromeTracingLogger::StartLog() {
output_file_stream_ << string_format(std::string( output_file_stream_ << string_format(std::string(
R"JSON( R"JSON(
{ {
"displayTimeUnit": "ms",)JSON"));
}
void ChromeTracingLogger::LogMetaInfo(const std::string& version,
uint32_t span_indx) {
output_file_stream_ << string_format(std::string(
R"JSON(
"schemaVersion": "%s", "schemaVersion": "%s",
"displayTimeUnit": "ms", "span_indx": "%d",)JSON"),
"span_indx": "%d", version.c_str(),
)JSON"), span_indx);
kSchemaVersion, }
span_indx++);
// add device property information #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) void ChromeTracingLogger::LogDeviceProperty(
const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
// add device property information
output_file_stream_ << std::string(R"JSON( output_file_stream_ << std::string(R"JSON(
"deviceProperties": [ "deviceProperties": [
)JSON"); )JSON");
std::vector<int> device_ids = GetSelectedDevices(); auto device_nums = device_property_map.size();
for (auto index = 0u; index < device_ids.size() - 1; index++) { if (device_nums == 0) {
const gpuDeviceProp& device_property = output_file_stream_ << std::string(R"JSON(
GetDeviceProperties(device_ids[index]); ],
output_file_stream_ << string_format( )JSON");
std::string( }
R"JSON( #if defined(PADDLE_WITH_CUDA)
for (auto it = device_property_map.begin(); it != device_property_map.end();
it++) {
const gpuDeviceProp& device_property = it->second;
if (device_nums > 1) {
output_file_stream_ << string_format(
std::string(
R"JSON(
{ {
"id": %d, "name": "%s", "totalGlobalMem": %llu, "id": %u, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d, "computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
...@@ -574,60 +592,93 @@ void ChromeTracingLogger::StartLog() { ...@@ -574,60 +592,93 @@ void ChromeTracingLogger::StartLog() {
"smCount": %d, "sharedMemPerBlockOptin": %d "smCount": %d, "sharedMemPerBlockOptin": %d
}, },
)JSON"), )JSON"),
device_ids[index], it->first,
device_property.name, device_property.name,
device_property.totalGlobalMem, device_property.totalGlobalMem,
device_property.major, device_property.major,
device_property.minor, device_property.minor,
device_property.maxThreadsPerBlock, device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor, device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock, device_property.regsPerBlock,
device_property.regsPerMultiprocessor, device_property.regsPerMultiprocessor,
device_property.warpSize, device_property.warpSize,
device_property.sharedMemPerBlock, device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor, device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount, device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin); device_property.sharedMemPerBlockOptin);
} else {
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"id": %u, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
"sharedMemPerBlock": %d, "sharedMemPerMultiprocessor": %d,
"smCount": %d, "sharedMemPerBlockOptin": %d
}],
)JSON"),
it->first,
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock,
device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin);
}
device_nums -= 1;
} }
if (device_ids.size() > 0) { #endif
const gpuDeviceProp& device_property = #if defined(PADDLE_WITH_HIP)
GetDeviceProperties(device_ids[device_ids.size() - 1]); for (auto it = device_property_map.begin(); it != device_property_map.end();
output_file_stream_ << string_format( it++) {
std::string( const gpuDeviceProp& device_property = it->second;
R"JSON( if (device_nums > 1) {
output_file_stream_ << string_format(std::string(
R"JSON(
{ {
"id": %d, "name": "%s", "totalGlobalMem": %llu, "id": %u, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d, "computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "smCount": %d
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, },
"sharedMemPerBlock": %d, "sharedMemPerMultiprocessor": %d,
"smCount": %d, "sharedMemPerBlockOptin": %d
}],
)JSON"), )JSON"),
device_ids[device_ids.size() - 1], it->first,
device_property.name, device_property.name,
device_property.totalGlobalMem, device_property.totalGlobalMem,
device_property.major, device_property.major,
device_property.minor, device_property.minor,
device_property.maxThreadsPerBlock, device_property.multiProcessorCount);
device_property.maxThreadsPerMultiProcessor, } else {
device_property.regsPerBlock, output_file_stream_ << string_format(std::string(
device_property.regsPerMultiprocessor, R"JSON(
device_property.warpSize, {
device_property.sharedMemPerBlock, "id": %u, "name": "%s", "totalGlobalMem": %llu,
device_property.sharedMemPerMultiprocessor, "computeMajor": %d, "computeMinor": %d,
device_property.multiProcessorCount, "smCount": %d
device_property.sharedMemPerBlockOptin); }],
)JSON"),
it->first,
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.multiProcessorCount);
}
device_nums -= 1;
} }
#endif #endif
output_file_stream_ << std::string(
R"JSON(
"traceEvents": [
)JSON");
} }
#endif
void ChromeTracingLogger::LogMetaInfo( void ChromeTracingLogger::LogExtraInfo(
const std::unordered_map<std::string, std::string> extra_info) { const std::unordered_map<std::string, std::string> extra_info) {
RefineDisplayName(extra_info); RefineDisplayName(extra_info);
output_file_stream_ << std::string( output_file_stream_ << std::string(
......
...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <set> #include <set>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler/output_logger.h" #include "paddle/fluid/platform/profiler/output_logger.h"
namespace paddle { namespace paddle {
...@@ -36,8 +38,13 @@ class ChromeTracingLogger : public BaseLogger { ...@@ -36,8 +38,13 @@ class ChromeTracingLogger : public BaseLogger {
void LogHostTraceEventNode(const HostTraceEventNode&) override; void LogHostTraceEventNode(const HostTraceEventNode&) override;
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override; void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>); void LogExtraInfo(const std::unordered_map<std::string, std::string>);
void LogMemTraceEventNode(const MemTraceEventNode&) override; void LogMemTraceEventNode(const MemTraceEventNode&) override;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void LogDeviceProperty(
const std::map<uint32_t, gpuDeviceProp>& device_property_map);
#endif
void LogMetaInfo(const std::string& version, uint32_t span_indx);
private: private:
void OpenFile(); void OpenFile();
......
...@@ -51,6 +51,7 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() { ...@@ -51,6 +51,7 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
std::string("%s"), std::string("%s"),
extra_info_map.value().c_str()); extra_info_map.value().c_str());
} }
// restore NodeTrees // restore NodeTrees
std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map; std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
for (int node_tree_index = 0; for (int node_tree_index = 0;
...@@ -127,8 +128,26 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() { ...@@ -127,8 +128,26 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
} }
// restore NodeTrees object // restore NodeTrees object
std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map)); std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
return std::unique_ptr<ProfilerResult>( // restore gpuDeviceProp
new ProfilerResult(std::move(tree), extrainfo)); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::map<uint32_t, gpuDeviceProp> device_property_map;
for (auto indx = 0; indx < node_trees_proto_->device_property_size();
indx++) {
const DevicePropertyProto& device_property_proto =
node_trees_proto_->device_property(indx);
device_property_map[device_property_proto.id()] =
RestoreDeviceProperty(device_property_proto);
}
ProfilerResult* profiler_result_ptr =
new ProfilerResult(std::move(tree), extrainfo, device_property_map);
#else
ProfilerResult* profiler_result_ptr =
new ProfilerResult(std::move(tree), extrainfo);
#endif
// restore version and span indx
profiler_result_ptr->SetVersion(node_trees_proto_->version());
profiler_result_ptr->SetSpanIndx(node_trees_proto_->span_indx());
return std::unique_ptr<ProfilerResult>(profiler_result_ptr);
} }
DeserializationReader::~DeserializationReader() { DeserializationReader::~DeserializationReader() {
...@@ -136,6 +155,37 @@ DeserializationReader::~DeserializationReader() { ...@@ -136,6 +155,37 @@ DeserializationReader::~DeserializationReader() {
input_file_stream_.close(); input_file_stream_.close();
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuDeviceProp DeserializationReader::RestoreDeviceProperty(
const DevicePropertyProto& device_property_proto) {
gpuDeviceProp device_property;
strncpy(device_property.name,
device_property_proto.name().c_str(),
device_property_proto.name().length() + 1);
device_property.totalGlobalMem = device_property_proto.total_global_memory();
device_property.major = device_property_proto.compute_major();
device_property.minor = device_property_proto.compute_minor();
device_property.multiProcessorCount = device_property_proto.sm_count();
#if defined(PADDLE_WITH_CUDA)
device_property.maxThreadsPerBlock =
device_property_proto.max_threads_per_block();
device_property.maxThreadsPerMultiProcessor =
device_property_proto.max_threads_per_multiprocessor();
device_property.regsPerBlock = device_property_proto.regs_per_block();
device_property.regsPerMultiprocessor =
device_property_proto.regs_per_multiprocessor();
device_property.warpSize = device_property_proto.warp_size();
device_property.sharedMemPerBlock =
device_property_proto.shared_memory_per_block();
device_property.sharedMemPerMultiprocessor =
device_property_proto.shared_memory_per_multiprocessor();
device_property.sharedMemPerBlockOptin =
device_property_proto.shared_memory_per_block_optin();
#endif
return device_property;
}
#endif
DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode( DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode(
const DeviceTraceEventNodeProto& device_node_proto) { const DeviceTraceEventNodeProto& device_node_proto) {
const DeviceTraceEventProto& device_event_proto = const DeviceTraceEventProto& device_event_proto =
...@@ -275,6 +325,10 @@ KernelEventInfo DeserializationReader::HandleKernelEventInfoProto( ...@@ -275,6 +325,10 @@ KernelEventInfo DeserializationReader::HandleKernelEventInfoProto(
kernel_info.queued = kernel_info_proto.queued(); kernel_info.queued = kernel_info_proto.queued();
kernel_info.submitted = kernel_info_proto.submitted(); kernel_info.submitted = kernel_info_proto.submitted();
kernel_info.completed = kernel_info_proto.completed(); kernel_info.completed = kernel_info_proto.completed();
// version 1.0.2
kernel_info.blocks_per_sm = kernel_info_proto.blocks_per_sm();
kernel_info.warps_per_sm = kernel_info_proto.warps_per_sm();
kernel_info.occupancy = kernel_info_proto.occupancy();
return kernel_info; return kernel_info;
} }
......
...@@ -39,6 +39,10 @@ class DeserializationReader { ...@@ -39,6 +39,10 @@ class DeserializationReader {
MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
const OperatorSupplementEventNodeProto&); const OperatorSupplementEventNodeProto&);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&);
#endif
std::string filename_; std::string filename_;
std::ifstream input_file_stream_; std::ifstream input_file_stream_;
NodeTreesProto* node_trees_proto_; NodeTreesProto* node_trees_proto_;
......
...@@ -95,6 +95,12 @@ message KernelEventInfoProto { ...@@ -95,6 +95,12 @@ message KernelEventInfoProto {
required uint64 submitted = 13; required uint64 submitted = 13;
// The completed timestamp for the kernel execution, in ns. // The completed timestamp for the kernel execution, in ns.
required uint64 completed = 14; required uint64 completed = 14;
// blocks per sm
required float blocks_per_sm = 15;
// warps per sm
required float warps_per_sm = 16;
// theoretical achieved occupancy
required float occupancy = 17;
} }
message MemcpyEventInfoProto { message MemcpyEventInfoProto {
...@@ -270,9 +276,27 @@ message ExtraInfoMap { ...@@ -270,9 +276,27 @@ message ExtraInfoMap {
required string value = 2; required string value = 2;
} }
message DevicePropertyProto {
required uint32 id = 1;
required string name = 2;
required uint64 total_global_memory = 3;
required uint32 compute_major = 4;
required uint32 compute_minor = 5;
required uint32 max_threads_per_block = 6;
required uint32 max_threads_per_multiprocessor = 7;
required uint32 regs_per_block = 8;
required uint32 regs_per_multiprocessor = 9;
required uint32 warp_size = 10;
required uint64 shared_memory_per_block = 11;
required uint64 shared_memory_per_multiprocessor = 12;
required uint32 sm_count = 13;
required uint64 shared_memory_per_block_optin = 14;
}
message NodeTreesProto { message NodeTreesProto {
required string version = 1; required string version = 1;
required uint32 span_indx = 2; required uint32 span_indx = 2;
repeated ThreadNodeTreeProto thread_trees = 3; repeated ThreadNodeTreeProto thread_trees = 3;
repeated ExtraInfoMap extra_info = 4; repeated ExtraInfoMap extra_info = 4;
repeated DevicePropertyProto device_property = 5;
} }
...@@ -20,8 +20,6 @@ namespace paddle { ...@@ -20,8 +20,6 @@ namespace paddle {
namespace platform { namespace platform {
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
static const char* version = "1.0.1";
static uint32_t span_indx = 0;
static std::string DefaultFileName() { static std::string DefaultFileName() {
auto pid = GetProcessId(); auto pid = GetProcessId();
...@@ -40,10 +38,43 @@ void SerializationLogger::OpenFile() { ...@@ -40,10 +38,43 @@ void SerializationLogger::OpenFile() {
LOG(INFO) << "writing profiling data to " << filename_ << std::endl; LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
} }
node_trees_proto_ = new NodeTreesProto(); node_trees_proto_ = new NodeTreesProto();
node_trees_proto_->set_version(std::string(version));
node_trees_proto_->set_span_indx(span_indx++);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void SerializationLogger::LogDeviceProperty(
const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
for (auto it = device_property_map.begin(); it != device_property_map.end();
it++) {
const gpuDeviceProp& device_property = it->second;
DevicePropertyProto* device_property_proto =
node_trees_proto_->add_device_property();
device_property_proto->set_id(it->first);
device_property_proto->set_name(device_property.name);
device_property_proto->set_total_global_memory(
device_property.totalGlobalMem);
device_property_proto->set_compute_major(device_property.major);
device_property_proto->set_compute_minor(device_property.minor);
device_property_proto->set_sm_count(device_property.multiProcessorCount);
#if defined(PADDLE_WITH_CUDA)
device_property_proto->set_max_threads_per_block(
device_property.maxThreadsPerBlock);
device_property_proto->set_max_threads_per_multiprocessor(
device_property.maxThreadsPerMultiProcessor);
device_property_proto->set_regs_per_block(device_property.regsPerBlock);
device_property_proto->set_regs_per_multiprocessor(
device_property.regsPerMultiprocessor);
device_property_proto->set_warp_size(device_property.warpSize);
device_property_proto->set_shared_memory_per_block(
device_property.sharedMemPerBlock);
device_property_proto->set_shared_memory_per_multiprocessor(
device_property.sharedMemPerMultiprocessor);
device_property_proto->set_shared_memory_per_block_optin(
device_property.sharedMemPerBlockOptin);
#endif
}
}
#endif
void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
// dump the whole tree into file // dump the whole tree into file
const std::map<uint64_t, std::vector<HostTraceEventNode*>> const std::map<uint64_t, std::vector<HostTraceEventNode*>>
...@@ -271,6 +302,9 @@ void SerializationLogger::HandleTypeKernel( ...@@ -271,6 +302,9 @@ void SerializationLogger::HandleTypeKernel(
kernel_info->set_queued(info.queued); kernel_info->set_queued(info.queued);
kernel_info->set_submitted(info.submitted); kernel_info->set_submitted(info.submitted);
kernel_info->set_completed(info.completed); kernel_info->set_completed(info.completed);
kernel_info->set_blocks_per_sm(info.blocks_per_sm);
kernel_info->set_warps_per_sm(info.warps_per_sm);
kernel_info->set_occupancy(info.occupancy);
// binding // binding
device_trace_event->set_allocated_kernel_info(kernel_info); device_trace_event->set_allocated_kernel_info(kernel_info);
current_device_trace_event_node_proto_->set_allocated_device_event( current_device_trace_event_node_proto_->set_allocated_device_event(
...@@ -328,7 +362,7 @@ void SerializationLogger::HandleTypeMemset( ...@@ -328,7 +362,7 @@ void SerializationLogger::HandleTypeMemset(
device_trace_event); device_trace_event);
} }
void SerializationLogger::LogMetaInfo( void SerializationLogger::LogExtraInfo(
const std::unordered_map<std::string, std::string> extra_info) { const std::unordered_map<std::string, std::string> extra_info) {
for (const auto& kv : extra_info) { for (const auto& kv : extra_info) {
ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info(); ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info();
...@@ -337,6 +371,12 @@ void SerializationLogger::LogMetaInfo( ...@@ -337,6 +371,12 @@ void SerializationLogger::LogMetaInfo(
} }
} }
void SerializationLogger::LogMetaInfo(const std::string& version,
uint32_t span_indx) {
node_trees_proto_->set_version(version);
node_trees_proto_->set_span_indx(span_indx);
}
SerializationLogger::SerializationLogger(const std::string& filename) { SerializationLogger::SerializationLogger(const std::string& filename) {
filename_ = filename.empty() ? DefaultFileName() : filename; filename_ = filename.empty() ? DefaultFileName() : filename;
OpenFile(); OpenFile();
......
...@@ -11,8 +11,10 @@ limitations under the License. */ ...@@ -11,8 +11,10 @@ limitations under the License. */
#pragma once #pragma once
#include <map>
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
#include "paddle/fluid/platform/profiler/output_logger.h" #include "paddle/fluid/platform/profiler/output_logger.h"
...@@ -33,8 +35,13 @@ class SerializationLogger : public BaseLogger { ...@@ -33,8 +35,13 @@ class SerializationLogger : public BaseLogger {
void LogHostTraceEventNode(const HostTraceEventNode&) override; void LogHostTraceEventNode(const HostTraceEventNode&) override;
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override; void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>); void LogExtraInfo(const std::unordered_map<std::string, std::string>);
void LogMemTraceEventNode(const MemTraceEventNode&) override; void LogMemTraceEventNode(const MemTraceEventNode&) override;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void LogDeviceProperty(
const std::map<uint32_t, gpuDeviceProp>& device_property_map);
#endif
void LogMetaInfo(const std::string& version, uint32_t span_indx);
private: private:
void OpenFile(); void OpenFile();
......
...@@ -140,6 +140,7 @@ TEST(SerializationLoggerTest, dump_case0) { ...@@ -140,6 +140,7 @@ TEST(SerializationLoggerTest, dump_case0) {
5, 5,
MemsetEventInfo())); MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case0.pb"); SerializationLogger logger("test_serialization_logger_case0.pb");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events, NodeTrees tree(host_events,
runtime_events, runtime_events,
device_events, device_events,
...@@ -169,7 +170,7 @@ TEST(SerializationLoggerTest, dump_case0) { ...@@ -169,7 +170,7 @@ TEST(SerializationLoggerTest, dump_case0) {
} }
} }
tree.LogMe(&logger); tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>()); logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
} }
TEST(SerializationLoggerTest, dump_case1) { TEST(SerializationLoggerTest, dump_case1) {
...@@ -234,6 +235,7 @@ TEST(SerializationLoggerTest, dump_case1) { ...@@ -234,6 +235,7 @@ TEST(SerializationLoggerTest, dump_case1) {
5, 5,
MemsetEventInfo())); MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case1.pb"); SerializationLogger logger("test_serialization_logger_case1.pb");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events, NodeTrees tree(host_events,
runtime_events, runtime_events,
device_events, device_events,
...@@ -257,7 +259,7 @@ TEST(SerializationLoggerTest, dump_case1) { ...@@ -257,7 +259,7 @@ TEST(SerializationLoggerTest, dump_case1) {
} }
} }
tree.LogMe(&logger); tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>()); logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
} }
TEST(DeserializationReaderTest, restore_case0) { TEST(DeserializationReaderTest, restore_case0) {
......
...@@ -65,6 +65,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { ...@@ -65,6 +65,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
runtime_python_node->end_ns = (*runtimenode)->EndNs(); runtime_python_node->end_ns = (*runtimenode)->EndNs();
runtime_python_node->process_id = (*runtimenode)->ProcessId(); runtime_python_node->process_id = (*runtimenode)->ProcessId();
runtime_python_node->thread_id = (*runtimenode)->ThreadId(); runtime_python_node->thread_id = (*runtimenode)->ThreadId();
runtime_python_node->correlation_id = (*runtimenode)->CorrelationId();
host_python_node->runtime_node_ptrs.push_back(runtime_python_node); host_python_node->runtime_node_ptrs.push_back(runtime_python_node);
// copy DeviceTraceEventNode // copy DeviceTraceEventNode
for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin(); for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin();
...@@ -78,6 +79,30 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { ...@@ -78,6 +79,30 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
device_python_node->device_id = (*devicenode)->DeviceId(); device_python_node->device_id = (*devicenode)->DeviceId();
device_python_node->context_id = (*devicenode)->ContextId(); device_python_node->context_id = (*devicenode)->ContextId();
device_python_node->stream_id = (*devicenode)->StreamId(); device_python_node->stream_id = (*devicenode)->StreamId();
device_python_node->correlation_id = (*devicenode)->CorrelationId();
if (device_python_node->type == TracerEventType::Kernel) {
KernelEventInfo kernel_info = (*devicenode)->KernelInfo();
device_python_node->block_x = kernel_info.block_x;
device_python_node->block_y = kernel_info.block_y;
device_python_node->block_z = kernel_info.block_z;
device_python_node->grid_x = kernel_info.grid_x;
device_python_node->grid_y = kernel_info.grid_y;
device_python_node->grid_z = kernel_info.grid_z;
device_python_node->shared_memory = kernel_info.dynamic_shared_memory +
kernel_info.static_shared_memory;
device_python_node->registers_per_thread =
kernel_info.registers_per_thread;
device_python_node->blocks_per_sm = kernel_info.blocks_per_sm;
device_python_node->warps_per_sm = kernel_info.warps_per_sm;
device_python_node->occupancy = kernel_info.occupancy;
} else if (device_python_node->type == TracerEventType::Memcpy) {
MemcpyEventInfo memcpy_info = (*devicenode)->MemcpyInfo();
device_python_node->num_bytes = memcpy_info.num_bytes;
} else if (device_python_node->type == TracerEventType::Memset) {
MemsetEventInfo memset_info = (*devicenode)->MemsetInfo();
device_python_node->num_bytes = memset_info.num_bytes;
device_python_node->value = memset_info.value;
}
runtime_python_node->device_node_ptrs.push_back(device_python_node); runtime_python_node->device_node_ptrs.push_back(device_python_node);
} }
} }
...@@ -110,6 +135,23 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { ...@@ -110,6 +135,23 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
return host_python_node; return host_python_node;
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ProfilerResult::ProfilerResult(
std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info,
const std::map<uint32_t, gpuDeviceProp> device_property_map)
: tree_(tree.release()),
extra_info_(extra_info),
device_property_map_(device_property_map) {
if (tree_ != nullptr) {
std::map<uint64_t, HostTraceEventNode*> nodetrees = tree_->GetNodeTrees();
for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) {
thread_event_trees_map_[it->first] = CopyTree(it->second);
}
}
}
#endif
ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree, ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info) const ExtraInfo& extra_info)
: tree_(tree.release()), extra_info_(extra_info) { : tree_(tree.release()), extra_info_(extra_info) {
...@@ -134,12 +176,20 @@ void ProfilerResult::Save(const std::string& file_name, ...@@ -134,12 +176,20 @@ void ProfilerResult::Save(const std::string& file_name,
const std::string format) { const std::string format) {
if (format == std::string("json")) { if (format == std::string("json")) {
ChromeTracingLogger logger(file_name); ChromeTracingLogger logger(file_name);
logger.LogMetaInfo(version_, span_indx_);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
logger.LogDeviceProperty(device_property_map_);
#endif
tree_->LogMe(&logger); tree_->LogMe(&logger);
logger.LogMetaInfo(GetExtraInfo()); logger.LogExtraInfo(GetExtraInfo());
} else if (format == std::string("pb")) { } else if (format == std::string("pb")) {
SerializationLogger logger(file_name); SerializationLogger logger(file_name);
logger.LogMetaInfo(version_, span_indx_);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
logger.LogDeviceProperty(device_property_map_);
#endif
tree_->LogMe(&logger); tree_->LogMe(&logger);
logger.LogMetaInfo(GetExtraInfo()); logger.LogExtraInfo(GetExtraInfo());
} }
return; return;
} }
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler/event_node.h" #include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/extra_info.h"
...@@ -41,6 +42,32 @@ struct DevicePythonNode { ...@@ -41,6 +42,32 @@ struct DevicePythonNode {
uint64_t context_id; uint64_t context_id;
// stream id // stream id
uint64_t stream_id; uint64_t stream_id;
// correlation id, used for correlating async activities happened on device
uint32_t correlation_id;
// The X-dimension block size for the kernel.
uint32_t block_x;
// The Y-dimension block size for the kernel.
uint32_t block_y;
// The Z-dimension grid size for the kernel.
uint32_t block_z;
// X-dimension of a grid.
uint32_t grid_x;
// Y-dimension of a grid.
uint32_t grid_y;
// Z-dimension of a grid.
uint32_t grid_z;
// dynamic + static
uint64_t shared_memory;
// The number of registers required for each thread executing the kernel.
uint32_t registers_per_thread;
float blocks_per_sm;
float warps_per_sm;
// theoretical achieved occupancy
float occupancy;
// The number of bytes transferred by the memory copy.
uint64_t num_bytes;
// the value being assigned to memory by the memory set.
uint32_t value;
}; };
struct MemPythonNode { struct MemPythonNode {
...@@ -87,6 +114,8 @@ struct HostPythonNode { ...@@ -87,6 +114,8 @@ struct HostPythonNode {
uint64_t process_id; uint64_t process_id;
// thread id of the record // thread id of the record
uint64_t thread_id; uint64_t thread_id;
// correlation id, used for correlating async activities happened on device
uint32_t correlation_id;
// input shapes // input shapes
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes; std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes; std::map<std::string, std::vector<std::string>> dtypes;
...@@ -105,8 +134,15 @@ struct HostPythonNode { ...@@ -105,8 +134,15 @@ struct HostPythonNode {
class ProfilerResult { class ProfilerResult {
public: public:
ProfilerResult() : tree_(nullptr) {} ProfilerResult() : tree_(nullptr) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
explicit ProfilerResult(
std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info,
const std::map<uint32_t, gpuDeviceProp> device_property_map);
#endif
explicit ProfilerResult(std::unique_ptr<NodeTrees> tree, explicit ProfilerResult(std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info); const ExtraInfo& extra_info);
~ProfilerResult(); ~ProfilerResult();
std::map<uint64_t, HostPythonNode*> GetData() { std::map<uint64_t, HostPythonNode*> GetData() {
return thread_event_trees_map_; return thread_event_trees_map_;
...@@ -120,10 +156,27 @@ class ProfilerResult { ...@@ -120,10 +156,27 @@ class ProfilerResult {
std::shared_ptr<NodeTrees> GetNodeTrees() { return tree_; } std::shared_ptr<NodeTrees> GetNodeTrees() { return tree_; }
void SetVersion(const std::string& version) { version_ = version; }
void SetSpanIndx(uint32_t span_indx) { span_indx_ = span_indx; }
std::string GetVersion() { return version_; }
uint32_t GetSpanIndx() { return span_indx_; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
return device_property_map_;
}
#endif
private: private:
std::map<uint64_t, HostPythonNode*> thread_event_trees_map_; std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
std::shared_ptr<NodeTrees> tree_; std::shared_ptr<NodeTrees> tree_;
ExtraInfo extra_info_; ExtraInfo extra_info_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::map<uint32_t, gpuDeviceProp> device_property_map_;
#endif
std::string version_;
uint32_t span_indx_;
HostPythonNode* CopyTree(HostTraceEventNode* root); HostPythonNode* CopyTree(HostTraceEventNode* root);
}; };
......
...@@ -40,6 +40,9 @@ void SynchronizeAllDevice(); ...@@ -40,6 +40,9 @@ void SynchronizeAllDevice();
std::atomic<bool> Profiler::alive_{false}; std::atomic<bool> Profiler::alive_{false};
uint32_t Profiler::span_indx = 0;
const char* Profiler::version = "1.0.2";
std::unique_ptr<Profiler> Profiler::Create( std::unique_ptr<Profiler> Profiler::Create(
const ProfilerOptions& options, const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types) { const std::vector<std::string>& custom_device_types) {
...@@ -131,8 +134,24 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() { ...@@ -131,8 +134,24 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
std::string("%s"), std::string("%s"),
kv.second.c_str()); kv.second.c_str());
} }
return std::unique_ptr<ProfilerResult>( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
new platform::ProfilerResult(std::move(tree), extrainfo)); std::map<uint32_t, gpuDeviceProp> device_property_map;
std::vector<int32_t> device_ids = GetSelectedDevices();
for (auto index = 0u; index < device_ids.size(); index++) {
const gpuDeviceProp& device_property =
GetDeviceProperties(device_ids[index]);
device_property_map[device_ids[index]] = device_property;
}
ProfilerResult* profiler_result_ptr = new platform::ProfilerResult(
std::move(tree), extrainfo, device_property_map);
#else
ProfilerResult* profiler_result_ptr =
new platform::ProfilerResult(std::move(tree), extrainfo);
#endif
profiler_result_ptr->SetVersion(std::string(version));
profiler_result_ptr->SetSpanIndx(span_indx);
span_indx += 1;
return std::unique_ptr<ProfilerResult>(profiler_result_ptr);
} }
} // namespace platform } // namespace platform
......
...@@ -44,6 +44,10 @@ struct ProfilerOptions { ...@@ -44,6 +44,10 @@ struct ProfilerOptions {
class Profiler { class Profiler {
public: public:
static uint32_t
span_indx; // index of profiler range, when user profiles multiple ranges
// such as [2,4], [6,8], the first range is index 0.
static const char* version; // profiler version.
static std::unique_ptr<Profiler> Create( static std::unique_ptr<Profiler> Create(
const ProfilerOptions& options, const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types = {}); const std::vector<std::string>& custom_device_types = {});
......
...@@ -137,6 +137,7 @@ TEST(NodeTreesTest, LogMe_case0) { ...@@ -137,6 +137,7 @@ TEST(NodeTreesTest, LogMe_case0) {
5, 5,
MemsetEventInfo())); MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case0.json"); ChromeTracingLogger logger("test_nodetrees_logme_case0.json");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events, NodeTrees tree(host_events,
runtime_events, runtime_events,
device_events, device_events,
...@@ -166,7 +167,7 @@ TEST(NodeTreesTest, LogMe_case0) { ...@@ -166,7 +167,7 @@ TEST(NodeTreesTest, LogMe_case0) {
} }
} }
tree.LogMe(&logger); tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>()); logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
} }
TEST(NodeTreesTest, LogMe_case1) { TEST(NodeTreesTest, LogMe_case1) {
...@@ -231,6 +232,7 @@ TEST(NodeTreesTest, LogMe_case1) { ...@@ -231,6 +232,7 @@ TEST(NodeTreesTest, LogMe_case1) {
5, 5,
MemsetEventInfo())); MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case1.json"); ChromeTracingLogger logger("test_nodetrees_logme_case1.json");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events, NodeTrees tree(host_events,
runtime_events, runtime_events,
device_events, device_events,
...@@ -254,7 +256,7 @@ TEST(NodeTreesTest, LogMe_case1) { ...@@ -254,7 +256,7 @@ TEST(NodeTreesTest, LogMe_case1) {
} }
} }
tree.LogMe(&logger); tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>()); logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
} }
TEST(NodeTreesTest, HandleTrees_case0) { TEST(NodeTreesTest, HandleTrees_case0) {
...@@ -333,6 +335,7 @@ TEST(NodeTreesTest, HandleTrees_case0) { ...@@ -333,6 +335,7 @@ TEST(NodeTreesTest, HandleTrees_case0) {
3, 3,
KernelEventInfo())); KernelEventInfo()));
ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json"); ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events, NodeTrees tree(host_events,
runtime_events, runtime_events,
device_events, device_events,
...@@ -376,5 +379,5 @@ TEST(NodeTreesTest, HandleTrees_case0) { ...@@ -376,5 +379,5 @@ TEST(NodeTreesTest, HandleTrees_case0) {
device_event_node_handle, device_event_node_handle,
mem_event_node_handle, mem_event_node_handle,
op_supplement_event_node_handle); op_supplement_event_node_handle);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>()); logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
} }
...@@ -2064,7 +2064,15 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2064,7 +2064,15 @@ All parameter, weight, gradient are variables in Paddle.
&paddle::platform::ProfilerResult::GetData, &paddle::platform::ProfilerResult::GetData,
py::return_value_policy::automatic_reference) py::return_value_policy::automatic_reference)
.def("save", &paddle::platform::ProfilerResult::Save) .def("save", &paddle::platform::ProfilerResult::Save)
.def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo); .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo)
.def("get_version", &paddle::platform::ProfilerResult::GetVersion)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx)
.def("get_device_property",
&paddle::platform::ProfilerResult::GetDeviceProperty);
#else
.def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx);
#endif
py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode") py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode")
.def(py::init<>()) .def(py::init<>())
...@@ -2097,7 +2105,28 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2097,7 +2105,28 @@ All parameter, weight, gradient are variables in Paddle.
.def_readwrite("context_id", .def_readwrite("context_id",
&paddle::platform::DevicePythonNode::context_id) &paddle::platform::DevicePythonNode::context_id)
.def_readwrite("stream_id", .def_readwrite("stream_id",
&paddle::platform::DevicePythonNode::stream_id); &paddle::platform::DevicePythonNode::stream_id)
.def_readwrite("correlation_id",
&paddle::platform::DevicePythonNode::correlation_id)
.def_readwrite("block_x", &paddle::platform::DevicePythonNode::block_x)
.def_readwrite("block_y", &paddle::platform::DevicePythonNode::block_y)
.def_readwrite("block_z", &paddle::platform::DevicePythonNode::block_z)
.def_readwrite("grid_x", &paddle::platform::DevicePythonNode::grid_x)
.def_readwrite("grid_y", &paddle::platform::DevicePythonNode::grid_y)
.def_readwrite("grid_z", &paddle::platform::DevicePythonNode::grid_z)
.def_readwrite("shared_memory",
&paddle::platform::DevicePythonNode::shared_memory)
.def_readwrite("registers_per_thread",
&paddle::platform::DevicePythonNode::registers_per_thread)
.def_readwrite("blocks_per_sm",
&paddle::platform::DevicePythonNode::blocks_per_sm)
.def_readwrite("warps_per_sm",
&paddle::platform::DevicePythonNode::warps_per_sm)
.def_readwrite("occupancy",
&paddle::platform::DevicePythonNode::occupancy)
.def_readwrite("num_bytes",
&paddle::platform::DevicePythonNode::num_bytes)
.def_readwrite("value", &paddle::platform::DevicePythonNode::value);
py::class_<paddle::platform::HostPythonNode>(m, "HostPythonNode") py::class_<paddle::platform::HostPythonNode>(m, "HostPythonNode")
.def(py::init<>()) .def(py::init<>())
...@@ -2108,6 +2137,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2108,6 +2137,8 @@ All parameter, weight, gradient are variables in Paddle.
.def_readwrite("process_id", .def_readwrite("process_id",
&paddle::platform::HostPythonNode::process_id) &paddle::platform::HostPythonNode::process_id)
.def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id) .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
.def_readwrite("correlation_id",
&paddle::platform::HostPythonNode::correlation_id)
.def_readwrite("input_shapes", .def_readwrite("input_shapes",
&paddle::platform::HostPythonNode::input_shapes) &paddle::platform::HostPythonNode::input_shapes)
.def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes) .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册