未验证 提交 23bc0e3c 编写于 作者: C chenjian 提交者: GitHub

Update protobuf output format for profiler (#45724)

* update protobuf format

* fix protobuf content

* fix file mode

* fix compiling error when gpu not exists

* fix compiling error when gpu not exists

* fix compiling error when gpu not exists

* fix compiling error when gpu not exists

* support rocm
上级 93e03fd7
......@@ -28,9 +28,7 @@ limitations under the License. */
namespace paddle {
namespace platform {
static const char* kSchemaVersion = "1.0.1";
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
static uint32_t span_indx = 0;
static std::string DefaultFileName() {
auto pid = GetProcessId();
......@@ -68,6 +66,10 @@ ChromeTracingLogger::~ChromeTracingLogger() {
}
void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
output_file_stream_ << std::string(
R"JSON(
"traceEvents": [
)JSON");
// log all nodes except root node, root node is a helper node.
const std::map<uint64_t, std::vector<HostTraceEventNode*>>
thread2host_event_nodes = node_trees.Traverse(true);
......@@ -545,28 +547,44 @@ void ChromeTracingLogger::HandleTypeMemset(
void ChromeTracingLogger::StartLog() {
output_file_stream_ << string_format(std::string(
R"JSON(
R"JSON(
{
"displayTimeUnit": "ms",)JSON"));
}
void ChromeTracingLogger::LogMetaInfo(const std::string& version,
uint32_t span_indx) {
output_file_stream_ << string_format(std::string(
R"JSON(
"schemaVersion": "%s",
"displayTimeUnit": "ms",
"span_indx": "%d",
)JSON"),
kSchemaVersion,
span_indx++);
// add device property information
#if defined(PADDLE_WITH_CUDA)
"span_indx": "%d",)JSON"),
version.c_str(),
span_indx);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void ChromeTracingLogger::LogDeviceProperty(
const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
// add device property information
output_file_stream_ << std::string(R"JSON(
"deviceProperties": [
)JSON");
std::vector<int> device_ids = GetSelectedDevices();
for (auto index = 0u; index < device_ids.size() - 1; index++) {
const gpuDeviceProp& device_property =
GetDeviceProperties(device_ids[index]);
output_file_stream_ << string_format(
std::string(
R"JSON(
)JSON");
auto device_nums = device_property_map.size();
if (device_nums == 0) {
output_file_stream_ << std::string(R"JSON(
],
)JSON");
}
#if defined(PADDLE_WITH_CUDA)
for (auto it = device_property_map.begin(); it != device_property_map.end();
it++) {
const gpuDeviceProp& device_property = it->second;
if (device_nums > 1) {
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"id": %d, "name": "%s", "totalGlobalMem": %llu,
"id": %u, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
......@@ -574,60 +592,93 @@ void ChromeTracingLogger::StartLog() {
"smCount": %d, "sharedMemPerBlockOptin": %d
},
)JSON"),
device_ids[index],
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock,
device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin);
it->first,
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock,
device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin);
} else {
output_file_stream_ << string_format(
std::string(
R"JSON(
{
"id": %u, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
"sharedMemPerBlock": %d, "sharedMemPerMultiprocessor": %d,
"smCount": %d, "sharedMemPerBlockOptin": %d
}],
)JSON"),
it->first,
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock,
device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin);
}
device_nums -= 1;
}
if (device_ids.size() > 0) {
const gpuDeviceProp& device_property =
GetDeviceProperties(device_ids[device_ids.size() - 1]);
output_file_stream_ << string_format(
std::string(
R"JSON(
#endif
#if defined(PADDLE_WITH_HIP)
for (auto it = device_property_map.begin(); it != device_property_map.end();
it++) {
const gpuDeviceProp& device_property = it->second;
if (device_nums > 1) {
output_file_stream_ << string_format(std::string(
R"JSON(
{
"id": %d, "name": "%s", "totalGlobalMem": %llu,
"id": %u, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d,
"maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
"regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
"sharedMemPerBlock": %d, "sharedMemPerMultiprocessor": %d,
"smCount": %d, "sharedMemPerBlockOptin": %d
}],
"smCount": %d
},
)JSON"),
device_ids[device_ids.size() - 1],
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.maxThreadsPerBlock,
device_property.maxThreadsPerMultiProcessor,
device_property.regsPerBlock,
device_property.regsPerMultiprocessor,
device_property.warpSize,
device_property.sharedMemPerBlock,
device_property.sharedMemPerMultiprocessor,
device_property.multiProcessorCount,
device_property.sharedMemPerBlockOptin);
it->first,
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.multiProcessorCount);
} else {
output_file_stream_ << string_format(std::string(
R"JSON(
{
"id": %u, "name": "%s", "totalGlobalMem": %llu,
"computeMajor": %d, "computeMinor": %d,
"smCount": %d
}],
)JSON"),
it->first,
device_property.name,
device_property.totalGlobalMem,
device_property.major,
device_property.minor,
device_property.multiProcessorCount);
}
device_nums -= 1;
}
#endif
output_file_stream_ << std::string(
R"JSON(
"traceEvents": [
)JSON");
}
#endif
void ChromeTracingLogger::LogMetaInfo(
void ChromeTracingLogger::LogExtraInfo(
const std::unordered_map<std::string, std::string> extra_info) {
RefineDisplayName(extra_info);
output_file_stream_ << std::string(
......
......@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <set>
#include <unordered_map>
#include <utility>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler/output_logger.h"
namespace paddle {
......@@ -36,8 +38,13 @@ class ChromeTracingLogger : public BaseLogger {
void LogHostTraceEventNode(const HostTraceEventNode&) override;
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>);
void LogExtraInfo(const std::unordered_map<std::string, std::string>);
void LogMemTraceEventNode(const MemTraceEventNode&) override;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void LogDeviceProperty(
const std::map<uint32_t, gpuDeviceProp>& device_property_map);
#endif
void LogMetaInfo(const std::string& version, uint32_t span_indx);
private:
void OpenFile();
......
......@@ -51,6 +51,7 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
std::string("%s"),
extra_info_map.value().c_str());
}
// restore NodeTrees
std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
for (int node_tree_index = 0;
......@@ -127,8 +128,26 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
}
// restore NodeTrees object
std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
return std::unique_ptr<ProfilerResult>(
new ProfilerResult(std::move(tree), extrainfo));
// restore gpuDeviceProp
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::map<uint32_t, gpuDeviceProp> device_property_map;
for (auto indx = 0; indx < node_trees_proto_->device_property_size();
indx++) {
const DevicePropertyProto& device_property_proto =
node_trees_proto_->device_property(indx);
device_property_map[device_property_proto.id()] =
RestoreDeviceProperty(device_property_proto);
}
ProfilerResult* profiler_result_ptr =
new ProfilerResult(std::move(tree), extrainfo, device_property_map);
#else
ProfilerResult* profiler_result_ptr =
new ProfilerResult(std::move(tree), extrainfo);
#endif
// restore version and span indx
profiler_result_ptr->SetVersion(node_trees_proto_->version());
profiler_result_ptr->SetSpanIndx(node_trees_proto_->span_indx());
return std::unique_ptr<ProfilerResult>(profiler_result_ptr);
}
DeserializationReader::~DeserializationReader() {
......@@ -136,6 +155,37 @@ DeserializationReader::~DeserializationReader() {
input_file_stream_.close();
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuDeviceProp DeserializationReader::RestoreDeviceProperty(
const DevicePropertyProto& device_property_proto) {
gpuDeviceProp device_property;
strncpy(device_property.name,
device_property_proto.name().c_str(),
device_property_proto.name().length() + 1);
device_property.totalGlobalMem = device_property_proto.total_global_memory();
device_property.major = device_property_proto.compute_major();
device_property.minor = device_property_proto.compute_minor();
device_property.multiProcessorCount = device_property_proto.sm_count();
#if defined(PADDLE_WITH_CUDA)
device_property.maxThreadsPerBlock =
device_property_proto.max_threads_per_block();
device_property.maxThreadsPerMultiProcessor =
device_property_proto.max_threads_per_multiprocessor();
device_property.regsPerBlock = device_property_proto.regs_per_block();
device_property.regsPerMultiprocessor =
device_property_proto.regs_per_multiprocessor();
device_property.warpSize = device_property_proto.warp_size();
device_property.sharedMemPerBlock =
device_property_proto.shared_memory_per_block();
device_property.sharedMemPerMultiprocessor =
device_property_proto.shared_memory_per_multiprocessor();
device_property.sharedMemPerBlockOptin =
device_property_proto.shared_memory_per_block_optin();
#endif
return device_property;
}
#endif
DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode(
const DeviceTraceEventNodeProto& device_node_proto) {
const DeviceTraceEventProto& device_event_proto =
......@@ -275,6 +325,10 @@ KernelEventInfo DeserializationReader::HandleKernelEventInfoProto(
kernel_info.queued = kernel_info_proto.queued();
kernel_info.submitted = kernel_info_proto.submitted();
kernel_info.completed = kernel_info_proto.completed();
// version 1.0.2
kernel_info.blocks_per_sm = kernel_info_proto.blocks_per_sm();
kernel_info.warps_per_sm = kernel_info_proto.warps_per_sm();
kernel_info.occupancy = kernel_info_proto.occupancy();
return kernel_info;
}
......
......@@ -39,6 +39,10 @@ class DeserializationReader {
MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
const OperatorSupplementEventNodeProto&);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&);
#endif
std::string filename_;
std::ifstream input_file_stream_;
NodeTreesProto* node_trees_proto_;
......
......@@ -95,6 +95,12 @@ message KernelEventInfoProto {
required uint64 submitted = 13;
// The completed timestamp for the kernel execution, in ns.
required uint64 completed = 14;
// blocks per sm
required float blocks_per_sm = 15;
// warps per sm
required float warps_per_sm = 16;
// theoretical achieved occupancy
required float occupancy = 17;
}
message MemcpyEventInfoProto {
......@@ -270,9 +276,27 @@ message ExtraInfoMap {
required string value = 2;
}
message DevicePropertyProto {
required uint32 id = 1;
required string name = 2;
required uint64 total_global_memory = 3;
required uint32 compute_major = 4;
required uint32 compute_minor = 5;
required uint32 max_threads_per_block = 6;
required uint32 max_threads_per_multiprocessor = 7;
required uint32 regs_per_block = 8;
required uint32 regs_per_multiprocessor = 9;
required uint32 warp_size = 10;
required uint64 shared_memory_per_block = 11;
required uint64 shared_memory_per_multiprocessor = 12;
required uint32 sm_count = 13;
required uint64 shared_memory_per_block_optin = 14;
}
message NodeTreesProto {
required string version = 1;
required uint32 span_indx = 2;
repeated ThreadNodeTreeProto thread_trees = 3;
repeated ExtraInfoMap extra_info = 4;
repeated DevicePropertyProto device_property = 5;
}
......@@ -20,8 +20,6 @@ namespace paddle {
namespace platform {
static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
static const char* version = "1.0.1";
static uint32_t span_indx = 0;
static std::string DefaultFileName() {
auto pid = GetProcessId();
......@@ -40,10 +38,43 @@ void SerializationLogger::OpenFile() {
LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
}
node_trees_proto_ = new NodeTreesProto();
node_trees_proto_->set_version(std::string(version));
node_trees_proto_->set_span_indx(span_indx++);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void SerializationLogger::LogDeviceProperty(
const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
for (auto it = device_property_map.begin(); it != device_property_map.end();
it++) {
const gpuDeviceProp& device_property = it->second;
DevicePropertyProto* device_property_proto =
node_trees_proto_->add_device_property();
device_property_proto->set_id(it->first);
device_property_proto->set_name(device_property.name);
device_property_proto->set_total_global_memory(
device_property.totalGlobalMem);
device_property_proto->set_compute_major(device_property.major);
device_property_proto->set_compute_minor(device_property.minor);
device_property_proto->set_sm_count(device_property.multiProcessorCount);
#if defined(PADDLE_WITH_CUDA)
device_property_proto->set_max_threads_per_block(
device_property.maxThreadsPerBlock);
device_property_proto->set_max_threads_per_multiprocessor(
device_property.maxThreadsPerMultiProcessor);
device_property_proto->set_regs_per_block(device_property.regsPerBlock);
device_property_proto->set_regs_per_multiprocessor(
device_property.regsPerMultiprocessor);
device_property_proto->set_warp_size(device_property.warpSize);
device_property_proto->set_shared_memory_per_block(
device_property.sharedMemPerBlock);
device_property_proto->set_shared_memory_per_multiprocessor(
device_property.sharedMemPerMultiprocessor);
device_property_proto->set_shared_memory_per_block_optin(
device_property.sharedMemPerBlockOptin);
#endif
}
}
#endif
void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
// dump the whole tree into file
const std::map<uint64_t, std::vector<HostTraceEventNode*>>
......@@ -271,6 +302,9 @@ void SerializationLogger::HandleTypeKernel(
kernel_info->set_queued(info.queued);
kernel_info->set_submitted(info.submitted);
kernel_info->set_completed(info.completed);
kernel_info->set_blocks_per_sm(info.blocks_per_sm);
kernel_info->set_warps_per_sm(info.warps_per_sm);
kernel_info->set_occupancy(info.occupancy);
// binding
device_trace_event->set_allocated_kernel_info(kernel_info);
current_device_trace_event_node_proto_->set_allocated_device_event(
......@@ -328,7 +362,7 @@ void SerializationLogger::HandleTypeMemset(
device_trace_event);
}
void SerializationLogger::LogMetaInfo(
void SerializationLogger::LogExtraInfo(
const std::unordered_map<std::string, std::string> extra_info) {
for (const auto& kv : extra_info) {
ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info();
......@@ -337,6 +371,12 @@ void SerializationLogger::LogMetaInfo(
}
}
void SerializationLogger::LogMetaInfo(const std::string& version,
uint32_t span_indx) {
node_trees_proto_->set_version(version);
node_trees_proto_->set_span_indx(span_indx);
}
SerializationLogger::SerializationLogger(const std::string& filename) {
filename_ = filename.empty() ? DefaultFileName() : filename;
OpenFile();
......
......@@ -11,8 +11,10 @@ limitations under the License. */
#pragma once
#include <map>
#include <unordered_map>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
#include "paddle/fluid/platform/profiler/output_logger.h"
......@@ -33,8 +35,13 @@ class SerializationLogger : public BaseLogger {
void LogHostTraceEventNode(const HostTraceEventNode&) override;
void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
void LogNodeTrees(const NodeTrees&) override;
void LogMetaInfo(const std::unordered_map<std::string, std::string>);
void LogExtraInfo(const std::unordered_map<std::string, std::string>);
void LogMemTraceEventNode(const MemTraceEventNode&) override;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void LogDeviceProperty(
const std::map<uint32_t, gpuDeviceProp>& device_property_map);
#endif
void LogMetaInfo(const std::string& version, uint32_t span_indx);
private:
void OpenFile();
......
......@@ -140,6 +140,7 @@ TEST(SerializationLoggerTest, dump_case0) {
5,
MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case0.pb");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events,
runtime_events,
device_events,
......@@ -169,7 +170,7 @@ TEST(SerializationLoggerTest, dump_case0) {
}
}
tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
}
TEST(SerializationLoggerTest, dump_case1) {
......@@ -234,6 +235,7 @@ TEST(SerializationLoggerTest, dump_case1) {
5,
MemsetEventInfo()));
SerializationLogger logger("test_serialization_logger_case1.pb");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events,
runtime_events,
device_events,
......@@ -257,7 +259,7 @@ TEST(SerializationLoggerTest, dump_case1) {
}
}
tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
}
TEST(DeserializationReaderTest, restore_case0) {
......
......@@ -65,6 +65,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
runtime_python_node->end_ns = (*runtimenode)->EndNs();
runtime_python_node->process_id = (*runtimenode)->ProcessId();
runtime_python_node->thread_id = (*runtimenode)->ThreadId();
runtime_python_node->correlation_id = (*runtimenode)->CorrelationId();
host_python_node->runtime_node_ptrs.push_back(runtime_python_node);
// copy DeviceTraceEventNode
for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin();
......@@ -78,6 +79,30 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
device_python_node->device_id = (*devicenode)->DeviceId();
device_python_node->context_id = (*devicenode)->ContextId();
device_python_node->stream_id = (*devicenode)->StreamId();
device_python_node->correlation_id = (*devicenode)->CorrelationId();
if (device_python_node->type == TracerEventType::Kernel) {
KernelEventInfo kernel_info = (*devicenode)->KernelInfo();
device_python_node->block_x = kernel_info.block_x;
device_python_node->block_y = kernel_info.block_y;
device_python_node->block_z = kernel_info.block_z;
device_python_node->grid_x = kernel_info.grid_x;
device_python_node->grid_y = kernel_info.grid_y;
device_python_node->grid_z = kernel_info.grid_z;
device_python_node->shared_memory = kernel_info.dynamic_shared_memory +
kernel_info.static_shared_memory;
device_python_node->registers_per_thread =
kernel_info.registers_per_thread;
device_python_node->blocks_per_sm = kernel_info.blocks_per_sm;
device_python_node->warps_per_sm = kernel_info.warps_per_sm;
device_python_node->occupancy = kernel_info.occupancy;
} else if (device_python_node->type == TracerEventType::Memcpy) {
MemcpyEventInfo memcpy_info = (*devicenode)->MemcpyInfo();
device_python_node->num_bytes = memcpy_info.num_bytes;
} else if (device_python_node->type == TracerEventType::Memset) {
MemsetEventInfo memset_info = (*devicenode)->MemsetInfo();
device_python_node->num_bytes = memset_info.num_bytes;
device_python_node->value = memset_info.value;
}
runtime_python_node->device_node_ptrs.push_back(device_python_node);
}
}
......@@ -110,6 +135,23 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
return host_python_node;
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ProfilerResult::ProfilerResult(
std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info,
const std::map<uint32_t, gpuDeviceProp> device_property_map)
: tree_(tree.release()),
extra_info_(extra_info),
device_property_map_(device_property_map) {
if (tree_ != nullptr) {
std::map<uint64_t, HostTraceEventNode*> nodetrees = tree_->GetNodeTrees();
for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) {
thread_event_trees_map_[it->first] = CopyTree(it->second);
}
}
}
#endif
ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info)
: tree_(tree.release()), extra_info_(extra_info) {
......@@ -134,12 +176,20 @@ void ProfilerResult::Save(const std::string& file_name,
const std::string format) {
if (format == std::string("json")) {
ChromeTracingLogger logger(file_name);
logger.LogMetaInfo(version_, span_indx_);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
logger.LogDeviceProperty(device_property_map_);
#endif
tree_->LogMe(&logger);
logger.LogMetaInfo(GetExtraInfo());
logger.LogExtraInfo(GetExtraInfo());
} else if (format == std::string("pb")) {
SerializationLogger logger(file_name);
logger.LogMetaInfo(version_, span_indx_);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
logger.LogDeviceProperty(device_property_map_);
#endif
tree_->LogMe(&logger);
logger.LogMetaInfo(GetExtraInfo());
logger.LogExtraInfo(GetExtraInfo());
}
return;
}
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <memory>
#include <unordered_map>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler/event_node.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
......@@ -41,6 +42,32 @@ struct DevicePythonNode {
uint64_t context_id;
// stream id
uint64_t stream_id;
// correlation id, used for correlating async activities happened on device
uint32_t correlation_id;
// The X-dimension block size for the kernel.
uint32_t block_x;
// The Y-dimension block size for the kernel.
uint32_t block_y;
// The Z-dimension grid size for the kernel.
uint32_t block_z;
// X-dimension of a grid.
uint32_t grid_x;
// Y-dimension of a grid.
uint32_t grid_y;
// Z-dimension of a grid.
uint32_t grid_z;
// dynamic + static
uint64_t shared_memory;
// The number of registers required for each thread executing the kernel.
uint32_t registers_per_thread;
float blocks_per_sm;
float warps_per_sm;
// theoretical achieved occupancy
float occupancy;
// The number of bytes transferred by the memory copy.
uint64_t num_bytes;
// the value being assigned to memory by the memory set.
uint32_t value;
};
struct MemPythonNode {
......@@ -87,6 +114,8 @@ struct HostPythonNode {
uint64_t process_id;
// thread id of the record
uint64_t thread_id;
// correlation id, used for correlating async activities happened on device
uint32_t correlation_id;
// input shapes
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
......@@ -105,8 +134,15 @@ struct HostPythonNode {
class ProfilerResult {
public:
ProfilerResult() : tree_(nullptr) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
explicit ProfilerResult(
std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info,
const std::map<uint32_t, gpuDeviceProp> device_property_map);
#endif
explicit ProfilerResult(std::unique_ptr<NodeTrees> tree,
const ExtraInfo& extra_info);
~ProfilerResult();
std::map<uint64_t, HostPythonNode*> GetData() {
return thread_event_trees_map_;
......@@ -120,10 +156,27 @@ class ProfilerResult {
std::shared_ptr<NodeTrees> GetNodeTrees() { return tree_; }
void SetVersion(const std::string& version) { version_ = version; }
void SetSpanIndx(uint32_t span_indx) { span_indx_ = span_indx; }
std::string GetVersion() { return version_; }
uint32_t GetSpanIndx() { return span_indx_; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
return device_property_map_;
}
#endif
private:
std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
std::shared_ptr<NodeTrees> tree_;
ExtraInfo extra_info_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::map<uint32_t, gpuDeviceProp> device_property_map_;
#endif
std::string version_;
uint32_t span_indx_;
HostPythonNode* CopyTree(HostTraceEventNode* root);
};
......
......@@ -40,6 +40,9 @@ void SynchronizeAllDevice();
std::atomic<bool> Profiler::alive_{false};
uint32_t Profiler::span_indx = 0;
const char* Profiler::version = "1.0.2";
std::unique_ptr<Profiler> Profiler::Create(
const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types) {
......@@ -131,8 +134,24 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
std::string("%s"),
kv.second.c_str());
}
return std::unique_ptr<ProfilerResult>(
new platform::ProfilerResult(std::move(tree), extrainfo));
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::map<uint32_t, gpuDeviceProp> device_property_map;
std::vector<int32_t> device_ids = GetSelectedDevices();
for (auto index = 0u; index < device_ids.size(); index++) {
const gpuDeviceProp& device_property =
GetDeviceProperties(device_ids[index]);
device_property_map[device_ids[index]] = device_property;
}
ProfilerResult* profiler_result_ptr = new platform::ProfilerResult(
std::move(tree), extrainfo, device_property_map);
#else
ProfilerResult* profiler_result_ptr =
new platform::ProfilerResult(std::move(tree), extrainfo);
#endif
profiler_result_ptr->SetVersion(std::string(version));
profiler_result_ptr->SetSpanIndx(span_indx);
span_indx += 1;
return std::unique_ptr<ProfilerResult>(profiler_result_ptr);
}
} // namespace platform
......
......@@ -44,6 +44,10 @@ struct ProfilerOptions {
class Profiler {
public:
static uint32_t
span_indx; // index of profiler range, when user profiles multiple ranges
// such as [2,4], [6,8], the first range is index 0.
static const char* version; // profiler version.
static std::unique_ptr<Profiler> Create(
const ProfilerOptions& options,
const std::vector<std::string>& custom_device_types = {});
......
......@@ -137,6 +137,7 @@ TEST(NodeTreesTest, LogMe_case0) {
5,
MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case0.json");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events,
runtime_events,
device_events,
......@@ -166,7 +167,7 @@ TEST(NodeTreesTest, LogMe_case0) {
}
}
tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
}
TEST(NodeTreesTest, LogMe_case1) {
......@@ -231,6 +232,7 @@ TEST(NodeTreesTest, LogMe_case1) {
5,
MemsetEventInfo()));
ChromeTracingLogger logger("test_nodetrees_logme_case1.json");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events,
runtime_events,
device_events,
......@@ -254,7 +256,7 @@ TEST(NodeTreesTest, LogMe_case1) {
}
}
tree.LogMe(&logger);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
}
TEST(NodeTreesTest, HandleTrees_case0) {
......@@ -333,6 +335,7 @@ TEST(NodeTreesTest, HandleTrees_case0) {
3,
KernelEventInfo()));
ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json");
logger.LogMetaInfo(std::string("1.0.2"), 0);
NodeTrees tree(host_events,
runtime_events,
device_events,
......@@ -376,5 +379,5 @@ TEST(NodeTreesTest, HandleTrees_case0) {
device_event_node_handle,
mem_event_node_handle,
op_supplement_event_node_handle);
logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
}
......@@ -2064,7 +2064,15 @@ All parameter, weight, gradient are variables in Paddle.
&paddle::platform::ProfilerResult::GetData,
py::return_value_policy::automatic_reference)
.def("save", &paddle::platform::ProfilerResult::Save)
.def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
.def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo)
.def("get_version", &paddle::platform::ProfilerResult::GetVersion)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx)
.def("get_device_property",
&paddle::platform::ProfilerResult::GetDeviceProperty);
#else
.def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx);
#endif
py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode")
.def(py::init<>())
......@@ -2097,7 +2105,28 @@ All parameter, weight, gradient are variables in Paddle.
.def_readwrite("context_id",
&paddle::platform::DevicePythonNode::context_id)
.def_readwrite("stream_id",
&paddle::platform::DevicePythonNode::stream_id);
&paddle::platform::DevicePythonNode::stream_id)
.def_readwrite("correlation_id",
&paddle::platform::DevicePythonNode::correlation_id)
.def_readwrite("block_x", &paddle::platform::DevicePythonNode::block_x)
.def_readwrite("block_y", &paddle::platform::DevicePythonNode::block_y)
.def_readwrite("block_z", &paddle::platform::DevicePythonNode::block_z)
.def_readwrite("grid_x", &paddle::platform::DevicePythonNode::grid_x)
.def_readwrite("grid_y", &paddle::platform::DevicePythonNode::grid_y)
.def_readwrite("grid_z", &paddle::platform::DevicePythonNode::grid_z)
.def_readwrite("shared_memory",
&paddle::platform::DevicePythonNode::shared_memory)
.def_readwrite("registers_per_thread",
&paddle::platform::DevicePythonNode::registers_per_thread)
.def_readwrite("blocks_per_sm",
&paddle::platform::DevicePythonNode::blocks_per_sm)
.def_readwrite("warps_per_sm",
&paddle::platform::DevicePythonNode::warps_per_sm)
.def_readwrite("occupancy",
&paddle::platform::DevicePythonNode::occupancy)
.def_readwrite("num_bytes",
&paddle::platform::DevicePythonNode::num_bytes)
.def_readwrite("value", &paddle::platform::DevicePythonNode::value);
py::class_<paddle::platform::HostPythonNode>(m, "HostPythonNode")
.def(py::init<>())
......@@ -2108,6 +2137,8 @@ All parameter, weight, gradient are variables in Paddle.
.def_readwrite("process_id",
&paddle::platform::HostPythonNode::process_id)
.def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
.def_readwrite("correlation_id",
&paddle::platform::HostPythonNode::correlation_id)
.def_readwrite("input_shapes",
&paddle::platform::HostPythonNode::input_shapes)
.def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册