diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index b825a68fad220bdd2a45afd3962a1c6c98cde070..1e22ffe1a8dcf78208ad5ce2fb388a6179dfcd18 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -28,9 +28,7 @@ limitations under the License. */ namespace paddle { namespace platform { -static const char* kSchemaVersion = "1.0.1"; static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json"; -static uint32_t span_indx = 0; static std::string DefaultFileName() { auto pid = GetProcessId(); @@ -68,6 +66,10 @@ ChromeTracingLogger::~ChromeTracingLogger() { } void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) { + output_file_stream_ << std::string( + R"JSON( + "traceEvents": [ + )JSON"); // log all nodes except root node, root node is a helper node. const std::map> thread2host_event_nodes = node_trees.Traverse(true); @@ -545,28 +547,44 @@ void ChromeTracingLogger::HandleTypeMemset( void ChromeTracingLogger::StartLog() { output_file_stream_ << string_format(std::string( - R"JSON( + R"JSON( { + "displayTimeUnit": "ms",)JSON")); +} + +void ChromeTracingLogger::LogMetaInfo(const std::string& version, + uint32_t span_indx) { + output_file_stream_ << string_format(std::string( + R"JSON( "schemaVersion": "%s", - "displayTimeUnit": "ms", - "span_indx": "%d", - )JSON"), - kSchemaVersion, - span_indx++); -// add device property information -#if defined(PADDLE_WITH_CUDA) + "span_indx": "%d",)JSON"), + version.c_str(), + span_indx); +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void ChromeTracingLogger::LogDeviceProperty( + const std::map& device_property_map) { + // add device property information output_file_stream_ << std::string(R"JSON( "deviceProperties": [ - )JSON"); - std::vector device_ids = GetSelectedDevices(); - for (auto index = 0u; index < device_ids.size() - 1; index++) { - const gpuDeviceProp& device_property = - GetDeviceProperties(device_ids[index]); - output_file_stream_ << string_format( - std::string( - R"JSON( + )JSON"); + auto device_nums = device_property_map.size(); + if (device_nums == 0) { + output_file_stream_ << std::string(R"JSON( + ], + )JSON"); + } +#if defined(PADDLE_WITH_CUDA) + for (auto it = device_property_map.begin(); it != device_property_map.end(); + it++) { + const gpuDeviceProp& device_property = it->second; + if (device_nums > 1) { + output_file_stream_ << string_format( + std::string( + R"JSON( { - "id": %d, "name": "%s", "totalGlobalMem": %llu, + "id": %u, "name": "%s", "totalGlobalMem": %llu, "computeMajor": %d, "computeMinor": %d, "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, @@ -574,60 +592,93 @@ void ChromeTracingLogger::StartLog() { "smCount": %d, "sharedMemPerBlockOptin": %d }, )JSON"), - device_ids[index], - device_property.name, - device_property.totalGlobalMem, - device_property.major, - device_property.minor, - device_property.maxThreadsPerBlock, - device_property.maxThreadsPerMultiProcessor, - device_property.regsPerBlock, - device_property.regsPerMultiprocessor, - device_property.warpSize, - device_property.sharedMemPerBlock, - device_property.sharedMemPerMultiprocessor, - device_property.multiProcessorCount, - device_property.sharedMemPerBlockOptin); + it->first, + device_property.name, + device_property.totalGlobalMem, + device_property.major, + device_property.minor, + device_property.maxThreadsPerBlock, + device_property.maxThreadsPerMultiProcessor, + device_property.regsPerBlock, + device_property.regsPerMultiprocessor, + device_property.warpSize, + device_property.sharedMemPerBlock, + device_property.sharedMemPerMultiprocessor, + device_property.multiProcessorCount, + device_property.sharedMemPerBlockOptin); + } else { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "id": %u, "name": "%s", "totalGlobalMem": %llu, + "computeMajor": %d, "computeMinor": %d, + "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, + "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, + "sharedMemPerBlock": %d, "sharedMemPerMultiprocessor": %d, + "smCount": %d, "sharedMemPerBlockOptin": %d + }], + )JSON"), + it->first, + device_property.name, + device_property.totalGlobalMem, + device_property.major, + device_property.minor, + device_property.maxThreadsPerBlock, + device_property.maxThreadsPerMultiProcessor, + device_property.regsPerBlock, + device_property.regsPerMultiprocessor, + device_property.warpSize, + device_property.sharedMemPerBlock, + device_property.sharedMemPerMultiprocessor, + device_property.multiProcessorCount, + device_property.sharedMemPerBlockOptin); + } + device_nums -= 1; } - if (device_ids.size() > 0) { - const gpuDeviceProp& device_property = - GetDeviceProperties(device_ids[device_ids.size() - 1]); - output_file_stream_ << string_format( - std::string( - R"JSON( +#endif +#if defined(PADDLE_WITH_HIP) + for (auto it = device_property_map.begin(); it != device_property_map.end(); + it++) { + const gpuDeviceProp& device_property = it->second; + if (device_nums > 1) { + output_file_stream_ << string_format(std::string( + R"JSON( { - "id": %d, "name": "%s", "totalGlobalMem": %llu, + "id": %u, "name": "%s", "totalGlobalMem": %llu, "computeMajor": %d, "computeMinor": %d, - "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d, - "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d, - "sharedMemPerBlock": %d, "sharedMemPerMultiprocessor": %d, - "smCount": %d, "sharedMemPerBlockOptin": %d - }], + "smCount": %d + }, )JSON"), - device_ids[device_ids.size() - 1], - device_property.name, - device_property.totalGlobalMem, - device_property.major, - device_property.minor, - device_property.maxThreadsPerBlock, - device_property.maxThreadsPerMultiProcessor, - device_property.regsPerBlock, - device_property.regsPerMultiprocessor, - device_property.warpSize, - device_property.sharedMemPerBlock, - device_property.sharedMemPerMultiprocessor, - device_property.multiProcessorCount, - device_property.sharedMemPerBlockOptin); + it->first, + device_property.name, + device_property.totalGlobalMem, + device_property.major, + device_property.minor, + device_property.multiProcessorCount); + } else { + output_file_stream_ << string_format(std::string( + R"JSON( + { + "id": %u, "name": "%s", "totalGlobalMem": %llu, + "computeMajor": %d, "computeMinor": %d, + "smCount": %d + }], + )JSON"), + it->first, + device_property.name, + device_property.totalGlobalMem, + device_property.major, + device_property.minor, + device_property.multiProcessorCount); + } + device_nums -= 1; } #endif - - output_file_stream_ << std::string( - R"JSON( - "traceEvents": [ - )JSON"); } +#endif -void ChromeTracingLogger::LogMetaInfo( +void ChromeTracingLogger::LogExtraInfo( const std::unordered_map extra_info) { RefineDisplayName(extra_info); output_file_stream_ << std::string( diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 3cbf9ccf6a0cc50fde08bbeef8a1a079cfa9aaa0..7f9bec1c32a534535057868b12d7cd230425f89e 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/output_logger.h" namespace paddle { @@ -36,8 +38,13 @@ class ChromeTracingLogger : public BaseLogger { void LogHostTraceEventNode(const HostTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; - void LogMetaInfo(const std::unordered_map); + void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void LogDeviceProperty( + const std::map& device_property_map); +#endif + void LogMetaInfo(const std::string& version, uint32_t span_indx); private: void OpenFile(); diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index e98622321995a6d4a3c1e45dcd000f2a40d84551..e6388fe275a9a8ccf5cff359910576e15363ee57 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -51,6 +51,7 @@ std::unique_ptr DeserializationReader::Parse() { std::string("%s"), extra_info_map.value().c_str()); } + // restore NodeTrees std::map thread_event_trees_map; for (int node_tree_index = 0; @@ -127,8 +128,26 @@ std::unique_ptr DeserializationReader::Parse() { } // restore NodeTrees object std::unique_ptr tree(new NodeTrees(thread_event_trees_map)); - return std::unique_ptr( - new ProfilerResult(std::move(tree), extrainfo)); +// restore gpuDeviceProp +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::map device_property_map; + for (auto indx = 0; indx < node_trees_proto_->device_property_size(); + indx++) { + const DevicePropertyProto& device_property_proto = + node_trees_proto_->device_property(indx); + device_property_map[device_property_proto.id()] = + RestoreDeviceProperty(device_property_proto); + } + ProfilerResult* profiler_result_ptr = + new ProfilerResult(std::move(tree), extrainfo, device_property_map); +#else + ProfilerResult* profiler_result_ptr = + new ProfilerResult(std::move(tree), extrainfo); +#endif + // restore version and span indx + profiler_result_ptr->SetVersion(node_trees_proto_->version()); + profiler_result_ptr->SetSpanIndx(node_trees_proto_->span_indx()); + return std::unique_ptr(profiler_result_ptr); } DeserializationReader::~DeserializationReader() { @@ -136,6 +155,37 @@ DeserializationReader::~DeserializationReader() { input_file_stream_.close(); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +gpuDeviceProp DeserializationReader::RestoreDeviceProperty( + const DevicePropertyProto& device_property_proto) { + gpuDeviceProp device_property; + strncpy(device_property.name, + device_property_proto.name().c_str(), + device_property_proto.name().length() + 1); + device_property.totalGlobalMem = device_property_proto.total_global_memory(); + device_property.major = device_property_proto.compute_major(); + device_property.minor = device_property_proto.compute_minor(); + device_property.multiProcessorCount = device_property_proto.sm_count(); +#if defined(PADDLE_WITH_CUDA) + device_property.maxThreadsPerBlock = + device_property_proto.max_threads_per_block(); + device_property.maxThreadsPerMultiProcessor = + device_property_proto.max_threads_per_multiprocessor(); + device_property.regsPerBlock = device_property_proto.regs_per_block(); + device_property.regsPerMultiprocessor = + device_property_proto.regs_per_multiprocessor(); + device_property.warpSize = device_property_proto.warp_size(); + device_property.sharedMemPerBlock = + device_property_proto.shared_memory_per_block(); + device_property.sharedMemPerMultiprocessor = + device_property_proto.shared_memory_per_multiprocessor(); + device_property.sharedMemPerBlockOptin = + device_property_proto.shared_memory_per_block_optin(); +#endif + return device_property; +} +#endif + DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode( const DeviceTraceEventNodeProto& device_node_proto) { const DeviceTraceEventProto& device_event_proto = @@ -275,6 +325,10 @@ KernelEventInfo DeserializationReader::HandleKernelEventInfoProto( kernel_info.queued = kernel_info_proto.queued(); kernel_info.submitted = kernel_info_proto.submitted(); kernel_info.completed = kernel_info_proto.completed(); + // version 1.0.2 + kernel_info.blocks_per_sm = kernel_info_proto.blocks_per_sm(); + kernel_info.warps_per_sm = kernel_info_proto.warps_per_sm(); + kernel_info.occupancy = kernel_info_proto.occupancy(); return kernel_info; } diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index 7df93b7703c328db04ca895d14b76b04cf1f4082..5f99f6fd82c55d0e1c7b81d041702e3ae8c06c2a 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -39,6 +39,10 @@ class DeserializationReader { MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( const OperatorSupplementEventNodeProto&); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&); +#endif + std::string filename_; std::ifstream input_file_stream_; NodeTreesProto* node_trees_proto_; diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto index 4ebfb6e73b331ccb340db842c87c4a1f93d5c4f7..af9d6ed9e00e0e30ef28fdba78e04ea214e551ee 100644 --- a/paddle/fluid/platform/profiler/dump/nodetree.proto +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -95,6 +95,12 @@ message KernelEventInfoProto { required uint64 submitted = 13; // The completed timestamp for the kernel execution, in ns. required uint64 completed = 14; + // blocks per sm + required float blocks_per_sm = 15; + // warps per sm + required float warps_per_sm = 16; + // theoretical achieved occupancy + required float occupancy = 17; } message MemcpyEventInfoProto { @@ -270,9 +276,27 @@ message ExtraInfoMap { required string value = 2; } +message DevicePropertyProto { + required uint32 id = 1; + required string name = 2; + required uint64 total_global_memory = 3; + required uint32 compute_major = 4; + required uint32 compute_minor = 5; + required uint32 max_threads_per_block = 6; + required uint32 max_threads_per_multiprocessor = 7; + required uint32 regs_per_block = 8; + required uint32 regs_per_multiprocessor = 9; + required uint32 warp_size = 10; + required uint64 shared_memory_per_block = 11; + required uint64 shared_memory_per_multiprocessor = 12; + required uint32 sm_count = 13; + required uint64 shared_memory_per_block_optin = 14; +} + message NodeTreesProto { required string version = 1; required uint32 span_indx = 2; repeated ThreadNodeTreeProto thread_trees = 3; repeated ExtraInfoMap extra_info = 4; + repeated DevicePropertyProto device_property = 5; } diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index 698c0b4231f5c60bd01c374b747e2a31f1fc790b..ce6fcf5b56538129c3f1d5eb95c33e891a30736b 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -20,8 +20,6 @@ namespace paddle { namespace platform { static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; -static const char* version = "1.0.1"; -static uint32_t span_indx = 0; static std::string DefaultFileName() { auto pid = GetProcessId(); @@ -40,10 +38,43 @@ void SerializationLogger::OpenFile() { LOG(INFO) << "writing profiling data to " << filename_ << std::endl; } node_trees_proto_ = new NodeTreesProto(); - node_trees_proto_->set_version(std::string(version)); - node_trees_proto_->set_span_indx(span_indx++); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +void SerializationLogger::LogDeviceProperty( + const std::map& device_property_map) { + for (auto it = device_property_map.begin(); it != device_property_map.end(); + it++) { + const gpuDeviceProp& device_property = it->second; + DevicePropertyProto* device_property_proto = + node_trees_proto_->add_device_property(); + device_property_proto->set_id(it->first); + device_property_proto->set_name(device_property.name); + device_property_proto->set_total_global_memory( + device_property.totalGlobalMem); + device_property_proto->set_compute_major(device_property.major); + device_property_proto->set_compute_minor(device_property.minor); + device_property_proto->set_sm_count(device_property.multiProcessorCount); +#if defined(PADDLE_WITH_CUDA) + device_property_proto->set_max_threads_per_block( + device_property.maxThreadsPerBlock); + device_property_proto->set_max_threads_per_multiprocessor( + device_property.maxThreadsPerMultiProcessor); + device_property_proto->set_regs_per_block(device_property.regsPerBlock); + device_property_proto->set_regs_per_multiprocessor( + device_property.regsPerMultiprocessor); + device_property_proto->set_warp_size(device_property.warpSize); + device_property_proto->set_shared_memory_per_block( + device_property.sharedMemPerBlock); + device_property_proto->set_shared_memory_per_multiprocessor( + device_property.sharedMemPerMultiprocessor); + device_property_proto->set_shared_memory_per_block_optin( + device_property.sharedMemPerBlockOptin); +#endif + } +} +#endif + void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { // dump the whole tree into file const std::map> @@ -271,6 +302,9 @@ void SerializationLogger::HandleTypeKernel( kernel_info->set_queued(info.queued); kernel_info->set_submitted(info.submitted); kernel_info->set_completed(info.completed); + kernel_info->set_blocks_per_sm(info.blocks_per_sm); + kernel_info->set_warps_per_sm(info.warps_per_sm); + kernel_info->set_occupancy(info.occupancy); // binding device_trace_event->set_allocated_kernel_info(kernel_info); current_device_trace_event_node_proto_->set_allocated_device_event( @@ -328,7 +362,7 @@ void SerializationLogger::HandleTypeMemset( device_trace_event); } -void SerializationLogger::LogMetaInfo( +void SerializationLogger::LogExtraInfo( const std::unordered_map extra_info) { for (const auto& kv : extra_info) { ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info(); @@ -337,6 +371,12 @@ void SerializationLogger::LogMetaInfo( } } +void SerializationLogger::LogMetaInfo(const std::string& version, + uint32_t span_indx) { + node_trees_proto_->set_version(version); + node_trees_proto_->set_span_indx(span_indx); +} + SerializationLogger::SerializationLogger(const std::string& filename) { filename_ = filename.empty() ? DefaultFileName() : filename; OpenFile(); diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h index 31910cb68c5d797e8069cb4487a66ce9c9b52387..80d5413106dedc76d32a4031fe55b8b8a328d255 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -11,8 +11,10 @@ limitations under the License. */ #pragma once +#include #include +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" #include "paddle/fluid/platform/profiler/output_logger.h" @@ -33,8 +35,13 @@ class SerializationLogger : public BaseLogger { void LogHostTraceEventNode(const HostTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; - void LogMetaInfo(const std::unordered_map); + void LogExtraInfo(const std::unordered_map); void LogMemTraceEventNode(const MemTraceEventNode&) override; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void LogDeviceProperty( + const std::map& device_property_map); +#endif + void LogMetaInfo(const std::string& version, uint32_t span_indx); private: void OpenFile(); diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc index f606be4bf451e88d812c9ec1d1b2a6e3500244a9..9ebaaaa01d1b2c47a389e6bac018d72e433233e2 100644 --- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -140,6 +140,7 @@ TEST(SerializationLoggerTest, dump_case0) { 5, MemsetEventInfo())); SerializationLogger logger("test_serialization_logger_case0.pb"); + logger.LogMetaInfo(std::string("1.0.2"), 0); NodeTrees tree(host_events, runtime_events, device_events, @@ -169,7 +170,7 @@ TEST(SerializationLoggerTest, dump_case0) { } } tree.LogMe(&logger); - logger.LogMetaInfo(std::unordered_map()); + logger.LogExtraInfo(std::unordered_map()); } TEST(SerializationLoggerTest, dump_case1) { @@ -234,6 +235,7 @@ TEST(SerializationLoggerTest, dump_case1) { 5, MemsetEventInfo())); SerializationLogger logger("test_serialization_logger_case1.pb"); + logger.LogMetaInfo(std::string("1.0.2"), 0); NodeTrees tree(host_events, runtime_events, device_events, @@ -257,7 +259,7 @@ TEST(SerializationLoggerTest, dump_case1) { } } tree.LogMe(&logger); - logger.LogMetaInfo(std::unordered_map()); + logger.LogExtraInfo(std::unordered_map()); } TEST(DeserializationReaderTest, restore_case0) { diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index 75bb5086fdacddb4ca022a1ea9700a298fc5d412..231c0e0beaf5ab001ba471ca57ec9c630e4ef725 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -65,6 +65,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { runtime_python_node->end_ns = (*runtimenode)->EndNs(); runtime_python_node->process_id = (*runtimenode)->ProcessId(); runtime_python_node->thread_id = (*runtimenode)->ThreadId(); + runtime_python_node->correlation_id = (*runtimenode)->CorrelationId(); host_python_node->runtime_node_ptrs.push_back(runtime_python_node); // copy DeviceTraceEventNode for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin(); @@ -78,6 +79,30 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { device_python_node->device_id = (*devicenode)->DeviceId(); device_python_node->context_id = (*devicenode)->ContextId(); device_python_node->stream_id = (*devicenode)->StreamId(); + device_python_node->correlation_id = (*devicenode)->CorrelationId(); + if (device_python_node->type == TracerEventType::Kernel) { + KernelEventInfo kernel_info = (*devicenode)->KernelInfo(); + device_python_node->block_x = kernel_info.block_x; + device_python_node->block_y = kernel_info.block_y; + device_python_node->block_z = kernel_info.block_z; + device_python_node->grid_x = kernel_info.grid_x; + device_python_node->grid_y = kernel_info.grid_y; + device_python_node->grid_z = kernel_info.grid_z; + device_python_node->shared_memory = kernel_info.dynamic_shared_memory + + kernel_info.static_shared_memory; + device_python_node->registers_per_thread = + kernel_info.registers_per_thread; + device_python_node->blocks_per_sm = kernel_info.blocks_per_sm; + device_python_node->warps_per_sm = kernel_info.warps_per_sm; + device_python_node->occupancy = kernel_info.occupancy; + } else if (device_python_node->type == TracerEventType::Memcpy) { + MemcpyEventInfo memcpy_info = (*devicenode)->MemcpyInfo(); + device_python_node->num_bytes = memcpy_info.num_bytes; + } else if (device_python_node->type == TracerEventType::Memset) { + MemsetEventInfo memset_info = (*devicenode)->MemsetInfo(); + device_python_node->num_bytes = memset_info.num_bytes; + device_python_node->value = memset_info.value; + } runtime_python_node->device_node_ptrs.push_back(device_python_node); } } @@ -110,6 +135,23 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { return host_python_node; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +ProfilerResult::ProfilerResult( + std::unique_ptr tree, + const ExtraInfo& extra_info, + const std::map device_property_map) + : tree_(tree.release()), + extra_info_(extra_info), + device_property_map_(device_property_map) { + if (tree_ != nullptr) { + std::map nodetrees = tree_->GetNodeTrees(); + for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) { + thread_event_trees_map_[it->first] = CopyTree(it->second); + } + } +} +#endif + ProfilerResult::ProfilerResult(std::unique_ptr tree, const ExtraInfo& extra_info) : tree_(tree.release()), extra_info_(extra_info) { @@ -134,12 +176,20 @@ void ProfilerResult::Save(const std::string& file_name, const std::string format) { if (format == std::string("json")) { ChromeTracingLogger logger(file_name); + logger.LogMetaInfo(version_, span_indx_); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + logger.LogDeviceProperty(device_property_map_); +#endif tree_->LogMe(&logger); - logger.LogMetaInfo(GetExtraInfo()); + logger.LogExtraInfo(GetExtraInfo()); } else if (format == std::string("pb")) { SerializationLogger logger(file_name); + logger.LogMetaInfo(version_, span_indx_); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + logger.LogDeviceProperty(device_property_map_); +#endif tree_->LogMe(&logger); - logger.LogMetaInfo(GetExtraInfo()); + logger.LogExtraInfo(GetExtraInfo()); } return; } diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index 9c5ac28f36f5b25c9889599e93718f1fabc72de6..e27bdf0696324b233a39115f970b711db25c4e5d 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/event_node.h" #include "paddle/fluid/platform/profiler/extra_info.h" @@ -41,6 +42,32 @@ struct DevicePythonNode { uint64_t context_id; // stream id uint64_t stream_id; + // correlation id, used for correlating async activities happened on device + uint32_t correlation_id; + // The X-dimension block size for the kernel. + uint32_t block_x; + // The Y-dimension block size for the kernel. + uint32_t block_y; + // The Z-dimension grid size for the kernel. + uint32_t block_z; + // X-dimension of a grid. + uint32_t grid_x; + // Y-dimension of a grid. + uint32_t grid_y; + // Z-dimension of a grid. + uint32_t grid_z; + // dynamic + static + uint64_t shared_memory; + // The number of registers required for each thread executing the kernel. + uint32_t registers_per_thread; + float blocks_per_sm; + float warps_per_sm; + // theoretical achieved occupancy + float occupancy; + // The number of bytes transferred by the memory copy. + uint64_t num_bytes; + // the value being assigned to memory by the memory set. + uint32_t value; }; struct MemPythonNode { @@ -87,6 +114,8 @@ struct HostPythonNode { uint64_t process_id; // thread id of the record uint64_t thread_id; + // correlation id, used for correlating async activities happened on device + uint32_t correlation_id; // input shapes std::map>> input_shapes; std::map> dtypes; @@ -105,8 +134,15 @@ struct HostPythonNode { class ProfilerResult { public: ProfilerResult() : tree_(nullptr) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + explicit ProfilerResult( + std::unique_ptr tree, + const ExtraInfo& extra_info, + const std::map device_property_map); +#endif explicit ProfilerResult(std::unique_ptr tree, const ExtraInfo& extra_info); + ~ProfilerResult(); std::map GetData() { return thread_event_trees_map_; @@ -120,10 +156,27 @@ class ProfilerResult { std::shared_ptr GetNodeTrees() { return tree_; } + void SetVersion(const std::string& version) { version_ = version; } + + void SetSpanIndx(uint32_t span_indx) { span_indx_ = span_indx; } + + std::string GetVersion() { return version_; } + uint32_t GetSpanIndx() { return span_indx_; } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::map GetDeviceProperty() { + return device_property_map_; + } +#endif + private: std::map thread_event_trees_map_; std::shared_ptr tree_; ExtraInfo extra_info_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::map device_property_map_; +#endif + std::string version_; + uint32_t span_indx_; HostPythonNode* CopyTree(HostTraceEventNode* root); }; diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 6365586c684ea2180afed5b1098488864ba99bb9..5957c4c24ca3be1357b6c5864b7493d9c115ed69 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -40,6 +40,9 @@ void SynchronizeAllDevice(); std::atomic Profiler::alive_{false}; +uint32_t Profiler::span_indx = 0; +const char* Profiler::version = "1.0.2"; + std::unique_ptr Profiler::Create( const ProfilerOptions& options, const std::vector& custom_device_types) { @@ -131,8 +134,24 @@ std::unique_ptr Profiler::Stop() { std::string("%s"), kv.second.c_str()); } - return std::unique_ptr( - new platform::ProfilerResult(std::move(tree), extrainfo)); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::map device_property_map; + std::vector device_ids = GetSelectedDevices(); + for (auto index = 0u; index < device_ids.size(); index++) { + const gpuDeviceProp& device_property = + GetDeviceProperties(device_ids[index]); + device_property_map[device_ids[index]] = device_property; + } + ProfilerResult* profiler_result_ptr = new platform::ProfilerResult( + std::move(tree), extrainfo, device_property_map); +#else + ProfilerResult* profiler_result_ptr = + new platform::ProfilerResult(std::move(tree), extrainfo); +#endif + profiler_result_ptr->SetVersion(std::string(version)); + profiler_result_ptr->SetSpanIndx(span_indx); + span_indx += 1; + return std::unique_ptr(profiler_result_ptr); } } // namespace platform diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index 2480f3a6073e1915eda7b7a4b101c5030f8a1dce..878f73f2b98021483fa43d943039e1f129fabf0a 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -44,6 +44,10 @@ struct ProfilerOptions { class Profiler { public: + static uint32_t + span_indx; // index of profiler range, when user profiles multiple ranges + // such as [2,4], [6,8], the first range is index 0. + static const char* version; // profiler version. static std::unique_ptr Create( const ProfilerOptions& options, const std::vector& custom_device_types = {}); diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc index 105f938cb97b5787c74a37974eef9c37731084d7..617000a5e1c80417f3408b7c9bbcabafefa94521 100644 --- a/paddle/fluid/platform/profiler/test_event_node.cc +++ b/paddle/fluid/platform/profiler/test_event_node.cc @@ -137,6 +137,7 @@ TEST(NodeTreesTest, LogMe_case0) { 5, MemsetEventInfo())); ChromeTracingLogger logger("test_nodetrees_logme_case0.json"); + logger.LogMetaInfo(std::string("1.0.2"), 0); NodeTrees tree(host_events, runtime_events, device_events, @@ -166,7 +167,7 @@ TEST(NodeTreesTest, LogMe_case0) { } } tree.LogMe(&logger); - logger.LogMetaInfo(std::unordered_map()); + logger.LogExtraInfo(std::unordered_map()); } TEST(NodeTreesTest, LogMe_case1) { @@ -231,6 +232,7 @@ TEST(NodeTreesTest, LogMe_case1) { 5, MemsetEventInfo())); ChromeTracingLogger logger("test_nodetrees_logme_case1.json"); + logger.LogMetaInfo(std::string("1.0.2"), 0); NodeTrees tree(host_events, runtime_events, device_events, @@ -254,7 +256,7 @@ TEST(NodeTreesTest, LogMe_case1) { } } tree.LogMe(&logger); - logger.LogMetaInfo(std::unordered_map()); + logger.LogExtraInfo(std::unordered_map()); } TEST(NodeTreesTest, HandleTrees_case0) { @@ -333,6 +335,7 @@ TEST(NodeTreesTest, HandleTrees_case0) { 3, KernelEventInfo())); ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json"); + logger.LogMetaInfo(std::string("1.0.2"), 0); NodeTrees tree(host_events, runtime_events, device_events, @@ -376,5 +379,5 @@ TEST(NodeTreesTest, HandleTrees_case0) { device_event_node_handle, mem_event_node_handle, op_supplement_event_node_handle); - logger.LogMetaInfo(std::unordered_map()); + logger.LogExtraInfo(std::unordered_map()); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 328b7fc74eb190c17d2c708dc7beb927ba78ec01..4c5fd8a6a39844da8c58952795d0f4633b462d6f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2064,7 +2064,15 @@ All parameter, weight, gradient are variables in Paddle. &paddle::platform::ProfilerResult::GetData, py::return_value_policy::automatic_reference) .def("save", &paddle::platform::ProfilerResult::Save) - .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo); + .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo) + .def("get_version", &paddle::platform::ProfilerResult::GetVersion) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx) + .def("get_device_property", + &paddle::platform::ProfilerResult::GetDeviceProperty); +#else + .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx); +#endif py::class_(m, "MemPythonNode") .def(py::init<>()) @@ -2097,7 +2105,28 @@ All parameter, weight, gradient are variables in Paddle. .def_readwrite("context_id", &paddle::platform::DevicePythonNode::context_id) .def_readwrite("stream_id", - &paddle::platform::DevicePythonNode::stream_id); + &paddle::platform::DevicePythonNode::stream_id) + .def_readwrite("correlation_id", + &paddle::platform::DevicePythonNode::correlation_id) + .def_readwrite("block_x", &paddle::platform::DevicePythonNode::block_x) + .def_readwrite("block_y", &paddle::platform::DevicePythonNode::block_y) + .def_readwrite("block_z", &paddle::platform::DevicePythonNode::block_z) + .def_readwrite("grid_x", &paddle::platform::DevicePythonNode::grid_x) + .def_readwrite("grid_y", &paddle::platform::DevicePythonNode::grid_y) + .def_readwrite("grid_z", &paddle::platform::DevicePythonNode::grid_z) + .def_readwrite("shared_memory", + &paddle::platform::DevicePythonNode::shared_memory) + .def_readwrite("registers_per_thread", + &paddle::platform::DevicePythonNode::registers_per_thread) + .def_readwrite("blocks_per_sm", + &paddle::platform::DevicePythonNode::blocks_per_sm) + .def_readwrite("warps_per_sm", + &paddle::platform::DevicePythonNode::warps_per_sm) + .def_readwrite("occupancy", + &paddle::platform::DevicePythonNode::occupancy) + .def_readwrite("num_bytes", + &paddle::platform::DevicePythonNode::num_bytes) + .def_readwrite("value", &paddle::platform::DevicePythonNode::value); py::class_(m, "HostPythonNode") .def(py::init<>()) @@ -2108,6 +2137,8 @@ All parameter, weight, gradient are variables in Paddle. .def_readwrite("process_id", &paddle::platform::HostPythonNode::process_id) .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id) + .def_readwrite("correlation_id", + &paddle::platform::HostPythonNode::correlation_id) .def_readwrite("input_shapes", &paddle::platform::HostPythonNode::input_shapes) .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)