Update protobuf output format for profiler (#45724)

* update protobuf format * fix protobuf content * fix file mode * fix compiling error when gpu not exists * fix compiling error when gpu not exists * fix compiling error when gpu not exists * fix compiling error when gpu not exists * support rocm

Update protobuf output format for profiler (#45724)
* update protobuf format * fix protobuf content * fix file mode * fix compiling error when gpu not exists * fix compiling error when gpu not exists * fix compiling error when gpu not exists * fix compiling error when gpu not exists * support rocm
23bc0e3c · chenjian · GitHub · 93e03fd7 · 23bc0e3c · 23bc0e3c
14 changed file
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -28,9 +28,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
-static const char* kSchemaVersion = "1.0.1";
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
-static uint32_t span_indx = 0;
 static std::string DefaultFileName() {
  auto pid = GetProcessId();
@@ -68,6 +66,10 @@ ChromeTracingLogger::~ChromeTracingLogger() {
 }
 void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
+  output_file_stream_ << std::string(
+      R"JSON(
+    "traceEvents": [
+  )JSON");
  // log all nodes except root node, root node is a helper node.
  const std::map<uint64_t, std::vector<HostTraceEventNode*>>
      thread2host_event_nodes = node_trees.Traverse(true);
@@ -545,28 +547,44 @@ void ChromeTracingLogger::HandleTypeMemset(
 void ChromeTracingLogger::StartLog() {
  output_file_stream_ << string_format(std::string(
-                                           R"JSON(
+      R"JSON(
  { 
+    "displayTimeUnit": "ms",)JSON"));
+}
+void ChromeTracingLogger::LogMetaInfo(const std::string& version,
+                                      uint32_t span_indx) {
+  output_file_stream_ << string_format(std::string(
+                                           R"JSON(
    "schemaVersion": "%s",
-    "displayTimeUnit": "ms",
+    "span_indx": "%d",)JSON"),
-    "span_indx": "%d",
+                                       version.c_str(),
-  )JSON"),
+                                       span_indx);
-                                       kSchemaVersion,
+}
-                                       span_indx++);
-// add device property information
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_CUDA)
+void ChromeTracingLogger::LogDeviceProperty(
+    const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
+  // add device property information
  output_file_stream_ << std::string(R"JSON(
    "deviceProperties": [
-  )JSON");
+    )JSON");
-  std::vector<int> device_ids = GetSelectedDevices();
+  auto device_nums = device_property_map.size();
-  for (auto index = 0u; index < device_ids.size() - 1; index++) {
+  if (device_nums == 0) {
-    const gpuDeviceProp& device_property =
+    output_file_stream_ << std::string(R"JSON(
-        GetDeviceProperties(device_ids[index]);
+      ],
-    output_file_stream_ << string_format(
+    )JSON");
-        std::string(
+  }
-            R"JSON(
+#if defined(PADDLE_WITH_CUDA)
+  for (auto it = device_property_map.begin(); it != device_property_map.end();
+       it++) {
+    const gpuDeviceProp& device_property = it->second;
+    if (device_nums > 1) {
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
    {
-       "id": %d, "name": "%s", "totalGlobalMem": %llu,
+      "id": %u, "name": "%s", "totalGlobalMem": %llu,
      "computeMajor": %d, "computeMinor": %d,
      "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
      "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
@@ -574,60 +592,93 @@ void ChromeTracingLogger::StartLog() {
      "smCount": %d, "sharedMemPerBlockOptin": %d
    },
  )JSON"),
-        device_ids[index],
+          it->first,
-        device_property.name,
+          device_property.name,
-        device_property.totalGlobalMem,
+          device_property.totalGlobalMem,
-        device_property.major,
+          device_property.major,
-        device_property.minor,
+          device_property.minor,
-        device_property.maxThreadsPerBlock,
+          device_property.maxThreadsPerBlock,
-        device_property.maxThreadsPerMultiProcessor,
+          device_property.maxThreadsPerMultiProcessor,
-        device_property.regsPerBlock,
+          device_property.regsPerBlock,
-        device_property.regsPerMultiprocessor,
+          device_property.regsPerMultiprocessor,
-        device_property.warpSize,
+          device_property.warpSize,
-        device_property.sharedMemPerBlock,
+          device_property.sharedMemPerBlock,
-        device_property.sharedMemPerMultiprocessor,
+          device_property.sharedMemPerMultiprocessor,
-        device_property.multiProcessorCount,
+          device_property.multiProcessorCount,
-        device_property.sharedMemPerBlockOptin);
+          device_property.sharedMemPerBlockOptin);
+    } else {
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
+      {
+        "id": %u, "name": "%s", "totalGlobalMem": %llu,
+        "computeMajor": %d, "computeMinor": %d,
+        "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
+        "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
+        "sharedMemPerBlock": %d, "sharedMemPerMultiprocessor": %d,
+        "smCount": %d, "sharedMemPerBlockOptin": %d
+      }],
+    )JSON"),
+          it->first,
+          device_property.name,
+          device_property.totalGlobalMem,
+          device_property.major,
+          device_property.minor,
+          device_property.maxThreadsPerBlock,
+          device_property.maxThreadsPerMultiProcessor,
+          device_property.regsPerBlock,
+          device_property.regsPerMultiprocessor,
+          device_property.warpSize,
+          device_property.sharedMemPerBlock,
+          device_property.sharedMemPerMultiprocessor,
+          device_property.multiProcessorCount,
+          device_property.sharedMemPerBlockOptin);
+    }
+    device_nums -= 1;
  }
-  if (device_ids.size() > 0) {
+#endif
-    const gpuDeviceProp& device_property =
+#if defined(PADDLE_WITH_HIP)
-        GetDeviceProperties(device_ids[device_ids.size() - 1]);
+  for (auto it = device_property_map.begin(); it != device_property_map.end();
-    output_file_stream_ << string_format(
+       it++) {
-        std::string(
+    const gpuDeviceProp& device_property = it->second;
-            R"JSON(
+    if (device_nums > 1) {
+      output_file_stream_ << string_format(std::string(
+                                               R"JSON(
    {
-       "id": %d, "name": "%s", "totalGlobalMem": %llu,
+      "id": %u, "name": "%s", "totalGlobalMem": %llu,
      "computeMajor": %d, "computeMinor": %d,
-      "maxThreadsPerBlock": %d, "maxThreadsPerMultiprocessor": %d,
+      "smCount": %d
-      "regsPerBlock": %d, "regsPerMultiprocessor": %d, "warpSize": %d,
+    },
-      "sharedMemPerBlock": %d, "sharedMemPerMultiprocessor": %d,
-      "smCount": %d, "sharedMemPerBlockOptin": %d
-    }],
  )JSON"),
-        device_ids[device_ids.size() - 1],
+                                           it->first,
-        device_property.name,
+                                           device_property.name,
-        device_property.totalGlobalMem,
+                                           device_property.totalGlobalMem,
-        device_property.major,
+                                           device_property.major,
-        device_property.minor,
+                                           device_property.minor,
-        device_property.maxThreadsPerBlock,
+                                           device_property.multiProcessorCount);
-        device_property.maxThreadsPerMultiProcessor,
+    } else {
-        device_property.regsPerBlock,
+      output_file_stream_ << string_format(std::string(
-        device_property.regsPerMultiprocessor,
+                                               R"JSON(
-        device_property.warpSize,
+      {
-        device_property.sharedMemPerBlock,
+        "id": %u, "name": "%s", "totalGlobalMem": %llu,
-        device_property.sharedMemPerMultiprocessor,
+        "computeMajor": %d, "computeMinor": %d,
-        device_property.multiProcessorCount,
+        "smCount": %d
-        device_property.sharedMemPerBlockOptin);
+      }],
+    )JSON"),
+                                           it->first,
+                                           device_property.name,
+                                           device_property.totalGlobalMem,
+                                           device_property.major,
+                                           device_property.minor,
+                                           device_property.multiProcessorCount);
+    }
+    device_nums -= 1;
  }
 #endif
-  output_file_stream_ << std::string(
-      R"JSON(
-    "traceEvents": [
-  )JSON");
 }
+#endif
-void ChromeTracingLogger::LogMetaInfo(
+void ChromeTracingLogger::LogExtraInfo(
    const std::unordered_map<std::string, std::string> extra_info) {
  RefineDisplayName(extra_info);
  output_file_stream_ << std::string(

--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <map>
 #include <set>
 #include <unordered_map>
 #include <utility>
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/profiler/output_logger.h"
 namespace paddle {
@@ -36,8 +38,13 @@ class ChromeTracingLogger : public BaseLogger {
  void LogHostTraceEventNode(const HostTraceEventNode&) override;
  void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
  void LogNodeTrees(const NodeTrees&) override;
-  void LogMetaInfo(const std::unordered_map<std::string, std::string>);
+  void LogExtraInfo(const std::unordered_map<std::string, std::string>);
  void LogMemTraceEventNode(const MemTraceEventNode&) override;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void LogDeviceProperty(
+      const std::map<uint32_t, gpuDeviceProp>& device_property_map);
+#endif
+  void LogMetaInfo(const std::string& version, uint32_t span_indx);
 private:
  void OpenFile();

--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -51,6 +51,7 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
                           std::string("%s"),
                           extra_info_map.value().c_str());
  }
  // restore NodeTrees
  std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
  for (int node_tree_index = 0;
@@ -127,8 +128,26 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
  }
  // restore NodeTrees object
  std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
-  return std::unique_ptr<ProfilerResult>(
+// restore gpuDeviceProp
-      new ProfilerResult(std::move(tree), extrainfo));
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::map<uint32_t, gpuDeviceProp> device_property_map;
+  for (auto indx = 0; indx < node_trees_proto_->device_property_size();
+       indx++) {
+    const DevicePropertyProto& device_property_proto =
+        node_trees_proto_->device_property(indx);
+    device_property_map[device_property_proto.id()] =
+        RestoreDeviceProperty(device_property_proto);
+  }
+  ProfilerResult* profiler_result_ptr =
+      new ProfilerResult(std::move(tree), extrainfo, device_property_map);
+#else
+  ProfilerResult* profiler_result_ptr =
+      new ProfilerResult(std::move(tree), extrainfo);
+#endif
+  // restore version and span indx
+  profiler_result_ptr->SetVersion(node_trees_proto_->version());
+  profiler_result_ptr->SetSpanIndx(node_trees_proto_->span_indx());
+  return std::unique_ptr<ProfilerResult>(profiler_result_ptr);
 }
 DeserializationReader::~DeserializationReader() {
@@ -136,6 +155,37 @@ DeserializationReader::~DeserializationReader() {
  input_file_stream_.close();
 }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+gpuDeviceProp DeserializationReader::RestoreDeviceProperty(
+    const DevicePropertyProto& device_property_proto) {
+  gpuDeviceProp device_property;
+  strncpy(device_property.name,
+          device_property_proto.name().c_str(),
+          device_property_proto.name().length() + 1);
+  device_property.totalGlobalMem = device_property_proto.total_global_memory();
+  device_property.major = device_property_proto.compute_major();
+  device_property.minor = device_property_proto.compute_minor();
+  device_property.multiProcessorCount = device_property_proto.sm_count();
+#if defined(PADDLE_WITH_CUDA)
+  device_property.maxThreadsPerBlock =
+      device_property_proto.max_threads_per_block();
+  device_property.maxThreadsPerMultiProcessor =
+      device_property_proto.max_threads_per_multiprocessor();
+  device_property.regsPerBlock = device_property_proto.regs_per_block();
+  device_property.regsPerMultiprocessor =
+      device_property_proto.regs_per_multiprocessor();
+  device_property.warpSize = device_property_proto.warp_size();
+  device_property.sharedMemPerBlock =
+      device_property_proto.shared_memory_per_block();
+  device_property.sharedMemPerMultiprocessor =
+      device_property_proto.shared_memory_per_multiprocessor();
+  device_property.sharedMemPerBlockOptin =
+      device_property_proto.shared_memory_per_block_optin();
+#endif
+  return device_property;
+}
+#endif
 DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode(
    const DeviceTraceEventNodeProto& device_node_proto) {
  const DeviceTraceEventProto& device_event_proto =
@@ -275,6 +325,10 @@ KernelEventInfo DeserializationReader::HandleKernelEventInfoProto(
  kernel_info.queued = kernel_info_proto.queued();
  kernel_info.submitted = kernel_info_proto.submitted();
  kernel_info.completed = kernel_info_proto.completed();
+  // version 1.0.2
+  kernel_info.blocks_per_sm = kernel_info_proto.blocks_per_sm();
+  kernel_info.warps_per_sm = kernel_info_proto.warps_per_sm();
+  kernel_info.occupancy = kernel_info_proto.occupancy();
  return kernel_info;
 }

--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -39,6 +39,10 @@ class DeserializationReader {
  MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
  OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
      const OperatorSupplementEventNodeProto&);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&);
+#endif
  std::string filename_;
  std::ifstream input_file_stream_;
  NodeTreesProto* node_trees_proto_;

--- a/paddle/fluid/platform/profiler/dump/nodetree.proto
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -95,6 +95,12 @@ message KernelEventInfoProto {
  required uint64 submitted = 13;
  // The completed timestamp for the kernel execution, in ns.
  required uint64 completed = 14;
+  // blocks per sm
+  required float blocks_per_sm = 15;
+  // warps per sm
+  required float warps_per_sm = 16;
+  // theoretical achieved occupancy
+  required float occupancy = 17;
 }
 message MemcpyEventInfoProto {
@@ -270,9 +276,27 @@ message ExtraInfoMap {
  required string value = 2;
 }
+message DevicePropertyProto {
+  required uint32 id = 1;
+  required string name = 2;
+  required uint64 total_global_memory = 3;
+  required uint32 compute_major = 4;
+  required uint32 compute_minor = 5;
+  required uint32 max_threads_per_block = 6;
+  required uint32 max_threads_per_multiprocessor = 7;
+  required uint32 regs_per_block = 8;
+  required uint32 regs_per_multiprocessor = 9;
+  required uint32 warp_size = 10;
+  required uint64 shared_memory_per_block = 11;
+  required uint64 shared_memory_per_multiprocessor = 12;
+  required uint32 sm_count = 13;
+  required uint64 shared_memory_per_block_optin = 14;
+}
 message NodeTreesProto {
  required string version = 1;
  required uint32 span_indx = 2;
  repeated ThreadNodeTreeProto thread_trees = 3;
  repeated ExtraInfoMap extra_info = 4;
+  repeated DevicePropertyProto device_property = 5;
 }
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -20,8 +20,6 @@ namespace paddle {
 namespace platform {
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
-static const char* version = "1.0.1";
-static uint32_t span_indx = 0;
 static std::string DefaultFileName() {
  auto pid = GetProcessId();
@@ -40,10 +38,43 @@ void SerializationLogger::OpenFile() {
    LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
  }
  node_trees_proto_ = new NodeTreesProto();
-  node_trees_proto_->set_version(std::string(version));
-  node_trees_proto_->set_span_indx(span_indx++);
 }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void SerializationLogger::LogDeviceProperty(
+    const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
+  for (auto it = device_property_map.begin(); it != device_property_map.end();
+       it++) {
+    const gpuDeviceProp& device_property = it->second;
+    DevicePropertyProto* device_property_proto =
+        node_trees_proto_->add_device_property();
+    device_property_proto->set_id(it->first);
+    device_property_proto->set_name(device_property.name);
+    device_property_proto->set_total_global_memory(
+        device_property.totalGlobalMem);
+    device_property_proto->set_compute_major(device_property.major);
+    device_property_proto->set_compute_minor(device_property.minor);
+    device_property_proto->set_sm_count(device_property.multiProcessorCount);
+#if defined(PADDLE_WITH_CUDA)
+    device_property_proto->set_max_threads_per_block(
+        device_property.maxThreadsPerBlock);
+    device_property_proto->set_max_threads_per_multiprocessor(
+        device_property.maxThreadsPerMultiProcessor);
+    device_property_proto->set_regs_per_block(device_property.regsPerBlock);
+    device_property_proto->set_regs_per_multiprocessor(
+        device_property.regsPerMultiprocessor);
+    device_property_proto->set_warp_size(device_property.warpSize);
+    device_property_proto->set_shared_memory_per_block(
+        device_property.sharedMemPerBlock);
+    device_property_proto->set_shared_memory_per_multiprocessor(
+        device_property.sharedMemPerMultiprocessor);
+    device_property_proto->set_shared_memory_per_block_optin(
+        device_property.sharedMemPerBlockOptin);
+#endif
+  }
+}
+#endif
 void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
  // dump the whole tree into file
  const std::map<uint64_t, std::vector<HostTraceEventNode*>>
@@ -271,6 +302,9 @@ void SerializationLogger::HandleTypeKernel(
  kernel_info->set_queued(info.queued);
  kernel_info->set_submitted(info.submitted);
  kernel_info->set_completed(info.completed);
+  kernel_info->set_blocks_per_sm(info.blocks_per_sm);
+  kernel_info->set_warps_per_sm(info.warps_per_sm);
+  kernel_info->set_occupancy(info.occupancy);
  // binding
  device_trace_event->set_allocated_kernel_info(kernel_info);
  current_device_trace_event_node_proto_->set_allocated_device_event(
@@ -328,7 +362,7 @@ void SerializationLogger::HandleTypeMemset(
      device_trace_event);
 }
-void SerializationLogger::LogMetaInfo(
+void SerializationLogger::LogExtraInfo(
    const std::unordered_map<std::string, std::string> extra_info) {
  for (const auto& kv : extra_info) {
    ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info();
@@ -337,6 +371,12 @@ void SerializationLogger::LogMetaInfo(
  }
 }
+void SerializationLogger::LogMetaInfo(const std::string& version,
+                                      uint32_t span_indx) {
+  node_trees_proto_->set_version(version);
+  node_trees_proto_->set_span_indx(span_indx);
+}
 SerializationLogger::SerializationLogger(const std::string& filename) {
  filename_ = filename.empty() ? DefaultFileName() : filename;
  OpenFile();

--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -11,8 +11,10 @@ limitations under the License. */
 #pragma once
+#include <map>
 #include <unordered_map>
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
 #include "paddle/fluid/platform/profiler/output_logger.h"
@@ -33,8 +35,13 @@ class SerializationLogger : public BaseLogger {
  void LogHostTraceEventNode(const HostTraceEventNode&) override;
  void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
  void LogNodeTrees(const NodeTrees&) override;
-  void LogMetaInfo(const std::unordered_map<std::string, std::string>);
+  void LogExtraInfo(const std::unordered_map<std::string, std::string>);
  void LogMemTraceEventNode(const MemTraceEventNode&) override;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void LogDeviceProperty(
+      const std::map<uint32_t, gpuDeviceProp>& device_property_map);
+#endif
+  void LogMetaInfo(const std::string& version, uint32_t span_indx);
 private:
  void OpenFile();

--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -140,6 +140,7 @@ TEST(SerializationLoggerTest, dump_case0) {
                                           5,
                                           MemsetEventInfo()));
  SerializationLogger logger("test_serialization_logger_case0.pb");
+  logger.LogMetaInfo(std::string("1.0.2"), 0);
  NodeTrees tree(host_events,
                 runtime_events,
                 device_events,
@@ -169,7 +170,7 @@ TEST(SerializationLoggerTest, dump_case0) {
    }
  }
  tree.LogMe(&logger);
-  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
+  logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
 }
 TEST(SerializationLoggerTest, dump_case1) {
@@ -234,6 +235,7 @@ TEST(SerializationLoggerTest, dump_case1) {
                                           5,
                                           MemsetEventInfo()));
  SerializationLogger logger("test_serialization_logger_case1.pb");
+  logger.LogMetaInfo(std::string("1.0.2"), 0);
  NodeTrees tree(host_events,
                 runtime_events,
                 device_events,
@@ -257,7 +259,7 @@ TEST(SerializationLoggerTest, dump_case1) {
    }
  }
  tree.LogMe(&logger);
-  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
+  logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
 }
 TEST(DeserializationReaderTest, restore_case0) {

--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -65,6 +65,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
    runtime_python_node->end_ns = (*runtimenode)->EndNs();
    runtime_python_node->process_id = (*runtimenode)->ProcessId();
    runtime_python_node->thread_id = (*runtimenode)->ThreadId();
+    runtime_python_node->correlation_id = (*runtimenode)->CorrelationId();
    host_python_node->runtime_node_ptrs.push_back(runtime_python_node);
    // copy DeviceTraceEventNode
    for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin();
@@ -78,6 +79,30 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
      device_python_node->device_id = (*devicenode)->DeviceId();
      device_python_node->context_id = (*devicenode)->ContextId();
      device_python_node->stream_id = (*devicenode)->StreamId();
+      device_python_node->correlation_id = (*devicenode)->CorrelationId();
+      if (device_python_node->type == TracerEventType::Kernel) {
+        KernelEventInfo kernel_info = (*devicenode)->KernelInfo();
+        device_python_node->block_x = kernel_info.block_x;
+        device_python_node->block_y = kernel_info.block_y;
+        device_python_node->block_z = kernel_info.block_z;
+        device_python_node->grid_x = kernel_info.grid_x;
+        device_python_node->grid_y = kernel_info.grid_y;
+        device_python_node->grid_z = kernel_info.grid_z;
+        device_python_node->shared_memory = kernel_info.dynamic_shared_memory +
+                                            kernel_info.static_shared_memory;
+        device_python_node->registers_per_thread =
+            kernel_info.registers_per_thread;
+        device_python_node->blocks_per_sm = kernel_info.blocks_per_sm;
+        device_python_node->warps_per_sm = kernel_info.warps_per_sm;
+        device_python_node->occupancy = kernel_info.occupancy;
+      } else if (device_python_node->type == TracerEventType::Memcpy) {
+        MemcpyEventInfo memcpy_info = (*devicenode)->MemcpyInfo();
+        device_python_node->num_bytes = memcpy_info.num_bytes;
+      } else if (device_python_node->type == TracerEventType::Memset) {
+        MemsetEventInfo memset_info = (*devicenode)->MemsetInfo();
+        device_python_node->num_bytes = memset_info.num_bytes;
+        device_python_node->value = memset_info.value;
+      }
      runtime_python_node->device_node_ptrs.push_back(device_python_node);
    }
  }
@@ -110,6 +135,23 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
  return host_python_node;
 }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+ProfilerResult::ProfilerResult(
+    std::unique_ptr<NodeTrees> tree,
+    const ExtraInfo& extra_info,
+    const std::map<uint32_t, gpuDeviceProp> device_property_map)
+    : tree_(tree.release()),
+      extra_info_(extra_info),
+      device_property_map_(device_property_map) {
+  if (tree_ != nullptr) {
+    std::map<uint64_t, HostTraceEventNode*> nodetrees = tree_->GetNodeTrees();
+    for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) {
+      thread_event_trees_map_[it->first] = CopyTree(it->second);
+    }
+  }
+}
+#endif
 ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree,
                               const ExtraInfo& extra_info)
    : tree_(tree.release()), extra_info_(extra_info) {
@@ -134,12 +176,20 @@ void ProfilerResult::Save(const std::string& file_name,
                          const std::string format) {
  if (format == std::string("json")) {
    ChromeTracingLogger logger(file_name);
+    logger.LogMetaInfo(version_, span_indx_);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    logger.LogDeviceProperty(device_property_map_);
+#endif
    tree_->LogMe(&logger);
-    logger.LogMetaInfo(GetExtraInfo());
+    logger.LogExtraInfo(GetExtraInfo());
  } else if (format == std::string("pb")) {
    SerializationLogger logger(file_name);
+    logger.LogMetaInfo(version_, span_indx_);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    logger.LogDeviceProperty(device_property_map_);
+#endif
    tree_->LogMe(&logger);
-    logger.LogMetaInfo(GetExtraInfo());
+    logger.LogExtraInfo(GetExtraInfo());
  }
  return;
 }

--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <unordered_map>
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
@@ -41,6 +42,32 @@ struct DevicePythonNode {
  uint64_t context_id;
  // stream id
  uint64_t stream_id;
+  // correlation id, used for correlating async activities happened on device
+  uint32_t correlation_id;
+  // The X-dimension block size for the kernel.
+  uint32_t block_x;
+  // The Y-dimension block size for the kernel.
+  uint32_t block_y;
+  // The Z-dimension grid size for the kernel.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+  // dynamic + static
+  uint64_t shared_memory;
+  // The number of registers required for each thread executing the kernel.
+  uint32_t registers_per_thread;
+  float blocks_per_sm;
+  float warps_per_sm;
+  // theoretical achieved occupancy
+  float occupancy;
+  // The number of bytes transferred by the memory copy.
+  uint64_t num_bytes;
+  // the value being assigned to memory by the memory set.
+  uint32_t value;
 };
 struct MemPythonNode {
@@ -87,6 +114,8 @@ struct HostPythonNode {
  uint64_t process_id;
  // thread id of the record
  uint64_t thread_id;
+  // correlation id, used for correlating async activities happened on device
+  uint32_t correlation_id;
  // input shapes
  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
  std::map<std::string, std::vector<std::string>> dtypes;
@@ -105,8 +134,15 @@ struct HostPythonNode {
 class ProfilerResult {
 public:
  ProfilerResult() : tree_(nullptr) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  explicit ProfilerResult(
+      std::unique_ptr<NodeTrees> tree,
+      const ExtraInfo& extra_info,
+      const std::map<uint32_t, gpuDeviceProp> device_property_map);
+#endif
  explicit ProfilerResult(std::unique_ptr<NodeTrees> tree,
                          const ExtraInfo& extra_info);
  ~ProfilerResult();
  std::map<uint64_t, HostPythonNode*> GetData() {
    return thread_event_trees_map_;
@@ -120,10 +156,27 @@ class ProfilerResult {
  std::shared_ptr<NodeTrees> GetNodeTrees() { return tree_; }
+  void SetVersion(const std::string& version) { version_ = version; }
+  void SetSpanIndx(uint32_t span_indx) { span_indx_ = span_indx; }
+  std::string GetVersion() { return version_; }
+  uint32_t GetSpanIndx() { return span_indx_; }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
+    return device_property_map_;
+  }
+#endif
 private:
  std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
  std::shared_ptr<NodeTrees> tree_;
  ExtraInfo extra_info_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::map<uint32_t, gpuDeviceProp> device_property_map_;
+#endif
+  std::string version_;
+  uint32_t span_indx_;
  HostPythonNode* CopyTree(HostTraceEventNode* root);
 };

--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -40,6 +40,9 @@ void SynchronizeAllDevice();
 std::atomic<bool> Profiler::alive_{false};
+uint32_t Profiler::span_indx = 0;
+const char* Profiler::version = "1.0.2";
 std::unique_ptr<Profiler> Profiler::Create(
    const ProfilerOptions& options,
    const std::vector<std::string>& custom_device_types) {
@@ -131,8 +134,24 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
                           std::string("%s"),
                           kv.second.c_str());
  }
-  return std::unique_ptr<ProfilerResult>(
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      new platform::ProfilerResult(std::move(tree), extrainfo));
+  std::map<uint32_t, gpuDeviceProp> device_property_map;
+  std::vector<int32_t> device_ids = GetSelectedDevices();
+  for (auto index = 0u; index < device_ids.size(); index++) {
+    const gpuDeviceProp& device_property =
+        GetDeviceProperties(device_ids[index]);
+    device_property_map[device_ids[index]] = device_property;
+  }
+  ProfilerResult* profiler_result_ptr = new platform::ProfilerResult(
+      std::move(tree), extrainfo, device_property_map);
+#else
+  ProfilerResult* profiler_result_ptr =
+      new platform::ProfilerResult(std::move(tree), extrainfo);
+#endif
+  profiler_result_ptr->SetVersion(std::string(version));
+  profiler_result_ptr->SetSpanIndx(span_indx);
+  span_indx += 1;
+  return std::unique_ptr<ProfilerResult>(profiler_result_ptr);
 }
 }  // namespace platform

--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -44,6 +44,10 @@ struct ProfilerOptions {
 class Profiler {
 public:
+  static uint32_t
+      span_indx;  // index of profiler range, when user profiles multiple ranges
+                  // such as [2,4], [6,8], the first range is index 0.
+  static const char* version;  // profiler version.
  static std::unique_ptr<Profiler> Create(
      const ProfilerOptions& options,
      const std::vector<std::string>& custom_device_types = {});

--- a/paddle/fluid/platform/profiler/test_event_node.cc
+++ b/paddle/fluid/platform/profiler/test_event_node.cc
@@ -137,6 +137,7 @@ TEST(NodeTreesTest, LogMe_case0) {
                                           5,
                                           MemsetEventInfo()));
  ChromeTracingLogger logger("test_nodetrees_logme_case0.json");
+  logger.LogMetaInfo(std::string("1.0.2"), 0);
  NodeTrees tree(host_events,
                 runtime_events,
                 device_events,
@@ -166,7 +167,7 @@ TEST(NodeTreesTest, LogMe_case0) {
    }
  }
  tree.LogMe(&logger);
-  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
+  logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
 }
 TEST(NodeTreesTest, LogMe_case1) {
@@ -231,6 +232,7 @@ TEST(NodeTreesTest, LogMe_case1) {
                                           5,
                                           MemsetEventInfo()));
  ChromeTracingLogger logger("test_nodetrees_logme_case1.json");
+  logger.LogMetaInfo(std::string("1.0.2"), 0);
  NodeTrees tree(host_events,
                 runtime_events,
                 device_events,
@@ -254,7 +256,7 @@ TEST(NodeTreesTest, LogMe_case1) {
    }
  }
  tree.LogMe(&logger);
-  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
+  logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
 }
 TEST(NodeTreesTest, HandleTrees_case0) {
@@ -333,6 +335,7 @@ TEST(NodeTreesTest, HandleTrees_case0) {
                                           3,
                                           KernelEventInfo()));
  ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json");
+  logger.LogMetaInfo(std::string("1.0.2"), 0);
  NodeTrees tree(host_events,
                 runtime_events,
                 device_events,
@@ -376,5 +379,5 @@ TEST(NodeTreesTest, HandleTrees_case0) {
                   device_event_node_handle,
                   mem_event_node_handle,
                   op_supplement_event_node_handle);
-  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
+  logger.LogExtraInfo(std::unordered_map<std::string, std::string>());
 }
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2064,7 +2064,15 @@ All parameter, weight, gradient are variables in Paddle.
           &paddle::platform::ProfilerResult::GetData,
           py::return_value_policy::automatic_reference)
      .def("save", &paddle::platform::ProfilerResult::Save)
-      .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
+      .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo)
+      .def("get_version", &paddle::platform::ProfilerResult::GetVersion)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx)
+      .def("get_device_property",
+           &paddle::platform::ProfilerResult::GetDeviceProperty);
+#else
+      .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx);
+#endif
  py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode")
      .def(py::init<>())
@@ -2097,7 +2105,28 @@ All parameter, weight, gradient are variables in Paddle.
      .def_readwrite("context_id",
                     &paddle::platform::DevicePythonNode::context_id)
      .def_readwrite("stream_id",
-                     &paddle::platform::DevicePythonNode::stream_id);
+                     &paddle::platform::DevicePythonNode::stream_id)
+      .def_readwrite("correlation_id",
+                     &paddle::platform::DevicePythonNode::correlation_id)
+      .def_readwrite("block_x", &paddle::platform::DevicePythonNode::block_x)
+      .def_readwrite("block_y", &paddle::platform::DevicePythonNode::block_y)
+      .def_readwrite("block_z", &paddle::platform::DevicePythonNode::block_z)
+      .def_readwrite("grid_x", &paddle::platform::DevicePythonNode::grid_x)
+      .def_readwrite("grid_y", &paddle::platform::DevicePythonNode::grid_y)
+      .def_readwrite("grid_z", &paddle::platform::DevicePythonNode::grid_z)
+      .def_readwrite("shared_memory",
+                     &paddle::platform::DevicePythonNode::shared_memory)
+      .def_readwrite("registers_per_thread",
+                     &paddle::platform::DevicePythonNode::registers_per_thread)
+      .def_readwrite("blocks_per_sm",
+                     &paddle::platform::DevicePythonNode::blocks_per_sm)
+      .def_readwrite("warps_per_sm",
+                     &paddle::platform::DevicePythonNode::warps_per_sm)
+      .def_readwrite("occupancy",
+                     &paddle::platform::DevicePythonNode::occupancy)
+      .def_readwrite("num_bytes",
+                     &paddle::platform::DevicePythonNode::num_bytes)
+      .def_readwrite("value", &paddle::platform::DevicePythonNode::value);
  py::class_<paddle::platform::HostPythonNode>(m, "HostPythonNode")
      .def(py::init<>())
@@ -2108,6 +2137,8 @@ All parameter, weight, gradient are variables in Paddle.
      .def_readwrite("process_id",
                     &paddle::platform::HostPythonNode::process_id)
      .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
+      .def_readwrite("correlation_id",
+                     &paddle::platform::HostPythonNode::correlation_id)
      .def_readwrite("input_shapes",
                     &paddle::platform::HostPythonNode::input_shapes)
      .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)