From 40d2b7c6fe92890d77a400c51cf933f18b62275c Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Wed, 16 Feb 2022 09:45:47 +0800
Subject: [PATCH] Add profiler tree dump code (#39519)

* add profiler tree dump code

* add CMakeLists item

* reduce some contents for pr
---
 paddle/fluid/platform/profiler/CMakeLists.txt |   1 +
 .../platform/profiler/dump/CMakeLists.txt     |   4 +
 .../profiler/dump/deserialization_reader.cc   | 218 ++++++++++++++
 .../profiler/dump/deserialization_reader.h    |  45 +++
 .../platform/profiler/dump/nodetree.proto     | 181 ++++++++++++
 .../profiler/dump/serialization_logger.cc     | 265 ++++++++++++++++++
 .../profiler/dump/serialization_logger.h      |  49 ++++
 .../dump/test_serialization_logger.cc         | 174 ++++++++++++
 paddle/fluid/platform/profiler/utils.h        |  46 +++
 9 files changed, 983 insertions(+)
 create mode 100644 paddle/fluid/platform/profiler/dump/CMakeLists.txt
 create mode 100644 paddle/fluid/platform/profiler/dump/deserialization_reader.cc
 create mode 100644 paddle/fluid/platform/profiler/dump/deserialization_reader.h
 create mode 100644 paddle/fluid/platform/profiler/dump/nodetree.proto
 create mode 100644 paddle/fluid/platform/profiler/dump/serialization_logger.cc
 create mode 100755 paddle/fluid/platform/profiler/dump/serialization_logger.h
 create mode 100644 paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
 create mode 100644 paddle/fluid/platform/profiler/utils.h
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 626847f046..320e989bd9 100644
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -5,3 +5,4 @@ cc_library(event_node SRCS event_node.cc DEPS enforce)
 cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node)
 cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
 cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
+add_subdirectory(dump)
diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
new file mode 100644
index 0000000000..e25333f7a8
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
@@ -0,0 +1,4 @@
+proto_library(nodetreeproto SRCS nodetree.proto)
+cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node)
+cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node)
+cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS  serialization_logger deserialization_reader event_node)
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
new file mode 100644
index 0000000000..d1049a7dc1
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -0,0 +1,218 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+
+#include <cstring>
+
+namespace paddle {
+namespace platform {
+
+DeserializationReader::DeserializationReader(const std::string& filename)
+    : filename_(filename) {
+  OpenFile();
+  node_trees_proto_ = new NodeTreesProto();
+}
+
+DeserializationReader::DeserializationReader(const char* filename)
+    : filename_(filename) {
+  OpenFile();
+  node_trees_proto_ = new NodeTreesProto();
+}
+
+void DeserializationReader::OpenFile() {
+  input_file_stream_.open(filename_, std::ifstream::in | std::ifstream::binary);
+  if (!input_file_stream_) {
+    VLOG(2) << "Unable to open file for writing profiling data." << std::endl;
+  } else {
+    VLOG(0) << "Read profiling data from " << filename_ << std::endl;
+  }
+}
+
+std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
+  if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) {
+    VLOG(2) << "Unable to load node trees in protobuf." << std::endl;
+    return nullptr;
+  }
+  std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
+  for (int node_tree_index = 0;
+       node_tree_index < node_trees_proto_->thread_trees_size();
+       node_tree_index++) {
+    // handle one thread tree
+    std::map<int64_t, HostTraceEventNode*> index_node_map;
+    std::map<int64_t, int64_t> child_parent_map;
+    const ThreadNodeTreeProto& thread_node_tree_proto =
+        node_trees_proto_->thread_trees(node_tree_index);
+    uint64_t current_threadid = thread_node_tree_proto.thread_id();
+    for (int host_node_index = 0;
+         host_node_index < thread_node_tree_proto.host_nodes_size();
+         host_node_index++) {
+      // handle host node
+      const HostTraceEventNodeProto& host_node_proto =
+          thread_node_tree_proto.host_nodes(host_node_index);
+      HostTraceEventNode* host_node =
+          RestoreHostTraceEventNode(host_node_proto);
+      index_node_map[host_node_proto.id()] = host_node;
+      child_parent_map[host_node_proto.id()] = host_node_proto.parentid();
+      // handle runtime node
+      for (int runtime_node_index = 0;
+           runtime_node_index < host_node_proto.runtime_nodes_size();
+           runtime_node_index++) {
+        const CudaRuntimeTraceEventNodeProto& runtime_node_proto =
+            host_node_proto.runtime_nodes(runtime_node_index);
+        CudaRuntimeTraceEventNode* runtime_node =
+            RestoreCudaRuntimeTraceEventNode(runtime_node_proto);
+        host_node->AddCudaRuntimeNode(runtime_node);  // insert into host_node
+        // handle device node
+        for (int device_node_index = 0;
+             device_node_index < runtime_node_proto.device_nodes_size();
+             device_node_index++) {
+          const DeviceTraceEventNodeProto& device_node_proto =
+              runtime_node_proto.device_nodes(device_node_index);
+          DeviceTraceEventNode* device_node =
+              RestoreDeviceTraceEventNode(device_node_proto);
+          runtime_node->AddDeviceTraceEventNode(
+              device_node);  // insert into runtime_node
+        }
+      }
+    }
+    // restore parent-child relationship
+    for (auto it = child_parent_map.begin(); it != child_parent_map.end();
+         it++) {
+      if (it->second != -1) {  // not root node
+        index_node_map[it->second]->AddChild(index_node_map[it->first]);
+      } else {
+        thread_event_trees_map[current_threadid] =
+            index_node_map[it->first];  // root node
+      }
+    }
+  }
+  // restore NodeTrees object
+  return std::unique_ptr<NodeTrees>(new NodeTrees(thread_event_trees_map));
+}
+
+DeserializationReader::~DeserializationReader() {
+  delete node_trees_proto_;
+  input_file_stream_.close();
+}
+
+DeviceTraceEventNode* DeserializationReader::RestoreDeviceTraceEventNode(
+    const DeviceTraceEventNodeProto& device_node_proto) {
+  const DeviceTraceEventProto& device_event_proto =
+      device_node_proto.device_event();
+  DeviceTraceEvent device_event;
+  device_event.name = device_event_proto.name();
+  device_event.type = static_cast<TracerEventType>(device_event_proto.type());
+  device_event.start_ns = device_event_proto.start_ns();
+  device_event.end_ns = device_event_proto.end_ns();
+  device_event.device_id = device_event_proto.device_id();
+  device_event.context_id = device_event_proto.context_id();
+  device_event.stream_id = device_event_proto.stream_id();
+  device_event.correlation_id = device_event_proto.correlation_id();
+  switch (device_event.type) {
+    case TracerEventType::Kernel:
+      device_event.kernel_info = HandleKernelEventInfoProto(device_event_proto);
+      break;
+
+    case TracerEventType::Memcpy:
+      device_event.memcpy_info = HandleMemcpyEventInfoProto(device_event_proto);
+      break;
+
+    case TracerEventType::Memset:
+      device_event.memset_info = HandleMemsetEventInfoProto(device_event_proto);
+      break;
+    default:
+      break;
+  }
+  return new DeviceTraceEventNode(device_event);
+}
+
+CudaRuntimeTraceEventNode*
+DeserializationReader::RestoreCudaRuntimeTraceEventNode(
+    const CudaRuntimeTraceEventNodeProto& runtime_node_proto) {
+  const CudaRuntimeTraceEventProto& runtime_event_proto =
+      runtime_node_proto.runtime_trace_event();
+  RuntimeTraceEvent runtime_event;
+  runtime_event.name = runtime_event_proto.name();
+  runtime_event.start_ns = runtime_event_proto.start_ns();
+  runtime_event.end_ns = runtime_event_proto.end_ns();
+  runtime_event.process_id = runtime_event_proto.process_id();
+  runtime_event.thread_id = runtime_event_proto.thread_id();
+  runtime_event.correlation_id = runtime_event_proto.correlation_id();
+  runtime_event.callback_id = runtime_event_proto.callback_id();
+  return new CudaRuntimeTraceEventNode(runtime_event);
+}
+
+HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode(
+    const HostTraceEventNodeProto& host_node_proto) {
+  const HostTraceEventProto& host_event_proto =
+      host_node_proto.host_trace_event();
+  HostTraceEvent host_event;
+  host_event.name = host_event_proto.name();
+  host_event.type = static_cast<TracerEventType>(host_event_proto.type());
+  host_event.start_ns = host_event_proto.start_ns();
+  host_event.end_ns = host_event_proto.end_ns();
+  host_event.process_id = host_event_proto.process_id();
+  host_event.thread_id = host_event_proto.thread_id();
+  return new HostTraceEventNode(host_event);
+}
+
+KernelEventInfo DeserializationReader::HandleKernelEventInfoProto(
+    const DeviceTraceEventProto& device_event_proto) {
+  const KernelEventInfoProto& kernel_info_proto =
+      device_event_proto.kernel_info();
+  KernelEventInfo kernel_info;
+  kernel_info.block_x = kernel_info_proto.block_x();
+  kernel_info.block_y = kernel_info_proto.block_y();
+  kernel_info.block_z = kernel_info_proto.block_z();
+  kernel_info.grid_x = kernel_info_proto.grid_x();
+  kernel_info.grid_y = kernel_info_proto.grid_y();
+  kernel_info.grid_z = kernel_info_proto.grid_z();
+  kernel_info.dynamic_shared_memory = kernel_info_proto.dynamic_shared_memory();
+  kernel_info.static_shared_memory = kernel_info_proto.static_shared_memory();
+  kernel_info.registers_per_thread = kernel_info_proto.registers_per_thread();
+  kernel_info.local_memory_per_thread =
+      kernel_info_proto.local_memory_per_thread();
+  kernel_info.local_memory_total = kernel_info_proto.local_memory_total();
+  kernel_info.queued = kernel_info_proto.queued();
+  kernel_info.submitted = kernel_info_proto.submitted();
+  kernel_info.completed = kernel_info_proto.completed();
+  return kernel_info;
+}
+
+MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto(
+    const DeviceTraceEventProto& device_event_proto) {
+  const MemcpyEventInfoProto& memcpy_info_proto =
+      device_event_proto.memcpy_info();
+  MemcpyEventInfo memcpy_info;
+  memcpy_info.num_bytes = memcpy_info_proto.num_bytes();
+  std::strncpy(memcpy_info.copy_kind, memcpy_info_proto.copy_kind().c_str(),
+               kMemKindMaxLen - 1);
+  std::strncpy(memcpy_info.src_kind, memcpy_info_proto.src_kind().c_str(),
+               kMemKindMaxLen - 1);
+  std::strncpy(memcpy_info.dst_kind, memcpy_info_proto.dst_kind().c_str(),
+               kMemKindMaxLen - 1);
+  return memcpy_info;
+}
+
+MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto(
+    const DeviceTraceEventProto& device_event_proto) {
+  const MemsetEventInfoProto& memset_info_proto =
+      device_event_proto.memset_info();
+  MemsetEventInfo memset_info;
+  memset_info.num_bytes = memset_info_proto.num_bytes();
+  std::strncpy(memset_info.memory_kind, memset_info_proto.memory_kind().c_str(),
+               kMemKindMaxLen - 1);
+  memset_info.value = memset_info_proto.value();
+  return memset_info;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
new file mode 100644
index 0000000000..1ad2dabf22
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
+#include "paddle/fluid/platform/profiler/event_node.h"
+
+namespace paddle {
+namespace platform {
+
+class DeserializationReader {
+ public:
+  explicit DeserializationReader(const std::string& filename);
+  explicit DeserializationReader(const char* filename);
+  ~DeserializationReader();
+  std::unique_ptr<NodeTrees> Parse();
+
+ private:
+  void OpenFile();
+  DeviceTraceEventNode* RestoreDeviceTraceEventNode(
+      const DeviceTraceEventNodeProto&);
+  CudaRuntimeTraceEventNode* RestoreCudaRuntimeTraceEventNode(
+      const CudaRuntimeTraceEventNodeProto&);
+  HostTraceEventNode* RestoreHostTraceEventNode(const HostTraceEventNodeProto&);
+  KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&);
+  MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&);
+  MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&);
+  std::string filename_;
+  std::ifstream input_file_stream_;
+  NodeTreesProto* node_trees_proto_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto
new file mode 100644
index 0000000000..37dac0e597
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -0,0 +1,181 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.platform;
+
+enum TracerEventTypeProto {
+  // Used to mark operator record
+  Operator = 0;
+  // Used to mark dataloader record
+  Dataloader = 1;
+  // Used to mark profile step record
+  ProfileStep = 2;
+  // Used to mark cuda runtime record returned by cupti
+  CudaRuntime = 3;
+  // Used to mark kernel computation record returned by cupti
+  Kernel = 4;
+  // Used to mark memcpy record returned by cupti
+  Memcpy = 5;
+  // Used to mark memset record returned by cupti
+  Memset = 6;
+  // Used to mark record defined by user
+  UserDefined = 7;
+  // A flag to denote the number of current types
+  NumTypes = 8;
+}
+
+message KernelEventInfoProto {
+  // The X-dimension block size for the kernel.
+  required uint32 block_x = 1;
+  // The Y-dimension block size for the kernel.
+  required uint32 block_y = 2;
+  // The Z-dimension grid size for the kernel.
+  required uint32 block_z = 3;
+  // X-dimension of a grid.
+  required uint32 grid_x = 4;
+  // Y-dimension of a grid.
+  required uint32 grid_y = 5;
+  // Z-dimension of a grid.
+  required uint32 grid_z = 6;
+  // The dynamic shared memory reserved for the kernel, in bytes.
+  required uint32 dynamic_shared_memory = 7;
+  // The static shared memory allocated for the kernel, in bytes.
+  required uint32 static_shared_memory = 8;
+  // The number of registers required for each thread executing the kernel.
+  required uint32 registers_per_thread = 9;
+  // The amount of local memory reserved for each thread, in bytes.
+  required uint32 local_memory_per_thread = 10;
+  // The total amount of local memory reserved for the kernel, in bytes.
+  required uint32 local_memory_total = 11;
+  // The timestamp when the kernel is queued up in the command buffer, in ns.
+  // This timestamp is not collected by default. Use API
+  // cuptiActivityEnableLatencyTimestamps() to enable collection.
+  required uint64 queued = 12;
+  // The timestamp when the command buffer containing the kernel launch is
+  // submitted to the GPU, in ns.
+  // This timestamp is not collected by default. Use API
+  // cuptiActivityEnableLatencyTimestamps() to enable collection.
+  required uint64 submitted = 13;
+  // The completed timestamp for the kernel execution, in ns.
+  required uint64 completed = 14;
+}
+
+message MemcpyEventInfoProto {
+  // The number of bytes transferred by the memory copy.
+  required uint64 num_bytes = 1;
+  // The kind of the memory copy.
+  // Each kind represents the source and destination targets of a memory copy.
+  // Targets are host, device, and array. Refer to CUpti_ActivityMemcpyKind
+  required string copy_kind = 2;
+  // The source memory kind read by the memory copy.
+  // Each kind represents the type of the memory accessed by a memory
+  // operation/copy. Refer to CUpti_ActivityMemoryKind
+  required string src_kind = 3;
+  // The destination memory kind read by the memory copy.
+  required string dst_kind = 4;
+}
+
+message MemsetEventInfoProto {
+  // The number of bytes being set by the memory set.
+  required uint64 num_bytes = 1;
+  // The memory kind of the memory set. Refer to CUpti_ActivityMemoryKind
+  required string memory_kind = 2;
+  // the value being assigned to memory by the memory set.
+  required uint32 value = 3;
+}
+
+message HostTraceEventProto {
+  required string name = 1;
+  required TracerEventTypeProto type = 2;
+  // start timestamp of the record
+  required uint64 start_ns = 3;
+  // end timestamp of the record
+  required uint64 end_ns = 4;
+  // process id of the record
+  required uint64 process_id = 5;
+  // thread id of the record
+  required uint64 thread_id = 6;
+}
+
+message CudaRuntimeTraceEventProto {
+  // record name
+  required string name = 1;
+  // start timestamp of the record
+  required uint64 start_ns = 2;
+  // end timestamp of the record
+  required uint64 end_ns = 3;
+  // process id of the record
+  required uint64 process_id = 4;
+  // thread id of the record
+  required uint64 thread_id = 5;
+  // correlation id, used for correlating async activities happened on device
+  required uint32 correlation_id = 6;
+  // callback id, used to identify which cuda runtime api is called
+  required uint32 callback_id = 7;
+}
+
+message DeviceTraceEventProto {
+  // record name
+  required string name = 1;
+  // record type, one of TracerEventType
+  required TracerEventTypeProto type = 2;
+  // start timestamp of the record
+  required uint64 start_ns = 3;
+  // end timestamp of the record
+  required uint64 end_ns = 4;
+  // device id
+  required uint64 device_id = 5;
+  // context id
+  required uint64 context_id = 6;
+  // stream id
+  required uint64 stream_id = 7;
+  // correlation id, used for correlating async activities happened on device
+  required uint32 correlation_id = 8;
+  // union, specific device record type has different detail information
+  oneof detail_info {
+    // used for TracerEventType::Kernel
+    KernelEventInfoProto kernel_info = 9;
+    // used for TracerEventType::Memcpy
+    MemcpyEventInfoProto memcpy_info = 10;
+    // used for TracerEventType::Memset
+    MemsetEventInfoProto memset_info = 11;
+  }
+}
+
+message DeviceTraceEventNodeProto {
+  required DeviceTraceEventProto device_event = 1;
+}
+
+message CudaRuntimeTraceEventNodeProto {
+  required CudaRuntimeTraceEventProto runtime_trace_event = 1;
+  repeated DeviceTraceEventNodeProto device_nodes = 2;
+}
+
+message HostTraceEventNodeProto {
+  required int64 id = 1;
+  required int64 parentid = 2;
+  required HostTraceEventProto host_trace_event = 3;
+  repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4;
+}
+
+message ThreadNodeTreeProto {
+  required uint64 thread_id = 1;
+  repeated HostTraceEventNodeProto host_nodes = 2;
+}
+
+message NodeTreesProto {
+  required string version = 1;
+  repeated ThreadNodeTreeProto thread_trees = 2;
+}
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
new file mode 100644
index 0000000000..d9ed84bd43
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -0,0 +1,265 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+#include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/utils.h"
+
+namespace paddle {
+namespace platform {
+
+static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
+static const char* version = "1.0.0";
+
+static std::string DefaultFileName() {
+  auto pid = GetProcessId();
+  return string_format(std::string(kDefaultFilename), pid,
+                       GetStringFormatLocalTime().c_str());
+}
+
+void SerializationLogger::OpenFile() {
+  output_file_stream_.open(filename_, std::ofstream::out |
+                                          std::ofstream::trunc |
+                                          std::ofstream::binary);
+  if (!output_file_stream_) {
+    LOG(WARNING) << "Unable to open file for writing profiling data."
+                 << std::endl;
+  } else {
+    LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
+  }
+  node_trees_proto_ = new NodeTreesProto();
+  node_trees_proto_->set_version(std::string(version));
+}
+
+void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
+  // dump the whole tree into file
+  const std::map<uint64_t, std::vector<HostTraceEventNode*>>
+      thread2host_event_nodes = node_trees.Traverse(true);
+
+  for (auto it = thread2host_event_nodes.begin();
+       it != thread2host_event_nodes.end(); ++it) {
+    // 1. order every node an index, every node a parent
+    std::map<HostTraceEventNode*, int64_t> node_index_map;
+    std::map<HostTraceEventNode*, int64_t> node_parent_map;
+    int64_t index = 0;
+    for (auto hostnode = it->second.begin(); hostnode != it->second.end();
+         ++hostnode) {
+      node_index_map[(*hostnode)] = index;  // order each node
+      index++;
+    }
+    node_parent_map[(*(it->second.begin()))] = -1;  // root's parent set as -1
+    for (auto hostnode = it->second.begin(); hostnode != it->second.end();
+         ++hostnode) {
+      for (auto childnode = (*hostnode)->GetChildren().begin();
+           childnode != (*hostnode)->GetChildren().end(); ++childnode) {
+        node_parent_map[(*childnode)] =
+            node_index_map[(*hostnode)];  // mark each node's parent
+      }
+    }
+
+    // 2. serialize host node, runtime node and device node
+    current_thread_node_tree_proto_ =
+        node_trees_proto_->add_thread_trees();  // add ThreadNodeTreeProto
+    current_thread_node_tree_proto_->set_thread_id(it->first);
+    for (auto hostnode = it->second.begin(); hostnode != it->second.end();
+         ++hostnode) {
+      HostTraceEventNodeProto* host_node_proto =
+          current_thread_node_tree_proto_
+              ->add_host_nodes();  // add HostTraceEventNodeProto
+      host_node_proto->set_id(node_index_map[(*hostnode)]);
+      host_node_proto->set_parentid(node_parent_map[(*hostnode)]);
+      current_host_trace_event_node_proto_ =
+          host_node_proto;       // set current HostTraceEventNodeProto
+      (*hostnode)->LogMe(this);  // fill detail information
+
+      for (auto runtimenode = (*hostnode)->GetRuntimeTraceEventNodes().begin();
+           runtimenode != (*hostnode)->GetRuntimeTraceEventNodes().end();
+           ++runtimenode) {
+        CudaRuntimeTraceEventNodeProto* runtime_node_proto =
+            current_host_trace_event_node_proto_
+                ->add_runtime_nodes();  // add CudaRuntimeTraceEventNodeProto
+        current_runtime_trace_event_node_proto_ =
+            runtime_node_proto;  // set current CudaRuntimeTraceEventNodeProto
+        (*runtimenode)->LogMe(this);  // fill detail information
+        for (auto devicenode =
+                 (*runtimenode)->GetDeviceTraceEventNodes().begin();
+             devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end();
+             ++devicenode) {
+          DeviceTraceEventNodeProto* device_node_proto =
+              current_runtime_trace_event_node_proto_
+                  ->add_device_nodes();  // add DeviceTraceEventNodeProto
+          current_device_trace_event_node_proto_ =
+              device_node_proto;       // set current DeviceTraceEventNodeProto
+          (*devicenode)->LogMe(this);  // fill detail information
+        }
+      }
+    }
+  }
+}
+
+void SerializationLogger::LogHostTraceEventNode(
+    const HostTraceEventNode& host_node) {
+  HostTraceEventProto* host_trace_event = new HostTraceEventProto();
+  host_trace_event->set_name(host_node.Name());
+  host_trace_event->set_type(
+      static_cast<TracerEventTypeProto>(host_node.Type()));
+  host_trace_event->set_start_ns(host_node.StartNs());
+  host_trace_event->set_end_ns(host_node.EndNs());
+  host_trace_event->set_process_id(host_node.ProcessId());
+  host_trace_event->set_thread_id(host_node.ThreadId());
+  current_host_trace_event_node_proto_->set_allocated_host_trace_event(
+      host_trace_event);
+}
+
+void SerializationLogger::LogRuntimeTraceEventNode(
+    const CudaRuntimeTraceEventNode& runtime_node) {
+  CudaRuntimeTraceEventProto* runtime_trace_event =
+      new CudaRuntimeTraceEventProto();
+  runtime_trace_event->set_name(runtime_node.Name());
+  runtime_trace_event->set_start_ns(runtime_node.StartNs());
+  runtime_trace_event->set_end_ns(runtime_node.EndNs());
+  runtime_trace_event->set_process_id(runtime_node.ProcessId());
+  runtime_trace_event->set_thread_id(runtime_node.ThreadId());
+  runtime_trace_event->set_correlation_id(runtime_node.CorrelationId());
+  runtime_trace_event->set_callback_id(runtime_node.CallbackId());
+  current_runtime_trace_event_node_proto_->set_allocated_runtime_trace_event(
+      runtime_trace_event);
+}
+
+void SerializationLogger::LogDeviceTraceEventNode(
+    const DeviceTraceEventNode& device_node) {
+  switch (device_node.Type()) {
+    case TracerEventType::Kernel:
+      HandleTypeKernel(device_node);
+      break;
+    case TracerEventType::Memcpy:
+      HandleTypeMemcpy(device_node);
+      break;
+    case TracerEventType::Memset:
+      HandleTypeMemset(device_node);
+      break;
+    default:
+      break;
+  }
+}
+
+void SerializationLogger::HandleTypeKernel(
+    const DeviceTraceEventNode& device_node) {
+  DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
+  KernelEventInfoProto* kernel_info = new KernelEventInfoProto();
+  // fill DeviceTraceEventProto
+  device_trace_event->set_name(device_node.Name());
+  device_trace_event->set_type(
+      static_cast<TracerEventTypeProto>(device_node.Type()));
+  device_trace_event->set_start_ns(device_node.StartNs());
+  device_trace_event->set_end_ns(device_node.EndNs());
+  device_trace_event->set_device_id(device_node.DeviceId());
+  device_trace_event->set_context_id(device_node.ContextId());
+  device_trace_event->set_stream_id(device_node.StreamId());
+  device_trace_event->set_correlation_id(device_node.CorrelationId());
+  // fill KernelEventInfoProto
+  KernelEventInfo info = device_node.KernelInfo();
+  kernel_info->set_block_x(info.block_x);
+  kernel_info->set_block_y(info.block_y);
+  kernel_info->set_block_z(info.block_z);
+  kernel_info->set_grid_x(info.grid_x);
+  kernel_info->set_grid_y(info.grid_y);
+  kernel_info->set_grid_z(info.grid_z);
+  kernel_info->set_dynamic_shared_memory(info.dynamic_shared_memory);
+  kernel_info->set_static_shared_memory(info.static_shared_memory);
+  kernel_info->set_registers_per_thread(info.registers_per_thread);
+  kernel_info->set_local_memory_per_thread(info.local_memory_per_thread);
+  kernel_info->set_local_memory_total(info.local_memory_total);
+  kernel_info->set_queued(info.queued);
+  kernel_info->set_submitted(info.submitted);
+  kernel_info->set_completed(info.completed);
+  // binding
+  device_trace_event->set_allocated_kernel_info(kernel_info);
+  current_device_trace_event_node_proto_->set_allocated_device_event(
+      device_trace_event);
+}
+
+void SerializationLogger::HandleTypeMemcpy(
+    const DeviceTraceEventNode& device_node) {
+  DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
+  MemcpyEventInfoProto* memcpy_info = new MemcpyEventInfoProto();
+  // fill DeviceTraceEventProto
+  device_trace_event->set_name(device_node.Name());
+  device_trace_event->set_type(
+      static_cast<TracerEventTypeProto>(device_node.Type()));
+  device_trace_event->set_start_ns(device_node.StartNs());
+  device_trace_event->set_end_ns(device_node.EndNs());
+  device_trace_event->set_device_id(device_node.DeviceId());
+  device_trace_event->set_context_id(device_node.ContextId());
+  device_trace_event->set_stream_id(device_node.StreamId());
+  device_trace_event->set_correlation_id(device_node.CorrelationId());
+  // fill MemcpyEventInfoProto
+  MemcpyEventInfo info = device_node.MemcpyInfo();
+  memcpy_info->set_num_bytes(info.num_bytes);
+  memcpy_info->set_copy_kind(std::string(info.copy_kind));
+  memcpy_info->set_src_kind(std::string(info.src_kind));
+  memcpy_info->set_dst_kind(std::string(info.dst_kind));
+  // binding
+  device_trace_event->set_allocated_memcpy_info(memcpy_info);
+  current_device_trace_event_node_proto_->set_allocated_device_event(
+      device_trace_event);
+}
+
+void SerializationLogger::HandleTypeMemset(
+    const DeviceTraceEventNode& device_node) {
+  DeviceTraceEventProto* device_trace_event = new DeviceTraceEventProto();
+  MemsetEventInfoProto* memset_info = new MemsetEventInfoProto();
+  // fill DeviceTraceEventProto
+  device_trace_event->set_name(device_node.Name());
+  device_trace_event->set_type(
+      static_cast<TracerEventTypeProto>(device_node.Type()));
+  device_trace_event->set_start_ns(device_node.StartNs());
+  device_trace_event->set_end_ns(device_node.EndNs());
+  device_trace_event->set_device_id(device_node.DeviceId());
+  device_trace_event->set_context_id(device_node.ContextId());
+  device_trace_event->set_stream_id(device_node.StreamId());
+  device_trace_event->set_correlation_id(device_node.CorrelationId());
+  // fill MemsetEventInfoProto
+  MemsetEventInfo info = device_node.MemsetInfo();
+  memset_info->set_num_bytes(info.num_bytes);
+  memset_info->set_memory_kind(std::string(info.memory_kind));
+  memset_info->set_value(info.value);
+  // binding
+  device_trace_event->set_allocated_memset_info(memset_info);
+  current_device_trace_event_node_proto_->set_allocated_device_event(
+      device_trace_event);
+}
+
+SerializationLogger::SerializationLogger(const std::string& filename) {
+  filename_ = filename.empty() ? DefaultFileName() : filename;
+  OpenFile();
+}
+
+SerializationLogger::SerializationLogger(const char* filename_cstr) {
+  std::string filename(filename_cstr);
+  filename_ = filename.empty() ? DefaultFileName() : filename;
+  OpenFile();
+}
+
+SerializationLogger::~SerializationLogger() {
+  if (!output_file_stream_) {
+    delete node_trees_proto_;
+    return;
+  }
+  node_trees_proto_->SerializeToOstream(&output_file_stream_);
+  delete node_trees_proto_;
+  output_file_stream_.close();
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
new file mode 100755
index 0000000000..1295be95d4
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
+#include "paddle/fluid/platform/profiler/output_logger.h"
+
+namespace paddle {
+namespace platform {
+
+// Dump a NodeTrees into a profobuf file.
+// A SerializationLogger object can only dump a NodeTrees object,
+// creates a file in the constructor and closes the file in the destructor.
+class SerializationLogger : public BaseLogger {
+ public:
+  explicit SerializationLogger(const std::string& filename);
+  explicit SerializationLogger(const char* filename);
+  ~SerializationLogger();
+  std::string filename() { return filename_; }
+  void LogDeviceTraceEventNode(const DeviceTraceEventNode&) override;
+  void LogHostTraceEventNode(const HostTraceEventNode&) override;
+  void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
+  void LogNodeTrees(const NodeTrees&) override;
+
+ private:
+  void OpenFile();
+  void HandleTypeKernel(const DeviceTraceEventNode&);
+  void HandleTypeMemset(const DeviceTraceEventNode&);
+  void HandleTypeMemcpy(const DeviceTraceEventNode&);
+  std::string filename_;
+  std::ofstream output_file_stream_;
+  NodeTreesProto* node_trees_proto_;
+  ThreadNodeTreeProto* current_thread_node_tree_proto_;
+  HostTraceEventNodeProto* current_host_trace_event_node_proto_;
+  CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_;
+  DeviceTraceEventNodeProto* current_device_trace_event_node_proto_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
new file mode 100644
index 0000000000..2fe9626ec7
--- /dev/null
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+#include "paddle/fluid/platform/profiler/event_node.h"
+
+using paddle::platform::SerializationLogger;
+using paddle::platform::DeserializationReader;
+using paddle::platform::NodeTrees;
+using paddle::platform::HostTraceEventNode;
+using paddle::platform::CudaRuntimeTraceEventNode;
+using paddle::platform::DeviceTraceEventNode;
+using paddle::platform::HostTraceEvent;
+using paddle::platform::RuntimeTraceEvent;
+using paddle::platform::DeviceTraceEvent;
+using paddle::platform::TracerEventType;
+using paddle::platform::KernelEventInfo;
+using paddle::platform::MemcpyEventInfo;
+using paddle::platform::MemsetEventInfo;
+
+TEST(SerializationLoggerTest, dump_case0) {
+  std::list<HostTraceEvent> host_events;
+  std::list<RuntimeTraceEvent> runtime_events;
+  std::list<DeviceTraceEvent> device_events;
+  host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
+                                       TracerEventType::Dataloader, 1000, 10000,
+                                       10, 10));
+  host_events.push_back(HostTraceEvent(
+      std::string("op1"), TracerEventType::Operator, 11000, 20000, 10, 10));
+  host_events.push_back(HostTraceEvent(
+      std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
+  host_events.push_back(HostTraceEvent(
+      std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
+                                             17000, 10, 10, 1, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
+                                             35000, 10, 10, 2, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
+                                             37000, 10, 11, 3, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
+                                             19000, 10, 10, 4, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
+                                             39000, 10, 11, 5, 0));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
+                       55000, 0, 10, 10, 1, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
+                       95000, 0, 10, 10, 2, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
+                       65000, 0, 10, 11, 3, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
+                       59000, 0, 10, 10, 4, MemcpyEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
+                       69000, 0, 10, 11, 5, MemsetEventInfo()));
+  SerializationLogger logger("test_serialization_logger_case0.pb");
+  NodeTrees tree(host_events, runtime_events, device_events);
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree.Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 4u);
+  EXPECT_EQ(nodes[11].size(), 2u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 3u);
+    }
+    if ((*it)->Name() == "op1") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "op3") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+  tree.LogMe(&logger);
+}
+
+TEST(SerializationLoggerTest, dump_case1) {
+  std::list<HostTraceEvent> host_events;
+  std::list<RuntimeTraceEvent> runtime_events;
+  std::list<DeviceTraceEvent> device_events;
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
+                                             17000, 10, 10, 1, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
+                                             35000, 10, 10, 2, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch3"), 33000,
+                                             37000, 10, 11, 3, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemcpy1"), 18000,
+                                             19000, 10, 10, 4, 0));
+  runtime_events.push_back(RuntimeTraceEvent(std::string("cudaMemset1"), 38000,
+                                             39000, 10, 11, 5, 0));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel1"), TracerEventType::Kernel, 40000,
+                       55000, 0, 10, 10, 1, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel2"), TracerEventType::Kernel, 70000,
+                       95000, 0, 10, 10, 2, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
+                       65000, 0, 10, 11, 3, KernelEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("memcpy1"), TracerEventType::Memcpy, 56000,
+                       59000, 0, 10, 10, 4, MemcpyEventInfo()));
+  device_events.push_back(
+      DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
+                       69000, 0, 10, 11, 5, MemsetEventInfo()));
+  SerializationLogger logger("test_serialization_logger_case1.pb");
+  NodeTrees tree(host_events, runtime_events, device_events);
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree.Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 1u);
+  EXPECT_EQ(nodes[11].size(), 1u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+  tree.LogMe(&logger);
+}
+
+TEST(DeserializationReaderTest, restore_case0) {
+  DeserializationReader reader("test_serialization_logger_case0.pb");
+  std::unique_ptr<NodeTrees> tree = reader.Parse();
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree->Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 4u);
+  EXPECT_EQ(nodes[11].size(), 2u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 3u);
+    }
+    if ((*it)->Name() == "op1") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "op3") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+}
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
new file mode 100644
index 0000000000..04014b972c
--- /dev/null
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/platform/os_info.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename... Args>
+std::string string_format(const std::string& format, Args... args) {
+  int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
+               1;  // Extra space for '\0'
+  PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
+                                   "Error during profiler data formatting."));
+  auto size = static_cast<size_t>(size_s);
+  auto buf = std::make_unique<char[]>(size);
+  std::snprintf(buf.get(), size, format.c_str(), args...);
+  return std::string(buf.get(), size - 1);  // exclude the '\0'
+}
+
+static std::string GetStringFormatLocalTime() {
+  std::time_t rawtime;
+  std::tm* timeinfo;
+  char buf[100];
+  std::time(&rawtime);
+  timeinfo = std::localtime(&rawtime);
+  std::strftime(buf, 100, "%F-%X", timeinfo);
+  return std::string(buf);
+}
+
+static int64_t nsToUs(int64_t ns) { return ns / 1000; }
+
+}  // namespace platform
+}  // namespace paddle
-- 
GitLab