add new profiler components (#39964)

* add new profiler components * fix bug

add new profiler components (#39964)
* add new profiler components * fix bug
d4ae1775 · chenjian · GitHub · 0ff72e5d · d4ae1775 · d4ae1775
11 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -263,11 +263,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
      // in order to record different op type cost time
      // and different op name cost time,we set two event.
      platform::RecordEvent op_type_record_event(
-          Type().c_str(), platform::TracerEventType::Operator, 1);
-      auto op_name = platform::OpName(outputs_, Type());
-      platform::RecordEvent op_name_record_event(
-          op_name, platform::TracerEventType::Operator, 1,
-          platform::EventRole::kUniqueOp);
+          Type(), platform::TracerEventType::Operator, 1);
+      // auto op_name = platform::OpName(outputs_, Type());
+      // platform::RecordEvent op_name_record_event(
+      //     op_name, platform::TracerEventType::Operator, 1,
+      //     platform::EventRole::kUniqueOp);
      RunImpl(scope, place);
    }


--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUPTI

 #include <cuda.h>
+#include <cuda_occupancy.h>
 #include <cupti.h>
 #include <mutex>  // NOLINT

@@ -50,7 +51,8 @@ namespace dynload {
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
  __macro(cuptiEnableCallback);               \
-  __macro(cuptiEnableDomain);
+  __macro(cuptiEnableDomain);                 \
+  __macro(cudaOccMaxActiveBlocksPerMultiprocessor);

 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);


--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
 cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
 cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer)
 cc_library(event_node SRCS event_node.cc DEPS enforce)
-cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node)
+cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
+cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils)
 cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
-cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
 add_subdirectory(dump)
+cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
+cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
+cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization)
+cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/profiler/cpu_utilization.h"
+
+namespace paddle {
+namespace platform {
+
+#ifdef _MSC_VER
+static uint64_t FileTimeToUint64(FILETIME time) {
+  uint64_t low_part = time.dwLowDateTime;
+  uint64_t high_part = time.dwHighDateTime;
+  uint64_t result = (high_part << 32) | low_part;
+  return result;
+}
+#endif
+
+void CpuUtilization::RecordBeginTimeInfo() {
+#if defined(_MSC_VER)
+  HANDLE process_handle = GetCurrentProcess();
+  GetSystemTimeAsFileTime(&start_);
+  GetSystemTimes(&system_idle_time_start_, &system_kernel_time_start_,
+                 &system_user_time_start_);
+  GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_,
+                  &process_kernel_time_start_, &process_user_time_start_);
+
+#elif defined(__linux__)
+  start_ = times(&process_tms_start_);
+#define proc_path_size 1024
+  static char proc_stat_path[proc_path_size] = "/proc/stat";
+  FILE *stat_file = fopen(proc_stat_path, "r");
+  if (stat_file != nullptr) {
+    char temp_str[200];
+    uint64_t temp_lu;
+    while (true) {
+      int retval = fscanf(
+          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+          temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
+          &system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
+          &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
+      if (std::string(temp_str).find("cpu") != 0) {
+        break;
+      }
+      if (retval != 11) {
+        return;
+      }
+    }
+    fclose(stat_file);
+  }
+#else
+#endif
+}
+
+void CpuUtilization::RecordEndTimeInfo() {
+#if defined(_MSC_VER)
+  HANDLE process_handle = GetCurrentProcess();
+  GetSystemTimeAsFileTime(&end_);
+  GetSystemTimes(&system_idle_time_end_, &system_kernel_time_end_,
+                 &system_user_time_end_);
+  GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_,
+                  &process_kernel_time_end_, &process_user_time_end_);
+#elif defined(__linux__)
+  end_ = times(&process_tms_end_);
+#define proc_path_size 1024
+  static char proc_stat_path[proc_path_size] = "/proc/stat";
+  FILE *stat_file = fopen(proc_stat_path, "r");
+  if (stat_file != nullptr) {
+    char temp_str[200];
+    uint64_t temp_lu;
+    while (true) {
+      int retval = fscanf(
+          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+          temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
+          &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
+          &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
+      if (std::string(temp_str).find("cpu") != 0) {
+        break;
+      }
+      if (retval != 11) {
+        return;
+      }
+    }
+    fclose(stat_file);
+  }
+#else
+#endif
+}
+
+float CpuUtilization::GetCpuUtilization() {
+  float cpu_utilization = 0.0;
+#if defined(_MSC_VER)
+  uint64_t system_user_time_start = FileTimeToUint64(system_user_time_start_);
+  uint64_t system_user_time_end = FileTimeToUint64(system_user_time_end_);
+  uint64_t system_kernel_time_start =
+      FileTimeToUint64(system_kernel_time_start_);
+  uint64_t system_kernel_time_end = FileTimeToUint64(system_kernel_time_end_);
+  uint64_t system_idle_time_start = FileTimeToUint64(system_idle_time_start_);
+  uint64_t system_idle_time_end = FileTimeToUint64(system_idle_time_end_);
+  float busy_time = (system_kernel_time_end - system_kernel_time_start) +
+                    (system_user_time_end - system_user_time_start);
+  float idle_time = system_idle_time_end - system_idle_time_start;
+  cpu_utilization = busy_time / (busy_time + idle_time);
+
+#elif defined(__linux__)
+  float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) +
+                    (system_tms_end_.tms_stime - system_tms_start_.tms_stime) +
+                    (nice_time_end_ - nice_time_start_) +
+                    (irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) +
+                    (steal_end_ - steal_start_);
+  float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_);
+  cpu_utilization = busy_time / (busy_time + idle_time);
+#else
+  LOG(WARNING)
+      << "Current System is not supported to get system cpu utilization"
+      << cpu_utilization << std::endl;
+#endif
+  return cpu_utilization;
+}
+
+float CpuUtilization::GetCpuCurProcessUtilization() {
+  float cpu_process_utilization = 0.0;
+#ifdef _MSC_VER
+  uint64_t process_user_time_start = FileTimeToUint64(process_user_time_start_);
+  uint64_t process_user_time_end = FileTimeToUint64(process_user_time_end_);
+  uint64_t process_kernel_time_start =
+      FileTimeToUint64(process_kernel_time_start_);
+  uint64_t process_kernel_time_end = FileTimeToUint64(process_kernel_time_end_);
+  uint64_t start = FileTimeToUint64(start_);
+  uint64_t end = FileTimeToUint64(end_);
+  float busy_time = (process_kernel_time_end - process_kernel_time_start) +
+                    (process_user_time_end - process_user_time_start);
+  cpu_process_utilization = busy_time / (end - start);
+  LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl;
+#elif defined(__linux__)
+  float busy_time =
+      (process_tms_end_.tms_utime - process_tms_start_.tms_utime) +
+      (process_tms_end_.tms_stime - process_tms_start_.tms_stime);
+  cpu_process_utilization = busy_time / (end_ - start_);
+#else
+  LOG(WARNING)
+      << "Current System is not supported to get process cpu utilization"
+      << cpu_process_utilization << std::endl;
+#endif
+  return cpu_process_utilization;
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/cpu_utilization.h
+++ b/paddle/fluid/platform/profiler/cpu_utilization.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdio.h>
+#include <cinttypes>
+#include <cstdint>
+#include "glog/logging.h"
+#ifdef _MSC_VER
+#include <windows.h>
+#else
+#include <sys/times.h>
+#include <unistd.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+class CpuUtilization {
+ public:
+  CpuUtilization() {}
+  void RecordBeginTimeInfo();
+  void RecordEndTimeInfo();
+  float GetCpuUtilization();
+  float GetCpuCurProcessUtilization();
+
+ private:
+#ifdef _MSC_VER
+  FILETIME start_, end_;
+  FILETIME process_user_time_start_, process_user_time_end_;
+  FILETIME process_kernel_time_start_, process_kernel_time_end_;
+  FILETIME system_user_time_start_, system_user_time_end_;
+  FILETIME system_kernel_time_start_, system_kernel_time_end_;
+  FILETIME system_idle_time_start_, system_idle_time_end_;
+  FILETIME process_creation_time_, process_exit_time_;
+#else
+  clock_t start_, end_;
+  uint64_t idle_start_, idle_end_;
+  uint64_t iowait_start_, iowait_end_;
+  uint64_t nice_time_start_, nice_time_end_;
+  uint64_t irq_start_, irq_end_;
+  uint64_t softirq_start_, softirq_end_;
+  uint64_t steal_start_, steal_end_;
+  struct tms system_tms_start_, system_tms_end_;
+  struct tms process_tms_start_, process_tms_end_;
+#endif
+};
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/extra_info.h
+++ b/paddle/fluid/platform/profiler/extra_info.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/platform/profiler/utils.h"
+
+namespace paddle {
+namespace platform {
+
+class ExtraInfo {
+ public:
+  ExtraInfo() {}
+  template <typename... Args>
+  void AddExtraInfo(const std::string& key, const std::string& format,
+                    Args... args);
+  void Clear() { extra_info_.clear(); }
+  std::unordered_map<std::string, std::string> GetExtraInfo() {
+    return extra_info_;
+  }
+
+ private:
+  std::unordered_map<std::string, std::string> extra_info_;
+};
+
+template <typename... Args>
+void ExtraInfo::AddExtraInfo(const std::string& key, const std::string& format,
+                             Args... args) {
+  std::string value = string_format(format, args...);
+  extra_info_[key] = value;
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/output_logger.h
+++ b/paddle/fluid/platform/profiler/output_logger.h
@@ -33,7 +33,6 @@ class BaseLogger {
  virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
  virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
  virtual void LogNodeTrees(const NodeTrees&) {}
-  virtual void LogMetaInfo() {}
 };

 }  // namespace platform

--- a/paddle/fluid/platform/profiler/test_extra_info.cc
+++ b/paddle/fluid/platform/profiler/test_extra_info.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
+
+using paddle::platform::ExtraInfo;
+
+TEST(ExtraInfoTest, case0) {
+  ExtraInfo instance;
+  instance.AddExtraInfo(std::string("info1"), std::string("%d"), 20);
+  instance.AddExtraInfo(std::string("info2"), std::string("%s"), "helloworld");
+  std::unordered_map<std::string, std::string> map = instance.GetExtraInfo();
+  EXPECT_EQ(map["info1"], "20");
+  EXPECT_EQ(map["info2"], "helloworld");
+  EXPECT_EQ(map.size(), 2u);
+  instance.Clear();
+  map = instance.GetExtraInfo();
+  EXPECT_EQ(map.size(), 0u);
+}
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler/utils.h"
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/dynload/cupti.h"
+
+namespace paddle {
+namespace platform {
+#ifdef PADDLE_WITH_CUPTI
+float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
+                            int32_t StaticSharedMemory,
+                            int32_t DynamicSharedMemory, int32_t BlockX,
+                            int32_t BlockY, int32_t BlockZ, float BlocksPerSm) {
+  float occupancy = 0.0;
+  std::vector<int> device_ids = GetSelectedDevices();
+  if (DeviceId < device_ids.size()) {
+    const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
+    cudaOccFuncAttributes occFuncAttr;
+    occFuncAttr.maxThreadsPerBlock = INT_MAX;
+    occFuncAttr.numRegs = RegistersPerThread;
+    occFuncAttr.sharedSizeBytes = StaticSharedMemory;
+    occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
+    occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
+    occFuncAttr.maxDynamicSharedSizeBytes = 0;
+    const cudaOccDeviceState occDeviceState = {};
+    int blockSize = BlockX * BlockY * BlockZ;
+    size_t dynamicSmemSize = DynamicSharedMemory;
+    cudaOccResult occ_result;
+    cudaOccDeviceProp prop(device_property);
+    cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
+        &occ_result, &prop, &occFuncAttr, &occDeviceState, blockSize,
+        dynamicSmemSize);
+    if (status == CUDA_OCC_SUCCESS) {
+      if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) {
+        BlocksPerSm = occ_result.activeBlocksPerMultiprocessor;
+      }
+      occupancy =
+          BlocksPerSm * blockSize /
+          static_cast<float>(device_property.maxThreadsPerMultiProcessor);
+    } else {
+      LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
+                   << status << std::endl;
+    }
+  }
+  return occupancy;
+}
+#endif
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

+#include <ctime>
+#include <string>
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"

 namespace paddle {
@@ -42,5 +45,11 @@ static std::string GetStringFormatLocalTime() {

 static int64_t nsToUs(int64_t ns) { return ns / 1000; }

+#ifdef PADDLE_WITH_CUPTI
+float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
+                            int32_t staticSharedMemory,
+                            int32_t dynamicSharedMemory, int32_t blockX,
+                            int32_t blockY, int32_t blockZ, float blocksPerSm);
+#endif
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/phi/backends/dynload/cupti.h
+++ b/paddle/phi/backends/dynload/cupti.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUPTI

 #include <cuda.h>
+#include <cuda_occupancy.h>
 #include <cupti.h>
 #include <mutex>  // NOLINT

@@ -63,7 +64,8 @@ extern void *cupti_dso_handle;
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
  __macro(cuptiEnableCallback);               \
-  __macro(cuptiEnableDomain);
+  __macro(cuptiEnableDomain);                 \
+  __macro(cudaOccMaxActiveBlocksPerMultiprocessor);

 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);