From d4ae17754cb4d49f06b7e5007fc7890f5316def2 Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Feb 2022 10:11:54 +0800 Subject: [PATCH] add new profiler components (#39964) * add new profiler components * fix bug --- paddle/fluid/framework/operator.cc | 10 +- paddle/fluid/platform/dynload/cupti.h | 4 +- paddle/fluid/platform/profiler/CMakeLists.txt | 9 +- .../platform/profiler/cpu_utilization.cc | 172 ++++++++++++++++++ .../fluid/platform/profiler/cpu_utilization.h | 62 +++++++ paddle/fluid/platform/profiler/extra_info.h | 49 +++++ .../fluid/platform/profiler/output_logger.h | 1 - .../platform/profiler/test_extra_info.cc | 31 ++++ paddle/fluid/platform/profiler/utils.cc | 66 +++++++ paddle/fluid/platform/profiler/utils.h | 9 + paddle/phi/backends/dynload/cupti.h | 4 +- 11 files changed, 406 insertions(+), 11 deletions(-) mode change 100644 => 100755 paddle/fluid/platform/profiler/CMakeLists.txt create mode 100644 paddle/fluid/platform/profiler/cpu_utilization.cc create mode 100644 paddle/fluid/platform/profiler/cpu_utilization.h create mode 100644 paddle/fluid/platform/profiler/extra_info.h create mode 100644 paddle/fluid/platform/profiler/test_extra_info.cc create mode 100644 paddle/fluid/platform/profiler/utils.cc diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 692ebf6f332..b7332896818 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -263,11 +263,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { // in order to record different op type cost time // and different op name cost time,we set two event. platform::RecordEvent op_type_record_event( - Type().c_str(), platform::TracerEventType::Operator, 1); - auto op_name = platform::OpName(outputs_, Type()); - platform::RecordEvent op_name_record_event( - op_name, platform::TracerEventType::Operator, 1, - platform::EventRole::kUniqueOp); + Type(), platform::TracerEventType::Operator, 1); + // auto op_name = platform::OpName(outputs_, Type()); + // platform::RecordEvent op_name_record_event( + // op_name, platform::TracerEventType::Operator, 1, + // platform::EventRole::kUniqueOp); RunImpl(scope, place); } diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index dacfe2bd2e7..854e5a7b9f0 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUPTI #include +#include #include #include // NOLINT @@ -50,7 +51,8 @@ namespace dynload { __macro(cuptiSubscribe); \ __macro(cuptiUnsubscribe); \ __macro(cuptiEnableCallback); \ - __macro(cuptiEnableDomain); + __macro(cuptiEnableDomain); \ + __macro(cudaOccMaxActiveBlocksPerMultiprocessor); CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt old mode 100644 new mode 100755 index 320e989bd9b..5acdfa39569 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,8 +1,11 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer) cc_library(event_node SRCS event_node.cc DEPS enforce) -cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node) +cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) +cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils) cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger) -cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node) add_subdirectory(dump) +cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) +cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) +cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization) +cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node) diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc new file mode 100644 index 00000000000..672a9a15453 --- /dev/null +++ b/paddle/fluid/platform/profiler/cpu_utilization.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/profiler/cpu_utilization.h" + +namespace paddle { +namespace platform { + +#ifdef _MSC_VER +static uint64_t FileTimeToUint64(FILETIME time) { + uint64_t low_part = time.dwLowDateTime; + uint64_t high_part = time.dwHighDateTime; + uint64_t result = (high_part << 32) | low_part; + return result; +} +#endif + +void CpuUtilization::RecordBeginTimeInfo() { +#if defined(_MSC_VER) + HANDLE process_handle = GetCurrentProcess(); + GetSystemTimeAsFileTime(&start_); + GetSystemTimes(&system_idle_time_start_, &system_kernel_time_start_, + &system_user_time_start_); + GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_, + &process_kernel_time_start_, &process_user_time_start_); + +#elif defined(__linux__) + start_ = times(&process_tms_start_); +#define proc_path_size 1024 + static char proc_stat_path[proc_path_size] = "/proc/stat"; + FILE *stat_file = fopen(proc_stat_path, "r"); + if (stat_file != nullptr) { + char temp_str[200]; + uint64_t temp_lu; + while (true) { + int retval = fscanf( + stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_start_.tms_utime, &nice_time_start_, + &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, + &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu); + if (std::string(temp_str).find("cpu") != 0) { + break; + } + if (retval != 11) { + return; + } + } + fclose(stat_file); + } +#else +#endif +} + +void CpuUtilization::RecordEndTimeInfo() { +#if defined(_MSC_VER) + HANDLE process_handle = GetCurrentProcess(); + GetSystemTimeAsFileTime(&end_); + GetSystemTimes(&system_idle_time_end_, &system_kernel_time_end_, + &system_user_time_end_); + GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_, + &process_kernel_time_end_, &process_user_time_end_); +#elif defined(__linux__) + end_ = times(&process_tms_end_); +#define proc_path_size 1024 + static char proc_stat_path[proc_path_size] = "/proc/stat"; + FILE *stat_file = fopen(proc_stat_path, "r"); + if (stat_file != nullptr) { + char temp_str[200]; + uint64_t temp_lu; + while (true) { + int retval = fscanf( + stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_end_.tms_utime, &nice_time_end_, + &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_, + &softirq_end_, &steal_end_, &temp_lu, &temp_lu); + if (std::string(temp_str).find("cpu") != 0) { + break; + } + if (retval != 11) { + return; + } + } + fclose(stat_file); + } +#else +#endif +} + +float CpuUtilization::GetCpuUtilization() { + float cpu_utilization = 0.0; +#if defined(_MSC_VER) + uint64_t system_user_time_start = FileTimeToUint64(system_user_time_start_); + uint64_t system_user_time_end = FileTimeToUint64(system_user_time_end_); + uint64_t system_kernel_time_start = + FileTimeToUint64(system_kernel_time_start_); + uint64_t system_kernel_time_end = FileTimeToUint64(system_kernel_time_end_); + uint64_t system_idle_time_start = FileTimeToUint64(system_idle_time_start_); + uint64_t system_idle_time_end = FileTimeToUint64(system_idle_time_end_); + float busy_time = (system_kernel_time_end - system_kernel_time_start) + + (system_user_time_end - system_user_time_start); + float idle_time = system_idle_time_end - system_idle_time_start; + cpu_utilization = busy_time / (busy_time + idle_time); + +#elif defined(__linux__) + float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) + + (system_tms_end_.tms_stime - system_tms_start_.tms_stime) + + (nice_time_end_ - nice_time_start_) + + (irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) + + (steal_end_ - steal_start_); + float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_); + cpu_utilization = busy_time / (busy_time + idle_time); +#else + LOG(WARNING) + << "Current System is not supported to get system cpu utilization" + << cpu_utilization << std::endl; +#endif + return cpu_utilization; +} + +float CpuUtilization::GetCpuCurProcessUtilization() { + float cpu_process_utilization = 0.0; +#ifdef _MSC_VER + uint64_t process_user_time_start = FileTimeToUint64(process_user_time_start_); + uint64_t process_user_time_end = FileTimeToUint64(process_user_time_end_); + uint64_t process_kernel_time_start = + FileTimeToUint64(process_kernel_time_start_); + uint64_t process_kernel_time_end = FileTimeToUint64(process_kernel_time_end_); + uint64_t start = FileTimeToUint64(start_); + uint64_t end = FileTimeToUint64(end_); + float busy_time = (process_kernel_time_end - process_kernel_time_start) + + (process_user_time_end - process_user_time_start); + cpu_process_utilization = busy_time / (end - start); + LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl; +#elif defined(__linux__) + float busy_time = + (process_tms_end_.tms_utime - process_tms_start_.tms_utime) + + (process_tms_end_.tms_stime - process_tms_start_.tms_stime); + cpu_process_utilization = busy_time / (end_ - start_); +#else + LOG(WARNING) + << "Current System is not supported to get process cpu utilization" + << cpu_process_utilization << std::endl; +#endif + return cpu_process_utilization; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/cpu_utilization.h b/paddle/fluid/platform/profiler/cpu_utilization.h new file mode 100644 index 00000000000..7b05a6302cd --- /dev/null +++ b/paddle/fluid/platform/profiler/cpu_utilization.h @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "glog/logging.h" +#ifdef _MSC_VER +#include +#else +#include +#include +#endif + +namespace paddle { +namespace platform { + +class CpuUtilization { + public: + CpuUtilization() {} + void RecordBeginTimeInfo(); + void RecordEndTimeInfo(); + float GetCpuUtilization(); + float GetCpuCurProcessUtilization(); + + private: +#ifdef _MSC_VER + FILETIME start_, end_; + FILETIME process_user_time_start_, process_user_time_end_; + FILETIME process_kernel_time_start_, process_kernel_time_end_; + FILETIME system_user_time_start_, system_user_time_end_; + FILETIME system_kernel_time_start_, system_kernel_time_end_; + FILETIME system_idle_time_start_, system_idle_time_end_; + FILETIME process_creation_time_, process_exit_time_; +#else + clock_t start_, end_; + uint64_t idle_start_, idle_end_; + uint64_t iowait_start_, iowait_end_; + uint64_t nice_time_start_, nice_time_end_; + uint64_t irq_start_, irq_end_; + uint64_t softirq_start_, softirq_end_; + uint64_t steal_start_, steal_end_; + struct tms system_tms_start_, system_tms_end_; + struct tms process_tms_start_, process_tms_end_; +#endif +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/extra_info.h b/paddle/fluid/platform/profiler/extra_info.h new file mode 100644 index 00000000000..04532592ebd --- /dev/null +++ b/paddle/fluid/platform/profiler/extra_info.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/platform/profiler/utils.h" + +namespace paddle { +namespace platform { + +class ExtraInfo { + public: + ExtraInfo() {} + template + void AddExtraInfo(const std::string& key, const std::string& format, + Args... args); + void Clear() { extra_info_.clear(); } + std::unordered_map GetExtraInfo() { + return extra_info_; + } + + private: + std::unordered_map extra_info_; +}; + +template +void ExtraInfo::AddExtraInfo(const std::string& key, const std::string& format, + Args... args) { + std::string value = string_format(format, args...); + extra_info_[key] = value; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h index ff4effad5ec..05a68cf2a4a 100644 --- a/paddle/fluid/platform/profiler/output_logger.h +++ b/paddle/fluid/platform/profiler/output_logger.h @@ -33,7 +33,6 @@ class BaseLogger { virtual void LogHostTraceEventNode(const HostTraceEventNode&) {} virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {} virtual void LogNodeTrees(const NodeTrees&) {} - virtual void LogMetaInfo() {} }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/test_extra_info.cc b/paddle/fluid/platform/profiler/test_extra_info.cc new file mode 100644 index 00000000000..7274c9de977 --- /dev/null +++ b/paddle/fluid/platform/profiler/test_extra_info.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/profiler/extra_info.h" + +using paddle::platform::ExtraInfo; + +TEST(ExtraInfoTest, case0) { + ExtraInfo instance; + instance.AddExtraInfo(std::string("info1"), std::string("%d"), 20); + instance.AddExtraInfo(std::string("info2"), std::string("%s"), "helloworld"); + std::unordered_map map = instance.GetExtraInfo(); + EXPECT_EQ(map["info1"], "20"); + EXPECT_EQ(map["info2"], "helloworld"); + EXPECT_EQ(map.size(), 2u); + instance.Clear(); + map = instance.GetExtraInfo(); + EXPECT_EQ(map.size(), 0u); +} diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc new file mode 100644 index 00000000000..b43389866c7 --- /dev/null +++ b/paddle/fluid/platform/profiler/utils.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/profiler/utils.h" + +#include + +#include "glog/logging.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/dynload/cupti.h" + +namespace paddle { +namespace platform { +#ifdef PADDLE_WITH_CUPTI +float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, + int32_t StaticSharedMemory, + int32_t DynamicSharedMemory, int32_t BlockX, + int32_t BlockY, int32_t BlockZ, float BlocksPerSm) { + float occupancy = 0.0; + std::vector device_ids = GetSelectedDevices(); + if (DeviceId < device_ids.size()) { + const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId); + cudaOccFuncAttributes occFuncAttr; + occFuncAttr.maxThreadsPerBlock = INT_MAX; + occFuncAttr.numRegs = RegistersPerThread; + occFuncAttr.sharedSizeBytes = StaticSharedMemory; + occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF; + occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT; + occFuncAttr.maxDynamicSharedSizeBytes = 0; + const cudaOccDeviceState occDeviceState = {}; + int blockSize = BlockX * BlockY * BlockZ; + size_t dynamicSmemSize = DynamicSharedMemory; + cudaOccResult occ_result; + cudaOccDeviceProp prop(device_property); + cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor( + &occ_result, &prop, &occFuncAttr, &occDeviceState, blockSize, + dynamicSmemSize); + if (status == CUDA_OCC_SUCCESS) { + if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) { + BlocksPerSm = occ_result.activeBlocksPerMultiprocessor; + } + occupancy = + BlocksPerSm * blockSize / + static_cast(device_property.maxThreadsPerMultiProcessor); + } else { + LOG(WARNING) << "Failed to calculate estimated occupancy, status = " + << status << std::endl; + } + } + return occupancy; +} +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index 04014b972c3..cd56d343842 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/os_info.h" namespace paddle { @@ -42,5 +45,11 @@ static std::string GetStringFormatLocalTime() { static int64_t nsToUs(int64_t ns) { return ns / 1000; } +#ifdef PADDLE_WITH_CUPTI +float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, + int32_t staticSharedMemory, + int32_t dynamicSharedMemory, int32_t blockX, + int32_t blockY, int32_t blockZ, float blocksPerSm); +#endif } // namespace platform } // namespace paddle diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h index fe98fa6bd37..a526fbfd926 100644 --- a/paddle/phi/backends/dynload/cupti.h +++ b/paddle/phi/backends/dynload/cupti.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUPTI #include +#include #include #include // NOLINT @@ -63,7 +64,8 @@ extern void *cupti_dso_handle; __macro(cuptiSubscribe); \ __macro(cuptiUnsubscribe); \ __macro(cuptiEnableCallback); \ - __macro(cuptiEnableDomain); + __macro(cuptiEnableDomain); \ + __macro(cudaOccMaxActiveBlocksPerMultiprocessor); CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); -- GitLab