未验证 提交 d4ae1775 编写于 作者: C chenjian 提交者: GitHub

add new profiler components (#39964)

* add new profiler components

* fix bug
上级 0ff72e5d
......@@ -263,11 +263,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
// in order to record different op type cost time
// and different op name cost time,we set two event.
platform::RecordEvent op_type_record_event(
Type().c_str(), platform::TracerEventType::Operator, 1);
auto op_name = platform::OpName(outputs_, Type());
platform::RecordEvent op_name_record_event(
op_name, platform::TracerEventType::Operator, 1,
platform::EventRole::kUniqueOp);
Type(), platform::TracerEventType::Operator, 1);
// auto op_name = platform::OpName(outputs_, Type());
// platform::RecordEvent op_name_record_event(
// op_name, platform::TracerEventType::Operator, 1,
// platform::EventRole::kUniqueOp);
RunImpl(scope, place);
}
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUPTI
#include <cuda.h>
#include <cuda_occupancy.h>
#include <cupti.h>
#include <mutex> // NOLINT
......@@ -50,7 +51,8 @@ namespace dynload {
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback); \
__macro(cuptiEnableDomain);
__macro(cuptiEnableDomain); \
__macro(cudaOccMaxActiveBlocksPerMultiprocessor);
CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
......
cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer)
cc_library(event_node SRCS event_node.cc DEPS enforce)
cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node)
cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils)
cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
add_subdirectory(dump)
cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization)
cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/cpu_utilization.h"
namespace paddle {
namespace platform {
#ifdef _MSC_VER
static uint64_t FileTimeToUint64(FILETIME time) {
uint64_t low_part = time.dwLowDateTime;
uint64_t high_part = time.dwHighDateTime;
uint64_t result = (high_part << 32) | low_part;
return result;
}
#endif
void CpuUtilization::RecordBeginTimeInfo() {
#if defined(_MSC_VER)
HANDLE process_handle = GetCurrentProcess();
GetSystemTimeAsFileTime(&start_);
GetSystemTimes(&system_idle_time_start_, &system_kernel_time_start_,
&system_user_time_start_);
GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_,
&process_kernel_time_start_, &process_user_time_start_);
#elif defined(__linux__)
start_ = times(&process_tms_start_);
#define proc_path_size 1024
static char proc_stat_path[proc_path_size] = "/proc/stat";
FILE *stat_file = fopen(proc_stat_path, "r");
if (stat_file != nullptr) {
char temp_str[200];
uint64_t temp_lu;
while (true) {
int retval = fscanf(
stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
"%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
&system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
&irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
if (std::string(temp_str).find("cpu") != 0) {
break;
}
if (retval != 11) {
return;
}
}
fclose(stat_file);
}
#else
#endif
}
void CpuUtilization::RecordEndTimeInfo() {
#if defined(_MSC_VER)
HANDLE process_handle = GetCurrentProcess();
GetSystemTimeAsFileTime(&end_);
GetSystemTimes(&system_idle_time_end_, &system_kernel_time_end_,
&system_user_time_end_);
GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_,
&process_kernel_time_end_, &process_user_time_end_);
#elif defined(__linux__)
end_ = times(&process_tms_end_);
#define proc_path_size 1024
static char proc_stat_path[proc_path_size] = "/proc/stat";
FILE *stat_file = fopen(proc_stat_path, "r");
if (stat_file != nullptr) {
char temp_str[200];
uint64_t temp_lu;
while (true) {
int retval = fscanf(
stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
"%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
&system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
&softirq_end_, &steal_end_, &temp_lu, &temp_lu);
if (std::string(temp_str).find("cpu") != 0) {
break;
}
if (retval != 11) {
return;
}
}
fclose(stat_file);
}
#else
#endif
}
float CpuUtilization::GetCpuUtilization() {
float cpu_utilization = 0.0;
#if defined(_MSC_VER)
uint64_t system_user_time_start = FileTimeToUint64(system_user_time_start_);
uint64_t system_user_time_end = FileTimeToUint64(system_user_time_end_);
uint64_t system_kernel_time_start =
FileTimeToUint64(system_kernel_time_start_);
uint64_t system_kernel_time_end = FileTimeToUint64(system_kernel_time_end_);
uint64_t system_idle_time_start = FileTimeToUint64(system_idle_time_start_);
uint64_t system_idle_time_end = FileTimeToUint64(system_idle_time_end_);
float busy_time = (system_kernel_time_end - system_kernel_time_start) +
(system_user_time_end - system_user_time_start);
float idle_time = system_idle_time_end - system_idle_time_start;
cpu_utilization = busy_time / (busy_time + idle_time);
#elif defined(__linux__)
float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) +
(system_tms_end_.tms_stime - system_tms_start_.tms_stime) +
(nice_time_end_ - nice_time_start_) +
(irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) +
(steal_end_ - steal_start_);
float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_);
cpu_utilization = busy_time / (busy_time + idle_time);
#else
LOG(WARNING)
<< "Current System is not supported to get system cpu utilization"
<< cpu_utilization << std::endl;
#endif
return cpu_utilization;
}
float CpuUtilization::GetCpuCurProcessUtilization() {
float cpu_process_utilization = 0.0;
#ifdef _MSC_VER
uint64_t process_user_time_start = FileTimeToUint64(process_user_time_start_);
uint64_t process_user_time_end = FileTimeToUint64(process_user_time_end_);
uint64_t process_kernel_time_start =
FileTimeToUint64(process_kernel_time_start_);
uint64_t process_kernel_time_end = FileTimeToUint64(process_kernel_time_end_);
uint64_t start = FileTimeToUint64(start_);
uint64_t end = FileTimeToUint64(end_);
float busy_time = (process_kernel_time_end - process_kernel_time_start) +
(process_user_time_end - process_user_time_start);
cpu_process_utilization = busy_time / (end - start);
LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl;
#elif defined(__linux__)
float busy_time =
(process_tms_end_.tms_utime - process_tms_start_.tms_utime) +
(process_tms_end_.tms_stime - process_tms_start_.tms_stime);
cpu_process_utilization = busy_time / (end_ - start_);
#else
LOG(WARNING)
<< "Current System is not supported to get process cpu utilization"
<< cpu_process_utilization << std::endl;
#endif
return cpu_process_utilization;
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdio.h>
#include <cinttypes>
#include <cstdint>
#include "glog/logging.h"
#ifdef _MSC_VER
#include <windows.h>
#else
#include <sys/times.h>
#include <unistd.h>
#endif
namespace paddle {
namespace platform {
class CpuUtilization {
public:
CpuUtilization() {}
void RecordBeginTimeInfo();
void RecordEndTimeInfo();
float GetCpuUtilization();
float GetCpuCurProcessUtilization();
private:
#ifdef _MSC_VER
FILETIME start_, end_;
FILETIME process_user_time_start_, process_user_time_end_;
FILETIME process_kernel_time_start_, process_kernel_time_end_;
FILETIME system_user_time_start_, system_user_time_end_;
FILETIME system_kernel_time_start_, system_kernel_time_end_;
FILETIME system_idle_time_start_, system_idle_time_end_;
FILETIME process_creation_time_, process_exit_time_;
#else
clock_t start_, end_;
uint64_t idle_start_, idle_end_;
uint64_t iowait_start_, iowait_end_;
uint64_t nice_time_start_, nice_time_end_;
uint64_t irq_start_, irq_end_;
uint64_t softirq_start_, softirq_end_;
uint64_t steal_start_, steal_end_;
struct tms system_tms_start_, system_tms_end_;
struct tms process_tms_start_, process_tms_end_;
#endif
};
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/platform/profiler/utils.h"
namespace paddle {
namespace platform {
class ExtraInfo {
public:
ExtraInfo() {}
template <typename... Args>
void AddExtraInfo(const std::string& key, const std::string& format,
Args... args);
void Clear() { extra_info_.clear(); }
std::unordered_map<std::string, std::string> GetExtraInfo() {
return extra_info_;
}
private:
std::unordered_map<std::string, std::string> extra_info_;
};
template <typename... Args>
void ExtraInfo::AddExtraInfo(const std::string& key, const std::string& format,
Args... args) {
std::string value = string_format(format, args...);
extra_info_[key] = value;
}
} // namespace platform
} // namespace paddle
......@@ -33,7 +33,6 @@ class BaseLogger {
virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
virtual void LogNodeTrees(const NodeTrees&) {}
virtual void LogMetaInfo() {}
};
} // namespace platform
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
using paddle::platform::ExtraInfo;
TEST(ExtraInfoTest, case0) {
ExtraInfo instance;
instance.AddExtraInfo(std::string("info1"), std::string("%d"), 20);
instance.AddExtraInfo(std::string("info2"), std::string("%s"), "helloworld");
std::unordered_map<std::string, std::string> map = instance.GetExtraInfo();
EXPECT_EQ(map["info1"], "20");
EXPECT_EQ(map["info2"], "helloworld");
EXPECT_EQ(map.size(), 2u);
instance.Clear();
map = instance.GetExtraInfo();
EXPECT_EQ(map.size(), 0u);
}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler/utils.h"
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/dynload/cupti.h"
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
int32_t StaticSharedMemory,
int32_t DynamicSharedMemory, int32_t BlockX,
int32_t BlockY, int32_t BlockZ, float BlocksPerSm) {
float occupancy = 0.0;
std::vector<int> device_ids = GetSelectedDevices();
if (DeviceId < device_ids.size()) {
const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
cudaOccFuncAttributes occFuncAttr;
occFuncAttr.maxThreadsPerBlock = INT_MAX;
occFuncAttr.numRegs = RegistersPerThread;
occFuncAttr.sharedSizeBytes = StaticSharedMemory;
occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF;
occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
occFuncAttr.maxDynamicSharedSizeBytes = 0;
const cudaOccDeviceState occDeviceState = {};
int blockSize = BlockX * BlockY * BlockZ;
size_t dynamicSmemSize = DynamicSharedMemory;
cudaOccResult occ_result;
cudaOccDeviceProp prop(device_property);
cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
&occ_result, &prop, &occFuncAttr, &occDeviceState, blockSize,
dynamicSmemSize);
if (status == CUDA_OCC_SUCCESS) {
if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) {
BlocksPerSm = occ_result.activeBlocksPerMultiprocessor;
}
occupancy =
BlocksPerSm * blockSize /
static_cast<float>(device_property.maxThreadsPerMultiProcessor);
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
<< status << std::endl;
}
}
return occupancy;
}
#endif
} // namespace platform
} // namespace paddle
......@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <ctime>
#include <string>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
namespace paddle {
......@@ -42,5 +45,11 @@ static std::string GetStringFormatLocalTime() {
static int64_t nsToUs(int64_t ns) { return ns / 1000; }
#ifdef PADDLE_WITH_CUPTI
float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
int32_t staticSharedMemory,
int32_t dynamicSharedMemory, int32_t blockX,
int32_t blockY, int32_t blockZ, float blocksPerSm);
#endif
} // namespace platform
} // namespace paddle
......@@ -16,6 +16,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUPTI
#include <cuda.h>
#include <cuda_occupancy.h>
#include <cupti.h>
#include <mutex> // NOLINT
......@@ -63,7 +64,8 @@ extern void *cupti_dso_handle;
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback); \
__macro(cuptiEnableDomain);
__macro(cuptiEnableDomain); \
__macro(cudaOccMaxActiveBlocksPerMultiprocessor);
CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册