未验证 提交 8037901b 编写于 作者: Y yuguo 提交者: GitHub

Add code of occupancy computing on DCU and avoid threadID bug for DCU profiler (#44520)

上级 fcfaa104
...@@ -435,6 +435,7 @@ endif() ...@@ -435,6 +435,7 @@ endif()
if(WITH_ROCM) if(WITH_ROCM)
include(hip) include(hip)
include(miopen) # set miopen libraries, must before configure include(miopen) # set miopen libraries, must before configure
include(cupti)
endif() endif()
if(WITH_XPU_KP) if(WITH_XPU_KP)
......
...@@ -178,6 +178,13 @@ elseif(WITH_ROCM) ...@@ -178,6 +178,13 @@ elseif(WITH_ROCM)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
add_definitions(-DEIGEN_USE_HIP) add_definitions(-DEIGEN_USE_HIP)
if(CUPTI_FOUND)
include_directories(${CUPTI_INCLUDE_DIR})
add_definitions(-DPADDLE_WITH_CUPTI)
else()
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
endif()
if(NOT MIOPEN_FOUND) if(NOT MIOPEN_FOUND)
message(FATAL_ERROR "Paddle needs MIOpen to compile") message(FATAL_ERROR "Paddle needs MIOpen to compile")
endif() endif()
......
if(NOT WITH_GPU) if(NOT WITH_GPU AND NOT WITH_ROCM)
return() return()
endif() endif()
......
...@@ -35,6 +35,9 @@ if(NOT APPLE) ...@@ -35,6 +35,9 @@ if(NOT APPLE)
if(WITH_RCCL) if(WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc) list(APPEND HIP_SRCS rccl.cc)
endif() endif()
if(CUPTI_FOUND)
list(APPEND HIP_SRCS cupti.cc)
endif()
endif() endif()
endif() endif()
......
...@@ -401,7 +401,11 @@ void ChromeTracingLogger::HandleTypeKernel( ...@@ -401,7 +401,11 @@ void ChromeTracingLogger::HandleTypeKernel(
float warps_per_sm = 0.0; float warps_per_sm = 0.0;
float occupancy = 0.0; float occupancy = 0.0;
#if defined(PADDLE_WITH_CUPTI) #if defined(PADDLE_WITH_CUPTI)
#ifdef PADDLE_WITH_HIP
constexpr int threads_per_warp = 64;
#else
constexpr int threads_per_warp = 32; constexpr int threads_per_warp = 32;
#endif
const gpuDeviceProp& device_property = const gpuDeviceProp& device_property =
GetDeviceProperties(device_node.DeviceId()); GetDeviceProperties(device_node.DeviceId());
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y * blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
...@@ -411,6 +415,15 @@ void ChromeTracingLogger::HandleTypeKernel( ...@@ -411,6 +415,15 @@ void ChromeTracingLogger::HandleTypeKernel(
blocks_per_sm * blocks_per_sm *
(kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) / (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
threads_per_warp; threads_per_warp;
#ifdef PADDLE_WITH_HIP
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
kernel_info.kernelFunc,
kernel_info.launchType);
#else
occupancy = CalculateEstOccupancy(device_node.DeviceId(), occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.registers_per_thread, kernel_info.registers_per_thread,
kernel_info.static_shared_memory, kernel_info.static_shared_memory,
...@@ -419,6 +432,8 @@ void ChromeTracingLogger::HandleTypeKernel( ...@@ -419,6 +432,8 @@ void ChromeTracingLogger::HandleTypeKernel(
kernel_info.block_y, kernel_info.block_y,
kernel_info.block_z, kernel_info.block_z,
blocks_per_sm); blocks_per_sm);
#endif // PADDLE_WITH_HIP
#endif #endif
float dur = nsToMsFloat(device_node.Duration()); float dur = nsToMsFloat(device_node.Duration());
std::string dur_display; std::string dur_display;
......
...@@ -52,6 +52,10 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel, ...@@ -52,6 +52,10 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel,
event.kernel_info.queued = kernel->queued; event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted; event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->completed; event.kernel_info.completed = kernel->completed;
#ifdef PADDLE_WITH_HIP
event.kernel_info.kernelFunc = kernel->kernelFunc;
event.kernel_info.launchType = kernel->launchType;
#endif
collector->AddDeviceEvent(std::move(event)); collector->AddDeviceEvent(std::move(event));
} }
...@@ -279,7 +283,11 @@ void AddApiRecord(const CUpti_ActivityAPI* api, ...@@ -279,7 +283,11 @@ void AddApiRecord(const CUpti_ActivityAPI* api,
} else { } else {
tid = iter->second; tid = iter->second;
} }
#ifdef PADDLE_WITH_HIP
event.thread_id = api->threadId;
#else
event.thread_id = tid; event.thread_id = tid;
#endif
event.correlation_id = api->correlationId; event.correlation_id = api->correlationId;
event.callback_id = api->cbid; event.callback_id = api->cbid;
collector->AddRuntimeEvent(std::move(event)); collector->AddRuntimeEvent(std::move(event));
......
...@@ -105,6 +105,10 @@ struct KernelEventInfo { ...@@ -105,6 +105,10 @@ struct KernelEventInfo {
uint64_t submitted; uint64_t submitted;
// The completed timestamp for the kernel execution, in ns. // The completed timestamp for the kernel execution, in ns.
uint64_t completed; uint64_t completed;
#ifdef PADDLE_WITH_HIP
void* kernelFunc;
uint8_t launchType;
#endif
}; };
static constexpr size_t kMemKindMaxLen = 50; static constexpr size_t kMemKindMaxLen = 50;
......
...@@ -43,6 +43,58 @@ std::string json_vector<std::string>( ...@@ -43,6 +43,58 @@ std::string json_vector<std::string>(
} }
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
#ifdef PADDLE_WITH_HIP
#include "hip/hip_runtime.h"
float CalculateEstOccupancy(uint32_t DeviceId,
int32_t DynamicSharedMemory,
int32_t BlockX,
int32_t BlockY,
int32_t BlockZ,
void* kernelFunc,
uint8_t launchType) {
float occupancy = 0.0;
std::vector<int> device_ids = GetSelectedDevices();
if (DeviceId < device_ids.size()) {
const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
int blockSize = BlockX * BlockY * BlockZ;
int numBlock = 0;
hipError_t status;
if (launchType == 0) {
status = hipOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlock, kernelFunc, blockSize, DynamicSharedMemory);
if (status == hipSuccess) {
occupancy = static_cast<double>(numBlock) * blockSize /
device_property.maxThreadsPerMultiProcessor;
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
<< status << std::endl;
}
} else if (launchType == 100) {
status = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlock,
reinterpret_cast<hipFunction_t>(kernelFunc),
blockSize,
DynamicSharedMemory);
if (status == hipSuccess) {
occupancy = static_cast<double>(numBlock) * blockSize /
device_property.maxThreadsPerMultiProcessor;
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
<< status << std::endl;
}
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, can not "
"recognize launchType : "
<< launchType << std::endl;
}
}
return occupancy;
}
#else
float CalculateEstOccupancy(uint32_t DeviceId, float CalculateEstOccupancy(uint32_t DeviceId,
uint16_t RegistersPerThread, uint16_t RegistersPerThread,
int32_t StaticSharedMemory, int32_t StaticSharedMemory,
...@@ -88,7 +140,9 @@ float CalculateEstOccupancy(uint32_t DeviceId, ...@@ -88,7 +140,9 @@ float CalculateEstOccupancy(uint32_t DeviceId,
} }
return occupancy; return occupancy;
} }
#endif #endif // PADDLE_WITH_HIP
#endif // PADDLE_WITH_CUPTI
const char* StringTracerMemEventType(TracerMemEventType type) { const char* StringTracerMemEventType(TracerMemEventType type) {
static const char* categary_name_[] = { static const char* categary_name_[] = {
......
...@@ -125,6 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) { ...@@ -125,6 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
} }
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
#ifdef PADDLE_WITH_HIP
float CalculateEstOccupancy(uint32_t DeviceId,
int32_t DynamicSharedMemory,
int32_t BlockX,
int32_t BlockY,
int32_t BlockZ,
void* kernelFunc,
uint8_t launchType);
#else
float CalculateEstOccupancy(uint32_t deviceId, float CalculateEstOccupancy(uint32_t deviceId,
uint16_t registersPerThread, uint16_t registersPerThread,
int32_t staticSharedMemory, int32_t staticSharedMemory,
...@@ -133,7 +142,8 @@ float CalculateEstOccupancy(uint32_t deviceId, ...@@ -133,7 +142,8 @@ float CalculateEstOccupancy(uint32_t deviceId,
int32_t blockY, int32_t blockY,
int32_t blockZ, int32_t blockZ,
float blocksPerSm); float blocksPerSm);
#endif #endif // PADDLE_WITH_HIP
#endif // PADDLE_WITH_CUPTI
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -35,6 +35,9 @@ if(NOT APPLE) ...@@ -35,6 +35,9 @@ if(NOT APPLE)
if(WITH_RCCL) if(WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc) list(APPEND HIP_SRCS rccl.cc)
endif() endif()
if(CUPTI_FOUND)
list(APPEND HIP_SRCS cupti.cc)
endif()
endif() endif()
endif() endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册