未验证 提交 8037901b 编写于 作者: Y yuguo 提交者: GitHub

Add code of occupancy computing on DCU and avoid threadID bug for DCU profiler (#44520)

上级 fcfaa104
......@@ -435,6 +435,7 @@ endif()
if(WITH_ROCM)
include(hip)
include(miopen) # set miopen libraries, must before configure
include(cupti)
endif()
if(WITH_XPU_KP)
......
......@@ -178,6 +178,13 @@ elseif(WITH_ROCM)
add_definitions(-DEIGEN_USE_GPU)
add_definitions(-DEIGEN_USE_HIP)
if(CUPTI_FOUND)
include_directories(${CUPTI_INCLUDE_DIR})
add_definitions(-DPADDLE_WITH_CUPTI)
else()
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
endif()
if(NOT MIOPEN_FOUND)
message(FATAL_ERROR "Paddle needs MIOpen to compile")
endif()
......
if(NOT WITH_GPU)
if(NOT WITH_GPU AND NOT WITH_ROCM)
return()
endif()
......
......@@ -35,6 +35,9 @@ if(NOT APPLE)
if(WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc)
endif()
if(CUPTI_FOUND)
list(APPEND HIP_SRCS cupti.cc)
endif()
endif()
endif()
......
......@@ -401,7 +401,11 @@ void ChromeTracingLogger::HandleTypeKernel(
float warps_per_sm = 0.0;
float occupancy = 0.0;
#if defined(PADDLE_WITH_CUPTI)
#ifdef PADDLE_WITH_HIP
constexpr int threads_per_warp = 64;
#else
constexpr int threads_per_warp = 32;
#endif
const gpuDeviceProp& device_property =
GetDeviceProperties(device_node.DeviceId());
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
......@@ -411,6 +415,15 @@ void ChromeTracingLogger::HandleTypeKernel(
blocks_per_sm *
(kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
threads_per_warp;
#ifdef PADDLE_WITH_HIP
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
kernel_info.kernelFunc,
kernel_info.launchType);
#else
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.registers_per_thread,
kernel_info.static_shared_memory,
......@@ -419,6 +432,8 @@ void ChromeTracingLogger::HandleTypeKernel(
kernel_info.block_y,
kernel_info.block_z,
blocks_per_sm);
#endif // PADDLE_WITH_HIP
#endif
float dur = nsToMsFloat(device_node.Duration());
std::string dur_display;
......
......@@ -52,6 +52,10 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel,
event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->completed;
#ifdef PADDLE_WITH_HIP
event.kernel_info.kernelFunc = kernel->kernelFunc;
event.kernel_info.launchType = kernel->launchType;
#endif
collector->AddDeviceEvent(std::move(event));
}
......@@ -279,7 +283,11 @@ void AddApiRecord(const CUpti_ActivityAPI* api,
} else {
tid = iter->second;
}
#ifdef PADDLE_WITH_HIP
event.thread_id = api->threadId;
#else
event.thread_id = tid;
#endif
event.correlation_id = api->correlationId;
event.callback_id = api->cbid;
collector->AddRuntimeEvent(std::move(event));
......
......@@ -105,6 +105,10 @@ struct KernelEventInfo {
uint64_t submitted;
// The completed timestamp for the kernel execution, in ns.
uint64_t completed;
#ifdef PADDLE_WITH_HIP
void* kernelFunc;
uint8_t launchType;
#endif
};
static constexpr size_t kMemKindMaxLen = 50;
......
......@@ -43,6 +43,58 @@ std::string json_vector<std::string>(
}
#ifdef PADDLE_WITH_CUPTI
#ifdef PADDLE_WITH_HIP
#include "hip/hip_runtime.h"
float CalculateEstOccupancy(uint32_t DeviceId,
int32_t DynamicSharedMemory,
int32_t BlockX,
int32_t BlockY,
int32_t BlockZ,
void* kernelFunc,
uint8_t launchType) {
float occupancy = 0.0;
std::vector<int> device_ids = GetSelectedDevices();
if (DeviceId < device_ids.size()) {
const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
int blockSize = BlockX * BlockY * BlockZ;
int numBlock = 0;
hipError_t status;
if (launchType == 0) {
status = hipOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlock, kernelFunc, blockSize, DynamicSharedMemory);
if (status == hipSuccess) {
occupancy = static_cast<double>(numBlock) * blockSize /
device_property.maxThreadsPerMultiProcessor;
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
<< status << std::endl;
}
} else if (launchType == 100) {
status = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlock,
reinterpret_cast<hipFunction_t>(kernelFunc),
blockSize,
DynamicSharedMemory);
if (status == hipSuccess) {
occupancy = static_cast<double>(numBlock) * blockSize /
device_property.maxThreadsPerMultiProcessor;
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
<< status << std::endl;
}
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, can not "
"recognize launchType : "
<< launchType << std::endl;
}
}
return occupancy;
}
#else
float CalculateEstOccupancy(uint32_t DeviceId,
uint16_t RegistersPerThread,
int32_t StaticSharedMemory,
......@@ -88,7 +140,9 @@ float CalculateEstOccupancy(uint32_t DeviceId,
}
return occupancy;
}
#endif
#endif // PADDLE_WITH_HIP
#endif // PADDLE_WITH_CUPTI
const char* StringTracerMemEventType(TracerMemEventType type) {
static const char* categary_name_[] = {
......
......@@ -125,6 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
}
#ifdef PADDLE_WITH_CUPTI
#ifdef PADDLE_WITH_HIP
float CalculateEstOccupancy(uint32_t DeviceId,
int32_t DynamicSharedMemory,
int32_t BlockX,
int32_t BlockY,
int32_t BlockZ,
void* kernelFunc,
uint8_t launchType);
#else
float CalculateEstOccupancy(uint32_t deviceId,
uint16_t registersPerThread,
int32_t staticSharedMemory,
......@@ -133,7 +142,8 @@ float CalculateEstOccupancy(uint32_t deviceId,
int32_t blockY,
int32_t blockZ,
float blocksPerSm);
#endif
#endif // PADDLE_WITH_HIP
#endif // PADDLE_WITH_CUPTI
} // namespace platform
} // namespace paddle
......@@ -35,6 +35,9 @@ if(NOT APPLE)
if(WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc)
endif()
if(CUPTI_FOUND)
list(APPEND HIP_SRCS cupti.cc)
endif()
endif()
endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册