From 8037901bfcd0b2c7c9a1352bd699cd0aecf046d9 Mon Sep 17 00:00:00 2001 From: yuguo <948529990@qq.com> Date: Fri, 22 Jul 2022 16:13:18 +0800 Subject: [PATCH] Add code of occupancy computing on DCU and avoid threadID bug for DCU profiler (#44520) --- CMakeLists.txt | 1 + cmake/configure.cmake | 7 +++ cmake/cupti.cmake | 2 +- paddle/fluid/platform/dynload/CMakeLists.txt | 3 + .../platform/profiler/chrometracing_logger.cc | 15 +++++ .../platform/profiler/cupti_data_process.cc | 8 +++ paddle/fluid/platform/profiler/trace_event.h | 4 ++ paddle/fluid/platform/profiler/utils.cc | 56 ++++++++++++++++++- paddle/fluid/platform/profiler/utils.h | 12 +++- paddle/phi/backends/dynload/CMakeLists.txt | 3 + 10 files changed, 108 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 78ebbccfb2e..ea0011762df 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -435,6 +435,7 @@ endif() if(WITH_ROCM) include(hip) include(miopen) # set miopen libraries, must before configure + include(cupti) endif() if(WITH_XPU_KP) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index f84bb15d592..41b4a90c20f 100755 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -178,6 +178,13 @@ elseif(WITH_ROCM) add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_HIP) + if(CUPTI_FOUND) + include_directories(${CUPTI_INCLUDE_DIR}) + add_definitions(-DPADDLE_WITH_CUPTI) + else() + message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.") + endif() + if(NOT MIOPEN_FOUND) message(FATAL_ERROR "Paddle needs MIOpen to compile") endif() diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake index 6bf0141c208..a6bab6a3951 100644 --- a/cmake/cupti.cmake +++ b/cmake/cupti.cmake @@ -1,4 +1,4 @@ -if(NOT WITH_GPU) +if(NOT WITH_GPU AND NOT WITH_ROCM) return() endif() diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index fbccfe5265a..448d2f7e99b 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -35,6 +35,9 @@ if(NOT APPLE) if(WITH_RCCL) list(APPEND HIP_SRCS rccl.cc) endif() + if(CUPTI_FOUND) + list(APPEND HIP_SRCS cupti.cc) + endif() endif() endif() diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 2d461904c3a..8d6165e5376 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -401,7 +401,11 @@ void ChromeTracingLogger::HandleTypeKernel( float warps_per_sm = 0.0; float occupancy = 0.0; #if defined(PADDLE_WITH_CUPTI) +#ifdef PADDLE_WITH_HIP + constexpr int threads_per_warp = 64; +#else constexpr int threads_per_warp = 32; +#endif const gpuDeviceProp& device_property = GetDeviceProperties(device_node.DeviceId()); blocks_per_sm = static_cast(kernel_info.grid_x * kernel_info.grid_y * @@ -411,6 +415,15 @@ void ChromeTracingLogger::HandleTypeKernel( blocks_per_sm * (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) / threads_per_warp; +#ifdef PADDLE_WITH_HIP + occupancy = CalculateEstOccupancy(device_node.DeviceId(), + kernel_info.dynamic_shared_memory, + kernel_info.block_x, + kernel_info.block_y, + kernel_info.block_z, + kernel_info.kernelFunc, + kernel_info.launchType); +#else occupancy = CalculateEstOccupancy(device_node.DeviceId(), kernel_info.registers_per_thread, kernel_info.static_shared_memory, @@ -419,6 +432,8 @@ void ChromeTracingLogger::HandleTypeKernel( kernel_info.block_y, kernel_info.block_z, blocks_per_sm); +#endif // PADDLE_WITH_HIP + #endif float dur = nsToMsFloat(device_node.Duration()); std::string dur_display; diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index df9bc861f3b..cf296fe197a 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -52,6 +52,10 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel, event.kernel_info.queued = kernel->queued; event.kernel_info.submitted = kernel->submitted; event.kernel_info.completed = kernel->completed; +#ifdef PADDLE_WITH_HIP + event.kernel_info.kernelFunc = kernel->kernelFunc; + event.kernel_info.launchType = kernel->launchType; +#endif collector->AddDeviceEvent(std::move(event)); } @@ -279,7 +283,11 @@ void AddApiRecord(const CUpti_ActivityAPI* api, } else { tid = iter->second; } +#ifdef PADDLE_WITH_HIP + event.thread_id = api->threadId; +#else event.thread_id = tid; +#endif event.correlation_id = api->correlationId; event.callback_id = api->cbid; collector->AddRuntimeEvent(std::move(event)); diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index 62d82c19d17..cdd302494c4 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -105,6 +105,10 @@ struct KernelEventInfo { uint64_t submitted; // The completed timestamp for the kernel execution, in ns. uint64_t completed; +#ifdef PADDLE_WITH_HIP + void* kernelFunc; + uint8_t launchType; +#endif }; static constexpr size_t kMemKindMaxLen = 50; diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc index 11035867416..006170caa4d 100644 --- a/paddle/fluid/platform/profiler/utils.cc +++ b/paddle/fluid/platform/profiler/utils.cc @@ -43,6 +43,58 @@ std::string json_vector( } #ifdef PADDLE_WITH_CUPTI + +#ifdef PADDLE_WITH_HIP + +#include "hip/hip_runtime.h" +float CalculateEstOccupancy(uint32_t DeviceId, + int32_t DynamicSharedMemory, + int32_t BlockX, + int32_t BlockY, + int32_t BlockZ, + void* kernelFunc, + uint8_t launchType) { + float occupancy = 0.0; + std::vector device_ids = GetSelectedDevices(); + if (DeviceId < device_ids.size()) { + const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId); + int blockSize = BlockX * BlockY * BlockZ; + int numBlock = 0; + hipError_t status; + if (launchType == 0) { + status = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlock, kernelFunc, blockSize, DynamicSharedMemory); + if (status == hipSuccess) { + occupancy = static_cast(numBlock) * blockSize / + device_property.maxThreadsPerMultiProcessor; + } else { + LOG(WARNING) << "Failed to calculate estimated occupancy, status = " + << status << std::endl; + } + } else if (launchType == 100) { + status = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlock, + reinterpret_cast(kernelFunc), + blockSize, + DynamicSharedMemory); + if (status == hipSuccess) { + occupancy = static_cast(numBlock) * blockSize / + device_property.maxThreadsPerMultiProcessor; + } else { + LOG(WARNING) << "Failed to calculate estimated occupancy, status = " + << status << std::endl; + } + } else { + LOG(WARNING) << "Failed to calculate estimated occupancy, can not " + "recognize launchType : " + << launchType << std::endl; + } + } + return occupancy; +} + +#else + float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, int32_t StaticSharedMemory, @@ -88,7 +140,9 @@ float CalculateEstOccupancy(uint32_t DeviceId, } return occupancy; } -#endif +#endif // PADDLE_WITH_HIP + +#endif // PADDLE_WITH_CUPTI const char* StringTracerMemEventType(TracerMemEventType type) { static const char* categary_name_[] = { diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index dcbe230d5e9..c9437e0e779 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -125,6 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) { } #ifdef PADDLE_WITH_CUPTI +#ifdef PADDLE_WITH_HIP +float CalculateEstOccupancy(uint32_t DeviceId, + int32_t DynamicSharedMemory, + int32_t BlockX, + int32_t BlockY, + int32_t BlockZ, + void* kernelFunc, + uint8_t launchType); +#else float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, int32_t staticSharedMemory, @@ -133,7 +142,8 @@ float CalculateEstOccupancy(uint32_t deviceId, int32_t blockY, int32_t blockZ, float blocksPerSm); -#endif +#endif // PADDLE_WITH_HIP +#endif // PADDLE_WITH_CUPTI } // namespace platform } // namespace paddle diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 308e3b0cf9d..49ab8d4f0c9 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -35,6 +35,9 @@ if(NOT APPLE) if(WITH_RCCL) list(APPEND HIP_SRCS rccl.cc) endif() + if(CUPTI_FOUND) + list(APPEND HIP_SRCS cupti.cc) + endif() endif() endif() -- GitLab