diff --git a/CMakeLists.txt b/CMakeLists.txt index 78ebbccfb2e7ace9fb605b55c603d78ea152f5db..ea0011762df53044ecf5e7836eb7b67c0cf40f40 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -435,6 +435,7 @@ endif() if(WITH_ROCM) include(hip) include(miopen) # set miopen libraries, must before configure + include(cupti) endif() if(WITH_XPU_KP) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index f84bb15d5922b8c6955c752f1761e8f8fc786ca4..41b4a90c20f201c93f6f6f4ba3c334fe538104f4 100755 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -178,6 +178,13 @@ elseif(WITH_ROCM) add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_HIP) + if(CUPTI_FOUND) + include_directories(${CUPTI_INCLUDE_DIR}) + add_definitions(-DPADDLE_WITH_CUPTI) + else() + message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.") + endif() + if(NOT MIOPEN_FOUND) message(FATAL_ERROR "Paddle needs MIOpen to compile") endif() diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake index 6bf0141c208c7e6ba640881fd7509185c1b1ac29..a6bab6a39512a347b9ca284887353523d2d77e76 100644 --- a/cmake/cupti.cmake +++ b/cmake/cupti.cmake @@ -1,4 +1,4 @@ -if(NOT WITH_GPU) +if(NOT WITH_GPU AND NOT WITH_ROCM) return() endif() diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index fbccfe5265a71cb8f764a4f0fc6dcd8299e7babf..448d2f7e99b5a9c1b304d5c496051d6210da180d 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -35,6 +35,9 @@ if(NOT APPLE) if(WITH_RCCL) list(APPEND HIP_SRCS rccl.cc) endif() + if(CUPTI_FOUND) + list(APPEND HIP_SRCS cupti.cc) + endif() endif() endif() diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 2d461904c3ad0cf2168f85476d9e7469a47e0be1..8d6165e53766a044bcec9c0551c5af0527b97f88 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -401,7 +401,11 @@ void ChromeTracingLogger::HandleTypeKernel( float warps_per_sm = 0.0; float occupancy = 0.0; #if defined(PADDLE_WITH_CUPTI) +#ifdef PADDLE_WITH_HIP + constexpr int threads_per_warp = 64; +#else constexpr int threads_per_warp = 32; +#endif const gpuDeviceProp& device_property = GetDeviceProperties(device_node.DeviceId()); blocks_per_sm = static_cast(kernel_info.grid_x * kernel_info.grid_y * @@ -411,6 +415,15 @@ void ChromeTracingLogger::HandleTypeKernel( blocks_per_sm * (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) / threads_per_warp; +#ifdef PADDLE_WITH_HIP + occupancy = CalculateEstOccupancy(device_node.DeviceId(), + kernel_info.dynamic_shared_memory, + kernel_info.block_x, + kernel_info.block_y, + kernel_info.block_z, + kernel_info.kernelFunc, + kernel_info.launchType); +#else occupancy = CalculateEstOccupancy(device_node.DeviceId(), kernel_info.registers_per_thread, kernel_info.static_shared_memory, @@ -419,6 +432,8 @@ void ChromeTracingLogger::HandleTypeKernel( kernel_info.block_y, kernel_info.block_z, blocks_per_sm); +#endif // PADDLE_WITH_HIP + #endif float dur = nsToMsFloat(device_node.Duration()); std::string dur_display; diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index df9bc861f3b440c12a07dadfb892ec1ec2d7a4e1..cf296fe197ab8493a90ab41c1bbec0fc065e02f9 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -52,6 +52,10 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel, event.kernel_info.queued = kernel->queued; event.kernel_info.submitted = kernel->submitted; event.kernel_info.completed = kernel->completed; +#ifdef PADDLE_WITH_HIP + event.kernel_info.kernelFunc = kernel->kernelFunc; + event.kernel_info.launchType = kernel->launchType; +#endif collector->AddDeviceEvent(std::move(event)); } @@ -279,7 +283,11 @@ void AddApiRecord(const CUpti_ActivityAPI* api, } else { tid = iter->second; } +#ifdef PADDLE_WITH_HIP + event.thread_id = api->threadId; +#else event.thread_id = tid; +#endif event.correlation_id = api->correlationId; event.callback_id = api->cbid; collector->AddRuntimeEvent(std::move(event)); diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index 62d82c19d1796aa84573dff40928b81fd0411d85..cdd302494c47afb2212243176a5d5522d2b46c78 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -105,6 +105,10 @@ struct KernelEventInfo { uint64_t submitted; // The completed timestamp for the kernel execution, in ns. uint64_t completed; +#ifdef PADDLE_WITH_HIP + void* kernelFunc; + uint8_t launchType; +#endif }; static constexpr size_t kMemKindMaxLen = 50; diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc index 11035867416b8b8b8d2bd0d2df019f6dd6d688d1..006170caa4dd7c30dfcf5c11eb360412f1cf736d 100644 --- a/paddle/fluid/platform/profiler/utils.cc +++ b/paddle/fluid/platform/profiler/utils.cc @@ -43,6 +43,58 @@ std::string json_vector( } #ifdef PADDLE_WITH_CUPTI + +#ifdef PADDLE_WITH_HIP + +#include "hip/hip_runtime.h" +float CalculateEstOccupancy(uint32_t DeviceId, + int32_t DynamicSharedMemory, + int32_t BlockX, + int32_t BlockY, + int32_t BlockZ, + void* kernelFunc, + uint8_t launchType) { + float occupancy = 0.0; + std::vector device_ids = GetSelectedDevices(); + if (DeviceId < device_ids.size()) { + const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId); + int blockSize = BlockX * BlockY * BlockZ; + int numBlock = 0; + hipError_t status; + if (launchType == 0) { + status = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlock, kernelFunc, blockSize, DynamicSharedMemory); + if (status == hipSuccess) { + occupancy = static_cast(numBlock) * blockSize / + device_property.maxThreadsPerMultiProcessor; + } else { + LOG(WARNING) << "Failed to calculate estimated occupancy, status = " + << status << std::endl; + } + } else if (launchType == 100) { + status = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor( + &numBlock, + reinterpret_cast(kernelFunc), + blockSize, + DynamicSharedMemory); + if (status == hipSuccess) { + occupancy = static_cast(numBlock) * blockSize / + device_property.maxThreadsPerMultiProcessor; + } else { + LOG(WARNING) << "Failed to calculate estimated occupancy, status = " + << status << std::endl; + } + } else { + LOG(WARNING) << "Failed to calculate estimated occupancy, can not " + "recognize launchType : " + << launchType << std::endl; + } + } + return occupancy; +} + +#else + float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, int32_t StaticSharedMemory, @@ -88,7 +140,9 @@ float CalculateEstOccupancy(uint32_t DeviceId, } return occupancy; } -#endif +#endif // PADDLE_WITH_HIP + +#endif // PADDLE_WITH_CUPTI const char* StringTracerMemEventType(TracerMemEventType type) { static const char* categary_name_[] = { diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index dcbe230d5e96c421c3e029d62d7b5beed1779567..c9437e0e7793a3ba9691672d14c8866b61554b36 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -125,6 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) { } #ifdef PADDLE_WITH_CUPTI +#ifdef PADDLE_WITH_HIP +float CalculateEstOccupancy(uint32_t DeviceId, + int32_t DynamicSharedMemory, + int32_t BlockX, + int32_t BlockY, + int32_t BlockZ, + void* kernelFunc, + uint8_t launchType); +#else float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, int32_t staticSharedMemory, @@ -133,7 +142,8 @@ float CalculateEstOccupancy(uint32_t deviceId, int32_t blockY, int32_t blockZ, float blocksPerSm); -#endif +#endif // PADDLE_WITH_HIP +#endif // PADDLE_WITH_CUPTI } // namespace platform } // namespace paddle diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 308e3b0cf9dfa895a13938a6e3c44bade4096e1f..49ab8d4f0c91a8bbbf6c76fa13c090a4aab5ffaa 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -35,6 +35,9 @@ if(NOT APPLE) if(WITH_RCCL) list(APPEND HIP_SRCS rccl.cc) endif() + if(CUPTI_FOUND) + list(APPEND HIP_SRCS cupti.cc) + endif() endif() endif()