diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f6e15758379ada165a9dc0e31273a533b06ad2df..ebb686d8ad0f31917e64161d6f7d2ecd4644fadd 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -134,8 +134,8 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) - list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) endif() endif() if (WITH_XPU) @@ -197,92 +197,88 @@ function(kernel_library TARGET) # kernel source file level # level 1: base device kernel - # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # level 2: device-independent kernel # - common_srcs # level 3: Kernel implemented by reusing device-independent kernel # - selected_rows_srcs + set(base_device_kernels) + set(device_independent_kernel) + set(high_level_kernels) - # Build Target according different src organization - if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND - (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) - # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. + # 1. Base device kernel compile + if (${cpu_srcs_len} GREATER 0) + cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_cpu) + endif() + if (${gpu_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpu) + endif() + if (${xpu_srcs_len} GREATER 0) + cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_xpu) + endif() + if (${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If the selected_rows_srcs depends on common_srcs, build target using this rule. - elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpudnn) + endif() + if (${kps_srcs_len} GREATER 0) + # only when WITH_XPU_KP, the kps_srcs_len can be > 0 + xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_kps) + endif() + + # 2. Device-independent kernel compile + if (${common_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) else() - cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) endif() - # If there are only common_srcs or selected_rows_srcs, build target using below rules. - elseif (${common_srcs_len} GREATER 0) + list(APPEND device_independent_kernel ${TARGET}_common) + endif() + + # 3. Reusing kernel compile + if (${selected_rows_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) endif() - elseif (${selected_rows_srcs_len} GREATER 0) + list(APPEND high_level_kernels ${TARGET}_sr) + endif() + + # 4. Unify target compile + list(LENGTH base_device_kernels base_device_kernels_len) + list(LENGTH device_independent_kernel device_independent_kernel_len) + list(LENGTH high_level_kernels high_level_kernels_len) + if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR + ${high_level_kernels_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) else() - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) endif() else() set(target_build_flag 0) diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/softmax_kernel.cu