set(kernel_declare_file ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp CACHE INTERNAL "declarations.h file") set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h) file( WRITE ${kernel_declare_file} "// Generated by the paddle/phi/kernels/CMakeLists.txt. DO NOT EDIT!\n\n#pragma once\n\n" ) file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h\"\n\n") set(kernel_declare_file_prune ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.prune CACHE INTERNAL "declarations.h file") # phi functors and functions called by kernels add_subdirectory(funcs) # kernel autotune add_subdirectory(autotune) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h") file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h") file(GLOB kernel_primitive_h "primitive/*.h") # fusion ops would be included here file( GLOB kernel_cu RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "gpu/*.cu" "gpu/*.cu.cc" "gpudnn/*.cu" "kps/*.cu" "legacy/kps/*.cu" "legacy/gpu/*.cu" "selected_rows/gpu/*.cu" "sparse/gpu/*.cu" "strings/gpu/*.cu" "fusion/gpu/*.cu") if(APPLE OR WIN32) list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu") endif() if(NOT WITH_DGC) list(REMOVE_ITEM kernel_cu "gpu/dgc_kernel.cu") endif() if(DEFINED REDUCE_INFERENCE_LIB_SIZE) list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$") list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$") endif() if(WITH_CUTLASS) execute_process( COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d/generated" COMMAND ${PYTHON_EXECUTABLE} "conv2d_bias_act.py" COMMAND ${PYTHON_EXECUTABLE} "conv2d_bias_residual.py" COMMAND ${PYTHON_EXECUTABLE} "conv2d_depthwise_bias_act.py" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d") execute_process( COMMAND ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py --cuda_arch "${NVCC_ARCH_BIN}" RESULT_VARIABLE memory_efficient_attention_gen_res) execute_process( COMMAND ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_variable_forward_kernels.py --cuda_arch "${NVCC_ARCH_BIN}" RESULT_VARIABLE memory_efficient_attention_gen_res) if(NOT memory_efficient_attention_gen_res EQUAL 0) message( FATAL_ERROR "The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}" ) endif() file( GLOB cutlass_cu RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "fusion/cutlass/conv2d/generated/*.cu" "fusion/cutlass/conv2d/*.cu" "fusion/cutlass/*.cu" "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu" "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu") list(APPEND kernel_cu ${cutlass_cu}) endif() set(cc_search_pattern "*.cc" "cpu/*.cc" "legacy/*.cc" "legacy/cpu/*.cc" "selected_rows/*.cc" "selected_rows/cpu/*.cc" "sparse/*.cc" "sparse/cpu/*.cc" "legacy/*.cc" "legacy/cpu/*.cc" "strings/*.cc" "strings/cpu/*.cc" "fusion/*.cc" "stride/*.cc" "fusion/cpu/*.cc") if(WITH_MKLDNN) set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc" "fusion/onednn/*.cc") endif() file( GLOB kernel_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" ${cc_search_pattern}) if(DEFINED REDUCE_INFERENCE_LIB_SIZE) list(FILTER kernel_cc EXCLUDE REGEX ".*_grad_kernel\\.cc$") endif() file( GLOB kernel_xpu RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc" "sparse/xpu/*.cc") if(WITH_GPU OR WITH_ROCM) collect_srcs(kernels_srcs SRCS ${kernel_cu}) kernel_declare("${kernel_cu}") endif() if(WITH_XPU) if(WITH_XPU_KP) file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/ DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/) file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/legacy/kps/ DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/) file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.cu") foreach(kernel ${kernel_xpu_kps}) get_filename_component(name ${kernel} NAME_WE) file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps") endforeach() file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps") collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_kps}) foreach(kernel ${kernel_cc}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${kernel} ${CMAKE_CURRENT_BINARY_DIR}/${kernel} COPYONLY) endforeach() file(GLOB_RECURSE kernel_xpu_cc "${CMAKE_CURRENT_BINARY_DIR}/*.cc") collect_generated_srcs(kernels_srcs SRCS ${kernel_xpu_cc}) set(kernel_cc "") endif() collect_srcs(kernels_srcs SRCS ${kernel_xpu}) kernel_declare("${kernel_xpu}") kernel_declare("${kernel_xpu_kps}") kernel_declare("${kernel_xpu_cc}") endif() collect_srcs(kernels_srcs SRCS ${kernel_cc}) kernel_declare("${kernel_cc}") if(NOT "${KERNEL_LIST}" STREQUAL "") prune_declaration_h() endif()