CMakeLists.txt 7.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
set(kernel_declare_file
    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.tmp
    CACHE INTERNAL "declarations.h file")
set(kernel_declare_file_final
    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h)
file(
  WRITE ${kernel_declare_file}
  "// Generated by the paddle/phi/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n"
)
file(APPEND ${kernel_declare_file}
     "#include \"paddle/phi/core/kernel_registry.h\"\n\n")
12 13 14
set(kernel_declare_file_prune
    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.prune
    CACHE INTERNAL "declarations.h file")
15

16
# phi functors and functions called by kernels
C
Chen Weihang 已提交
17
add_subdirectory(funcs)
C
Chen Weihang 已提交
18

19 20 21
# kernel autotune
add_subdirectory(autotune)

22 23
# phi depends all phi kernel targets
set_property(GLOBAL PROPERTY PHI_KERNELS "")
24

25
# [ 1. Common kernel compilation dependencies ]
26 27
set(COMMON_KERNEL_DEPS
    dense_tensor
28
    string_tensor
29 30
    sparse_coo_tensor
    sparse_csr_tensor
31
    tensor_array
H
Huang Jiyi 已提交
32 33
    int_array
    scalar
34 35 36 37 38
    kernel_context
    kernel_factory
    arg_map_context
    convert_utils
    lod_utils
39
    custom_kernel
40
    string_infermeta
41
    phi_tensor_utils)
42 43 44 45 46 47 48 49 50
set(COMMON_KERNEL_DEPS
    ${COMMON_KERNEL_DEPS}
    eigen_function
    blas
    math_function
    im2col
    vol2col
    concat_and_split_functor
    selected_rows_functor)
51
# remove this dep after removing fluid deps on tensor creation
H
Huang Jiyi 已提交
52
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} lod_utils)
Z
zhangkaihuo 已提交
53 54
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta infermeta_utils
                       sparse_infermeta)
55
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
56

57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
set(COMMON_KERNEL_DEPS
    ${COMMON_KERNEL_DEPS}
    threadpool
    jit_kernel_helper
    softmax
    cross_entropy
    matrix_bit_code
    lapack_function
    lstm_compute
    gru_compute
    deformable_conv_functor
    matrix_reduce
    segment_pooling
    pooling
    maxouting
    matrix_inverse
73
    matrix_solve
74
    phi_dynload_warpctc
H
Hui Zhang 已提交
75
    phi_dynload_warprnnt
76
    sequence_padding
77
    sequence_pooling
F
Feiyu Chan 已提交
78
    sequence_scale
79
    fft
80
    phi_data_layout_transform
81
    gpc
82 83
    utf8proc
    gather_scatter_functor)
84

S
ShenLiang 已提交
85 86
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group)

87 88 89 90
if(WITH_FLASHATTN)
  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_dynload_flashattn)
endif()

L
LiYuRio 已提交
91
if(WITH_NCCL OR WITH_RCCL)
92
  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} nccl_comm_context)
93 94 95
endif()
if(WITH_GLOO)
  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} gloo_comm_context)
L
LiYuRio 已提交
96
endif()
97 98 99
if(WITH_CUDNN_FRONTEND)
  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cudnn-frontend)
endif()
100
copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
101 102 103 104 105

file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
file(GLOB kernel_primitive_h "primitive/*.h")

106
# fusion ops would be included here
107 108 109 110 111 112 113
file(
  GLOB
  kernel_cu
  "gpu/*.cu"
  "gpu/*.cu.cc"
  "gpudnn/*.cu"
  "kps/*.cu"
Z
zhangyuqin1998 已提交
114
  "legacy/kps/*.cu"
115
  "legacy/gpu/*.cu"
116 117
  "selected_rows/gpu/*.cu"
  "sparse/gpu/*.cu"
118 119
  "strings/gpu/*.cu"
  "fusion/gpu/*.cu")
120

121 122 123 124 125
if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
endif()

126
if(WITH_CUTLASS)
127 128 129 130 131
  execute_process(
    COMMAND ${CMAKE_COMMAND} -E make_directory
            "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d/generated"
    COMMAND ${PYTHON_EXECUTABLE} "conv2d_bias_act.py"
    COMMAND ${PYTHON_EXECUTABLE} "conv2d_bias_residual.py"
132
    COMMAND ${PYTHON_EXECUTABLE} "conv2d_depthwise_bias_act.py"
133 134
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d")

135 136 137 138
  execute_process(
    COMMAND
      ${PYTHON_EXECUTABLE}
      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
139 140 141 142 143 144 145 146 147 148
      --cuda_arch "${NVCC_ARCH_BIN}"
    RESULT_VARIABLE memory_efficient_attention_gen_res)

  if(NOT memory_efficient_attention_gen_res EQUAL 0)
    message(
      FATAL_ERROR
        "The memory efficient attention kernel generation errors with NVCC_ARCH_BIN=${NVCC_ARCH_BIN}"
    )
  endif()

149
  file(GLOB cutlass_cu "fusion/cutlass/conv2d/generated/*.cu"
150 151 152
       "fusion/cutlass/conv2d/*.cu" "fusion/cutlass/*.cu"
       "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu")
  add_definitions("-DPADDLE_WITH_MEMORY_EFFICIENT_ATTENTION")
153 154 155
  list(APPEND kernel_cu ${cutlass_cu})
endif()

156 157 158 159 160
if(APPLE OR WIN32)
  list(REMOVE_ITEM kernel_cu
       "${CMAKE_CURRENT_SOURCE_DIR}/fusion/gpu/fusion_group_kernel.cu")
endif()

161 162 163 164 165 166
if(WITH_MKLDNN)
  file(
    GLOB
    kernel_cc
    "*.cc"
    "cpu/*.cc"
Z
zhangyuqin1998 已提交
167 168
    "legacy/*.cc"
    "legacy/cpu/*.cc"
169
    "legacy/onednn/*.cc"
170 171 172 173 174 175
    "selected_rows/*.cc"
    "selected_rows/cpu/*.cc"
    "sparse/*.cc"
    "sparse/cpu/*.cc"
    "strings/*.cc"
    "strings/cpu/*.cc"
176 177
    "onednn/*.cc"
    "fusion/*.cc"
178
    "fusion/onednn/*.cc"
179
    "fusion/cpu/*.cc")
180 181 182 183 184 185
else()
  file(
    GLOB
    kernel_cc
    "*.cc"
    "cpu/*.cc"
Z
zhangyuqin1998 已提交
186 187
    "legacy/*.cc"
    "legacy/cpu/*.cc"
188 189 190 191 192
    "selected_rows/*.cc"
    "selected_rows/cpu/*.cc"
    "sparse/*.cc"
    "sparse/cpu/*.cc"
    "strings/*.cc"
193 194 195
    "strings/cpu/*.cc"
    "fusion/*.cc"
    "fusion/cpu/*.cc")
196 197
endif()

198 199 200 201
if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
  list(FILTER kernel_cc EXCLUDE REGEX ".*_grad_kernel\\.cc$")
endif()

202 203 204 205 206 207 208 209
file(
  GLOB
  kernel_xpu
  "xpu/*.cc"
  "legacy/xpu/*.cc"
  "selected_rows/xpu/*.cc"
  "fusion/xpu/*.cc"
  "sparse/xpu/*.cc")
210

211
if(WITH_MKLDNN)
212
  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} get_kerneltype_forvar_utils)
213
endif()
214

215 216
if(WITH_GPU OR WITH_ROCM)
  if(WITH_GPU)
217
    add_library(phi_gpu ${kernel_cu} ${kernel_cc})
U
umiswing 已提交
218 219 220
    if(WITH_CUTLASS)
      add_dependencies(phi_gpu cutlass_codegen)
    endif()
221
  elseif(WITH_ROCM)
222
    hip_add_library(phi_gpu STATIC ${kernel_cu} ${kernel_cc})
223 224
  endif()
  kernel_declare("${kernel_cu}")
225
  kernel_declare("${kernel_cc}")
226
  target_link_libraries(phi_gpu ${COMMON_KERNEL_DEPS})
227
  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_gpu)
228
elseif(WITH_XPU)
229
  if(WITH_XPU_KP)
L
Leo Chen 已提交
230 231
    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/
         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
Z
zhangyuqin1998 已提交
232 233
    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/legacy/kps/
         DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps/)
L
Leo Chen 已提交
234 235 236 237 238 239
    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.cu")
    foreach(kernel ${kernel_xpu_kps})
      get_filename_component(name ${kernel} NAME_WE)
      file(RENAME ${kernel} "${CMAKE_CURRENT_BINARY_DIR}/kps/${name}.kps")
    endforeach()
    file(GLOB kernel_xpu_kps "${CMAKE_CURRENT_BINARY_DIR}/kps/*.kps")
Y
YuanRisheng 已提交
240 241 242 243 244
    file(
      GLOB kernel_cc_relative
      RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
      "*.cc"
      "cpu/*.cc"
Z
zhangyuqin1998 已提交
245 246
      "legacy/*.cc"
      "legacy/cpu/*.cc"
Y
YuanRisheng 已提交
247 248 249 250 251 252 253 254 255 256 257 258 259
      "selected_rows/*.cc"
      "selected_rows/cpu/*.cc"
      "sparse/*.cc"
      "sparse/cpu/*.cc"
      "strings/*.cc"
      "strings/cpu/*.cc"
      "fusion/*.cc"
      "fusion/cpu/*.cc")
    foreach(kernel ${kernel_cc_relative})
      file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/${kernel}
           DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/${kernel})
    endforeach()
    file(GLOB_RECURSE kernel_xpu_cc "${CMAKE_CURRENT_BINARY_DIR}/*.cc")
260 261 262 263 264
    xpu_add_library(
      phi_xpu
      STATIC
      ${kernel_xpu}
      ${kernel_xpu_kps}
Y
YuanRisheng 已提交
265
      ${kernel_xpu_cc}
266 267
      DEPENDS
      ${COMMON_KERNEL_DEPS})
Y
YuanRisheng 已提交
268
    kernel_declare("${kernel_xpu_cc}")
269
  else()
270
    add_library(phi_xpu ${kernel_xpu} ${kernel_cc})
Y
YuanRisheng 已提交
271
    kernel_declare("${kernel_cc}")
272
  endif()
273 274
  kernel_declare("${kernel_xpu}")
  kernel_declare("${kernel_xpu_kps}")
Y
YuanRisheng 已提交
275

276
  target_link_libraries(phi_xpu ${COMMON_KERNEL_DEPS})
277
  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_xpu)
278 279 280 281 282
else()
  add_library(phi_cpu ${kernel_cc})
  target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
  kernel_declare("${kernel_cc}")
  set(ADD_PHI_KERNELS phi_cpu)
283 284 285
endif()

set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS})
286 287 288 289

if(NOT "${KERNEL_LIST}" STREQUAL "")
  prune_declaration_h()
endif()