file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") list(REMOVE_DUPLICATES GENERAL_OPS) set(DEPS_OPS "") set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n") function(op_library TARGET) # op_library is a function to create op library. The interface is same as # cc_library. But it handle split GPU/CPU code and link some common library # for ops. set(cc_srcs) set(cu_srcs) set(hip_cu_srcs) set(miopen_hip_cc_srcs) set(cu_cc_srcs) set(cudnn_cu_cc_srcs) set(CUDNN_FILE) set(mkldnn_cc_srcs) set(MKLDNN_FILE) set(op_common_deps operator op_registry math_function) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) set(pybind_flag 0) cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) list(LENGTH op_library_SRCS op_library_SRCS_len) if (${op_library_SRCS_len} EQUAL 0) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) list(APPEND cc_srcs ${TARGET}.cc) endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) list(APPEND cu_cc_srcs ${TARGET}.cu.cc) endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) list(APPEND hip_cu_srcs ${TARGET}.hip.cu) endif() string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}") if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) endif() if(WITH_AMD_GPU) string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}") if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc) list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc) endif() endif() if(WITH_MKLDNN) string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc) list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc) endif() endif() else() foreach(src ${op_library_SRCS}) if (${src} MATCHES ".*\\.hip.cu$") list(APPEND hip_cu_srcs ${src}) elseif (${src} MATCHES ".*\\.cu$") list(APPEND cu_srcs ${src}) elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") list(APPEND cudnn_cu_cc_srcs ${src}) elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$") list(APPEND miopen_hip_cc_srcs ${src}) elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") list(APPEND mkldnn_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cu.cc$") list(APPEND cu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) else() message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu") endif() endforeach() endif() list(LENGTH cc_srcs cc_srcs_len) if (${cc_srcs_len} EQUAL 0) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() endforeach() endif(WIN32) set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) list(LENGTH op_library_DEPS op_library_DEPS_len) if (${op_library_DEPS_len} GREATER 0) set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) endif() if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) elseif (WITH_AMD_GPU) hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) endif() # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() endforeach() # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. # And for detail pybind information, please see generated paddle/pybind/pybind.h. file(READ ${TARGET}.cc TARGET_CONTENT) string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}") string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}") if (one_register STREQUAL "") string(REPLACE "_op" "" TARGET "${TARGET}") else () string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}") string(REPLACE "," "" TARGET "${TARGET}") endif() # pybind USE_NO_KERNEL_OP # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") string(REPLACE "_op" "" TARGET "${TARGET}") if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n") set(pybind_flag 1) endif() # pybind USE_CPU_ONLY_OP list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH hip_cu_srcs hip_cu_srcs_len) list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len) if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) endif() # pybind USE_OP_DEVICE_KERNEL for CUDNN list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len) if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() # pybind USE_OP_DEVICE_KERNEL for MIOPEN if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") endif() # pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) # Append first implemented MKLDNN activation operator if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") endif() endif() # pybind USE_OP if (${pybind_flag} EQUAL 0) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") file(APPEND ${pybind_file} "USE_OP(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") elseif(${TARGET} STREQUAL "fake_quantize") file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n") elseif(${TARGET} STREQUAL "tensorrt_engine_op") message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") elseif(${TARGET} STREQUAL "fc") # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") else() file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") endif() endif() endfunction() add_subdirectory(math) if (NOT WIN32) add_subdirectory(nccl) if(WITH_GPU) op_library(nccl_op DEPS nccl_common) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") else() set(DEPS_OPS ${DEPS_OPS} nccl_op) endif() endif() # NOT WIN32 set(DISTRIBUTE_DEPS "") if(WITH_DISTRIBUTE) add_subdirectory(distributed) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY}) find_library(RDMACM_LIBRARY NAMES rdmacm) ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY}) set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm) endif() endif() set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op") op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) endforeach() #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op # listen_and_serv_op sum_op executor SERIAL) if(WITH_GPU AND NOT WIN32) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL) if(WITH_GRPC) op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) else() op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc) endif() set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) endif() # WITH_GPU AND NOT WIN32 else() set(DEPS_OPS ${DEPS_OPS} checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op) endif() op_library(cross_entropy_op DEPS cross_entropy) if(WITH_GPU) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax cub) op_library(sequence_softmax_op DEPS cub) else() op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) endif() op_library(softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n") nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc DEPS tensorrt_engine_op analysis) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() op_library(hash_op DEPS xxhash) op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) op_library(print_op DEPS lod_tensor) op_library(adagrad_op DEPS selected_rows_functor) op_library(maxout_op DEPS maxouting) op_library(unpool_op DEPS unpooling) op_library(pool_op DEPS pooling) op_library(pool_with_index_op DEPS pooling) op_library(lod_rank_table_op DEPS lod_rank_table) op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) op_library(max_sequence_len_op DEPS lod_rank_table) op_library(sequence_conv_op DEPS context_project) op_library(sequence_pool_op DEPS sequence_pooling) if (NOT WIN32) op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) op_library(lstmp_op DEPS sequence2batch lstm_compute) op_library(gru_op DEPS sequence2batch gru_compute) endif(NOT WIN32) op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) op_library(unsqueeze_op DEPS reshape_op) op_library(squeeze_op DEPS reshape_op) op_library(extract_rows_op DEPS memory) op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) op_library(fusion_lstm_op DEPS jit_kernel) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) op_library(reduce_mean_op DEPS cub) op_library(affine_channel_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() op_library(conv_transpose_op DEPS vol2col im2col) # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) op_library(concat_op DEPS concat_and_split) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) op_library(${src}) endforeach() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") if (NOT WIN32) add_subdirectory(reader) endif(NOT WIN32) foreach(src ${READER_LIBRARY}) set(OP_LIBRARY ${src} ${OP_LIBRARY}) endforeach() add_subdirectory(detection) foreach(src ${DETECTION_LIBRARY}) set(OP_LIBRARY ${src} ${OP_LIBRARY}) endforeach() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency") cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) if(NOT WIN32) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)