include(operators)

# clean cache and pybind_file content first when rebuild
unset(GLOB_OP_LIB CACHE)
unset(OP_LIBRARY CACHE)
set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h CACHE INTERNAL "pybind.h file")
file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")

add_subdirectory(math)
add_subdirectory(controlflow)
add_subdirectory(csp)
add_subdirectory(detection)
add_subdirectory(elementwise)
add_subdirectory(fused)
add_subdirectory(metrics)
add_subdirectory(optimizers)
add_subdirectory(reduce_ops)
add_subdirectory(sequence_ops)

if(WITH_DISTRIBUTE)
    add_subdirectory(distributed)
    add_subdirectory(distributed_ops)
endif()

add_subdirectory(reader)

if (NOT WIN32)
    add_subdirectory(nccl)
endif()

if (WITH_GPU AND TENSORRT_FOUND)
    add_subdirectory(tensorrt)
endif()

SET(OP_HEADER_DEPS xxhash)
if (WITH_GPU)
    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
endif()

SET(OP_PREFETCH_DEPS "")
if (WITH_DISTRIBUTE)
    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
endif()

register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})


# warpctc_op needs cudnn 7 above
if (WITH_GPU AND NOT WIN32)
    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
    else()
        op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
    endif()
    # conv_fusion_op needs cudnn 7 above
    if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
        op_library(conv_fusion_op)
        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
    endif()
else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif()

set(COMMON_OP_DEPS ${OP_HEADER_DEPS})

set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
if (WITH_GPU)
  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
endif()

# FIXME(typhoonzero): operator deps may not needed.
# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
# op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
# op_library(unsqueeze_op DEPS reshape_op)
# op_library(squeeze_op DEPS reshape_op)
# op_library(flatten_op DEPS reshape_op)
# op_library(unstack_op DEPS stack_op)
# op_library(tensor_array_to_tensor_op DEPS concat_op)

set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")

cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)

set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")