diff --git a/CMakeLists.txt b/CMakeLists.txt index d40491f3eecbea5d4da5817c07be9cb27b8ce25e..aefe8cc19c586381aea83645e80b1fd700959bbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,7 +59,9 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) +lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) +lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) @@ -177,6 +179,10 @@ if(LITE_WITH_XPU) include(device/xpu) endif() +if(LITE_WITH_MLU) + include(mlu) +endif() + include(external/mklml) # download mklml package include(external/xbyak) # download xbyak package include(external/libxsmm) # download, build, install libxsmm diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 0d60c578685cd3d3f3adbeac9fc75d1cdcc78c51..caf456367047277344f0353b6c142b039a81b12c 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -136,6 +136,9 @@ endif() if (LITE_WITH_XPU) add_definitions("-DLITE_WITH_XPU") + if (LITE_WITH_XTCL) + add_definitions("-DLITE_WITH_XTCL") + endif() endif() if (LITE_WITH_OPENCL) @@ -150,6 +153,10 @@ if (LITE_WITH_BM) add_definitions("-DLITE_WITH_BM") endif() +if (LITE_WITH_MLU) +add_definitions("-DLITE_WITH_MLU") +endif() + if (LITE_WITH_PROFILE) add_definitions("-DLITE_WITH_PROFILE") endif() diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake index 099833ee4cf80968671036cffe89329506bbf091..823048552f3cb5f05375e97e94cd5b5ad63e7563 100644 --- a/cmake/device/xpu.cmake +++ b/cmake/device/xpu.cmake @@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT) message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") endif() endif() - message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}") -find_path(XPU_SDK_INC NAMES xtcl.h - PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl - NO_DEFAULT_PATH) -if(NOT XPU_SDK_INC) - message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include") -endif() -include_directories("${XPU_SDK_ROOT}/XTCL/include") include_directories("${XPU_SDK_ROOT}/XTDK/include") -find_library(XPU_SDK_XTCL_FILE NAMES xtcl - PATHS ${XPU_SDK_ROOT}/XTCL/so - NO_DEFAULT_PATH) - -if(NOT XPU_SDK_XTCL_FILE) - message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}") - add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE}) -endif() - -find_library(XPU_SDK_TVM_FILE NAMES tvm - PATHS ${XPU_SDK_ROOT}/XTCL/so - NO_DEFAULT_PATH) - -if(NOT XPU_SDK_TVM_FILE) - message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}") - add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE}) -endif() - find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi PATHS ${XPU_SDK_ROOT}/XTDK/shlib NO_DEFAULT_PATH) @@ -82,23 +50,55 @@ else() set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE}) endif() -find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc - PATHS ${XPU_SDK_ROOT}/XTDK/shlib - NO_DEFAULT_PATH) - -find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 - PATHS ${XPU_SDK_ROOT}/XTDK/shlib - NO_DEFAULT_PATH) - -if(NOT XPU_SDK_LLVM_FILE) - message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") - add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) +set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs") +set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs") + +if(LITE_WITH_XTCL) + find_path(XPU_SDK_INC NAMES xtcl.h + PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH) + if(NOT XPU_SDK_INC) + message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include") + endif() + include_directories("${XPU_SDK_ROOT}/XTCL/include") + + find_library(XPU_SDK_XTCL_FILE NAMES xtcl + PATHS ${XPU_SDK_ROOT}/XTCL/so + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_XTCL_FILE) + message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}") + add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE}) + endif() + + find_library(XPU_SDK_TVM_FILE NAMES tvm + PATHS ${XPU_SDK_ROOT}/XTCL/so + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_TVM_FILE) + message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}") + add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE}) + endif() + + find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_LLVM_FILE) + message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") + add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) + endif() + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1") + + set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") + set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0") - -set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") -set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 780cdea445cf10897ee71c85a939a64406b59c96..a07edaa57533e35943aedc5dbf812598d6215714 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -100,6 +100,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_MLU) + foreach(var ${lite_deps_MLU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + set(${TARGET} ${deps} PARENT_SCOPE) endfunction() @@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -144,6 +150,7 @@ function(lite_cc_library TARGET) PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) if (args_SHARED OR ARGS_shared) @@ -170,7 +177,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -189,6 +196,7 @@ function(lite_cc_binary TARGET) LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${CV_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) @@ -218,7 +226,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -245,6 +253,7 @@ function(lite_cc_test TARGET) LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${args_CV_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) # strip binary target to reduce size @@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") +set(mlu_kernels CACHE INTERNAL "mlu kernels") set(bm_kernels CACHE INTERNAL "bm kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") @@ -285,12 +295,12 @@ if(LITE_BUILD_TAILOR) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM) +# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -369,6 +379,12 @@ function(add_kernel TARGET device level) endif() set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "MLU") + if (NOT LITE_WITH_MLU) + return() + endif() + set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "OPENCL") if (NOT LITE_WITH_OPENCL) foreach(src ${args_SRCS}) @@ -409,6 +425,7 @@ function(add_kernel TARGET device level) NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} BM_DEPS ${args_BM_DEPS} + MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -427,7 +444,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -462,6 +479,7 @@ function(add_operator TARGET level) NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} BM_DEPS ${args_BM_DEPS} + MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index c4dd769b4c9dbda8379aef631b6f44ce3aea9d22..e7c4e5fcc5c00929058f11160d0f87d13cbe7f4b 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -8,7 +8,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") +message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") +message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 5980e5ebe0b922e68a06bf84b51c33271d13e360..4ce95776f3c92ec474e45869570acbbe207e6b05 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -10,6 +10,7 @@ if (LITE_ON_TINY_PUBLISH) endif() set(light_lib_DEPS light_api paddle_api paddle_api_light) + if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) #full api dynamic library lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc @@ -19,7 +20,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH if(LITE_WITH_X86) add_dependencies(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash) - if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) + if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) add_dependencies(paddle_full_api_shared dynload_mklml) endif() endif() @@ -66,7 +67,8 @@ if (WITH_TESTING) CUDA_DEPS ${cuda_kernels} X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} - BM_DEPS ${bm_kernels}) + BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels}) endif() if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) @@ -88,6 +90,7 @@ message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get BM kernels ${bm_kernels}") +message(STATUS "get MLU kernels ${mlu_kernels}") # for full api if (NOT LITE_ON_TINY_PUBLISH) @@ -125,7 +128,8 @@ lite_cc_library(light_api SRCS light_api.cc XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels}) + BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING @@ -144,6 +148,7 @@ if(WITH_TESTING) CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -264,8 +269,6 @@ if (NOT LITE_ON_TINY_PUBLISH) NPU_DEPS ${npu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - CV_DEPS paddle_cv_arm - NPU_DEPS ${npu_kernels} BM_DEPS ${bm_kernels}) # The final inference library for just MobileConfig. bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) @@ -292,6 +295,7 @@ lite_cc_test(test_apis SRCS apis_test.cc XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -329,6 +333,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} + MLU_DEPS ${mlu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) if (WITH_TESTING) add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) @@ -342,6 +347,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} @@ -354,6 +360,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} @@ -366,6 +373,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} @@ -378,6 +386,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} @@ -389,6 +398,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index e63893cb91e112beb6be50bd661a57b9738e5fb1..094ba5b8d79501ed08673b2e63c290b52f8dade8 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -43,6 +43,16 @@ class LITE_API Predictor { public: // Create an empty predictor. Predictor() { scope_ = std::make_shared(); } + ~Predictor() { +#ifdef LITE_WITH_OPENCL + CLRuntime::Global()->ReleaseResources(); +#endif + scope_.reset(); + exec_scope_ = nullptr; + program_.reset(); + input_names_.clear(); + output_names_.clear(); + } // Create a predictor with the weight variable scope set. explicit Predictor(const std::shared_ptr& root_scope) : scope_(root_scope) {} diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 133b7f7ccf254ca13ab445b0116a684de610896b..ccd7c981385ff776c47c01fbfdd058001341dff6 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -42,6 +42,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { } } #endif +#ifdef LITE_WITH_MLU + Env::Init(); + lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(), + config.mlu_core_number(), + config.mlu_use_first_conv(), + config.mlu_first_conv_mean(), + config.mlu_first_conv_std(), + config.mlu_input_layout()); +#endif // LITE_WITH_MLU std::vector passes{}; auto use_layout_preprocess_pass = config.model_dir().find("OPENCL_PRE_PRECESS"); diff --git a/lite/api/light_api.h b/lite/api/light_api.h index aa25ea81c7b62238211f96265a4edc49f2d065a1..e21618449ca65f86f389cfff20d8e619de7c316d 100644 --- a/lite/api/light_api.h +++ b/lite/api/light_api.h @@ -107,6 +107,8 @@ class LightPredictorImpl : public lite_api::PaddlePredictor { public: LightPredictorImpl() = default; + ~LightPredictorImpl(); + std::unique_ptr GetInput(int i) override; std::unique_ptr GetOutput(int i) const override; diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc index cdf5b7fb06df35b2e7fb72fc4e33ccb721a0f7f7..c5ec042dfa7864f78b780eab05f3a2b4b132e4b3 100644 --- a/lite/api/light_api_impl.cc +++ b/lite/api/light_api_impl.cc @@ -21,6 +21,13 @@ namespace paddle { namespace lite { +LightPredictorImpl::~LightPredictorImpl() { + raw_predictor_.reset(); +#ifdef LITE_WITH_OPENCL + CLRuntime::Global()->ReleaseResources(); +#endif +} + void LightPredictorImpl::Init(const lite_api::MobileConfig& config) { // LightPredictor Only support NaiveBuffer backend in publish lib if (config.lite_model_file().empty()) { diff --git a/lite/api/opt.cc b/lite/api/opt.cc index 12003050af864da7d88d335553d71007cf5ed9c5..7a8cd7f1ef1234269c986b781f0546b26df53c4b 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -109,6 +109,8 @@ std::vector ParserValidPlaces() { valid_places.emplace_back(TARGET(kNPU)); } else if (target_repr == "xpu") { valid_places.emplace_back(TARGET(kXPU)); + } else if (target_repr == "mlu") { + valid_places.emplace_back(TARGET(kMLU)); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index 2cb2064da518bca442e882d0733c5c6966c4fac0..daef2c66dda5188a1eec25c3d5f045f1fa705e1e 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/api/paddle_api.h" +#include "lite/core/context.h" #include "lite/core/device_info.h" #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" @@ -203,6 +204,58 @@ void ConfigBase::set_threads(int threads) { #endif } +#ifdef LITE_WITH_MLU +void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) { + mlu_core_version_ = core_version; +} +void CxxConfig::set_mlu_core_number(int core_number) { + mlu_core_number_ = core_number; +} +void CxxConfig::set_mlu_input_layout(DataLayoutType layout) { + mlu_input_layout_ = layout; +} +void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) { + mlu_use_first_conv_ = use_first_conv; +} +void CxxConfig::set_mlu_first_conv_mean(const std::vector &mean) { + mlu_first_conv_mean_ = mean; +} +void CxxConfig::set_mlu_first_conv_std(const std::vector &std) { + mlu_first_conv_std_ = std; +} +lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { + return mlu_core_version_; +} +int CxxConfig::mlu_core_number() const { return mlu_core_number_; } +DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; } +bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; } +const std::vector &CxxConfig::mlu_first_conv_mean() const { + return mlu_first_conv_mean_; +} +const std::vector &CxxConfig::mlu_first_conv_std() const { + return mlu_first_conv_std_; +} +#endif + +void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { +#ifdef LITE_WITH_XPU + lite::Context::SetWorkspaceL3Size(l3_size); +#else + LOG(WARNING) << "The invoking of the function " + "'set_xpu_workspace_l3_size_per_thread' is ignored, please " + "rebuild it with LITE_WITH_XPU=ON."; +#endif +} + +void CxxConfig::set_xpu_dev_per_thread(int dev_no) { +#ifdef LITE_WITH_XPU + lite::Context::SetDev(dev_no); +#else + LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is " + "ignored, please rebuild it with LITE_WITH_XPU=ON."; +#endif +} + // set model data in combined format, `set_model_from_file` refers to loading // model from file, set_model_from_buffer refers to loading model from memory // buffer diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index c445ef641b96d9fbbc5b4123be794976c0cf03c4..ce0f0e15d84835fab733a5114906e0a0df3a0064 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase { #ifdef LITE_WITH_X86 int x86_math_library_math_threads_ = 1; #endif +#ifdef LITE_WITH_MLU + lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; + int mlu_core_number_{1}; + DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; + bool mlu_use_first_conv_{false}; + std::vector mlu_first_conv_mean_; + std::vector mlu_first_conv_std_; +#endif public: void set_valid_places(const std::vector& x) { valid_places_ = x; } @@ -163,6 +171,37 @@ class LITE_API CxxConfig : public ConfigBase { return x86_math_library_math_threads_; } #endif + +#ifdef LITE_WITH_MLU + // set MLU core version, which is used when compiling MLU kernels + void set_mlu_core_version(lite_api::MLUCoreVersion core_version); + // set MLU core number, which is used when compiling MLU kernels + void set_mlu_core_number(int core_number); + // set MLU input layout. User can specify layout of input data to be NHWC, + // default is NCHW + void set_mlu_input_layout(DataLayoutType layout); + // whether use MLU's first conv kernel. First conv is a special kernel + // provided by MLU, its input is uint8, and also needs two 3-dimentional + // vectors which save all inputs' mean and std values + void set_mlu_use_first_conv(bool use_first_conv); + // set the 3-dimentional mean vector used by MLU's first conv + void set_mlu_first_conv_mean(const std::vector& mean); + // set the 3-dimentional std vector used by MLU's first conv + void set_mlu_first_conv_std(const std::vector& std); + + lite_api::MLUCoreVersion mlu_core_version() const; + int mlu_core_number() const; + DataLayoutType mlu_input_layout() const; + bool mlu_use_first_conv() const; + const std::vector& mlu_first_conv_mean() const; + const std::vector& mlu_first_conv_std() const; +#endif + + // XPU only, set the size of the workspace memory from L3 cache for the + // current thread. + void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00); + // XPU only, specify the target device ID for the current thread. + void set_xpu_dev_per_thread(int dev_no = 0); }; /// MobileConfig is the config for the light weight predictor, it will skip diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index dba65656cbcffb00319c8f6083909e487e3df7a2..aceb047b64f54ac18ac492ef495d32c3180ad4b4 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) { "fpga", "npu", "xpu", - "bm"}; + "bm", + "mlu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) { "kFPGA", "kNPU", "kXPU", + "kMLU", "kBM"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); @@ -153,6 +155,7 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kNPU), TARGET(kXPU), TARGET(kBM), + TARGET(kMLU), TARGET(kFPGA)}); if (target == TARGET(kAny)) { return valid_set; diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index e48686b913cc5b07f87db0a503ce7081bbe7d058..f57b9832f2b35fc3db74232192bd397ec8b4930c 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -53,8 +53,8 @@ enum class TargetType : int { kNPU = 8, kXPU = 9, kBM = 10, - kAny = 6, // any target kMLU = 11, + kAny = 6, // any target NUM = 12, // number of fields. }; enum class PrecisionType : int { @@ -89,6 +89,8 @@ typedef enum { LITE_POWER_RAND_LOW = 5 } PowerMode; +typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion; + enum class ActivationType : int { kIndentity = 0, kRelu = 1, @@ -100,7 +102,9 @@ enum class ActivationType : int { kSwish = 7, kExp = 8, kAbs = 9, - NUM = 10, + kHardSwish = 10, + kReciprocal = 11, + NUM = 12, }; static size_t PrecisionTypeLength(PrecisionType type) { diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 25fd9b4eed17ba7e36b4835c7de2c93968d73ef7..3de9de11d0220b7e68a3136d17fcb7015e49ce78 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -45,6 +45,10 @@ USE_MIR_PASS(memory_optimize_pass); USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); +USE_MIR_PASS(mlu_subgraph_pass); +USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(assign_value_eliminate_pass); +USE_MIR_PASS(__xpu__resnet_fuse_pass); +USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index e86d570e18b50bdc3d8943ecdd3732f8475ad56c..5512e7bc438eddd6bcd9c8f792fc8507b03bf800 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -47,6 +47,7 @@ using lite_api::TargetType; using lite_api::PrecisionType; using lite_api::DataLayoutType; using lite_api::Place; +using lite_api::MLUCoreVersion; using lite::LightPredictorImpl; using lite_api::OptBase; @@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m); static void BindLitePowerMode(py::module *m); static void BindLitePlace(py::module *m); static void BindLiteTensor(py::module *m); +static void BindLiteMLUCoreVersion(py::module *m); void BindLiteApi(py::module *m) { BindLiteCxxConfig(m); @@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) { BindLitePowerMode(m); BindLitePlace(m); BindLiteTensor(m); + BindLiteMLUCoreVersion(m); #ifndef LITE_ON_TINY_PUBLISH BindLiteCxxPredictor(m); #endif @@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) { .def("set_power_mode", &CxxConfig::set_power_mode) .def("power_mode", &CxxConfig::power_mode); #endif +#ifdef LITE_WITH_MLU + cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version) + .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number) + .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout) + .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv) + .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean) + .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std); +#endif } // TODO(sangoly): Should MobileConfig be renamed to LightConfig ?? @@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) { .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW); } +void BindLiteMLUCoreVersion(py::module *m) { + py::enum_(*m, "MLUCoreVersion") + .value("LITE_MLU_220", MLUCoreVersion::MLU_220) + .value("LITE_MLU_270", MLUCoreVersion::MLU_270); +} + void BindLitePlace(py::module *m) { // TargetType py::enum_(*m, "TargetType") @@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) { .value("OpenCL", TargetType::kOpenCL) .value("FPGA", TargetType::kFPGA) .value("NPU", TargetType::kNPU) + .value("MLU", TargetType::kMLU) .value("Any", TargetType::kAny); // PrecisionType @@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) { DO_GETTER_ONCE(data_type__, name__##_data) DATA_GETTER_SETTER_ONCE(int8_t, int8); +#ifdef LITE_WITH_MLU + tensor.def("set_uint8_data", + [](Tensor &self, + const std::vector &data, + TargetType type = TargetType::kHost) { + if (type == TargetType::kHost) { + self.CopyFromCpu(data.data()); + } + }, + py::arg("data"), + py::arg("type") = TargetType::kHost); + + DO_GETTER_ONCE(uint8_t, "uint8_data"); +#endif DATA_GETTER_SETTER_ONCE(int32_t, int32); DATA_GETTER_SETTER_ONCE(float, float); #undef DO_GETTER_ONCE diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index e3517464812a24c9911e824c53841efc05dd2bc5..fb459ae3621d1281f0a2433ca6b237a165d078a1 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -6,4 +6,5 @@ add_subdirectory(fpga) add_subdirectory(host) add_subdirectory(npu) add_subdirectory(xpu) +add_subdirectory(mlu) add_subdirectory(bm) diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc index 9f478eab60538eeca38415afea4e0989eff5a04e..26e63e23f6acb761b61b397bb881d425e3442468 100644 --- a/lite/backends/arm/math/activation.cc +++ b/lite/backends/arm/math/activation.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/backends/arm/math/activation.h" +#include #include #include "lite/backends/arm/math/funcs.h" @@ -711,6 +712,38 @@ void act_square(const float* din, float* dout, int size, int threads) { } } +template <> +void act_hard_swish(const float* din, + float* dout, + int size, + float threshold, + float scale, + float offset, + int threads) { + const float* ptr_in = din; + float* ptr_out = dout; + for (int i = 0; i < size; ++i) { + ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) * + ptr_in[0] / scale; + ptr_in++; + ptr_out++; + } +} + +template <> +void act_reciprocal(const float* din, + float* dout, + int size, + int threads) { + const float* ptr_in = din; + float* ptr_out = dout; + for (int i = 0; i < size; ++i) { + ptr_out[0] = 1.0 / ptr_in[0]; + ptr_in++; + ptr_out++; + } +} + #ifdef LITE_WITH_TRAIN template <> void act_square_grad(const float* din, diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h index 63f4418d70db25f98dea2a405de1f4bb6b0b9111..ca6b146442a3ec324a9bd244ee4ce6ad0601d4d7 100644 --- a/lite/backends/arm/math/activation.h +++ b/lite/backends/arm/math/activation.h @@ -72,6 +72,17 @@ void act_rsqrt(const T* din, T* dout, int size, int threads); template void act_square(const T* din, T* dout, int size, int threads); +template +void act_hard_swish(const T* din, + T* dout, + int size, + float threshold, + float scale, + float offset, + int threads); +template +void act_reciprocal(const T* din, T* dout, int size, int threads); + #ifdef LITE_WITH_TRAIN template void act_square_grad( diff --git a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..72b0b66f9737ce0ca9c740e6d4e399d06eaf2cd8 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void decode_center_size(__read_only image2d_t prior_box_image, + __read_only image2d_t prior_box_var_image, + __read_only image2d_t target_box_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H){ + const int out_c = get_global_id(0); + const int out_nh = get_global_id(1); + const int out_h = out_nh % out_H; + const int out_n = 1; + + const int prior_box_n = 1; + const int prior_box_c = 0; + const int prior_box_h = out_h; + + const int prior_box_var_n = 1; + const int prior_box_var_c = 0; + const int prior_box_var_h = out_h; + + const int target_box_n = 1; + const int target_box_c = out_c; + const int target_box_h = out_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + int2 prior_box_pos; + int2 prior_box_var_pos; + int2 target_box_pos; + int2 output_pos; + + prior_box_pos.x = prior_box_c * 4; + prior_box_pos.y = prior_box_n * prior_box_h; + + prior_box_var_pos.x = prior_box_var_c * 4; + prior_box_var_pos.y = prior_box_var_n * prior_box_var_h; + + target_box_pos.x = target_box_c * 4; + target_box_pos.y = target_box_n * target_box_h; + + output_pos.x = out_c * 4; + output_pos.y = out_n * out_h; + + CL_DTYPE4 prior_box_input[4]; + CL_DTYPE4 prior_box_var_input[4]; + CL_DTYPE4 target_box_input[4]; + + prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 0, prior_box_pos.y)); + prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 1, prior_box_pos.y)); + prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 2, prior_box_pos.y)); + prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 3, prior_box_pos.y)); + + prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y)); + prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y)); + prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y)); + prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y)); + + target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 0,target_box_pos.y)); + target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 1, target_box_pos.y)); + target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 2, target_box_pos.y)); + target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 3, target_box_pos.y)); + + CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x; + CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x; + CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2; + CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2; + + CL_DTYPE4 target_box_center_x; + CL_DTYPE4 target_box_center_y; + CL_DTYPE4 target_box_width; + CL_DTYPE4 target_box_height; + CL_DTYPE4 output[4]; + + output[0] = 0.0f; + output[1] = 0.0f; + output[2] = 0.0f; + output[3] = 0.0f; + + target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x; + target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y; + target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width; + target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height; + + output[0].x = target_box_center_x.x - target_box_width.x/(half)2; + output[1].x = target_box_center_y.x - target_box_height.x/(half)2; + output[2].x = target_box_center_x.x + target_box_width.x/(half)2; + output[3].x = target_box_center_y.x + target_box_height.x/(half)2; + + if(out_C - out_c * 4 >= 2){ + target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x; + target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y; + target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width; + target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height; + output[0].y = target_box_center_x.y - target_box_width.y/(half)2; + output[1].y = target_box_center_y.y - target_box_height.y/(half)2; + output[2].y = target_box_center_x.y + target_box_width.y/(half)2; + output[3].y = target_box_center_y.y + target_box_height.y/(half)2; + } + if(out_C - out_c * 4 >= 3){ + target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x; + target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y; + target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width; + target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height; + output[0].z = target_box_center_x.z - target_box_width.z/(half)2; + output[1].z = target_box_center_y.z - target_box_height.z/(half)2; + output[2].z = target_box_center_x.z + target_box_width.z/(half)2; + output[3].z = target_box_center_y.z + target_box_height.z/(half)2; + } + if(out_C - out_c * 4 >= 4){ + target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x; + target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y; + target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width; + target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height; + output[0].w = target_box_center_x.w - target_box_width.w/(half)2; + output[1].w = target_box_center_y.w - target_box_height.w/(half)2; + output[2].w = target_box_center_x.w + target_box_width.w/(half)2; + output[3].w = target_box_center_y.w + target_box_height.w/(half)2; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]); +} diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index 8405fc967239e851705feb96f517b3980192ebef..dc6a16861212cf5a5693ae1779ed2b1c2c26f1ee 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -29,30 +29,38 @@ CLRuntime* CLRuntime::Global() { } CLRuntime::~CLRuntime() { + LOG(INFO) << "CLRuntime::~CLRuntime()"; + // Note: do ReleaseResources() in predictor + command_queue_&& clReleaseCommandQueue(command_queue_->get()); + command_queue_.reset(); + context_&& clReleaseContext(context_->get()); + context_.reset(); + device_.reset(); + platform_.reset(); + initialized_ = false; +} + +void CLRuntime::ReleaseResources() { + // if (is_resources_released_) { + // return; + // } + if (command_queue_ != nullptr) { command_queue_->flush(); command_queue_->finish(); } - for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { clReleaseKernel(kernels_[kidx]->get()); kernels_[kidx].reset(); } kernels_.clear(); kernel_offset_.clear(); - for (auto& p : programs_) { clReleaseProgram(p.second->get()); } programs_.clear(); - - // For controlling the destruction order - command_queue_&& clReleaseCommandQueue(command_queue_->get()); - command_queue_.reset(); - context_&& clReleaseContext(context_->get()); - context_.reset(); - device_.reset(); - platform_.reset(); + LOG(INFO) << "release resources finished."; + is_resources_released_ = true; } bool CLRuntime::Init() { diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 36e5d64b906ff5c91b2b5cb5e97855d7dff511c4..69f9e3e371d5b55429dd727bb79ad1a9595ab5c5 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -33,6 +33,8 @@ class CLRuntime { public: static CLRuntime* Global(); + void ReleaseResources(); + bool Init(); cl::Platform& platform(); @@ -116,6 +118,8 @@ class CLRuntime { bool initialized_{false}; bool is_init_success_{false}; + + bool is_resources_released_{false}; }; } // namespace lite diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index 8d61fb3bbb97705c697fba934e6cab9424f85bad..9cf3281152840416dc141f98992499c663783b7a 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -96,8 +96,8 @@ class BeamSearchFunctor { // : nullptr; // fill in data - std::vector low_level; - size_t low_offset = 0; + std::vector low_level; + uint64_t low_offset = 0; for (auto &items : selected_items) { low_level.push_back(low_offset); for (auto &item : items) { diff --git a/lite/backends/x86/math/beam_search_test.cc b/lite/backends/x86/math/beam_search_test.cc index 904870207b08d462025ecb4b84d6cf57f7b13f26..233fa03fbaa31165dae4453affb148276f8c6584 100644 --- a/lite/backends/x86/math/beam_search_test.cc +++ b/lite/backends/x86/math/beam_search_test.cc @@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids, paddle::framework::LoDTensor* pre_scores) { // lod paddle::framework::LoD lod; - std::vector level0({0, 2, 4}); - std::vector level1({0, 1, 2, 3, 4}); + std::vector level0({0, 2, 4}); + std::vector level1({0, 1, 2, 3, 4}); lod.push_back(level0); lod.push_back(level1); ids->set_lod(lod); diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h index 72d0736268f342187f0be8c6348f5bed75df30ea..34b258892be05625ae88076eff175f56a53d3537 100644 --- a/lite/backends/x86/math/blas_impl.h +++ b/lite/backends/x86/math/blas_impl.h @@ -483,7 +483,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data()); + mat_out->template mutable_data()); } template <> @@ -759,7 +759,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data()); + mat_out->template mutable_data()); } else { PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0); @@ -773,7 +773,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data(), + mat_out->template mutable_data(), dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_, dim_a.stride_, dim_b.stride_); diff --git a/lite/backends/x86/math/concat_and_split.cc b/lite/backends/x86/math/concat_and_split.cc index bec93dde41fdb654cfbfd20f5d9e59d1d372e3a8..df75654aebaba26b9889d97445bd889cdf2f4eb0 100644 --- a/lite/backends/x86/math/concat_and_split.cc +++ b/lite/backends/x86/math/concat_and_split.cc @@ -51,7 +51,7 @@ class ConcatFunctor { // auto cpu_place = boost::get(context.GetPlace()); // computation - auto output_data = output->mutable_data(); + auto output_data = output->template mutable_data(); int col_idx = 0; for (int j = 0; j < num; ++j) { int col_len = input_cols[j]; @@ -108,7 +108,7 @@ class SplitFunctor { int col_len = output_cols[j]; auto* out_tensor = outputs->at(j); if (out_tensor != nullptr) { - T* dst_ptr = out_tensor->mutable_data() + k * col_len; + T* dst_ptr = out_tensor->template mutable_data() + k * col_len; std::copy_n(src_ptr + col_idx, col_len, dst_ptr); // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, // sizeof(T) * col_len); diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc index 366486924a8c4a5eefd6341183b4f1bc1c0277ad..941a34643669f060cdd18f38f92c39e529da7b19 100644 --- a/lite/backends/x86/math/cross_entropy.cc +++ b/lite/backends/x86/math/cross_entropy.cc @@ -50,8 +50,8 @@ class CrossEntropyFunctor { .reshape(batch_axis_remain) .sum(Eigen::DSizes(1))); } else { - const T* prob_data = prob->data(); - T* loss_data = out->mutable_data(); + const T* prob_data = prob->template data(); + T* loss_data = out->template mutable_data(); const int64_t* label_data = labels->data(); for (int i = 0; i < batch_size; ++i) { diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc index 1c4c6a49f5bb804a57344c59368d18255e8a7912..b916c912ffc2a4d62b63b98fdce150b353ba087e 100644 --- a/lite/backends/x86/math/im2col.cc +++ b/lite/backends/x86/math/im2col.cc @@ -99,7 +99,7 @@ class Col2ImFunctormutable_data(); + T* im_data = im->template mutable_data(); const T* col_data = col.data(); for (int c = 0; c < channels_col; ++c) { @@ -161,7 +161,7 @@ class Im2ColFunctordims()[1]; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { @@ -235,7 +235,7 @@ class Col2ImFunctormutable_data(); + T* im_data = im->template mutable_data(); const T* col_data = col.data(); for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { diff --git a/lite/backends/x86/math/im2col_cfo_cpu.h b/lite/backends/x86/math/im2col_cfo_cpu.h index 4623f045bb1cbe67605b36621efcc3285b989ad5..97579647d4ec3a9a95e033a153417cb0aaadbeb6 100644 --- a/lite/backends/x86/math/im2col_cfo_cpu.h +++ b/lite/backends/x86/math/im2col_cfo_cpu.h @@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im, int channels_col = im_channels * filter_height * filter_width; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; @@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im, int output_width = col->dims()[4]; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); int col_matrix_width = output_width * output_height; int im_size = im_height * im_width; size_t copy_size = sizeof(T) * output_width; @@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im, constexpr int prw = 1; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); int im_size = im_height * im_width; int col_matrix_width = output_width * output_height; int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc index a17807e8a997f0ecf908313a4cb205676e4fa4b8..05a10b5a19fbc8e80ee6dd07e67154d9cf6d1b22 100644 --- a/lite/backends/x86/math/math_function.cc +++ b/lite/backends/x86/math/math_function.cc @@ -65,7 +65,7 @@ struct TensorSetConstantCPU { : tensor_(tensor), value_(value) {} template void apply() const { - auto* begin = tensor_->mutable_data(lite::TargetType::kX86); + auto* begin = tensor_->template mutable_data(lite::TargetType::kX86); std::fill(begin, begin + tensor_->numel(), static_cast(value_)); } lite::Tensor* tensor_; @@ -126,7 +126,7 @@ struct RowwiseAdd { const T* input_data = input.data(); const T* vector_data = vector.data(); - T* output_data = output->mutable_data(); + T* output_data = output->template mutable_data(); for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t j = 0; j < size; ++j) { output_data[i * in_dims[0] + j] = diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h index 3aaca2e59370f8f2b922554ec6f378bb2a3de9b5..acfb76759f6fc9fa4122afd2388bc3adf8f5ea22 100644 --- a/lite/backends/x86/math/math_function_impl.h +++ b/lite/backends/x86/math/math_function_impl.h @@ -83,7 +83,7 @@ class ColwiseSum { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), size); - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { @@ -129,7 +129,7 @@ class RowwiseMean { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), height); auto inv_size = 1.0 / size; - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { @@ -173,7 +173,7 @@ class RowwiseSum { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), height); - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { diff --git a/lite/backends/x86/math/maxouting.cc b/lite/backends/x86/math/maxouting.cc index 20b40fe7c5000cc1d0ee80c18efa5d1defc911f0..f97b16f7fb3326a6d2eb186e2984df3dbd0a0a90 100644 --- a/lite/backends/x86/math/maxouting.cc +++ b/lite/backends/x86/math/maxouting.cc @@ -35,7 +35,7 @@ class MaxOutFunctor { // c_size means the output size of each sample int c_size = fea_size * output_channels; const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; ++i) { int new_bindex = c_size * i; @@ -72,7 +72,8 @@ class MaxOutGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; ++i) { int blen = fea_size * output_channels * i; diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc index ab6c1edb481f914d5751149aca2595fee550ca51..4393c42157bb7667ec2218e8b76f05a2c60bcc86 100644 --- a/lite/backends/x86/math/pooling.cc +++ b/lite/backends/x86/math/pooling.cc @@ -54,8 +54,8 @@ class Pool2dFunctor { const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; - const T* input_data = input->data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + const T* input_data = input->template data(); + T* output_data = output->template mutable_data(lite::TargetType::kX86); int hstart, hend; int wstart, wend; @@ -137,7 +137,8 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); int hstart, hend; int wstart, wend; @@ -220,7 +221,8 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -322,7 +324,7 @@ class Pool3dFunctor { const int output_stride = output_depth * output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); int dstart, dend; int hstart, hend; @@ -425,7 +427,8 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); int dstart, dend; int hstart, hend; @@ -530,7 +533,8 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { diff --git a/lite/backends/x86/math/sample_prob.h b/lite/backends/x86/math/sample_prob.h index 5312b3df10a41444c073f0cf61d69bce6fc3859a..4351df68a2630c2b8c6f7285f3955a9b06165f67 100644 --- a/lite/backends/x86/math/sample_prob.h +++ b/lite/backends/x86/math/sample_prob.h @@ -58,11 +58,11 @@ class SampleWithProb { const int64_t* label_data = L->data(); // int64_t* samples_data = // S->mutable_data(ret_dim, Target); - // T* probabilities_data = P->mutable_data(ret_dim, Target); + // T* probabilities_data = P->template mutable_data(ret_dim, Target); S->Resize({batch_size, num_sampled_classes}); auto* samples_data = S->mutable_data(Target); P->Resize({batch_size, num_sampled_classes}); - auto* probabilities_data = P->mutable_data(Target); + auto* probabilities_data = P->template mutable_data(Target); // temp sets for unique sampling std::unordered_set tmp_samples; diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc index 56fc363cb48ec5c58f4a7ee3e62a2e6bd7355021..014b213d4f10f7161dc1881d582cca93f2be58e5 100644 --- a/lite/backends/x86/math/search_fc.cc +++ b/lite/backends/x86/math/search_fc.cc @@ -42,7 +42,7 @@ class SearchFcFunctor { lite::DDim dims(std::vector({bottom.dims()[0], out_size})); const auto bottom_data = bottom.data(); - auto top_data = top->mutable_data(lite::TargetType::kX86); + auto top_data = top->template mutable_data(lite::TargetType::kX86); const auto weights = w.data(); auto blas = math::GetBlas(context); call_gemm(blas, diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc index f8f1b42361832771ba04d1bdc8b3e2e05f954e29..acb377e31ccac96547fc4f0644332cfad36d66bc 100644 --- a/lite/backends/x86/math/selected_rows_functor.cc +++ b/lite/backends/x86/math/selected_rows_functor.cc @@ -52,7 +52,7 @@ struct SelectedRowsAdd { PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); - auto* out_data = out_value->mutable_data(); + auto* out_data = out_value->template mutable_data(); auto* in1_data = in1_value.data(); std::copy_n(in1_data, in1_value.numel(), out_data); @@ -87,7 +87,7 @@ struct SelectedRowsAddTensor { functor(context, output, 0.0); auto* in1_data = in1_value.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); for (size_t i = 0; i < in1_rows.size(); i++) { for (int64_t j = 0; j < in1_row_numel; j++) { @@ -127,7 +127,7 @@ struct SelectedRowsAddTo { in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); auto* in1_data = in1_value.data(); - auto* in2_data = in2_value->mutable_data(); + auto* in2_data = in2_value->template mutable_data(); std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset); } }; @@ -161,7 +161,7 @@ struct SelectedRowsSumTo { input2->set_rows(in2_rows); auto* in2_value = input2->mutable_value(); - T* in2_data = in2_value->mutable_data(); + T* in2_data = in2_value->template mutable_data(); auto blas = math::GetBlas(context); size_t offset = 0u; for (size_t i = 0u; i != input1.size(); ++i) { @@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); - auto* input2_data = input2->mutable_data(); + auto* input2_data = input2->template mutable_data(); for (size_t i = 0; i < in1_rows.size(); i++) { for (int64_t j = 0; j < in1_row_numel; j++) { @@ -305,7 +305,7 @@ struct MergeAdd { lite::DDim dims(std::vector( {static_cast(merged_row_set.size()), input_width})); out.mutable_value()->Resize(dims); - auto* out_data = out.mutable_value()->mutable_data(); + auto* out_data = out.mutable_value()->template mutable_data(); if (merged_row_set.size() == row_num && !sorted_result) { // no duplicated ids, just concat the result together @@ -385,7 +385,7 @@ struct UpdateToTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); - auto* input2_data = input2->data(); + auto* input2_data = input2->template data(); // FIXME(typhoonzero): use macro fix the below messy code. switch (op) { diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc index c12c05414d717dce706590a491ccae2384f3bfe5..aa7aeac532e2fa1f90d452924b364be1896ee862 100644 --- a/lite/backends/x86/math/sequence2batch.cc +++ b/lite/backends/x86/math/sequence2batch.cc @@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor { public: void operator()(const lite::Context& context, const lite::Tensor& src, - const std::vector& index_lod, + const std::vector& index_lod, lite::Tensor* dst, bool is_src_index) { - const size_t* index = index_lod.data(); + const uint64_t* index = index_lod.data(); const auto& src_dims = src.dims(); const auto& dst_dims = dst->dims(); PADDLE_ENFORCE_EQ( @@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor { auto height = dst_dims[0]; auto width = dst_dims[1]; auto* src_data = src.data(); - auto* dst_data = dst->mutable_data(); + auto* dst_data = dst->template mutable_data(); const int sz = width * sizeof(T); if (is_src_index) { for (int i = 0; i < height; ++i) { diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h index a70cc5bf73522f97ab312fc48553b5316dbf8376..63df008b6dfca936265019a71ac0a553c525dc73 100644 --- a/lite/backends/x86/math/sequence2batch.h +++ b/lite/backends/x86/math/sequence2batch.h @@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor { // The indexed rows are based on the input index. void operator()(const lite::Context& context, const lite::Tensor& src, - const std::vector& index_lod, + const std::vector& index_lod, lite::Tensor* dst, bool is_src_index); }; @@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor { // batch_lods[2] is the sort order for the input LoDTensor. batch_lods->at(2).resize(seq_info.size()); - size_t* batch_starts = batch_lods->at(0).data(); - size_t* seq2batch_idx = batch_lods->at(1).data(); + auto* batch_starts = batch_lods->at(0).data(); + auto* seq2batch_idx = batch_lods->at(1).data(); batch_starts[0] = 0; for (int n = 0; n < max_seqlen; n++) { auto batch_id = static_cast(batch_starts[n]); @@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor { } batch_starts[n + 1] = static_cast(batch_id); } - size_t* seq_order = batch_lods->at(2).data(); + auto* seq_order = batch_lods->at(2).data(); for (size_t i = 0; i < seq_info.size(); ++i) { seq_order[i] = seq_info[i].seq_idx; } diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc index fbb6c11a5f7a0cbae36d2f8fba0b141dadadf542..eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76 100644 --- a/lite/backends/x86/math/sequence_padding.cc +++ b/lite/backends/x86/math/sequence_padding.cc @@ -22,15 +22,15 @@ namespace math { template void CopyValidData(lite::Tensor* dst_tensor, const lite::Tensor* src_tensor, - const std::vector& seq_offsets, + const std::vector& seq_offsets, int pad_seq_len, int step_width, bool norm_by_len, CopyType type, PadLayout layout) { int seq_num = seq_offsets.size() - 1; - const T* src_data = src_tensor->data(); - T* dst_data = dst_tensor->mutable_data(); + const T* src_data = src_tensor->template data(); + T* dst_data = dst_tensor->template mutable_data(); int seq_cpy_gap = step_width; int pad_cpy_gap = @@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor { "'step_width'."); // fill padding value - T* pad_data = pad_tensor->mutable_data(); + T* pad_data = pad_tensor->template mutable_data(); const T* pad_value_data = pad_value.data(); if (pad_value.numel() == 1) { fast_mem_init( diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h index a3f4512042de4c7a2fc665f2fd41777d472225f5..43407014dea0ed0c78ab29da7fb8ebb0e0310566 100644 --- a/lite/backends/x86/math/sequence_padding.h +++ b/lite/backends/x86/math/sequence_padding.h @@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth }; enum CopyType { kSeqToPad, kPadToSeq }; -inline static size_t MaximumSequenceLength( - const std::vector& seq_offset) { - size_t seq_num = seq_offset.size() - 1; - size_t max_seq_len = 0; +inline static uint64_t MaximumSequenceLength( + const std::vector& seq_offset) { + uint64_t seq_num = seq_offset.size() - 1; + uint64_t max_seq_len = 0; for (size_t i = 0; i < seq_num; ++i) { max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]); } @@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength( inline static void CheckDims(const lite::DDim& seq_tensor_dims, const lite::DDim& pad_tensor_dims, - const std::vector& seq_offset, + const std::vector& seq_offset, int64_t padded_seq_len, int64_t step_width, const PadLayout& layout) { diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc index 186b8b5543c7132867093616c83b45ae8ff27d3c..34c55c5714e467954bc1bb79d9b1385ef5cfe497 100644 --- a/lite/backends/x86/math/sequence_pooling.cc +++ b/lite/backends/x86/math/sequence_pooling.cc @@ -55,7 +55,7 @@ class MaxSeqPoolFunctor { auto starts = input.lod()[0]; const T* in_data = input.data(); - T* out_data = output->mutable_data(); + T* out_data = output->template mutable_data(); int* max_index = index->mutable_data(); int64_t num_seq = out_dims[0]; @@ -103,7 +103,7 @@ class MaxSeqPoolFunctor { auto starts = input.lod()[0]; const T* in_data = input.data(); - T* out_data = output->mutable_data(); + T* out_data = output->template mutable_data(); int64_t num_seq = out_dims[0]; int64_t dim = output->numel() / num_seq; @@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor { const T* og_data = out_grad.data(); const int* max_index = index.data(); - T* ig_data = in_grad->mutable_data(); + T* ig_data = in_grad->template mutable_data(); SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); @@ -170,7 +170,7 @@ class LastSeqPoolFunctor { lite::Tensor* output) { // Create pointers to input and output data auto* in_data = input.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); // Calculate the size of each item in sequence int64_t item_size = input.numel() / input.dims()[0]; @@ -203,7 +203,7 @@ class FirstSeqPoolFunctor { lite::Tensor* output) { // Create pointers to input and output data auto* in_data = input.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); // Calculate the size of each item in sequence int64_t item_size = input.numel() / input.dims()[0]; @@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor { int64_t in_w = in_grad->numel() / in_grad->dims()[0]; PADDLE_ENFORCE(in_w == out_w); const T* out_g_data = out_grad.data(); - T* in_g_data = in_grad->mutable_data(TARGET(kX86)); + T* in_g_data = in_grad->template mutable_data(TARGET(kX86)); auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); @@ -288,7 +288,7 @@ class SequencePoolFunctor { auto lod = input.lod()[0]; if (pooltype == "SUM") { const T* src = input.data(); - T* dst = output->mutable_data(TARGET(kX86)); + T* dst = output->template mutable_data(TARGET(kX86)); jit::seq_pool_attr_t attr( static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc index a73014767345842f09ac2ff0cd5c2e7231c1f90a..b91f43a571994bef95650361a6dc62c0465837a7 100644 --- a/lite/backends/x86/math/sequence_pooling_test.cc +++ b/lite/backends/x86/math/sequence_pooling_test.cc @@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { TEST(SequencePoolingGrad, CPU_SUM) { paddle::framework::LoD lod1; - lod1.push_back(std::vector{0, 10}); + lod1.push_back(std::vector{0, 10}); TestSequencePoolingSum(lod1); paddle::framework::LoD lod2; - lod2.push_back(std::vector{0, 2, 7, 10}); + lod2.push_back(std::vector{0, 2, 7, 10}); TestSequencePoolingSum(lod2); @@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) { #ifdef PADDLE_WITH_CUDA TEST(SequencePoolingGrad, CUDA_SUM) { paddle::framework::LoD lod1; - lod1.push_back(std::vector{0, 10}); + lod1.push_back(std::vector{0, 10}); TestSequencePoolingSum(lod1); paddle::framework::LoD lod2; - lod2.push_back(std::vector{0, 2, 7, 10}); + lod2.push_back(std::vector{0, 2, 7, 10}); TestSequencePoolingSum(lod2); diff --git a/lite/backends/x86/math/sequence_scale.cc b/lite/backends/x86/math/sequence_scale.cc index fad0628de15379b58847827cc3d48bf6085cbda2..25c7be0d0e2747f4f28c1d82f8855872d57726d1 100644 --- a/lite/backends/x86/math/sequence_scale.cc +++ b/lite/backends/x86/math/sequence_scale.cc @@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor { size_t seq_width = seq->dims()[1]; lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod); - T* seq_data = seq->mutable_data(lite::TargetType::kX86); + T* seq_data = seq->template mutable_data(lite::TargetType::kX86); for (size_t i = 0; i < num_seq; ++i) { for (size_t j = lod[level][i] * seq_width; j < lod[level][i + 1] * seq_width; diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc index 035a7923c70f91cf27f1d845f68110f8f33cb73d..97e27fed59f4bc1a4c457ea9cf515da6caca9a1c 100644 --- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc +++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc @@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor { auto pos_data = pos->mutable_data(lite::TargetType::kX86); int offset = 0; - std::vector vec_out_lod; + std::vector vec_out_lod; vec_out_lod.reserve(batch_size + 1); for (int i = 0; i <= batch_size; ++i) { offset = row_lod[i]; @@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor { out->set_lod(lod_temp); auto in_data = in.data(); - auto out_data = out->mutable_data(lite::TargetType::kX86); + auto out_data = out->template mutable_data(lite::TargetType::kX86); T* sum_data = new T[max_k]; for (int i = 0; i < batch_size; ++i) { diff --git a/lite/backends/x86/math/softmax_impl.h b/lite/backends/x86/math/softmax_impl.h index ec45377bc55154a4a36ebc5c3684ab7efeeef88e..1ba84dda42093155b10fa74a49e953d6663b8c88 100644 --- a/lite/backends/x86/math/softmax_impl.h +++ b/lite/backends/x86/math/softmax_impl.h @@ -108,8 +108,8 @@ class SoftmaxFunctor> { const int num_remain = num_classes / axis_dim; if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { - const T* in_data = X->data(); - auto* out_data = Y->mutable_data(); + const T* in_data = X->template data(); + auto* out_data = Y->template mutable_data(); for (int bs = 0; bs < batch_size; ++bs) { T max_val = *std::max_element(in_data, in_data + num_classes); max_val *= static_cast(-1); @@ -219,9 +219,9 @@ class SoftmaxGradFunctor> { const int num_remain = num_classes / axis_dim; if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { - const T* out_data = y->data(); - const T* out_grad = y_grad->data(); - T* in_grad = x_grad->mutable_data(); + const T* out_data = y->template data(); + const T* out_grad = y_grad->template data(); + T* in_grad = x_grad->template mutable_data(); for (int bs = 0; bs < batch_size; ++bs) { T scalar; vec_mul_reduce( diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc index 20b913331308c8b8c95d190b6b0b3d76ccac354b..bfc7084c9ff018101ca3dfc1d1748083b1449662 100644 --- a/lite/backends/x86/math/tree2col.cc +++ b/lite/backends/x86/math/tree2col.cc @@ -104,12 +104,12 @@ class Tree2ColFunctor { patch_size = processing_list.size(); // T *patch_data = - // patch->mutable_data({static_cast(patch_size), + // patch->template mutable_data({static_cast(patch_size), // static_cast(patch_elem_size)}, // cpu_place); patch->Resize({static_cast(patch_size), static_cast(patch_elem_size)}); - auto *patch_data = patch->mutable_data(lite::TargetType::kX86); + auto *patch_data = patch->template mutable_data(lite::TargetType::kX86); constant(context, patch, 0); const T *features = node_features.data(); @@ -166,12 +166,12 @@ class Col2TreeFunctor { } } // T *grad_data = - // in_grad->mutable_data({static_cast(node_count), + // in_grad->template mutable_data({static_cast(node_count), // static_cast(grad_elem_size)}, // cpu_place); in_grad->Resize({static_cast(node_count), static_cast(grad_elem_size)}); - auto *grad_data = in_grad->mutable_data(lite::TargetType::kX86); + auto *grad_data = in_grad->template mutable_data(lite::TargetType::kX86); constant(context, in_grad, 0); const T *out_g = out_grad.data(); diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc index 568f9952cab755c8441695e1a9266a2001d2b9a9..119d7294e9ec21e67f09776ad20d04f15b8b81ce 100644 --- a/lite/backends/x86/math/unpooling.cc +++ b/lite/backends/x86/math/unpooling.cc @@ -36,7 +36,7 @@ class Unpool2dMaxFunctor { int output_feasize = output_height * output_width; const T* input_data = input.data(); const int* indices_data = indices.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { @@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor { int output_feasize = output_height * output_width; const int* indices_data = indices.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc index 8fd5e8954e2010d5226d56ac4a87a44e6364c8c6..91979bb7fdcfe66d84ded3f9797144ddafc8769e 100644 --- a/lite/backends/x86/math/vol2col.cc +++ b/lite/backends/x86/math/vol2col.cc @@ -75,7 +75,7 @@ class Vol2ColFunctor { "mismatching."); const T* vol_data = vol.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; @@ -159,7 +159,7 @@ class Col2VolFunctor { output_width, "input_width and output_width are " "mismatching."); - T* vol_data = vol->mutable_data(); + T* vol_data = vol->template mutable_data(); const T* col_data = col.data(); for (int c = 0; c < channels_col; ++c) { diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt index 4491fdeaefe9f16265bdee2c07ebb02b86a2b038..85bef0452c41ce35c90d9bd058bb7fdefd030f3a 100644 --- a/lite/backends/xpu/CMakeLists.txt +++ b/lite/backends/xpu/CMakeLists.txt @@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU) return() endif() -lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) +if(LITE_WITH_XTCL) + lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) +endif() +lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h index 6de18d5466da6e6b791363d2e275ea72376c78b8..a2cc3206d3d0391d89690026561f47983e9376c9 100644 --- a/lite/backends/xpu/device.h +++ b/lite/backends/xpu/device.h @@ -14,12 +14,12 @@ #pragma once -#include #include #include #include #include #include +#include "lite/backends/xpu/xpu_header_sitter.h" namespace paddle { namespace lite { diff --git a/lite/backends/xpu/math.h b/lite/backends/xpu/math.h new file mode 100644 index 0000000000000000000000000000000000000000..48352736d45a20d9abd496d9dd10b000d3f15a28 --- /dev/null +++ b/lite/backends/xpu/math.h @@ -0,0 +1,219 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace xpu { +namespace math { + +static inline long round_half_to_even(const float src) { // NOLINT + long ret = llround(src); // NOLINT + if (fabs(fabs(round(src) - src) - 0.5) > 0) { + return ret; + } else { + if (abs(ret) % 2 == 0) { + return ret; + } else { + return ret + (ret > 0 ? -1 : 1); + } + } +} + +static float ieee_compliance_0(float f) { + uint32_t *ptr = reinterpret_cast(&f); + uint32_t sign = (*ptr) & 0x80000000; + uint32_t uf = 0; + // nan -> inf + if (std::isnan(f)) { + uf = (sign | 0x7F800000); + float *ptr = reinterpret_cast(&uf); + return *ptr; + } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) { + return f; + } else { + // denormal -> +-0 + uf = 0x0; + float *ptr = reinterpret_cast(&uf); + return *ptr; + } +} + +template +static inline T fp32_to_intx(const float f, float max) { + max = ieee_compliance_0(max); + float input = ieee_compliance_0(f); + // +0 and -0 -> +0 + if (input == 0) { + input = 0.0f; + } + + float tmp = RMAX / max; + if (std::isinf(tmp)) { + uint32_t *ptr = reinterpret_cast(&input); + if ((*ptr) >> 31 & 1) { + return T(-RMAX); + } else { + return T(RMAX); + } + } + + tmp = input * tmp; + if (std::isnan(tmp)) { + return T(RMAX); + } + + tmp = ieee_compliance_0(tmp); + // early check to avoid INF or big value get into convertor func. + if (tmp > RMAX) { + return T(RMAX); + } + if (tmp < -RMAX) { + return T(-RMAX); + } + T ret = (T)round_half_to_even(tmp); + if (ret > RMAX) { + ret = T(RMAX); + } + if (ret < -RMAX) { + ret = T(-RMAX); + } + return ret; +} + +static inline int16_t fp32_to_int16(const float f, float max) { + int16_t v1 = fp32_to_intx(f, max); + return v1; +} + +static inline int ConvertFP32ToInt16(const void *input, + void *output, + float max_val, + int len) { + for (int i = 0; i < len; i++) { + static_cast(output)[i] = + fp32_to_int16(static_cast(input)[i], max_val); + } + return 0; +} + +static inline float FindMaxAbs(const float *data, int len) { + float max_f = 0.0f; + for (int i = 0; i < len; ++i) { + float max = std::abs(data[i]); + if (max > max_f) { + max_f = max; + } + } + return max_f; +} + +template +static inline void Transpose(const T *in, T *out, int h, int w) { + for (int h1 = 0; h1 < w; ++h1) { + for (int w1 = 0; w1 < h; ++w1) { + out[h1 * h + w1] = in[w1 * w + h1]; + } + } +} + +/** + * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the + * original x_dim is returned. + */ +static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) { + if (x_dim.size() > 1) { + return x_dim; + } + return lite::DDim({1, x_dim[0]}); +} + +/** + * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the + * original y_dim is returned. + */ +static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) { + if (y_dim.size() > 1) { + return y_dim; + } + return lite::DDim({y_dim[0], 1}); +} + +/** + * Matrix Descriptor of a memory buffer. + * + * It is used for Blas::MatMul. MatMul operator can be batched. + * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a + * `batch_size` times of GEMM. The batched GEMM could be faster base on the + * implementation of the blas library. The batch size could be zero. If any + * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g., + * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be + * [BatchSize, H1, W2] + * + * The boolean flag, `trans`, describe the memory is the transpose of matrix or + * not. If the trans is true, the last two dims of matrix are transposed. The + * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height]. + * + * The MatDescriptor is not only the dimension or shape of a matrix, it also + * contains the layout, stride of matrix. It is clearer to have a structure than + * reuse `DDim`. + */ +struct MatDescriptor { + int64_t height_; + int64_t width_; + int64_t stride_{0}; + int64_t batch_size_{0}; + bool trans_; +}; + +static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, + int num_flatten_cols, + bool trans) { + MatDescriptor retv; + if (num_flatten_cols > 1) { + auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); + retv.height_ = flatten_dim[0]; + retv.width_ = flatten_dim[1]; + } else { + if (tensor_dim.size() == 2) { + retv.height_ = tensor_dim[0]; + retv.width_ = tensor_dim[1]; + } else { + auto dim_vec = tensor_dim.Vectorize(); + retv.batch_size_ = 1; + for (size_t i = 0; i < dim_vec.size() - 2; ++i) { + retv.batch_size_ *= dim_vec[i]; + } + retv.height_ = dim_vec[dim_vec.size() - 2]; + retv.width_ = dim_vec[dim_vec.size() - 1]; + retv.stride_ = retv.height_ * retv.width_; + } + } + if (trans) { + std::swap(retv.width_, retv.height_); + } + retv.trans_ = trans; + return retv; +} + +} // namespace math +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..5dcbc1e275cca8c32003cbef74dfb1f6d4caee93 --- /dev/null +++ b/lite/backends/xpu/target_wrapper.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/target_wrapper.h" +#include "lite/backends/xpu/xpu_header_sitter.h" + +namespace paddle { +namespace lite { + +void* TargetWrapperXPU::Malloc(size_t size) { + void* ptr{nullptr}; + xpu_malloc(&ptr, size); + return ptr; +} + +void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); } + +void TargetWrapperXPU::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + switch (dir) { + case IoDirection::HtoD: + xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE); + break; + case IoDirection::DtoH: + xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST); + break; + default: + LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..c42d4139246085d8b9a367b45b60699209d0b668 --- /dev/null +++ b/lite/backends/xpu/target_wrapper.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { + +using TargetWrapperXPU = TargetWrapper; + +template <> +class TargetWrapper { + public: + static size_t num_devices() { return 1; } + static size_t maximum_stream() { return 0; } + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/xpu_header_sitter.h b/lite/backends/xpu/xpu_header_sitter.h new file mode 100644 index 0000000000000000000000000000000000000000..875e67d57d4ba2110bfbffb7ee9d1d6a876060fa --- /dev/null +++ b/lite/backends/xpu/xpu_header_sitter.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#pragma GCC system_header +#include +#include +#include + +#if defined(LITE_WITH_XTCL) +#include +#endif + +namespace paddle { +namespace lite { + +namespace xdnn = baidu::xpu::api; + +} // namespace lite +} // namespace paddle diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index db8bc29d70d4764f14f24915fcbc254ba2af91df..278f971b0b1ee8a0b941158839fcc6810e25ad67 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc DEPS target_wrapper_host place X86_DEPS target_wrapper_x86 CUDA_DEPS target_wrapper_cuda + XPU_DEPS target_wrapper_xpu CL_DEPS cl_target_wrapper FPGA_DEPS fpga_target_wrapper - BM_DEPS target_wrapper_bm) + BM_DEPS target_wrapper_bm + MLU_DEPS target_wrapper_mlu) lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper) diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index 0f3f36768bd5a079564002cbb6464d61bd5db3aa..afc104073684ff00395fb32335630705ff3f7bc8 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/context.cc b/lite/core/context.cc index be886168e02e21d192305d701110ce5075ffba63..be41aa6eb0cb986760f38eaa2bb5b7e017cc4edb 100644 --- a/lite/core/context.cc +++ b/lite/core/context.cc @@ -15,5 +15,11 @@ #include "lite/core/context.h" namespace paddle { -namespace lite {} // namespace lite +namespace lite { + +#ifdef LITE_WITH_XPU +thread_local xdnn::Context* Context::_tls_raw_ctx{nullptr}; +#endif + +} // namespace lite } // namespace paddle diff --git a/lite/core/context.h b/lite/core/context.h index b22b59fbeb6a5e25547e18bcc4f62a263c4f165c..061638d63f5187bbfe296afbc3679d9b390a6457 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -24,6 +24,14 @@ #include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_runtime.h" #endif +#ifdef LITE_WITH_MLU +#include +#include +#include "lite/backends/mlu/mlu_utils.h" +#endif +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/xpu_header_sitter.h" +#endif #include #include @@ -103,11 +111,38 @@ class Context { public: Context() {} explicit Context(const XPUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} + void CopySharedTo(XPUContext* ctx) {} + static xdnn::Context* GetRawContext() { + if (_tls_raw_ctx == nullptr) { + _tls_raw_ctx = xdnn::create_context(); + CHECK(_tls_raw_ctx); + } + return _tls_raw_ctx; + } + + static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { + xdnn::set_workspace_l3_size(GetRawContext(), l3_size); + } + + static void SetDev(int dev_no = 0) { + const char* dev_env = getenv("LITE_XPU_DEV"); + if (dev_env) { + xpu_set_device(atoi(dev_env)); + return; + } + + xpu_set_device(dev_no); + } + std::string name() const { return "XPUContext"; } + + private: + static thread_local xdnn::Context* _tls_raw_ctx; }; #endif @@ -172,6 +207,85 @@ class Context { }; #endif +#ifdef LITE_WITH_MLU +template <> +class Context { + public: + typename Env::Devs& devs = Env::Global(); + + void InitOnce() {} + + MLUContext& operator=(const MLUContext& ctx) { + this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_); + return *this; + } + + void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) { + CHECK_GT(devs.size(), 0UL) + << "Env is not initialized or current target is not exit!"; + if (dev_id >= static_cast(devs.size())) { + LOG(WARNING) << "device index exceeds the number of devices, set to " + "default device(0)!"; + device_id_ = 0; + } else { + device_id_ = dev_id; + } + SetMluDevice(device_id_); + if (io_queue_id >= devs[dev_id].max_queue()) { + LOG(WARNING) << "data queue index exceeds the maximum queue number, " + "set to default qeueu(0)!"; + io_queue_id = 0; + } + if (exec_queue_id >= devs[dev_id].max_queue()) { + LOG(WARNING) << "exec queue index exceeds the maximum queue number, " + "set to default qeueu(0)!"; + exec_queue_id = 0; + } + io_queue_ = devs[dev_id].io_queues()[io_queue_id]; + exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id]; + + exec_queue_id_ = exec_queue_id; + io_queue_id_ = io_queue_id; + } + + void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; } + + const cnrtQueue_t& exec_queue() const { return exec_queue_; } + void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; } + + const cnrtQueue_t& io_queue() const { return io_queue_; } + void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; } + + cnmlCoreVersion_t MLUCoreVersion() { + return DeviceInfo::Global().MLUCoreVersion(); + } + + int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); } + + u32_t affinity() { return affinity_; } + + cnrtInvokeFuncParam_t forward_param() { return forward_param_; } + + int device_id() { return device_id_; } + + std::string name() const { return "MLUContext"; } + + private: + int device_id_; + // overall information + int exec_queue_id_; + int io_queue_id_; + cnrtQueue_t io_queue_; + cnrtQueue_t exec_queue_; + + std::vector input_notifiers_; + std::vector output_notifiers_; + + cnrtInvokeFuncParam_t forward_param_; + u32_t affinity_ = 0x01; +}; +#endif // LITE_WITH_MLU + #ifdef LITE_WITH_CUDA // Only works with CUDA kernels. template <> @@ -398,6 +512,16 @@ class ContextScheduler { kernel_contexts_[TargetType::kBM].As().CopySharedTo( &ctx->As()); break; +#endif +#ifdef LITE_WITH_MLU + case TARGET(kMLU): { + int dev_id = TargetWrapper::GetCurDevice(); + auto& context = ctx->As(); + context.Init(dev_id); + kernel_contexts_[TargetType::kMLU].As().CopySharedTo( + &context); + LOG(INFO) << "New Context for MLU"; + } break; #endif default: #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON) @@ -439,6 +563,9 @@ class ContextScheduler { #endif #ifdef LITE_WITH_BM InitContext(); +#endif +#ifdef LITE_WITH_MLU + InitContext(); #endif } diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 6e0d743fb9d8d8af5e7168e292c1e85d76844383..29ac96ed744b016833a746b35002dd68109efd8b 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -58,7 +58,7 @@ namespace paddle { namespace lite { -#ifdef LITE_WITH_ARM +#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU)) thread_local lite_api::PowerMode DeviceInfo::mode_; thread_local ARMArch DeviceInfo::arch_; thread_local int DeviceInfo::mem_size_; @@ -66,6 +66,15 @@ thread_local std::vector DeviceInfo::active_ids_; thread_local TensorLite DeviceInfo::workspace_; thread_local int64_t DeviceInfo::count_ = 0; +#ifdef LITE_WITH_MLU +thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270}; +thread_local int DeviceInfo::mlu_core_number_{1}; +thread_local bool DeviceInfo::use_first_conv_{false}; +thread_local std::vector DeviceInfo::mean_vec_; +thread_local std::vector DeviceInfo::std_vec_; +thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)}; +#endif + #ifdef TARGET_IOS const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; @@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() { return 0; } +#ifdef LITE_WITH_MLU +void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version, + int core_number, + bool use_first_conv, + const std::vector& mean_vec, + const std::vector& std_vec, + DataLayoutType input_layout) { + switch (core_version) { + case (lite_api::MLUCoreVersion::MLU_220): + mlu_core_version_ = CNML_MLU220; + break; + case (lite_api::MLUCoreVersion::MLU_270): + mlu_core_version_ = CNML_MLU270; + break; + default: + mlu_core_version_ = CNML_MLU270; + break; + } + mlu_core_number_ = core_number; + use_first_conv_ = use_first_conv; + mean_vec_ = mean_vec; + std_vec_ = std_vec; + input_layout_ = input_layout; +} + +cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; } + +int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; } + +bool DeviceInfo::UseFirstConv() { return use_first_conv_; } + +const std::vector& DeviceInfo::MeanVec() const { return mean_vec_; } + +const std::vector& DeviceInfo::StdVec() const { return std_vec_; } + +DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; } + +#endif // LITE_WITH_MLU + void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { #ifdef ARM_WITH_OMP thread_num = std::min(thread_num, core_num_); @@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) { #endif // LITE_WITH_ARM +#ifdef LITE_WITH_MLU +void SetMluDevice(int device_id) { + LOG(INFO) << "Set mlu device " << device_id; + cnrtDev_t dev_handle; + CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id)); + CNRT_CALL(cnrtSetCurrentDevice(dev_handle)); +} + +void Device::Init() { + SetMluDevice(idx_); + GetInfo(); + CreateQueue(); +} + +void Device::GetInfo() {} + +void Device::CreateQueue() { + exec_queue_.clear(); + io_queue_.clear(); + for (size_t i = 0; i < max_queue_; ++i) { + cnrtQueue_t exec_queue; + cnrtQueue_t io_queue; + cnrtCreateQueue(&exec_queue); + cnrtCreateQueue(&io_queue); + exec_queue_.push_back(exec_queue); + io_queue_.push_back(io_queue); + + cnrtCreateQueue(&exec_queue); + exec_queue_.push_back(exec_queue); + } +} +#endif // LITE_WITH_MLU + #ifdef LITE_WITH_CUDA void Device::Init() { diff --git a/lite/core/device_info.h b/lite/core/device_info.h index 5727933f477ae76fbfa89c9aa3e03aec8763d445..a108ae3d4b564aaac02a63ead9a35eba26a6cf63 100644 --- a/lite/core/device_info.h +++ b/lite/core/device_info.h @@ -19,11 +19,14 @@ #include #include "lite/core/tensor.h" #include "lite/utils/cp_logging.h" +#ifdef LITE_WITH_MLU +#include "lite/backends/mlu/mlu_utils.h" +#endif namespace paddle { namespace lite { -#ifdef LITE_WITH_ARM +#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU)) typedef enum { kAPPLE = 0, @@ -52,6 +55,20 @@ class DeviceInfo { int Setup(); void SetRunMode(lite_api::PowerMode mode, int thread_num); +#ifdef LITE_WITH_MLU + void SetMLURunMode(lite_api::MLUCoreVersion core_version, + int core_number, + bool use_first_conv, + const std::vector& mean_vec, + const std::vector& std_vec, + DataLayoutType input_layout); + cnmlCoreVersion_t MLUCoreVersion(); + int MLUCoreNumber(); + bool UseFirstConv(); + const std::vector& MeanVec() const; + const std::vector& StdVec() const; + DataLayoutType InputLayout() const; +#endif void SetCache(int l1size, int l2size, int l3size); void SetArch(ARMArch arch) { arch_ = arch; } @@ -103,6 +120,15 @@ class DeviceInfo { static thread_local TensorLite workspace_; static thread_local int64_t count_; +#ifdef LITE_WITH_MLU + static thread_local cnmlCoreVersion_t mlu_core_version_; + static thread_local int mlu_core_number_; + static thread_local bool use_first_conv_; + static thread_local std::vector mean_vec_; + static thread_local std::vector std_vec_; + static thread_local DataLayoutType input_layout_; +#endif + void SetDotInfo(int argc, ...); void SetFP16Info(int argc, ...); void SetFP32Info(int argc, ...); @@ -134,6 +160,9 @@ class Env { return *devs; } static void Init(int max_stream = 4) { +#ifdef LITE_WITH_MLU + CNRT_CALL(cnrtInit(0)); +#endif Devs& devs = Global(); if (devs.size() > 0) { return; @@ -156,6 +185,41 @@ class Env { } }; +#ifdef LITE_WITH_MLU +void SetMluDevice(int device_id); + +template <> +class Device { + public: + Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {} + void Init(); + + int id() { return idx_; } + int max_queue() { return max_queue_; } + void SetId(int idx) { idx_ = idx; } + std::string name() { return "MLU"; } + int core_num() { return 16; } + float max_memory() { return 16 * 1024; } + std::vector io_queues() { return io_queue_; } + std::vector exec_queues() { return exec_queue_; } + + private: + void CreateQueue(); + void GetInfo(); + + private: + int idx_{0}; + int max_queue_; + std::string device_name_; + float max_memory_; + + std::vector io_queue_; + std::vector exec_queue_; +}; + +template class Env; +#endif // LITE_WITH_MLU + #ifdef LITE_WITH_CUDA template <> class Device { diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 18a1243c11652afc181f13f0f5a497858a30885f..ff848dae9e4ad6e8aaef70432301033406633db6 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -83,6 +83,9 @@ class KernelBase { #if defined(LITE_WITH_CUDA) WorkSpace::Global_CUDA().AllocReset(); #endif +#if defined(LITE_WITH_MLU) + WorkSpace::Global_MLU().AllocReset(); +#endif #ifdef LITE_WITH_PROFILE profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); diff --git a/lite/core/memory.cc b/lite/core/memory.cc index 0ee973a8b6412a2fd20e33745b7b86561696efae..1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -45,6 +45,16 @@ void* TargetMalloc(TargetType target, size_t size) { data = TargetWrapper::Malloc(size); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + data = TargetWrapper::Malloc(size); + break; +#endif // LITE_WITH_MLU +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + data = TargetWrapperXPU::Malloc(size); + break; +#endif // LITE_WITH_XPU default: LOG(FATAL) << "Unknown supported target " << TargetToStr(target); } @@ -83,6 +93,16 @@ void TargetFree(TargetType target, void* data, std::string free_flag) { TargetWrapper::Free(data); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + TargetWrapper::Free(data); + break; +#endif // LITE_WITH_MLU +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + TargetWrapperXPU::Free(data); + break; +#endif // LITE_WITH_XPU default: LOG(FATAL) << "Unknown type"; } @@ -114,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { TargetWrapper::MemcpySync(dst, src, size, IoDirection::DtoD); break; #endif +#ifdef LITE_WITH_MLU + case TargetType::kMLU: + TargetWrapper::MemcpySync( + dst, src, size, IoDirection::HtoD); + break; +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/memory.h b/lite/core/memory.h index 691415aecb53bf7f48faf5fbb4dbca448da04a10..a1013910019251271ddfccfbc700297c45226fe6 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -31,6 +31,14 @@ #include "lite/backends/bm/target_wrapper.h" #endif // LITE_WITH_BM +#ifdef LITE_WITH_MLU +#include "lite/backends/mlu/target_wrapper.h" +#endif // LITE_WITH_MLU + +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/target_wrapper.h" +#endif // LITE_WITH_XPU + namespace paddle { namespace lite { @@ -75,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { TargetWrapperCL::MemcpySync(dst, src, size, dir); break; #endif // LITE_WITH_OPENCL +#ifdef LITE_WITH_MLU + case TARGET(kMLU): + TargetWrapperMlu::MemcpySync(dst, src, size, dir); + break; +#endif #ifdef LITE_WITH_FPGA case TARGET(kFPGA): TargetWrapper::MemcpySync(dst, src, size, dir); @@ -126,7 +139,7 @@ class Buffer { const size_t img_h, void* host_ptr = nullptr) { if (target != target_ || cl_image2d_width_ < img_w || - cl_image2d_height_ < img_h) { + cl_image2d_height_ < img_h || host_ptr != nullptr) { CHECK_EQ(own_data_, true) << "Can not reset unowned buffer."; Free(); data_ = TargetWrapperCL::MallocImage(img_w, img_h, host_ptr); diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index 0e021fa4441ae1245d0b61835040dc03e0abfd87..d4360b7f3299045d9aec2a8c1c67534a62021eae 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -21,6 +21,8 @@ lite_cc_library(mir_passes fusion/elementwise_add_activation_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc + fusion/__xpu__resnet_fuse_pass.cc + fusion/__xpu__multi_encoder_fuse_pass.cc elimination/identity_scale_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc elimination/assign_value_eliminate_pass.cc @@ -36,6 +38,7 @@ lite_cc_library(mir_passes demo_pass.cc runtime_context_assign_pass.cc memory_optimize_pass.cc + mlu_postprocess_pass.cc weight_quantization_preprocess_pass.cc quantized_op_attributes_inference_pass.cc DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs}) @@ -70,10 +73,10 @@ set(pattern_deps mir_node mir_ssa_graph op) if (WITH_TESTING) list(APPEND pattern_deps gtest) endif() -lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps}) +lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps}) lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher) -lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher) +lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher) # for mobile, unnecessary to compile the following testings. diff --git a/lite/core/mir/dot.h b/lite/core/mir/dot.h index df70565c0775acdb61cb540598f15b7f84e0119c..a68890910ab33bd32c68efc6f06236db21909a05 100644 --- a/lite/core/mir/dot.h +++ b/lite/core/mir/dot.h @@ -27,8 +27,8 @@ #include "lite/utils/string.h" namespace paddle { -namespace inference { -namespace analysis { +namespace lite { +namespace mir { static size_t dot_node_counter{0}; @@ -162,6 +162,6 @@ class Dot { std::vector attrs_; }; -} // namespace analysis -} // namespace inference +} // namespace mir +} // namespace lite } // namespace paddle diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt index e65e72cf7b367ee8477f3f783ae4d82372529864..04a36976c7110c64ef781af12fc86fd4853fe583 100644 --- a/lite/core/mir/fusion/CMakeLists.txt +++ b/lite/core/mir/fusion/CMakeLists.txt @@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose DEPS pattern_matcher_high_api) lite_cc_library(fuse_interpolate SRCS interpolate_fuser.cc - DEPS pattern_matcher_high_api) + DEPS pattern_matcher_high_api) lite_cc_library(fuse_sequence_pool_concat SRCS sequence_pool_concat_fuser.cc - DEPS pattern_matcher_high_api) + DEPS pattern_matcher_high_api) set(mir_fusers fuse_fc diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..655274070f1ffcccf39b5f3ff6aaa705c5cbbfda --- /dev/null +++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -0,0 +1,637 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { + +namespace fusion { + +class XPUSingleEncoderFuser : public FuseBase { + public: + explicit XPUSingleEncoderFuser(const std::string& act_type = "gelu") + : act_type_(act_type) {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("mul", "X") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + + auto* q_mul_y = + VarNode("q_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* q_mul = OpNode("q_mul", "mul"); + auto* q_mul_out = VarNode("q_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* q_add_y = VarNode("q_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* q_add = OpNode("q_add", "elementwise_add")->AsIntermediate(); + auto* q_add_out = VarNode("q_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* q_reshape2 = OpNode("q_reshape2", "reshape2")->AsIntermediate(); + auto* q_reshape2_out = VarNode("q_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* q_reshape2_xshape = VarNode("q_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* q_transpose2 = OpNode("q_transpose2", "transpose2")->AsIntermediate(); + auto* q_transpose2_out = VarNode("q_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("scale", "X") + ->AsIntermediate(); + auto* q_transpose2_xshape = + VarNode("q_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + auto* q_scale = OpNode("q_scale", "scale")->AsIntermediate(); + auto* q_scale_out = VarNode("q_scale_out") + ->assert_is_op_output("scale", "Out") + ->assert_is_op_input("matmul", "X") + ->AsIntermediate(); + + auto* k_mul_y = + VarNode("k_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* k_mul = OpNode("k_mul", "mul")->AsIntermediate(); + auto* k_mul_out = VarNode("k_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* k_add_y = VarNode("k_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* k_add = OpNode("k_add", "elementwise_add")->AsIntermediate(); + auto* k_add_out = VarNode("k_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* k_reshape2 = OpNode("k_reshape2", "reshape2")->AsIntermediate(); + auto* k_reshape2_out = VarNode("k_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* k_reshape2_xshape = VarNode("k_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate(); + auto* k_transpose2_out = VarNode("k_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("matmul", "Y") + ->AsIntermediate(); + auto* k_transpose2_xshape = + VarNode("k_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + + auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate(); + auto* qk_matmul_out = VarNode("qk_matmul_out") + ->assert_is_op_output("matmul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qk_mask = VarNode("qk_mask") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qk_add = OpNode("qk_add", "elementwise_add")->AsIntermediate(); + auto* qk_add_out = VarNode("qk_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("softmax", "X") + ->AsIntermediate(); + auto* qk_softmax = OpNode("qk_softmax", "softmax")->AsIntermediate(); + auto* qk_softmax_out = VarNode("qk_softmax_out") + ->assert_is_op_output("softmax", "Out") + ->AsIntermediate(); + auto* qk_dropout = OpNode("qk_dropout", "dropout")->AsIntermediate(); + auto* qk_dropout_out = VarNode("qk_dropout_out") + ->assert_is_op_output("dropout", "Out") + ->assert_is_op_input("matmul", "X") + ->AsIntermediate(); + auto* qk_dropout_mask = VarNode("qk_dropout_mask") + ->assert_is_op_output("dropout", "Mask") + ->AsIntermediate(); + + auto* v_mul_y = + VarNode("v_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* v_mul = OpNode("v_mul", "mul")->AsIntermediate(); + auto* v_mul_out = VarNode("v_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* v_add_y = VarNode("v_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* v_add = OpNode("v_add", "elementwise_add")->AsIntermediate(); + auto* v_add_out = VarNode("v_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* v_reshape2 = OpNode("v_reshape2", "reshape2")->AsIntermediate(); + auto* v_reshape2_out = VarNode("v_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* v_reshape2_xshape = VarNode("v_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* v_transpose2 = OpNode("v_transpose2", "transpose2")->AsIntermediate(); + auto* v_transpose2_out = VarNode("v_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("matmul", "Y") + ->AsIntermediate(); + auto* v_transpose2_xshape = + VarNode("v_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + + auto* qkv_matmul = OpNode("qkv_matmul", "matmul")->AsIntermediate(); + auto* qkv_matmul_out = VarNode("qkv_matmul_out") + ->assert_is_op_output("matmul", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* qkv_transpose2 = + OpNode("qkv_transpose2", "transpose2")->AsIntermediate(); + auto* qkv_transpose2_out = VarNode("qkv_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* qkv_transpose2_xshape = + VarNode("qkv_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + auto* qkv_reshape2 = OpNode("qkv_reshape2", "reshape2")->AsIntermediate(); + auto* qkv_reshape2_out = VarNode("qkv_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("mul", "X") + ->AsIntermediate(); + auto* qkv_reshape2_xshape = VarNode("qkv_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* qkv_mul_y = + VarNode("qkv_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul = OpNode("qkv_mul", "mul")->AsIntermediate(); + auto* qkv_mul_out = VarNode("qkv_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_y = VarNode("qkv_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add = OpNode("qkv_add", "elementwise_add")->AsIntermediate(); + auto* qkv_add_out = VarNode("qkv_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("dropout", "X") + ->AsIntermediate(); + auto* qkv_dropout = OpNode("qkv_dropout", "dropout")->AsIntermediate(); + auto* qkv_dropout_out = VarNode("qkv_dropout_out") + ->assert_is_op_output("dropout", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_dropout_mask = VarNode("qkv_dropout_mask") + ->assert_is_op_output("dropout", "Mask") + ->AsIntermediate(); + + auto* qkv_add_2 = OpNode("qkv_add_2", "elementwise_add")->AsIntermediate(); + auto* qkv_add_2_out = VarNode("qkv_add_2_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("layer_norm", "X") + ->AsIntermediate(); + auto* qkv_ln_2_scale = VarNode("qkv_ln_2_scale") + ->assert_is_op_input("layer_norm", "Scale") + ->AsInput(); + auto* qkv_ln_2_bias = VarNode("qkv_ln_2_bias") + ->assert_is_op_input("layer_norm", "Bias") + ->AsInput(); + auto* qkv_ln_2 = OpNode("qkv_ln_2", "layer_norm")->AsIntermediate(); + auto* qkv_ln_2_out = VarNode("qkv_ln_2_out") + ->assert_is_op_output("layer_norm", "Y") + ->assert_is_op_input("mul", "X") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* qkv_ln_2_mean = VarNode("qkv_ln_2_mean") + ->assert_is_op_output("layer_norm", "Mean") + ->AsIntermediate(); + auto* qkv_ln_2_var = VarNode("qkv_ln_2_var") + ->assert_is_op_output("layer_norm", "Variance") + ->AsIntermediate(); + + auto* qkv_mul_3_y = + VarNode("qkv_mul_3_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul_3 = OpNode("qkv_mul_3", "mul")->AsIntermediate(); + auto* qkv_mul_3_out = VarNode("qkv_mul_3_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_3_y = VarNode("qkv_add_3_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add_3 = OpNode("qkv_add_3", "elementwise_add")->AsIntermediate(); + auto* qkv_add_3_out = VarNode("qkv_add_3_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input(act_type_, "X") + ->AsIntermediate(); + auto* qkv_act = OpNode("qkv_act", act_type_)->AsIntermediate(); + auto* qkv_act_out = VarNode("qkv_act_out") + ->assert_is_op_output(act_type_, "Out") + ->assert_is_op_input("mul", "X") + ->AsIntermediate(); + auto* qkv_mul_4_y = + VarNode("qkv_mul_4_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul_4 = OpNode("qkv_mul_4", "mul")->AsIntermediate(); + auto* qkv_mul_4_out = VarNode("qkv_mul_4_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_4_y = VarNode("qkv_add_4_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add_4 = OpNode("qkv_add_4", "elementwise_add")->AsIntermediate(); + auto* qkv_add_4_out = VarNode("qkv_add_4_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("dropout", "X") + ->AsIntermediate(); + auto* qkv_dropout_4 = OpNode("qkv_dropout_4", "dropout")->AsIntermediate(); + auto* qkv_dropout_4_out = VarNode("qkv_dropout_4_out") + ->assert_is_op_output("dropout", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_dropout_4_mask = VarNode("qkv_dropout_4_mask") + ->assert_is_op_output("dropout", "Mask") + ->AsIntermediate(); + + auto* qkv_add_5 = OpNode("qkv_add_5", "elementwise_add")->AsIntermediate(); + auto* qkv_add_5_out = VarNode("qkv_add_5_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("layer_norm", "X") + ->AsIntermediate(); + auto* qkv_ln_5_scale = VarNode("qkv_ln_5_scale") + ->assert_is_op_input("layer_norm", "Scale") + ->AsInput(); + auto* qkv_ln_5_bias = VarNode("qkv_ln_5_bias") + ->assert_is_op_input("layer_norm", "Bias") + ->AsInput(); + auto* qkv_ln_5 = OpNode("qkv_ln_5", "layer_norm")->AsIntermediate(); + auto* qkv_ln_5_out = VarNode("qkv_ln_5_out") + ->assert_is_op_output("layer_norm", "Y") + ->AsOutput(); + auto* qkv_ln_5_mean = VarNode("qkv_ln_5_mean") + ->assert_is_op_output("layer_norm", "Mean") + ->AsIntermediate(); + auto* qkv_ln_5_var = VarNode("qkv_ln_5_var") + ->assert_is_op_output("layer_norm", "Variance") + ->AsIntermediate(); + + // TODO(miaotianxiang): use LinksFrom/LinksTo() instead + *input >> *q_mul >> *q_mul_out >> *q_add >> *q_add_out >> *q_reshape2 >> + *q_reshape2_out >> *q_transpose2 >> *q_transpose2_out >> *q_scale >> + *q_scale_out >> *qk_matmul; + *q_mul_y >> *q_mul; + *q_add_y >> *q_add; + *q_reshape2 >> *q_reshape2_xshape; + *q_transpose2 >> *q_transpose2_xshape; + + *input >> *k_mul >> *k_mul_out >> *k_add >> *k_add_out >> *k_reshape2 >> + *k_reshape2_out >> *k_transpose2 >> *k_transpose2_out >> *qk_matmul; + *k_mul_y >> *k_mul; + *k_add_y >> *k_add; + *k_reshape2 >> *k_reshape2_xshape; + *k_transpose2 >> *k_transpose2_xshape; + + *qk_matmul >> *qk_matmul_out >> *qk_add >> *qk_add_out >> *qk_softmax >> + *qk_softmax_out >> *qk_dropout >> *qk_dropout_out >> *qkv_matmul; + *qk_mask >> *qk_add; + *qk_dropout >> *qk_dropout_mask; + + *input >> *v_mul >> *v_mul_out >> *v_add >> *v_add_out >> *v_reshape2 >> + *v_reshape2_out >> *v_transpose2 >> *v_transpose2_out >> *qkv_matmul; + *v_mul_y >> *v_mul; + *v_add_y >> *v_add; + *v_reshape2 >> *v_reshape2_xshape; + *v_transpose2 >> *v_transpose2_xshape; + + *qkv_matmul >> *qkv_matmul_out >> *qkv_transpose2 >> *qkv_transpose2_out >> + *qkv_reshape2 >> *qkv_reshape2_out >> *qkv_mul >> *qkv_mul_out >> + *qkv_add >> *qkv_add_out >> *qkv_dropout >> *qkv_dropout_out >> + *qkv_add_2; + *qkv_transpose2 >> *qkv_transpose2_xshape; + *qkv_reshape2 >> *qkv_reshape2_xshape; + *qkv_mul_y >> *qkv_mul; + *qkv_add_y >> *qkv_add; + *qkv_dropout >> *qkv_dropout_mask; + + *input >> *qkv_add_2 >> *qkv_add_2_out >> *qkv_ln_2 >> *qkv_ln_2_out; + *qkv_ln_2_scale >> *qkv_ln_2; + *qkv_ln_2_bias >> *qkv_ln_2; + *qkv_ln_2 >> *qkv_ln_2_mean; + *qkv_ln_2 >> *qkv_ln_2_var; + + *qkv_ln_2_out >> *qkv_mul_3 >> *qkv_mul_3_out >> *qkv_add_3 >> + *qkv_add_3_out >> *qkv_act >> *qkv_act_out >> *qkv_mul_4 >> + *qkv_mul_4_out >> *qkv_add_4 >> *qkv_add_4_out >> *qkv_dropout_4 >> + *qkv_dropout_4_out >> *qkv_add_5; + *qkv_mul_3_y >> *qkv_mul_3; + *qkv_add_3_y >> *qkv_add_3; + *qkv_mul_4_y >> *qkv_mul_4; + *qkv_add_4_y >> *qkv_add_4; + *qkv_dropout_4 >> *qkv_dropout_4_mask; + + *qkv_ln_2_out >> *qkv_add_5 >> *qkv_add_5_out >> *qkv_ln_5 >> *qkv_ln_5_out; + *qkv_ln_5_scale >> *qkv_ln_5; + *qkv_ln_5_bias >> *qkv_ln_5; + *qkv_ln_5 >> *qkv_ln_5_mean; + *qkv_ln_5 >> *qkv_ln_5_var; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("single_encoder"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Mask", {matched.at("qk_mask")->arg()->name}); + op_desc.SetInput("FCWeight", + { + matched.at("q_mul_y")->arg()->name, + matched.at("k_mul_y")->arg()->name, + matched.at("v_mul_y")->arg()->name, + matched.at("qkv_mul_y")->arg()->name, + matched.at("qkv_mul_3_y")->arg()->name, + matched.at("qkv_mul_4_y")->arg()->name, + }); + op_desc.SetInput("FCBias", + { + matched.at("q_add_y")->arg()->name, + matched.at("k_add_y")->arg()->name, + matched.at("v_add_y")->arg()->name, + matched.at("qkv_add_y")->arg()->name, + matched.at("qkv_add_3_y")->arg()->name, + matched.at("qkv_add_4_y")->arg()->name, + }); + op_desc.SetInput("LNScale", + { + matched.at("qkv_ln_2_scale")->arg()->name, + matched.at("qkv_ln_5_scale")->arg()->name, + }); + op_desc.SetInput("LNBias", + { + matched.at("qkv_ln_2_bias")->arg()->name, + matched.at("qkv_ln_5_bias")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("qkv_ln_5_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + // extra traits to distill + auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info(); + auto reshape_dim = reshape_op_info->GetAttr>("shape"); + op_desc.SetAttr("head_num", reshape_dim[2]); + op_desc.SetAttr("size_per_head", reshape_dim[3]); + op_desc.SetAttr("act_type", act_type_); + + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + auto* single_encoder_stmt = matched.at("q_mul")->stmt(); + fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places()); + single_encoder_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "qk_mask", + "k_mul_y", + "v_mul_y", + "qkv_mul_y", + "qkv_mul_3_y", + "qkv_mul_4_y", + "q_add_y", + "k_add_y", + "v_add_y", + "qkv_add_y", + "qkv_add_3_y", + "qkv_add_4_y", + "qkv_ln_2_scale", + "qkv_ln_2_bias", + "qkv_ln_5_scale", + "qkv_ln_5_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("q_mul")); + } + IR_OP_VAR_LINK(matched.at("q_mul"), matched.at("qkv_ln_5_out")); + } + + private: + std::string act_type_; +}; + +class XPUMultiEncoderFuser { + public: + bool IsDirectPredecessorOf(Node* op1, Node* op2) { + for (auto* out : op1->outlinks) { + for (auto* in : op2->inlinks) { + if (out == in) return true; + } + } + return false; + } + + void operator()(SSAGraph* graph) { + std::vector all_encoders; + for (auto* node : graph->StmtTopologicalOrder()) { + CHECK(node->IsStmt()); + if (node->stmt()->op_info()->Type() == "single_encoder") { + all_encoders.push_back(node); + } + } + VLOG(3) << "Found " << all_encoders.size() << " single_encoder"; + if (all_encoders.size() == 0) { + return; + } + + // TODO(miaotianxiang): more verification + for (size_t i = 0; i < all_encoders.size() - 1; ++i) { + CHECK(IsDirectPredecessorOf(all_encoders[i], all_encoders[i + 1])); + } + std::string mask_name; + for (auto* encoder : all_encoders) { + auto* op_info = encoder->stmt()->op_info(); + if (mask_name.empty()) { + mask_name = op_info->Input("Mask").front(); + } else { + // CHECK(mask_name == op_info->Input("Mask").front()); + } + } + + std::unordered_set to_remove; + Node* first_encoder = all_encoders[0]; + std::string in_name, out_name; + std::vector arg_names{ + "FCWeight", "FCBias", "LNScale", "LNBias"}; + std::unordered_map> arg_map; + for (size_t i = 0; i < all_encoders.size(); ++i) { + Node* cur_encoder = all_encoders[i]; + auto* op_info = cur_encoder->stmt()->op_info(); + for (auto arg_name : arg_names) { + auto real_names = op_info->Input(arg_name); + for (auto name : real_names) { + auto* arg_node = graph->RetrieveArgument(name); + DirectedLink(arg_node, first_encoder); + arg_map[arg_name].push_back(name); + } + } + + auto* cur_out = + graph->RetrieveArgument(op_info->Output("Outputs").front()); + if (i == 0) { + // first encoder + to_remove.insert(cur_out); + in_name = op_info->Input("Inputs").front(); + mask_name = op_info->Input("Mask").front(); + } else if (i == all_encoders.size() - 1) { + // last encoder + to_remove.insert(cur_encoder); + DirectedLink(first_encoder, cur_out); + out_name = op_info->Output("Outputs").front(); + } else { + to_remove.insert(cur_encoder); + to_remove.insert(cur_out); + } + } + GraphSafeRemoveNodes(graph, to_remove); + + auto* multi_encoder_stmt = first_encoder->stmt(); + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__multi_encoder"); + op_desc.SetInput("Input", {in_name}); + for (auto kv : arg_map) { + op_desc.SetInput(kv.first, kv.second); + } + op_desc.SetInput("Mask", {mask_name}); + op_desc.SetOutput("Output", {out_name}); + op_desc.SetAttr("xpu", 1); + auto* first_encoder_op_info = multi_encoder_stmt->op_info(); + op_desc.SetAttr("head_num", + first_encoder_op_info->GetAttr("head_num")); + op_desc.SetAttr("size_per_head", + first_encoder_op_info->GetAttr("size_per_head")); + op_desc.SetAttr("n_layers", all_encoders.size()); + op_desc.SetAttr( + "act_type", first_encoder_op_info->GetAttr("act_type")); + + auto* scope = multi_encoder_stmt->op()->scope(); + std::vector fc_weight_max(arg_map["FCWeight"].size()); + auto& fc_weight_names = arg_map["FCWeight"]; + for (size_t i = 0; i < fc_weight_names.size(); ++i) { + auto* weight_t = scope->FindMutableTensor(fc_weight_names[i]); + auto weight_dims = weight_t->dims(); + int weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + + std::unique_ptr weight_int16(new int16_t[weight_len]); + std::unique_ptr weight_trans_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + paddle::lite::xpu::math::Transpose(weight_int16.get(), + weight_trans_int16.get(), + weight_dims[0], + weight_dims[1]); + memcpy(weight_on_host, + weight_trans_int16.get(), + weight_len * sizeof(int16_t)); + fc_weight_max[i] = max_f; + } + + std::string max_name = "encoder_max"; + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, first_encoder); + auto* max_filter_tensor = scope->NewTensor(max_name); + max_filter_tensor->Resize({static_cast(fc_weight_max.size())}); + memcpy(max_filter_tensor->mutable_data(), + &fc_weight_max[0], + sizeof(float) * fc_weight_max.size()); + op_desc.SetInput("FCWeightMax", {max_name}); + + auto multi_encoder_op = LiteOpRegistry::Global().Create(op_desc.Type()); + multi_encoder_op->Attach(op_desc, scope); + multi_encoder_op->SetValidPlaces(multi_encoder_stmt->op()->valid_places()); + auto kernels = + multi_encoder_op->CreateKernels(multi_encoder_op->valid_places()); + multi_encoder_stmt->SetOp(multi_encoder_op); + multi_encoder_stmt->SetKernels(std::move(kernels)); + + // temp remove useless cast + std::unordered_set to_remove2; + Node* stack = nullptr; + for (auto* node : graph->StmtTopologicalOrder()) { + CHECK(node->IsStmt()); + if (node->stmt()->op_info()->Type() == "stack") { + stack = node; + } + } + Node* stack_out = stack->outlinks.front(); + for (Node* cast : stack_out->outlinks) { + Node* cast_out = cast->outlinks.front(); + if (cast_out->outlinks.size() == 0) { + // remove + to_remove2.insert(cast_out); + to_remove2.insert(cast); + } + } + GraphSafeRemoveNodes(graph, to_remove2); + } +}; + +} // namespace fusion + +class XPUMultiEncoderFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + // TODO(miaotianxiang): backup graph, recover from failed match + std::vector act_types{"gelu", "relu"}; + for (auto& act_type : act_types) { + fusion::XPUSingleEncoderFuser single_encoder_fuser(act_type); + single_encoder_fuser(graph.get()); + fusion::XPUMultiEncoderFuser multi_encoder_fuser; + multi_encoder_fuser(graph.get()); + } + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__multi_encoder_fuse_pass, + paddle::lite::mir::XPUMultiEncoderFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("matmul"); diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..de2210a76ea0647cb02131a088ceb754afd0ef9c --- /dev/null +++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc @@ -0,0 +1,951 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUResNetBlock0Fuser : public FuseBase { + public: + XPUResNetBlock0Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* left_conv1_weight = VarNode("left_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv1 = OpNode("left_conv1", "conv2d"); + auto* left_conv1_out = VarNode("left_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn1_scale = VarNode("left_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn1_bias = VarNode("left_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn1_mean = VarNode("left_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn1_var = VarNode("left_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate(); + auto* left_bn1_out = VarNode("left_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn1_mean_out = VarNode("left_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn1_var_out = + VarNode("left_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn1_saved_mean = + VarNode("left_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn1_saved_var = + VarNode("left_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate(); + auto* left_relu1_out = VarNode("left_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv2_weight = VarNode("left_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate(); + auto* left_conv2_out = VarNode("left_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn2_scale = VarNode("left_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn2_bias = VarNode("left_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn2_mean = VarNode("left_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn2_var = VarNode("left_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate(); + auto* left_bn2_out = VarNode("left_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn2_mean_out = VarNode("left_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn2_var_out = + VarNode("left_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn2_saved_mean = + VarNode("left_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn2_saved_var = + VarNode("left_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate(); + auto* left_relu2_out = VarNode("left_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv3_weight = VarNode("left_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate(); + auto* left_conv3_out = VarNode("left_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn3_scale = VarNode("left_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn3_bias = VarNode("left_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn3_mean = VarNode("left_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn3_var = VarNode("left_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate(); + auto* left_bn3_out = VarNode("left_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* left_bn3_mean_out = VarNode("left_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn3_var_out = + VarNode("left_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn3_saved_mean = + VarNode("left_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn3_saved_var = + VarNode("left_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate(); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >> + *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >> + *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >> + *left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add; + + *left_conv1_weight >> *left_conv1; + *left_bn1_scale >> *left_bn1; + *left_bn1_bias >> *left_bn1; + *left_bn1_mean >> *left_bn1; + *left_bn1_var >> *left_bn1; + *left_bn1 >> *left_bn1_mean_out; + *left_bn1 >> *left_bn1_var_out; + *left_bn1 >> *left_bn1_saved_mean; + *left_bn1 >> *left_bn1_saved_var; + + *left_conv2_weight >> *left_conv2; + *left_bn2_scale >> *left_bn2; + *left_bn2_bias >> *left_bn2; + *left_bn2_mean >> *left_bn2; + *left_bn2_var >> *left_bn2; + *left_bn2 >> *left_bn2_mean_out; + *left_bn2 >> *left_bn2_var_out; + *left_bn2 >> *left_bn2_saved_mean; + *left_bn2 >> *left_bn2_saved_var; + + *left_conv3_weight >> *left_conv3; + *left_bn3_scale >> *left_bn3; + *left_bn3_bias >> *left_bn3; + *left_bn3_mean >> *left_bn3; + *left_bn3_var >> *left_bn3; + *left_bn3 >> *left_bn3_mean_out; + *left_bn3 >> *left_bn3_var_out; + *left_bn3 >> *left_bn3_saved_mean; + *left_bn3 >> *left_bn3_saved_var; + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_block0"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("left_conv1_weight")->arg()->name, + matched.at("left_conv2_weight")->arg()->name, + matched.at("left_conv3_weight")->arg()->name, + matched.at("right_conv1_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("left_bn1_scale")->arg()->name, + matched.at("left_bn2_scale")->arg()->name, + matched.at("left_bn3_scale")->arg()->name, + matched.at("right_bn1_scale")->arg()->name, + }); + op_desc.SetInput("Bias", + { + matched.at("left_bn1_bias")->arg()->name, + matched.at("left_bn2_bias")->arg()->name, + matched.at("left_bn3_bias")->arg()->name, + matched.at("right_bn1_bias")->arg()->name, + }); + op_desc.SetInput("Mean", + { + matched.at("left_bn1_mean")->arg()->name, + matched.at("left_bn2_mean")->arg()->name, + matched.at("left_bn3_mean")->arg()->name, + matched.at("right_bn1_mean")->arg()->name, + }); + op_desc.SetInput("Var", + { + matched.at("left_bn1_variance")->arg()->name, + matched.at("left_bn2_variance")->arg()->name, + matched.at("left_bn3_variance")->arg()->name, + matched.at("right_bn1_variance")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block0_stmt = matched.at("left_conv1")->stmt(); + // block0_stmt->ResetOp(op_desc, graph->valid_places()); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places()); + block0_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "left_conv2_weight", + "left_conv3_weight", + "right_conv1_weight", + "left_bn1_bias", + "left_bn2_bias", + "left_bn3_bias", + "right_bn1_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1")); + } + IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNetBlock1Fuser : public FuseBase { + public: + XPUResNetBlock1Fuser() {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("conv2d", "Input") + ->assert_is_op_input("elementwise_add", "X") + ->AsInput(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d"); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate(); + auto* right_relu1_out = VarNode("right_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv2_weight = VarNode("right_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate(); + auto* right_conv2_out = VarNode("right_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn2_scale = VarNode("right_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn2_bias = VarNode("right_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn2_mean = VarNode("right_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn2_var = VarNode("right_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate(); + auto* right_bn2_out = VarNode("right_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn2_mean_out = + VarNode("right_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn2_var_out = + VarNode("right_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn2_saved_mean = + VarNode("right_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn2_saved_var = + VarNode("right_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate(); + auto* right_relu2_out = VarNode("right_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv3_weight = VarNode("right_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate(); + auto* right_conv3_out = VarNode("right_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn3_scale = VarNode("right_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn3_bias = VarNode("right_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn3_mean = VarNode("right_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn3_var = VarNode("right_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate(); + auto* right_bn3_out = VarNode("right_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* right_bn3_mean_out = + VarNode("right_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn3_var_out = + VarNode("right_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn3_saved_mean = + VarNode("right_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn3_saved_var = + VarNode("right_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >> + *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >> + *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >> + *right_bn3_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *right_conv2_weight >> *right_conv2; + *right_bn2_scale >> *right_bn2; + *right_bn2_bias >> *right_bn2; + *right_bn2_mean >> *right_bn2; + *right_bn2_var >> *right_bn2; + *right_bn2 >> *right_bn2_mean_out; + *right_bn2 >> *right_bn2_var_out; + *right_bn2 >> *right_bn2_saved_mean; + *right_bn2 >> *right_bn2_saved_var; + + *right_conv3_weight >> *right_conv3; + *right_bn3_scale >> *right_bn3; + *right_bn3_bias >> *right_bn3; + *right_bn3_mean >> *right_bn3; + *right_bn3_var >> *right_bn3; + *right_bn3 >> *right_bn3_mean_out; + *right_bn3 >> *right_bn3_var_out; + *right_bn3 >> *right_bn3_saved_mean; + *right_bn3 >> *right_bn3_saved_var; + + *input >> *add; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_block1"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("right_conv1_weight")->arg()->name, + matched.at("right_conv2_weight")->arg()->name, + matched.at("right_conv3_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("right_bn1_scale")->arg()->name, + matched.at("right_bn2_scale")->arg()->name, + matched.at("right_bn3_scale")->arg()->name, + }); + op_desc.SetInput("Bias", + { + matched.at("right_bn1_bias")->arg()->name, + matched.at("right_bn2_bias")->arg()->name, + matched.at("right_bn3_bias")->arg()->name, + }); + op_desc.SetInput("Mean", + { + matched.at("right_bn1_mean")->arg()->name, + matched.at("right_bn2_mean")->arg()->name, + matched.at("right_bn3_mean")->arg()->name, + }); + op_desc.SetInput("Var", + { + matched.at("right_bn1_variance")->arg()->name, + matched.at("right_bn2_variance")->arg()->name, + matched.at("right_bn3_variance")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block1_stmt = matched.at("right_conv1")->stmt(); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places()); + block1_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "right_conv2_weight", + "right_conv3_weight", + "right_bn1_bias", + "right_bn2_bias", + "right_bn3_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1")); + } + IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNet50Fuser : public xpu::XPUFuseBase { + public: + XPUResNet50Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* top_conv_weight = VarNode("top_conv_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* top_conv = OpNode("top_conv", "conv2d"); + auto* top_conv_out = VarNode("top_conv_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* top_bn_scale = VarNode("top_bn_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* top_bn_bias = VarNode("top_bn_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* top_bn_mean = VarNode("top_bn_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* top_bn_var = VarNode("top_bn_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate(); + auto* top_bn_out = VarNode("top_bn_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* top_bn_mean_out = VarNode("top_bn_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* top_bn_var_out = + VarNode("top_bn_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* top_bn_saved_mean = + VarNode("top_bn_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* top_bn_saved_var = + VarNode("top_bn_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate(); + auto* top_relu_out = VarNode("top_relu_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("pool2d", "X") + ->AsIntermediate(); + auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate(); + auto* top_pool_out = VarNode("top_pool_out") + ->assert_is_op_output("pool2d", "Out") + ->assert_is_op_input("resnet_block0", "Inputs") + ->AsIntermediate(); + + // args are left out + auto* resnet_block0_1 = + OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_1_out = + VarNode("resnet_block0_1_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_1 = + OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_1_1_out = + VarNode("resnet_block1_1_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_2 = + OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_1_2_out = + VarNode("resnet_block1_1_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_2 = + OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_2_out = + VarNode("resnet_block0_2_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_1 = + OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_1_out = + VarNode("resnet_block1_2_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_2 = + OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_2_out = + VarNode("resnet_block1_2_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_3 = + OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_3_out = + VarNode("resnet_block1_2_3_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_3 = + OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_3_out = + VarNode("resnet_block0_3_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_1 = + OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_1_out = + VarNode("resnet_block1_3_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_2 = + OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_2_out = + VarNode("resnet_block1_3_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_3 = + OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_3_out = + VarNode("resnet_block1_3_3_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_4 = + OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_4_out = + VarNode("resnet_block1_3_4_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_5 = + OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_5_out = + VarNode("resnet_block1_3_5_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_4 = + OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_4_out = + VarNode("resnet_block0_4_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_1 = + OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_4_1_out = + VarNode("resnet_block1_4_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_2 = + OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_4_2_out = + VarNode("resnet_block1_4_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate(); + auto* bottom_pool_out = VarNode("bottom_pool_out") + ->assert_is_op_output("pool2d", "Out") + ->AsOutput(); + + *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >> + *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >> + *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >> + *resnet_block1_1_1_out >> *resnet_block1_1_2 >> + *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >> + *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >> + *resnet_block1_2_2_out >> *resnet_block1_2_3 >> + *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >> + *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >> + *resnet_block1_3_2_out >> *resnet_block1_3_3 >> + *resnet_block1_3_3_out >> *resnet_block1_3_4 >> + *resnet_block1_3_4_out >> *resnet_block1_3_5 >> + *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >> + *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >> + *resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out; + + *top_conv_weight >> *top_conv; + *top_bn_scale >> *top_bn; + *top_bn_bias >> *top_bn; + *top_bn_mean >> *top_bn; + *top_bn_var >> *top_bn; + *top_bn >> *top_bn_mean_out; + *top_bn >> *top_bn_var_out; + *top_bn >> *top_bn_saved_mean; + *top_bn >> *top_bn_saved_var; + } + + void InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched, + const std::vector& extra_input_vars) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__resnet50"); + op_desc.SetInput("Input", {matched.at("input")->arg()->name}); + std::vector filter_name = { + matched.at("top_conv_weight")->arg()->name}; + std::vector scale_name = { + matched.at("top_bn_scale")->arg()->name}; + std::vector bias_name = { + matched.at("top_bn_bias")->arg()->name}; + std::vector mean_name = { + matched.at("top_bn_mean")->arg()->name}; + std::vector var_name = { + matched.at("top_bn_variance")->arg()->name}; + std::vector max_filter_name; + std::vector resnet_block_vec = { + "resnet_block0_1", + "resnet_block1_1_1", + "resnet_block1_1_2", + "resnet_block0_2", + "resnet_block1_2_1", + "resnet_block1_2_2", + "resnet_block1_2_3", + "resnet_block0_3", + "resnet_block1_3_1", + "resnet_block1_3_2", + "resnet_block1_3_3", + "resnet_block1_3_4", + "resnet_block1_3_5", + "resnet_block0_4", + "resnet_block1_4_1", + "resnet_block1_4_2", + }; + for (auto& block : resnet_block_vec) { + auto* block_op_info = matched.at(block)->stmt()->op_info(); + auto block_filter_name = block_op_info->Input("Filter"); + std::copy(block_filter_name.begin(), + block_filter_name.end(), + std::back_inserter(filter_name)); + auto block_scale_name = block_op_info->Input("Scale"); + std::copy(block_scale_name.begin(), + block_scale_name.end(), + std::back_inserter(scale_name)); + auto block_bias_name = block_op_info->Input("Bias"); + std::copy(block_bias_name.begin(), + block_bias_name.end(), + std::back_inserter(bias_name)); + auto block_mean_name = block_op_info->Input("Mean"); + std::copy(block_mean_name.begin(), + block_mean_name.end(), + std::back_inserter(mean_name)); + auto block_var_name = block_op_info->Input("Var"); + std::copy(block_var_name.begin(), + block_var_name.end(), + std::back_inserter(var_name)); + } + op_desc.SetInput("Filter", filter_name); + op_desc.SetInput("Bias", bias_name); + op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name}); + op_desc.SetAttr("xpu", 1); + + auto* resnet50_stmt = matched.at("top_conv")->stmt(); + auto* scope = resnet50_stmt->op()->scope(); + for (size_t i = 0; i < filter_name.size(); ++i) { + auto* filter_t = scope->FindMutableTensor(filter_name[i]); + auto* scale_t = scope->FindMutableTensor(scale_name[i]); + auto* bias_t = scope->FindMutableTensor(bias_name[i]); + auto* mean_t = scope->FindMutableTensor(mean_name[i]); + auto* var_t = scope->FindMutableTensor(var_name[i]); + + int mean_len = mean_t->numel(); + int filter_len = filter_t->numel(); + int filter_stride = filter_len / mean_len; + + float* filter_on_host = filter_t->mutable_data(); + float* scale_on_host = scale_t->mutable_data(); + float* bias_on_host = bias_t->mutable_data(); + float* mean_on_host = mean_t->mutable_data(); + float* var_on_host = var_t->mutable_data(); + + // Perform preprocess + for (int i = 0; i < mean_len; ++i) { + scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f); + } + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; ++j) { + filter_on_host[i * filter_stride + j] *= scale_on_host[i]; + } + } + for (int i = 0; i < mean_len; ++i) { + bias_on_host[i] += -mean_on_host[i] * scale_on_host[i]; + } + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name[i] + "_max"; + max_filter_name.push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + op_desc.SetInput("MaxFilter", max_filter_name); + + auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type()); + resnet50_op->Attach(op_desc, scope); + resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places()); + auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places()); + resnet50_stmt->SetOp(resnet50_op); + resnet50_stmt->SetKernels(std::move(kernels)); + + IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv")); + for (auto* node : extra_input_vars) { + IR_NODE_LINK_TO(node, matched.at("top_conv")); + } + IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out")); + } +}; + +} // namespace fusion + +class XPUResNet50FusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + fusion::XPUResNetBlock0Fuser block0_fuser; + block0_fuser(graph.get()); + fusion::XPUResNetBlock1Fuser block1_fuser; + block1_fuser(graph.get()); + fusion::XPUResNet50Fuser resnet50_fuser; + resnet50_fuser(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__resnet_fuse_pass, + paddle::lite::mir::XPUResNet50FusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("conv2d"); diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc index 28ec814fa85451b5292bfde6bddc6b64b57b2f08..a32c9c05f69e5c31b77bc0d2ff976560f29b9bec 100644 --- a/lite/core/mir/graph_visualize_pass.cc +++ b/lite/core/mir/graph_visualize_pass.cc @@ -26,15 +26,13 @@ namespace paddle { namespace lite { namespace mir { -using inference::analysis::Dot; - void GraphVisualizePass::Apply(const std::unique_ptr& graph) { VLOG(5) << "\n" << Visualize(graph.get()); } std::string Visualize(mir::SSAGraph* graph) { std::ostringstream os; - inference::analysis::Dot dot; + Dot dot; auto string_trunc = [](const std::string& str) -> std::string { const int max_disp_size = 100; if (str.length() > max_disp_size) diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc index d6240888d0806486f478511ef81ba8179b46ab43..15f62f36b0f026dc42ecbb274c946e294c7fc44e 100644 --- a/lite/core/mir/mlu_postprocess_pass.cc +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -15,7 +15,6 @@ #include "lite/core/mir/mlu_postprocess_pass.h" #include #include -#include #include #include #include @@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, op_desc.SetAttr("out_dtype", 4); // FP16 op_desc.SetInput("X", {cur_node->AsArg().name}); op_desc.SetOutput("Out", {cast_arg_name}); - } else if (op_type == "transpose") { + } else if (op_type == "layout") { // NCHW -> NHWC - op_desc.SetAttr>("axis", {0, 2, 3, 1}); - op_desc.SetInput("X", {cur_node->AsArg().name}); + op_desc.SetInput("Input", {cur_node->AsArg().name}); op_desc.SetOutput("Out", {cast_arg_name}); } else if (op_type == "io_copy") { op_desc.SetInput("Input", {cur_node->AsArg().name}); @@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { is_found = true; } - } else if (op_type == "transpose") { - is_found = true; + } else if (op_type == "layout") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) && + DataLayoutCompatible(*out_arg_ty, *cast_type) && + // for first conv + PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } } else if (op_type == "io_copy") { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); @@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, // we pick the kernel cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); auto& stmt = cast_inst->AsStmt(); - stmt.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); + if (op_type == "layout") { + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(TARGET(kX86))); + } else { + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target())); + } break; } } @@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, cast_arg->AsArg().type = cast_type; auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); // for CastAfter manully set the tensor's type - var->GetMutable<::paddle::lite::Tensor>(); + var->GetMutable(); // create the stmt node auto* cast_inst = graph->NewInstructNode(); @@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, op_desc.SetAttr("out_dtype", 5); // FP16 op_desc.SetInput("X", {cast_arg_name}); op_desc.SetOutput("Out", {cur_node->AsArg().name}); - } else if (op_type == "transpose") { + } else if (op_type == "layout") { // NHWC -> NCHW - op_desc.SetAttr>("axis", {0, 3, 1, 2}); - op_desc.SetInput("X", {cast_arg_name}); + op_desc.SetInput("Input", {cast_arg_name}); op_desc.SetOutput("Out", {cur_node->AsArg().name}); } else if (op_type == "io_copy") { op_desc.SetInput("Input", {cast_arg_name}); @@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) { is_found = true; } - } else if (op_type == "transpose") { - is_found = true; + } else if (op_type == "layout") { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (DataLayoutCompatible(*in_arg_ty, *cast_type) && + DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) { + is_found = true; + } } else if (op_type == "io_copy") { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); @@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, // we pick the kernel cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); auto& stmt = cast_inst->AsStmt(); - stmt.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); + if (op_type == "layout") { + stmt.picked_kernel().SetContext( + ContextScheduler::Global().NewContext(TARGET(kX86))); + } else { + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target())); + } break; } } @@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph, auto* cur_node = head_node; const auto name_prefix = head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; + bool is_first_conv_head = + std::find(first_conv_nodes_.begin(), + first_conv_nodes_.end(), + head_node->AsArg().name) != first_conv_nodes_.end(); - // layout cast node - if (head_type->layout() != inst_type->layout()) { + // precision cast node + if (head_type->precision() != inst_type->precision() && !is_first_conv_head) { cur_node = InsertCastBefore( - "transpose", - name_prefix + "transpose", + "cast", + name_prefix + "cast", graph, cur_node, inst_node, LiteType::GetTensorTy( - head_type->target(), head_type->precision(), inst_type->layout())); + head_type->target(), inst_type->precision(), head_type->layout())); } - // precision cast node - if (head_type->precision() != inst_type->precision()) { + // layout cast node + if (head_type->layout() != inst_type->layout()) { cur_node = InsertCastBefore( - "cast", - name_prefix + "cast", + "layout", + name_prefix + "layout", graph, cur_node, inst_node, @@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, // get subgraph's valid precision const auto& places = graph->valid_places(); - std::set<::paddle::lite_api::PrecisionType> prec_set; + std::set prec_set; for (const auto& place : places) { if (place.target == TARGET(kMLU)) { prec_set.insert(place.precision); @@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, const auto name_prefix = tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; - // layout cast node - if (tail_type->layout() != inst_type->layout()) { + // precision cast node + if (tail_type->precision() != inst_type->precision()) { cur_node = InsertCastAfter( - "transpose", - name_prefix + "transpose", + "cast", + name_prefix + "cast", graph, cur_node, inst_node, LiteType::GetTensorTy( - tail_type->target(), tail_type->precision(), inst_type->layout())); + tail_type->target(), inst_type->precision(), tail_type->layout())); } - // precision cast node - if (tail_type->precision() != inst_type->precision()) { + // layout cast node + if (tail_type->layout() != inst_type->layout()) { cur_node = InsertCastAfter( - "cast", - name_prefix + "cast", + "layout", + name_prefix + "layout", graph, cur_node, inst_node, @@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, auto* sub_block_op_desc = sub_block_desc->GetOp(i); UpdateOutputTo( sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); + /* graph like this + * subgraph_op_0 + * / \ + * / \ + * subgraph_op_1 host_op + */ + UpdateInputTo( + sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); } // recreate the op @@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) { } } +bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) { + auto* block_desc = + static_cast(inst->AsStmt().op().get()) + ->GetSubBlock(); + for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) { + auto op_desc = block_desc->GetOp(op_idx); + CHECK(op_desc); + if (op_desc->Type() == "conv2d") { + for (auto& names : op_desc->inputs()) { + if (std::find(names.second.begin(), + names.second.end(), + arg_node->AsArg().name) != names.second.end()) { + return true; + } + } + } + } + return false; +} + +bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) { + CHECK(arg_node->IsArg()); + for (auto& inst : arg_node->outlinks) { + if (inst->AsStmt().op_type() == "subgraph") { + return IsFirstConvInSubgraph(arg_node, inst); + } + } + return false; +} + +void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) { + for (auto& node : graph->mutable_nodes()) { + if (!node.IsStmt()) continue; + if (node.AsStmt().op_type() == "feed") { + for (auto& out : node.outlinks) { + if (IsFirstConvNode(out)) { + first_conv_nodes_.insert(out->AsArg().name); + // modify first conv nodes' type + const auto* old_type = out->AsArg().type; + out->AsArg().type = + LiteType::GetTensorTy(old_type->target(), + paddle::lite_api::PrecisionType::kInt8, + old_type->layout(), + old_type->device()); + } + } + } + } +} + void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { for (auto& node : graph->mutable_nodes()) { if (!node.IsStmt()) continue; @@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { out->AsArg().type = LiteType::GetTensorTy(old_type->target(), old_type->precision(), - ::paddle::lite_api::DataLayoutType::kNHWC, + paddle::lite_api::DataLayoutType::kNHWC, old_type->device()); } } @@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { inp->AsArg().type = LiteType::GetTensorTy(old_type->target(), old_type->precision(), - ::paddle::lite_api::DataLayoutType::kNHWC, + paddle::lite_api::DataLayoutType::kNHWC, old_type->device()); } } @@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { } void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { - // currently for non-persistent input and output args, mlu subgraph op - // only support float16/float32 data type - - // in two situations as folllows: - // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; - // arg_in and arg_out are assumed to be NHWC which user should be aware of. - // Thus here we change these args' layout to NHWC - ModifyLayout(graph.get()); +// currently for non-persistent input and output args, mlu subgraph op +// only support float16/float32 data type + +// in two situations as folllows: +// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; +// arg_in and arg_out are assumed to be NHWC which user should be aware of. +// Thus here we change these args' layout to NHWC +#ifdef LITE_WITH_MLU + if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) { + ModifyLayout(graph.get()); + } + + if (lite::DeviceInfo::Global().UseFirstConv()) { + GatherAndModifyFirstConvNodes(graph.get()); + } +#endif // insert io_copy, layout and precision cast of subgraph's inputs and outputs for (auto& node : graph->mutable_nodes()) { diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h index 8ffcbc952a44abea272bdd22467d86cd04baa207..688dd06fb5fbec0c8e1c53acfe4215456ddb4192 100644 --- a/lite/core/mir/mlu_postprocess_pass.h +++ b/lite/core/mir/mlu_postprocess_pass.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include "lite/core/mir/pass.h" @@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass { const Type* cast_type); void RecreateOp(Node* inst_node, SSAGraph* graph); + + void GatherAndModifyFirstConvNodes(SSAGraph* graph); + + bool IsFirstConvNode(Node* arg_node); + + bool IsFirstConvInSubgraph(Node* arg_node, Node* inst); + + private: + std::set first_conv_nodes_; }; } // namespace mir diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc index b625919cbfb6d26ecbbd1bad36772aff86bee087..aaebf852b2ec519515e59655a57600f59ec6a2c3 100644 --- a/lite/core/mir/pattern_matcher.cc +++ b/lite/core/mir/pattern_matcher.cc @@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector *subgraphs) { } std::string PMPattern::DotString() const { - using inference::analysis::Dot; Dot dot; int id = 0; // Create Nodes diff --git a/lite/core/mir/pattern_matcher_high_api.h b/lite/core/mir/pattern_matcher_high_api.h index e62a4fc7494d750b2b5331c4b54b787df239ceee..3ac8e331aacb28044fca7f328319de37b27950bf 100644 --- a/lite/core/mir/pattern_matcher_high_api.h +++ b/lite/core/mir/pattern_matcher_high_api.h @@ -64,7 +64,6 @@ class FuseBase { protected: virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0; - private: void PerformPatternMatcher(SSAGraph* graph); // Delete nodes that are marked as Intermediate diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc index 6c45ce828249c3e236706c297db3d434c71c351a..54f5f4d46ce465d9db78b43f339296a3135c9507 100644 --- a/lite/core/mir/ssa_graph.cc +++ b/lite/core/mir/ssa_graph.cc @@ -64,6 +64,26 @@ std::map> SSAGraph::BuildOperationAdjList() { return adj_list; } +std::map> SSAGraph::BuildNodeAdjList() { + std::map> adj_list; + + for (auto &n : mutable_nodes()) { + if (adj_list.find(&n) == adj_list.end()) { + adj_list[&n] = std::set(); + } + std::vector nodes; + for (auto &var : n.inlinks) { + nodes.push_back(var); + } + std::sort(nodes.begin(), + nodes.end(), + [](mir::Node *node1, mir::Node *node2) { return node1 > node2; }); + adj_list[&n].insert(std::make_move_iterator(nodes.begin()), + std::make_move_iterator(nodes.end())); + } + return adj_list; +} + void SSAGraph::SortHelper( const std::map> &adj_list, mir::Node *node, @@ -98,6 +118,24 @@ std::vector SSAGraph::StmtTopologicalOrder() { return res; } +std::vector SSAGraph::NodeTopologicalOrder() { + CheckBidirectionalConnection(); + + std::stack stack; + std::set visited; + std::vector res; + + auto adj_list = BuildNodeAdjList(); + + for (auto adj : adj_list) { + if (visited.find(adj.first) == visited.end()) { + SortHelper(adj_list, adj.first, &visited, &res); + } + } + + return res; +} + Node *SSAGraph::GraphCreateInstructNode( const std::shared_ptr &op, const std::vector &valid_places) { node_storage_.emplace_back(); @@ -213,9 +251,10 @@ std::vector SSAGraph::outputs() { } mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) { - auto it = arguments_.find(arg); - if (it != arguments_.end()) { - return it->second; + for (auto &node : node_storage_) { + if (node.IsArg() && node.arg()->name == arg) { + return &node; + } } return nullptr; } diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h index b5b9fb1cb28a35f37d51e4e63eb7512354d0547b..e2967cf96a6b00ccc225ce05b043cb94f161b1d6 100644 --- a/lite/core/mir/ssa_graph.h +++ b/lite/core/mir/ssa_graph.h @@ -42,6 +42,8 @@ class SSAGraph : GraphBase { std::vector StmtTopologicalOrder(); + std::vector NodeTopologicalOrder(); + // The inputs of the graph. std::vector inputs(); @@ -86,6 +88,9 @@ class SSAGraph : GraphBase { // Build operator inlink edge table. std::map> BuildOperationAdjList(); + // Build node inlink edge table. + std::map> BuildNodeAdjList(); + void SortHelper(const std::map> &adj_list, mir::Node *node, std::set *visited, diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 6844fd96688d5086b47d66a32f770a757f56fda4..b61f7f365f51a32e267dd12943be5fcfadb3e08a 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -30,10 +30,8 @@ namespace paddle { namespace lite { namespace mir { -using inference::analysis::Dot; - std::string SubgraphVisualizer::operator()() { - inference::analysis::Dot dot; + Dot dot; const std::vector subgraph_colors{ "red", "green", "cyan", "bisque3", "coral", "darkseagreen1", "goldenrod1", "darkorchid", @@ -314,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) { std::vector> SubgraphDetector::ExtractSubgraphs( node_map_t *nodes) { - for (auto &it : *nodes) { - node_dat_t *node = it.second; + for (auto &ordered_node : graph_->NodeTopologicalOrder()) { + // different orders when traversing nodes in graph may lead to + // different subgraph division, which may generate different result + // with device such as MLU. These different results are all "right" + // but a little confusing. Thus the topological order is used instead + // of the address of the node in graph. + CHECK(nodes->find(ordered_node) != nodes->end()); + node_dat_t *node = (*nodes)[ordered_node]; if (!node->marked) { continue; } @@ -573,13 +577,14 @@ void ExtractInputsOutputs(const std::vector &op_nodes, unused_var_nodes->insert(var_node); continue; } - // Var can have more than one next op node, So, if any one in the - // op_nodes then continue - bool next_op_in_nodes = false; + // Var can have more than one next op node, So, if all next nodes are in + // op_nodes then it should be put into local_var_nodes + bool next_op_in_nodes = true; for (auto &next_op_node : var_node->outlinks) { - if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) != + if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) == op_nodes.end()) { - next_op_in_nodes = true; + next_op_in_nodes = false; + break; } } if (next_op_in_nodes) { diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc index 974772a9839c1e089359be3ae98e1833645ccd7a..1e54e1497b5d49754a705340aafa30ded1c2a727 100644 --- a/lite/core/mir/subgraph/subgraph_detector_test.cc +++ b/lite/core/mir/subgraph/subgraph_detector_test.cc @@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) { #ifdef LITE_WITH_NPU Place{TARGET(kNPU), PRECISION(kFloat)}, #endif -#ifdef LITE_WITH_XPU +#ifdef LITE_WITH_XTCL Place{TARGET(kXPU), PRECISION(kFloat)}, #endif }); diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index 5e2cecd277820ab39b5a25db6159591157982d01..eecd9348ae684929d3f55dee2a94921a078f148c 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -20,6 +20,7 @@ #include #include "lite/core/mir/pass_registry.h" #include "lite/core/mir/subgraph/subgraph_detector.h" +#include "lite/utils/env.h" namespace paddle { namespace lite { @@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { } void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { + if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return; std::unordered_set supported_lists; #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/xpu/bridges/paddle_use_bridges.h" @@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/mlu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + } // namespace mir } // namespace lite } // namespace paddle @@ -77,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) .BindTargets({TARGET(kXPU)}); REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) .BindTargets({TARGET(kBM)}); +REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass) + .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index 1ba0f2ab4aa52c384f4175de0eb34475b34fb94c..f83448df42ffe6d6d8c5b37503b5127290037dce 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class MLUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + } // namespace mir } // namespace lite } // namespace paddle diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 7117e1b3399fe823194f7f1a4d4c239099580955..a2369adc5d882310503cbf52fa5394098d824b40 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) { #ifdef LITE_WITH_NPU valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}); #endif -#ifdef LITE_WITH_XPU +#ifdef LITE_WITH_XTCL valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}); #endif auto tar_predictor = TestModel(FLAGS_model_dir, diff --git a/lite/core/mir/subgraph_cast_display_pass.cc b/lite/core/mir/subgraph_cast_display_pass.cc deleted file mode 100644 index 3a2c94d23298fcb607de0bf821d0dc92c95da7bb..0000000000000000000000000000000000000000 --- a/lite/core/mir/subgraph_cast_display_pass.cc +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/core/mir/pass.h" -#include "lite/core/mir/pass_registry.h" - -namespace paddle { -namespace lite { -namespace mir { - -class SubgraphCastDisplayPass : public DebugPass { - public: - void Apply(const std::unique_ptr& graph) override { - VLOG(3) << "== Argument types =="; - for (auto& node : graph->mutable_nodes()) { - if (!node.IsArg()) continue; - - auto* type = node.AsArg().type; - if (type) { - VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type; - } else { - VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK"; - } - } - VLOG(3) << "---------------------"; - - // - VLOG(0) << "== SubgraphOp Debug Info =="; - for (auto& node : graph->mutable_nodes()) { - if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { - VLOG(0) << "FOUND SUBGRAPH OP"; - display_debug_info(node, "subgraph"); - break; - } - } - VLOG(0) << "---------------------"; - } - - void display_debug_info(const Node& node, - std::string op_type, - bool display_in_nodes = true, - bool display_out_nodes = true) { - CHECK(node.IsStmt()); - VLOG(0) << node.AsStmt(); - if (display_in_nodes) { - for (auto p_in_arg_node : node.inlinks) { - CHECK(p_in_arg_node->IsArg()); - VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name - << " type: " << *p_in_arg_node->AsArg().type - << " is_weight: " << p_in_arg_node->AsArg().is_weight - << " is_persist: " << p_in_arg_node->AsArg().is_persist - << " input_count: " << p_in_arg_node->inlinks.size(); - if (p_in_arg_node->inlinks.size() == 0) { - VLOG(0) << "** END with No Op"; - } - for (auto p_in_stmt_node : p_in_arg_node->inlinks) { - CHECK(p_in_stmt_node->IsStmt()); - std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type(); - if (stmt_op_type == "cast" || stmt_op_type == "transpose" || - stmt_op_type == "io_copy") { - display_debug_info(*p_in_stmt_node, stmt_op_type, true, false); - } else { - VLOG(0) << "** END with op type: " << stmt_op_type; - } - } - } - } - if (display_out_nodes) { - for (auto p_out_arg_node : node.outlinks) { - CHECK(p_out_arg_node->IsArg()); - VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name - << " type: " << *p_out_arg_node->AsArg().type - << " is_weight: " << p_out_arg_node->AsArg().is_weight - << " is_persist: " << p_out_arg_node->AsArg().is_persist - << " output_count: " << p_out_arg_node->outlinks.size(); - if (p_out_arg_node->outlinks.size() == 0) { - VLOG(0) << "** END with No Op"; - } - for (auto p_out_stmt_node : p_out_arg_node->outlinks) { - CHECK(p_out_stmt_node->IsStmt()); - std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type(); - if (stmt_op_type == "cast" || stmt_op_type == "transpose" || - stmt_op_type == "io_copy") { - display_debug_info(*p_out_stmt_node, stmt_op_type, false, true); - } else { - VLOG(0) << "** END with op type: " << stmt_op_type; - } - } - } - } - } -}; - -} // namespace mir -} // namespace lite -} // namespace paddle - -REGISTER_MIR_PASS(subgraph_cast_display_pass, - paddle::lite::mir::SubgraphCastDisplayPass) - .BindTargets({TARGET(kAny)}); diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index 75d8022d5f5f9d8572a5e020c11ae5d8cf630c10..aca7343c8af39f767c2a336e0b298995731b755f 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst( VLOG(4) << "picked, opencl found"; is_found = true; } else if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->target() == to.target()) { + TargetCompatibleTo(*out_arg_ty, to)) { VLOG(4) << "picked"; is_found = true; } diff --git a/lite/core/mir/xpu_pattern_matcher.cc b/lite/core/mir/xpu_pattern_matcher.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f268e7af8a55d22163d52c7f8824406f58bd17b --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher.cc @@ -0,0 +1,271 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "lite/core/mir/dot.h" +#include "lite/core/mir/xpu_pattern_matcher.h" +#include "lite/core/op_lite.h" +#include "lite/utils/string.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +void XPUPatternMatcher::operator()(SSAGraph *graph, + XPUPatternMatcher::handle_t handler) { + if (!MarkPMNodesInGraph(graph)) { + return; + } + + auto subgraphs = DetectPatterns(); + UniquePatterns(&subgraphs); + RemoveOverlappedMatch(&subgraphs); + ValidateByNodeRole(&subgraphs); + + if (subgraphs.empty()) return; + LOG(INFO) << "detected " << subgraphs.size() << " subgraph"; + int id = 0; + for (auto &g : subgraphs) { + VLOG(3) << "optimizing #" << id++ << " subgraph"; + handler(g, graph); + } +} + +bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) { + VLOG(3) << "mark pmnodes in graph"; + if (graph->nodes().empty()) return false; + for (auto &node : graph->mutable_nodes()) { + for (const auto &pmnode : pattern_.nodes()) { + if (pmnode->Tell(&node)) { + pmnodes2nodes_[pmnode.get()].insert(&node); + } + } + } + // Check to early stop if some PMNode can't find matched Node. + for (auto &pmnode : pattern_.nodes()) { + if (!pmnodes2nodes_.count(pmnode.get())) { + VLOG(4) << pmnode->name() << " can't find matched Node, early stop"; + // return false; + } + } + VLOG(3) << pmnodes2nodes_.size() << " nodes marked"; + + return !pmnodes2nodes_.empty(); +} + +// The intermediate Nodes can only link to the nodes inside the pattern, or this +// subgraph will be droped. +void XPUPatternMatcher::ValidateByNodeRole( + std::vector *subgraphs) { + subgraphs->erase( + std::remove_if(subgraphs->begin(), + subgraphs->end(), + [](const XPUPatternMatcher::subgraph_t &subgraph) -> bool { + // Collect the inlinks and outlinks. + std::unordered_set ios; + for (auto &item : subgraph) { + ios.insert(item.second); + } + for (auto &item : subgraph) { + if (item.first->IsIntermediate()) { + for (auto *x : item.second->outlinks) { + if (!ios.count(x)) { + return true; + } + } + } + } + return false; + }), + subgraphs->end()); + + for (auto &subgraph : *subgraphs) { + std::unordered_set ios; + for (auto &item : subgraph) { + ios.insert(item.second); + } + extra_input_vars_.emplace_back(); + for (auto &item : subgraph) { + for (auto *x : item.second->inlinks) { + if (x->IsArg() && ios.count(x) == 0) { + // extra weight var + extra_input_vars_.back().push_back(x); + } + } + } + } +} + +struct HitGroup { + std::unordered_map roles; + + bool Match(Node *node, PMNode *pat) { + if (nodes_.count(node)) { + if (roles.count(pat) && roles[pat] == node) return true; + return false; + } else { + if (roles.count(pat) && roles[pat] != node) return false; + return true; + } + } + + void Register(Node *node, PMNode *pat) { + roles[pat] = node; + nodes_.insert(node); + } + + private: + std::unordered_set nodes_; +}; + +// Tell whether Node a links to b. +bool IsNodesLink(Node *a, Node *b) { + for (auto *node : a->outlinks) { + if (b == node) { + return true; + } + } + return false; +} + +std::vector XPUPatternMatcher::DetectPatterns() { + // Init empty subgraphs. + std::vector result; + std::vector init_groups; + std::array, 2> bi_records; + auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() + : pattern_.edges().front().first; + if (!pmnodes2nodes_.count(first_pnode)) return result; + for (auto *node : pmnodes2nodes_[first_pnode]) { + HitGroup group; + group.roles[first_pnode] = node; + init_groups.emplace_back(group); + } + + int step = 0; + bi_records[0] = std::move(init_groups); + + // Extend a PMNode to subgraphs by deducing the connection relations defined + // in edges of PMNodes. + for (const auto &edge : pattern_.edges()) { + VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); + // TODO(Superjomn) Fix bug here, the groups might be duplicate here. + // Each role has two PMNodes, which indicates two roles. + // Detect two Nodes that can match these two roles and they are connected. + auto &pre_groups = bi_records[step % 2]; + auto &cur_groups = bi_records[1 - (step++ % 2)]; + cur_groups.clear(); + if (pre_groups.empty()) break; + // source -> target + for (Node *source : pmnodes2nodes_[edge.first]) { + for (Node *target : pmnodes2nodes_[edge.second]) { + // TODO(Superjomn) add some prune strategies. + for (const auto &group : pre_groups) { + if (IsNodesLink(source, target)) { + HitGroup new_group = group; + bool flag = new_group.Match(source, edge.first) && + new_group.Match(target, edge.second); + if (flag) { + new_group.Register(source, edge.first); + new_group.Register(target, edge.second); + cur_groups.push_back(new_group); + // TODO(Superjomn) need to unique + } + } + } + } + } + VLOG(3) << "step " << step << " get records: " << cur_groups.size(); + } + + for (auto &group : bi_records[step % 2]) { + XPUPatternMatcher::subgraph_t subgraph; + for (auto &role : group.roles) { + subgraph.emplace(role.first, role.second); + } + result.emplace_back(subgraph); + } + return result; +} + +struct GraphItemLessThan { + bool operator()(const std::pair &a, + const std::pair &b) { + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } + } +}; + +// TODO(Superjomn) enhance the function as it marks unique unique as duplicates +// see https://github.com/PaddlePaddle/Paddle/issues/13550 +void XPUPatternMatcher::UniquePatterns( + std::vector *subgraphs) { + if (subgraphs->empty()) return; + std::vector result; + + std::unordered_set set; + std::hash hasher; + for (auto &g : *subgraphs) { + // Sort the items in the sub-graph, and transform to a string key. + std::vector> sorted_keys(g.begin(), g.end()); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan()); + STL::stringstream ss; + for (auto &item : sorted_keys) { + ss << reinterpret_cast(item.first) << ":" + << reinterpret_cast(item.second); + } + auto key = hasher(ss.str()); + if (!set.count(key)) { + result.emplace_back(g); + set.insert(key); + } + } + *subgraphs = result; +} + +void XPUPatternMatcher::RemoveOverlappedMatch( + std::vector *subgraphs) { + std::vector result; + std::unordered_set node_set; + + for (const auto &subgraph : *subgraphs) { + bool valid = true; + for (auto &item : subgraph) { + if (item.first->IsIntermediate() && node_set.count(item.second)) { + valid = false; + break; + } + } + if (valid) { + for (auto &item : subgraph) { + node_set.insert(item.second); + } + result.push_back(subgraph); + } + } + *subgraphs = result; +} + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher.h b/lite/core/mir/xpu_pattern_matcher.h new file mode 100644 index 0000000000000000000000000000000000000000..4ac03718f32a859ff6888e63e57fd4098e435e06 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/core/mir/pattern_matcher.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +/* + * PatternMatcher helps to detect the specific patterns in the graph. + * Input a pattern, output a list of the matched subgraphs/nodes. + * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.). + * + * The algorithm has three phases: + * 1. Mark the nodes that match the defined PMNodes in a PMPattern, + * 2. Extend a PMNode to subgraphs by deducing the connection relation defined + * in PAPattern(the edges), + * 3. Get the filtered subgraphs and treat them with a pre-defined handler. + * + * Usage: + * // Create a matcher + * PatternMatcher matcher; + * // Define the matcher's pattern, by adding PMNode and define the edges. + * auto* node0 = matcher.mutable_pattern().AddNode(...) + * auto* node1 = matcher.mutable_pattern().AddNode(...) + * node0->teller = some lambda. + * node1->teller = some lambda. + * matcher.mutable_pattern().AddEdge(node0, node1); + * // Create an handler, to define the behavior of treating the filtered + * // subgraphs that comply with the patterns. + * PatternMatcher::handle_t handler = some labmda + * // Execute the matcher. + * matcher(&graph, handler); + */ +struct XPUPatternMatcher { + using subgraph_t = std::unordered_map; + + // Operate on the detected pattern. + using handle_t = + std::function; + + void operator()(SSAGraph* graph, handle_t handler); + + const PMPattern& pattern() const { return pattern_; } + PMPattern* mutable_pattern() { return &pattern_; } + + // Mark the nodes that fits the pattern. + bool MarkPMNodesInGraph(SSAGraph* graph); + + // Detect all the pattern and output the hit records. + std::vector DetectPatterns(); + + // Remove duplicate patterns. + void UniquePatterns(std::vector* subgraphs); + + // Remove overlapped match subgraphs, when overlapped, keep the previous one. + // The intermediate PMNodes will be removed, so can't shared by multiple + // patterns. + void RemoveOverlappedMatch(std::vector* subgraphs); + + // Validate whether the intermediate nodes are linked by external nodes. + void ValidateByNodeRole(std::vector* subgraphs); + + using hit_rcd_t = + std::pair; + PMPattern pattern_; + std::unordered_map> pmnodes2nodes_; + std::vector> extra_input_vars_; +}; + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.cc b/lite/core/mir/xpu_pattern_matcher_high_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ffc496d1593d15f02d82e824c06443e7b3e01c9 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include +#include +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) { + VLOG(4) << "\n" << matcher_.pattern().DotString(); + // Get subgraphs and record the mir::Node pointers for each PMNode. + auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) { + // get all the reigistered nodes. + key2nodes_.emplace_back(); + for (auto &item : nodes_) { + key2nodes_.back()[item.first] = subgraph.at(item.second); + } + }; + + matcher_(graph, handler); +} + +void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) { + std::set keys; + for (auto &node : nodes_) { + if (node.second->IsIntermediate()) { + keys.insert(node.first); + } + } + + VLOG(4) << "keys: " << key2nodes_.size(); + std::unordered_set nodes2rm; + for (auto &matched : key2nodes_) { + for (const auto &key : keys) { + nodes2rm.insert(matched.at(key)); + } + } + + VLOG(3) << "clean nodes " << nodes2rm.size(); + GraphSafeRemoveNodes(graph, nodes2rm); +} + +PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) { + auto it = nodes_.find(key); + if (it != nodes_.end()) { + return it->second; + } + nodes_.emplace(key, + matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key))); + it = nodes_.find(key); + return it->second; +} + +PMNode *XPUFuseBase::OpNode(const std::string &key, + const std::string &op_type) { + GetOrCreateNode(key)->set_op_type(op_type); + GetOrCreateNode(key)->AsOp(op_type); + return GetOrCreateNode(key); +} + +PMNode *XPUFuseBase::VarNode(const std::string &key) { + GetOrCreateNode(key)->AsVar(); + return GetOrCreateNode(key); +} + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.h b/lite/core/mir/xpu_pattern_matcher_high_api.h new file mode 100644 index 0000000000000000000000000000000000000000..3302bcb6137f16afcf82269af91c8a13558da2b9 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher_high_api.h @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/mir/pattern_matcher_high_api.h" +#include "lite/core/mir/xpu_pattern_matcher.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +class XPUFuseBase { + public: + using key2nodes_t = std::map; + + virtual ~XPUFuseBase() = default; + + void operator()(SSAGraph* graph) { + BuildPattern(); + PerformPatternMatcher(graph); + + for (size_t i = 0; i < key2nodes_.size(); ++i) { + InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]); + } + + DeleteInterNodes(graph); + } + + // Build a PMPattern using PMNode. + virtual void BuildPattern() = 0; + + // Generate an operator desc with a matched subgraph. + virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) { + return cpp::OpDesc(); + } + + PMNode* OpNode(const std::string& key) { + return GetOrCreateNode(key)->assert_is_op(); + } + + PMNode* OpNode(const std::string& key, const std::string& op_type); + + PMNode* VarNode(const std::string& key); + + protected: + virtual void InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched, + const std::vector& extra_input_vars) = 0; + + void PerformPatternMatcher(SSAGraph* graph); + + // Delete nodes that are marked as Intermediate + void DeleteInterNodes(SSAGraph* graph); + + PMNode* GetOrCreateNode(const std::string& key); + + protected: + XPUPatternMatcher matcher_; + std::map nodes_; + std::vector key2nodes_; +}; + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index a9ccd1b9ae9a5d45f8d0e5638b3aab1d73d1903c..f8a706179374a0c86e28cf9a3638f5df2c932540 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -157,5 +157,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope, return var->GetMutable(); } +void OpLite::AttachInput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &input_name, + bool is_dispensable, + lite::Tensor **input_var) { + bool is_have_input = + op_desc.HasInput(input_name) && op_desc.Input(input_name).size() > 0; + CHECK(is_dispensable || is_have_input); + if (is_have_input) { + std::string input_var_name = op_desc.Input(input_name).front(); + *input_var = scope->FindVar(input_var_name)->GetMutable(); + } +} + +void OpLite::AttachOutput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &output_name, + bool is_dispensable, + lite::Tensor **output_var) { + bool is_have_output = + op_desc.HasOutput(output_name) && op_desc.Output(output_name).size() > 0; + CHECK(is_dispensable || is_have_output); + if (is_have_output) { + std::string output_var_name = op_desc.Output(output_name).front(); + *output_var = scope->FindVar(output_var_name)->GetMutable(); + } +} + } // namespace lite } // namespace paddle diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 1cdc33825cb4ffb758b46ac4b9bee968b3fca055..428b188c468ded790e74c9cc4f5da5c7efe2fd00 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -105,6 +105,20 @@ class OpLite : public Registry { return kernel_.get(); } + // Attach input variable from scope by op_desc and input name + void AttachInput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &input_name, + bool is_dispensable, + lite::Tensor **input_var); + + // Attach output variable from scope by op_desc and output name + void AttachOutput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &output_name, + bool is_dispensable, + lite::Tensor **output_var); + virtual ~OpLite() = default; protected: diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index fe1dff3c99c1d2413888e78c89c999caea0ab030..84f54b57b86c012ac72e367d657263b156e6c301 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -152,6 +152,8 @@ KernelRegistry::KernelRegistry() INIT_FOR(kMLU, kInt16, kNCHW); INIT_FOR(kHost, kFloat, kNCHW); + INIT_FOR(kHost, kInt32, kNCHW); + INIT_FOR(kHost, kInt64, kNCHW); INIT_FOR(kHost, kAny, kNCHW); INIT_FOR(kHost, kFloat, kNHWC); INIT_FOR(kHost, kFloat, kAny); diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 3c41c1fd8af240401c3edf0343433f8d8d9c85db..96c9fc2358199594cf9590385c2efdaf1c671425 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -135,6 +135,12 @@ class KernelRegistry final { KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index e2463ec9b6d93e08bb64e9d91763b22d30725661..82348dc3c3fb82fb61cc99d0e1083dac7fb9840c 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -76,6 +76,8 @@ class Optimizer { (defined LITE_WITH_ARM) "lite_elementwise_add_activation_fuse_pass", // #endif + "__xpu__resnet_fuse_pass", + "__xpu__multi_encoder_fuse_pass", "quantized_op_attributes_inference_pass", // Only for fully // quantized model, infer // the output scale and @@ -116,9 +118,15 @@ class Optimizer { "variable_place_inference_pass", // "argument_type_display_pass", + "mlu_subgraph_pass", + "runtime_context_assign_pass", "argument_type_display_pass", + + "mlu_postprocess_pass", + "memory_optimize_pass"}}; + if (passes.size() == 1) { passes_local.push_back(passes[0]); } diff --git a/lite/core/workspace.h b/lite/core/workspace.h index 117b80aaa7863719536d8dbec70cf38c7ba04efc..54efb6699ac6df63286b26843f8d79b7c84949f1 100644 --- a/lite/core/workspace.h +++ b/lite/core/workspace.h @@ -69,6 +69,13 @@ class WorkSpace { } #endif +#if defined(LITE_WITH_MLU) + static WorkSpace& Global_MLU() { + thread_local std::unique_ptr x(new WorkSpace(TARGET(kMLU))); + return *x; + } +#endif + private: explicit WorkSpace(TargetType x) : target_(x) {} diff --git a/lite/fluid/lod.h b/lite/fluid/lod.h index 36386f7eb967f31ec258681fe17222a928aa7b4b..b1f2f14a0a4534e588d18237826858812740db69 100644 --- a/lite/fluid/lod.h +++ b/lite/fluid/lod.h @@ -19,7 +19,7 @@ namespace paddle { namespace lite { namespace fluid { -using LoD = std::vector>; +using LoD = std::vector>; static LoD ToAbsOffset(const LoD &in) { // the lowest level stores relative offsets diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 4e0092b392eb31ce81f2a410ea86002b343f0aec..78bb8d10b798b73861ddbf25e427289fc2984a55 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -10,4 +10,5 @@ add_subdirectory(opencl) add_subdirectory(fpga) add_subdirectory(npu) add_subdirectory(xpu) +add_subdirectory(mlu) add_subdirectory(bm) diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc index d609716ee53ec584b8340e9b72498ed95afd5820..ea60cf528ea71f0bc0ba0a162063bd76899622f9 100644 --- a/lite/kernels/arm/activation_compute.cc +++ b/lite/kernels/arm/activation_compute.cc @@ -179,6 +179,34 @@ void SquareCompute::Run() { x_data, output_data, x_dims.production(), ctx.threads()); } +void HardSwishCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + float threshold = param.hard_swish_threshold; + float scale = param.hard_swish_scale; + float offset = param.hard_swish_offset; + lite::arm::math::act_hard_swish(x_data, + output_data, + x_dims.production(), + threshold, + scale, + offset, + ctx.threads()); +} + +void ReciprocalCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_reciprocal( + x_data, output_data, x_dims.production(), ctx.threads()); +} + } // namespace arm } // namespace kernels } // namespace lite @@ -275,3 +303,21 @@ REGISTER_LITE_KERNEL( .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +REGISTER_LITE_KERNEL(hard_swish, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::HardSwishCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +REGISTER_LITE_KERNEL(reciprocal, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::ReciprocalCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h index 476d7bb0a32db193d9afb1451507699d0af71736..2e8deda786a1ea9af70499c7b33c8aa1c6e19370 100644 --- a/lite/kernels/arm/activation_compute.h +++ b/lite/kernels/arm/activation_compute.h @@ -148,6 +148,24 @@ class SquareCompute : public KernelLite { virtual ~SquareCompute() = default; }; +class HardSwishCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~HardSwishCompute() = default; +}; + +class ReciprocalCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~ReciprocalCompute() = default; +}; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt index a52428aa097099150139de82627d5770c9b9071c..94fe384d0414d87f38fb0d1ab3e8ac1033423702 100644 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -5,3 +5,4 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps}) diff --git a/lite/kernels/host/ctc_align_compute.cc b/lite/kernels/host/ctc_align_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..a62c2ee15ac2752d5d3349fbaaeb18f31ac4c5a0 --- /dev/null +++ b/lite/kernels/host/ctc_align_compute.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/ctc_align_compute.h" +#include +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +LoD ToAbs(const LoD& in) { + if (in.empty()) return in; + LoD result; + for (auto& src : in) { + std::vector dest(src.size() + 1, 0); + for (int i = 0; i < src.size(); i++) { + dest[i + 1] = dest[i] + src[i]; + } + result.emplace_back(dest); + } + return result; +} + +LoD ToNorm(const LoD& in) { + if (in.empty()) return in; + LoD result; + for (auto& src : in) { + std::vector dest(src.size() - 1, 0); + for (int i = 0; i < dest.size(); i++) { + dest[i] = src[i + 1] - src[i]; + } + result.emplace_back(dest); + } + return result; +} + +LoD ToAbsOffset(const LoD& in) { + // the lowest level stores relative offsets + if (in.empty() || in.size() == 1) return in; + LoD result = in; + for (auto level = static_cast(in.size() - 2); level >= 0; level--) { + for (size_t i = 0; i < in[level].size(); ++i) { + size_t index = in[level][i]; + result[level][i] = result[level + 1][index]; + } + } + return result; +} + +template +void CtcAlignCompute::Run() { + auto& param = this->template Param(); + auto* input = param.input; + auto* output = param.output; + size_t blank = static_cast(param.blank); + bool merge_repeated = param.merge_repeated; + size_t padding_value = static_cast(param.padding_value); + + const auto* input_data = input->template data(); + auto input_dims = input->dims(); + auto* output_data = output->template mutable_data(); + + if (input->lod().empty()) { + auto* input_length = param.input_length; + auto* output_length = param.output_length; + CHECK(input_length != nullptr); + CHECK(output_length != nullptr); + const auto* input_length_data = input_length->template data(); + auto* output_length_data = output_length->template mutable_data(); + + for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0]; batch_id++) { + T prev_token = -1; + size_t output_idx = 0; + for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) { + size_t input_ind = batch_id * input_dims[1] + i; + if ((unsigned)input_data[input_ind] != blank && + !(merge_repeated && input_data[input_ind] == prev_token)) { + output_data[batch_id * input_dims[1] + output_idx] = + input_data[input_ind]; + ++output_idx; + } + prev_token = input_data[input_ind]; + } + output_length_data[batch_id] = output_idx; + for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++) + output_data[batch_id * input_dims[1] + j] = padding_value; + } + } else { + const size_t level = 0; + + auto input_lod = input->lod(); + input_lod = ToAbs(input->lod()); + input_lod = ToAbsOffset(input_lod); + CHECK_EQ(input_dims[0], static_cast(input_lod[level].back())); + + const size_t num_sequences = input_lod[level].size() - 1; + // merge repeated tokens and delete blank + size_t output_idx = 0; + std::vector output_lod0(1, 0); + for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) { + T prev_token = -1; + for (size_t i = input_lod[level][seq_idx]; + i < input_lod[level][seq_idx + 1]; + ++i) { + if ((unsigned)input_data[i] != blank && + !(merge_repeated && input_data[i] == prev_token)) { + output_data[output_idx] = input_data[i]; + ++output_idx; + } + prev_token = input_data[i]; + } + output_lod0.push_back(static_cast(output_idx)); + } + + LoD output_lod; + output_lod.push_back(output_lod0); + output_lod = ToNorm(output_lod); + output->set_lod(output_lod); + output->Resize({static_cast(output_lod0.back()), 1}); + if (output_lod0.back() == 0) { + output->Resize({1, 1}); + output_data = output->template mutable_data(); + output_data[0] = -1; + } + } +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle +using ctc_align_int64 = + paddle::lite::kernels::host::CtcAlignCompute; +REGISTER_LITE_KERNEL(ctc_align, kHost, kInt64, kNCHW, ctc_align_int64, def) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .BindInput("InputLength", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .BindOutput("OutputLength", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .Finalize(); + +using ctc_align_int32 = + paddle::lite::kernels::host::CtcAlignCompute; +REGISTER_LITE_KERNEL(ctc_align, kHost, kInt32, kNCHW, ctc_align_int32, def) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindInput("InputLength", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("OutputLength", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .Finalize(); diff --git a/lite/kernels/host/ctc_align_compute.h b/lite/kernels/host/ctc_align_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..737fb3be6c96d91a3cde4a8f9053c6f7b9c7ec69 --- /dev/null +++ b/lite/kernels/host/ctc_align_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +template +class CtcAlignCompute : public KernelLite { + public: + void Run() override; + + virtual ~CtcAlignCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt index 1c41f05ca0cb23013418654f195394f88adf05b1..f9395d45ccecccaf3f873797d0c2d71eda266319 100644 --- a/lite/kernels/mlu/CMakeLists.txt +++ b/lite/kernels/mlu/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(bridges) add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges}) add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) +add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt index 302d580ee1594f983e516d42da6f57221b3b33c8..82510ab9b6a794f5c6b1ffb43d2d3c55db3a5514 100644 --- a/lite/kernels/mlu/bridges/CMakeLists.txt +++ b/lite/kernels/mlu/bridges/CMakeLists.txt @@ -15,6 +15,9 @@ lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu}) set(mlu_subgraph_bridges subgraph_bridge_registry subgraph_bridge_utility_mlu @@ -26,16 +29,20 @@ set(mlu_subgraph_bridges subgraph_bridge_softmax_op_mlu subgraph_bridge_fc_op_mlu subgraph_bridge_batch_norm_op_mlu + subgraph_bridge_scale_op_mlu + subgraph_bridge_interp_op_mlu + subgraph_bridge_concat_op_mlu CACHE INTERNAL "mlu_subgraph_bridges") - -# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges}) -# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) -# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) - +lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges}) +lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}") diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc index 50291ec297f9d035f8a7fbe1b525f8ece27bfeb6..286195d9d5f961288dd0156db31ff8aacae58227 100644 --- a/lite/kernels/mlu/bridges/act_op.cc +++ b/lite/kernels/mlu/bridges/act_op.cc @@ -31,20 +31,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { VLOG(3) << "[MLU] Converting " + op_type + "..."; // Create act node and set params from op + auto fp_type = graph->FPType(); auto x_var_name = op_info->Input("X").front(); auto out_var_name = op_info->Output("Out").front(); auto output = scope->FindVar(out_var_name)->GetMutable(); auto output_dims = output->dims().Vectorize(); auto output_tensor = graph->AddNode( - out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type); CHECK(graph->HasNode(x_var_name)); auto input_tensor = graph->GetNode(x_var_name); - cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type); cnmlBaseOp_t activation_op; - CNML_CALL(cnmlCreateActiveOp(&activation_op, - act_type, - input_tensor->mlu_tensor(), - output_tensor->mlu_tensor())); + if (op_type == "leaky_relu") { + auto alpha = op_info->GetAttr("alpha"); + std::vector shape = {1, 1, 1, 1}; + std::string alpha_var_name = string_format("leaky_relu_alpha_%p", op); + auto alpha_tensor = + graph->AddNode(alpha_var_name, shape, CNML_CONST, CNML_NHWC, fp_type); + graph->BindConstRawData(alpha_var_name, &alpha, 1, true); + CNML_CALL(cnmlCreatePreluOp(&activation_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + alpha_tensor->mlu_tensor())); + } else { + cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type); + CNML_CALL(cnmlCreateActiveOp(&activation_op, + act_type, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + } graph->FuseOp(activation_op); return SUCCESS; } @@ -54,4 +68,11 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { } // namespace lite } // namespace paddle +REGISTER_SUBGRAPH_BRIDGE(sigmoid, + kMLU, + paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(leaky_relu, + kMLU, + paddle::lite::subgraph::mlu::ActConverter); diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc index 51cdc52dc6da764ab0c2d720b9159fd8b0a2c0df..7cec0529e49e694c362b3e0a550948f7855c85a2 100644 --- a/lite/kernels/mlu/bridges/act_op_test.cc +++ b/lite/kernels/mlu/bridges/act_op_test.cc @@ -25,8 +25,6 @@ namespace lite { namespace subgraph { namespace mlu { -int ActConverter(void* ctx, OpLite* op); - template void FillTensor(Tensor* x, float lower = -2, float upper = -2); @@ -136,7 +134,7 @@ void test_act(std::vector x_shape, std::string op_type) { TEST(MLUBridges, activation) { std::vector> shapes{{1}, {2, 3}, {1, 2, 3, 4}}; - std::vector types{"sigmoid", "relu", "tanh"}; + std::vector types{"sigmoid", "relu", "tanh", "leaky_relu"}; for (auto x_shape : shapes) { for (auto op_type : types) { test_act(x_shape, op_type); @@ -149,8 +147,7 @@ TEST(MLUBridges, activation) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - sigmoid, - paddle::lite::subgraph::mlu::ActConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter); +USE_SUBGRAPH_BRIDGE(sigmoid, kMLU) +USE_SUBGRAPH_BRIDGE(relu, kMLU) +USE_SUBGRAPH_BRIDGE(tanh, kMLU) +USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU) diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc index d95a5115c96c10a8881f50c44fee9881c6a9e218..7353a685dd5fd3a5bcc8c88def8ffb8b96fdde55 100644 --- a/lite/kernels/mlu/bridges/batch_norm_op.cc +++ b/lite/kernels/mlu/bridges/batch_norm_op.cc @@ -42,7 +42,7 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto output = scope->FindVar(y_var_name)->GetMutable(); auto output_dims = output->dims().Vectorize(); auto output_tensor = graph->AddNode( - y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + y_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); CHECK(graph->HasNode(x_var_name)); diff --git a/lite/kernels/mlu/bridges/batch_norm_op_test.cc b/lite/kernels/mlu/bridges/batch_norm_op_test.cc index 47e291bf3d83e8ce85216e86505817be6ed8b106..65b24a0a72a48a306b6a8976efd8839679d58038 100644 --- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc +++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc @@ -23,8 +23,6 @@ namespace lite { namespace subgraph { namespace mlu { -int BatchNormConverter(void* ctx, OpLite* op); - template void batch_norm_ref(const std::shared_ptr op) { Scope* scope = op->scope(); @@ -139,9 +137,7 @@ void test_batch_norm( {bs, ic, ih, iw}, {0, 2, 3, 1}); - out->Resize({bs, ih, iw, ic}); x->CopyDataFrom(input_trans); - x->Resize({bs, ih, iw, ic}); LaunchOp(op, {x_var_name}, {out_var_name}); @@ -181,6 +177,4 @@ TEST(MLUBridges, batch_norm) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - batch_norm, - paddle::lite::subgraph::mlu::BatchNormConverter); +USE_SUBGRAPH_BRIDGE(batch_norm, kMLU) diff --git a/lite/kernels/mlu/bridges/concat_op.cc b/lite/kernels/mlu/bridges/concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..14f0da746a00c1ea10ffae824217dbb2df84df55 --- /dev/null +++ b/lite/kernels/mlu/bridges/concat_op.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X"); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto param_axis = op_info->GetAttr("axis"); + + std::vector input_tensor; + for (auto x_name : x_var_name) { + CHECK(graph->HasNode(x_name)); + input_tensor.push_back(graph->GetNode(x_name)->mlu_tensor()); + } + + auto dims = output_dims.size(); + int axis = (param_axis < 0) ? (param_axis + dims) : param_axis; + CHECK_LE(axis, 4) << "Unsupport dims in mlu concat"; + int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2}; + int nhwc_axis = nchw_to_nhwc_axis_map[axis]; + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + + cnmlBaseOp_t concat_op; + cnmlTensor_t outputs = output_tensor->mlu_tensor(); + CNML_CALL(cnmlCreateNdConcatOp(&concat_op, + nhwc_axis, + input_tensor.data(), + x_var_name.size(), + &outputs, + 1)); + graph->FuseOp(concat_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(concat, + kMLU, + paddle::lite::subgraph::mlu::ConcatConverter); diff --git a/lite/kernels/mlu/bridges/concat_op_test.cc b/lite/kernels/mlu/bridges/concat_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4b48a9ef45430ec5867d231bbc2d0a798ec66d0 --- /dev/null +++ b/lite/kernels/mlu/bridges/concat_op_test.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/concat_op.h" +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void concat_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = op_info->Input("X"); + std::vector inputs; + for (auto var : x) { + inputs.push_back(scope->FindVar(var)->GetMutable()); + } + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + int axis = op_info->GetAttr("axis"); + std::vector inputs_concat(inputs.size()); + for (int j = 0; j < inputs.size(); ++j) { + inputs_concat[j] = inputs[j]; + } + size_t num = inputs.size(); + int rows = 1; + auto dim_0 = inputs[0]->dims(); + for (int i = 0; i < axis; ++i) { + rows *= dim_0[i]; + } + int out_rows = rows, out_cols = 0; + std::vector inputs_cols(inputs.size()); + for (int i = 0; i < num; ++i) { + int t_cols = inputs[i]->numel() / rows; + out_cols += t_cols; + inputs_cols[i] = t_cols; + } + for (int k = 0; k < out_rows; ++k) { + float* dst_ptr = out->mutable_data() + k * out_cols; + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = inputs_cols[j]; + const float* src_prt = inputs[j]->data() + k * col_len; + std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len); + col_idx += col_len; + } + } +} + +void test_concat(std::vector> input, int axis) { + std::string x_var_name = "x"; + std::string y_var_name = "y"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + + // prepare input&output variables + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* y = scope.Var(y_var_name)->GetMutable(); + x->Resize(DDim(input[0])); + y->Resize(DDim(input[1])); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + CHECK_EQ(out->dims(), out_ref->dims()); + + // initialize input&output data + FillTensor(x); + FillTensor(y); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("concat"); + opdesc.SetInput("X", {x_var_name, y_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + + auto op = CreateOp(opdesc, &scope); + concat_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_x, input_y; + input_x.Resize(DDim(input[0])); + input_y.Resize(DDim(input[1])); + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(input[0][0]), + static_cast(input[0][1]), + static_cast(input[0][2]), + static_cast(input[0][3])}, + {0, 2, 3, 1}); + transpose(y->mutable_data(), + input_y.mutable_data(), + {static_cast(input[1][0]), + static_cast(input[1][1]), + static_cast(input[1][2]), + static_cast(input[1][3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + y->CopyDataFrom(input_y); + + LaunchOp(op, {x_var_name, y_var_name}, {out_var_name}); + + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + + Tensor output_trans; + output_trans.Resize(out->dims()); + auto os = out->dims(); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4); + } +} + +TEST(MLUBridges, concat) { + test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0); + test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1); + test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2); + test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(concat, kMLU); diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc index e9fdacdca92398cee9f5e01b3f34e41e672274b5..6a7ef408eb7432950d5a0985dd6e174236e937e0 100644 --- a/lite/kernels/mlu/bridges/conv_op.cc +++ b/lite/kernels/mlu/bridges/conv_op.cc @@ -31,15 +31,16 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { const auto* scope = op->scope(); VLOG(3) << "[MLU] Converting " << op_info->Type() << "... "; - // Get input, filter and op attributes + // get input, filter and op attributes const auto input_var_name = op_info->Input("Input").front(); - const auto& input_dims_nhwc = + const auto& input_dims = scope->FindVar(input_var_name)->GetMutable()->dims(); - const auto input_dims = DimNHWC2NCHW(input_dims_nhwc); const auto filter_var_name = op_info->Input("Filter").front(); auto* filter = scope->FindVar(filter_var_name)->GetMutable(); const auto& filter_dims = filter->dims(); const auto output_var_name = op_info->Output("Output").front(); + auto* output = scope->FindVar(output_var_name)->GetMutable(); + const auto output_shape = output->dims().Vectorize(); const auto bs = input_dims[0]; const auto oc = filter_dims[0]; CHECK_EQ(input_dims.size(), 4); @@ -70,24 +71,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { input_dims, filter_dims); - std::vector output_shape({bs, oc}); - for (size_t i = 0; i < 2; i++) { - const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1; - output_shape.push_back( - (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) / - strides[i] + - 1); - } - - const auto output_shape_nhwc = DimNCHW2NHWC(output_shape); - const auto output_tensor = graph->AddNode(output_var_name, - output_shape_nhwc, - CNML_TENSOR, - CNML_NHWC, - graph->FPType()); - scope->FindVar(output_var_name) - ->GetMutable<::paddle::lite::Tensor>() - ->Resize(output_shape_nhwc); + const auto output_tensor = graph->AddNode( + output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); // Create filter node const auto filter_tensor = graph->AddNode(filter_var_name, @@ -119,14 +104,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { LOG(FATAL) << "UnSupported weight precision!"; } - cnmlConvOpParam_t conv_param; - CNML_CALL(cnmlCreateConvOpParam(&conv_param, - strides[0], - strides[1], - dilations[0], - dilations[1], - paddings[0] * 2, - paddings[2] * 2)); std::string bias_var_name; std::shared_ptr bias_tensor; if (HasInputArg(op_info, scope, "Bias")) { @@ -160,15 +137,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { graph->FPType()); graph->BindConstData(bias_var_name, bias); } - cnmlBaseOp_t conv_op; + const auto input_scale = op_info->GetAttr("input_scale"); - CNML_CALL(cnmlCreateConvOpForward( - &conv_op, - conv_param, - graph->GetNode(input_var_name)->mlu_tensor(), - output_tensor->mlu_tensor(), - filter_tensor->mlu_tensor(), - bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); + + bool use_first_conv = false; + if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) { + use_first_conv = true; + } + + cnmlBaseOp_t conv_op; + if (use_first_conv) { + cnmlConvFirstOpParam_t conv_param; + CNML_CALL(cnmlCreateConvFirstOpParam_V2(&conv_param, + strides[0], + strides[1], + dilations[0], + dilations[1], + paddings[2], + paddings[2], + paddings[0], + paddings[0])); + const auto mean_tensor = graph->AddNode("first_conv_mean_tensor", + std::vector{3}, + CNML_CONST, + CNML_CNHW, + graph->FPType()); + const auto std_tensor = graph->AddNode("first_conv_std_tensor", + std::vector{3}, + CNML_CONST, + CNML_CNHW, + graph->FPType()); + + graph->BindConstRawData("first_conv_mean_tensor", + lite::DeviceInfo::Global().MeanVec().data(), + 3, + false); + graph->BindConstRawData("first_conv_std_tensor", + lite::DeviceInfo::Global().StdVec().data(), + 3, + false); + + graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8); + CNML_CALL(cnmlCreateConvFirstOpForward( + &conv_op, + conv_param, + graph->GetNode(input_var_name)->mlu_tensor(), + mean_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr, + std_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param)); + } else { + cnmlConvOpParam_t conv_param; + CNML_CALL(cnmlCreateConvOpParam(&conv_param, + strides[0], + strides[1], + dilations[0], + dilations[1], + paddings[0] * 2, + paddings[2] * 2)); + CNML_CALL(cnmlCreateConvOpForward( + &conv_op, + conv_param, + graph->GetNode(input_var_name)->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); + CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); + } graph->SetComputingDataType( conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale); @@ -183,7 +220,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } graph->BindConstData(filter_var_name, filter); graph->FuseOp(conv_op); - CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc index e8ef9ba04fd6126f00f4ee2ff869495929bfdc9a..e34dd7c2a85dbda62596b6e82d820fc437bfd194 100644 --- a/lite/kernels/mlu/bridges/conv_op_test.cc +++ b/lite/kernels/mlu/bridges/conv_op_test.cc @@ -25,8 +25,6 @@ namespace lite { namespace subgraph { namespace mlu { -int ConvConverter(void* ctx, OpLite* op); - void conv_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); @@ -246,10 +244,6 @@ void test_conv(int bs, } } - input->Resize({bs, ih, iw, ic}); - output->Resize( - {output_shape[0], output_shape[2], output_shape[3], output_shape[1]}); - // create and convert op to MLU model, then run it on MLU auto op = CreateOp(opdesc_mlu, &scope); LaunchOp(op, {input_var_name}, {output_var_name}); @@ -342,9 +336,5 @@ TEST(MLUBridges, conv) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - conv2d, - paddle::lite::subgraph::mlu::ConvConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - depthwise_conv2d, - paddle::lite::subgraph::mlu::ConvConverter); +USE_SUBGRAPH_BRIDGE(conv2d, kMLU) +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU) diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc index 4ef949925d20e0a2cb1c7f25d840e2041d79dd7a..41526a0100ba71be9eda25983cb96aa888d6cf4d 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops.cc @@ -77,7 +77,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto output_tensor = graph->AddNode(out_var_name, x->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph->FPType()); cnmlBaseOp_t elementwise_op; @@ -90,7 +90,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto mid_tensor = graph->AddNode(out_var_name + "_mid", x->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph->FPType()); CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op, x_tensor->mlu_tensor(), diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc index 388aa68600e180945d19e1a4e4728cf26bf801e1..e5087dd708eee3ba255fbfa0383d31b12a6b6870 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc @@ -24,8 +24,6 @@ namespace lite { namespace subgraph { namespace mlu { -int ElementwiseConverter(void* ctx, OpLite* op); - template void elementwise_add_ref(const std::shared_ptr op) { Scope* scope = op->scope(); @@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - elementwise_add, - paddle::lite::subgraph::mlu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - elementwise_sub, - paddle::lite::subgraph::mlu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - elementwise_mul, - paddle::lite::subgraph::mlu::ElementwiseConverter); -REGISTER_SUBGRAPH_BRIDGE(MLU, - elementwise_div, - paddle::lite::subgraph::mlu::ElementwiseConverter); +USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU) +USE_SUBGRAPH_BRIDGE(elementwise_sub, kMLU) +USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU) +USE_SUBGRAPH_BRIDGE(elementwise_div, kMLU) diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc index 43a75daa2b3d2d6200f3607e213ab62ee6ba3cdb..286feec8d4d44eaa025f333d559c32ca72f042ff 100644 --- a/lite/kernels/mlu/bridges/fc_op.cc +++ b/lite/kernels/mlu/bridges/fc_op.cc @@ -37,6 +37,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { // int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); auto x = scope->FindVar(x_var_name)->GetMutable(); auto w = scope->FindVar(w_var_name)->GetMutable(); + auto output = scope->FindVar(output_var_name)->GetMutable(); auto x_dims = x->dims(); auto w_dims = w->dims(); @@ -50,15 +51,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto input_scale = op_info->GetAttr("input_scale"); - std::vector output_shape_nhwc({1, 1, 1, w_dims[1]}); auto output_tensor = graph->AddNode(output_var_name, - output_shape_nhwc, + output->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph->FPType()); - scope->FindVar(output_var_name) - ->GetMutable<::paddle::lite::Tensor>() - ->Resize(output_shape_nhwc); std::string bias_var_name; std::shared_ptr bias_tensor; diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc index 7e5cfdb32e7d993f32403dc764462575181f9d4d..8f92b6abad97650100d0862d49550abaf62daac9 100644 --- a/lite/kernels/mlu/bridges/fc_op_test.cc +++ b/lite/kernels/mlu/bridges/fc_op_test.cc @@ -24,8 +24,6 @@ namespace lite { namespace subgraph { namespace mlu { -int FCConverter(void* ctx, OpLite* op); - void fc_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); @@ -141,15 +139,34 @@ void test_fc(const std::vector& input_shape, } auto fc_op_mlu = CreateOp(fc_op_desc_mlu, &scope); - input->Resize({static_cast(input_shape[0]), - static_cast(input_shape[2]), - static_cast(input_shape[3]), - static_cast(input_shape[1])}); - out->Resize({static_cast(input_shape[0]), static_cast(w_shape[1])}); + + Tensor input_tmp, out_tmp; + input_tmp.Resize(input_shape); + transpose(input->mutable_data(), + input_tmp.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + input->CopyDataFrom(input_tmp); + LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name}); - // compare results + auto os = out->dims(); + out_tmp.Resize(os); auto* out_data = out->mutable_data(); + // transpose(out_data, + // out_tmp.mutable_data(), + // {static_cast(os[0]), + // static_cast(os[2]), + // static_cast(os[3]), + // static_cast(os[1])}, + // {0, 3, 1, 2}); + // + // out_data = out_tmp.mutable_data(); + + // compare results auto* out_ref_data = out_ref->mutable_data(); for (int i = 0; i < out->dims().production(); i++) { EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); @@ -170,4 +187,4 @@ TEST(MLUBridges, fc) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter); +USE_SUBGRAPH_BRIDGE(fc, kMLU); diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc index 27c6ab2597fa6930b14c4c4e34750030608167b6..65c2f8214c13ee8d004dbe4b2e706523d007469c 100644 --- a/lite/kernels/mlu/bridges/graph.cc +++ b/lite/kernels/mlu/bridges/graph.cc @@ -25,12 +25,12 @@ namespace mlu { std::shared_ptr Graph::AddNode(const std::string& name, std::vector shape, cnmlTensorType_t tensor_type, - cnmlDataOrder_t data_order, + cnmlDataOrder_t shape_order, cnmlDataType_t mlu_dtype, void* raw_ptr) { CHECK(!HasNode(name)); auto node = std::shared_ptr( - new MLUTensor(shape, tensor_type, data_order, mlu_dtype)); + new MLUTensor(shape, tensor_type, shape_order, mlu_dtype)); node->set_mlu_ptr(raw_ptr); nodes_.insert(std::make_pair(name, node)); return node; diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h index 140900a2dde004281945e50fb1c72d09b58befa1..b846d15af06c683ad685b04da5588f7ecedd0d38 100644 --- a/lite/kernels/mlu/bridges/graph.h +++ b/lite/kernels/mlu/bridges/graph.h @@ -23,6 +23,12 @@ #include "lite/core/tensor.h" #include "lite/kernels/mlu/bridges/tensor.h" +#define PRINT_HW_TIME false + +#if PRINT_HW_TIME +#include //NOLINT +#endif + namespace paddle { namespace lite { namespace subgraph { @@ -32,13 +38,30 @@ namespace mlu { // to the MLU IR graph class Graph { public: - Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); } + Graph() { + CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); +#if PRINT_HW_TIME + CNRT_CALL(cnrtCreateNotifier(¬ifier_start_)); + CNRT_CALL(cnrtCreateNotifier(¬ifier_end_)); +#endif + } ~Graph() { + FreeConstData(); CNML_CALL(cnmlDestroyFusionOp(&fusion_op_)); for (auto op : ops_) { CNML_CALL(cnmlDestroyBaseOp(&op)); } +#if PRINT_HW_TIME + CNRT_CALL(cnrtDestroyNotifier(¬ifier_start_)); + CNRT_CALL(cnrtDestroyNotifier(¬ifier_end_)); + double total_time = 0; + for (auto& f : time_log_) { + total_time += f; + } + std::cout << "cnml hardware time for " << time_log_.size() + << " process:" << total_time / time_log_.size() << std::endl; +#endif } // Data node @@ -89,6 +112,10 @@ class Graph { } void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) { +#if PRINT_HW_TIME + thread_local float hw_time; + CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); +#endif CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_, input_addrs_.data(), input_addrs_.size(), @@ -96,7 +123,61 @@ class Graph { output_addrs_.size(), &forward_param, que)); +#if PRINT_HW_TIME + CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); +#endif + CNRT_CALL(cnrtSyncQueue(que)); +#if PRINT_HW_TIME + CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); + hw_time /= 1000.0f; + DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; + std::lock_guard lk(time_mut_); + time_log_.push_back(hw_time); +#endif + } + + template + void* RegisterConstData(size_t len) { + void* addr = malloc(len * sizeof(T)); + const_data_storage_.push_back(addr); + return addr; + } + + void FreeConstData() { + for (auto& addr : const_data_storage_) { + free(addr); + } + } + + void BindConstRawData(std::string tensor_name, + const float* data, + size_t len, + bool alloc = true) { + void* alloc_data; + if (fp_type_ == CNML_DATA_FLOAT32) { + if (alloc) { + alloc_data = RegisterConstData(len); + memcpy(alloc_data, data, len * sizeof(float)); + } else { + alloc_data = const_cast(static_cast(data)); + } + CNML_CALL(cnmlBindConstData_V2( + nodes_[tensor_name]->mlu_tensor(), alloc_data, false)); + } else if (fp_type_ == CNML_DATA_FLOAT16) { + void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len); + CNRT_CALL( + cnrtCastDataType(const_cast(static_cast(data)), + CNRT_FLOAT32, + data_fp16, + CNRT_FLOAT16, + len, + nullptr)); + CNML_CALL(cnmlBindConstData_V2( + nodes_[tensor_name]->mlu_tensor(), data_fp16, false)); + } else { + CHECK(0); + } } void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) { @@ -158,6 +239,12 @@ class Graph { std::vector> output_tensors_; std::vector ops_; cnmlFusionOp_t fusion_op_; + std::vector const_data_storage_; +#if PRINT_HW_TIME + cnrtNotifier_t notifier_start_{}, notifier_end_{}; + std::mutex time_mut_; + std::vector time_log_; +#endif }; } // namespace mlu diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e201199824d8042abd6002ccbe5bb659a9ca2898 --- /dev/null +++ b/lite/kernels/mlu/bridges/interpolate_op.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto out = scope->FindVar(out_var_name)->GetMutable(); + auto x_dims = x->dims(); + CHECK_EQ(x_dims.size(), 4); + auto scale = op_info->GetAttr("scale"); + auto out_w = op_info->GetAttr("out_w"); + auto out_h = op_info->GetAttr("out_h"); + auto align_corners = op_info->GetAttr("align_corners"); + + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + + auto in_h = x_dims[2]; + auto in_w = x_dims[3]; + + // Priority: SizeTensor > OutSize > Scale > scale > out_h/out_w + if (HasInputArg(op_info, scope, "SizeTensor")) { + LOG(ERROR) << "Not support SizeTensor input now"; + CHECK(0); + } else { + if (HasInputArg(op_info, scope, "Scale")) { + LOG(ERROR) << "Not support Scale input now"; + CHECK(0); + } + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + out_h = out_h > 0 ? out_h : -1; + out_w = out_w > 0 ? out_w : -1; + } + if (HasInputArg(op_info, scope, "OutSize")) { + LOG(ERROR) << "Not support OutSize input now"; + CHECK(0); + } + } + + auto output_tensor = graph->AddNode(out_var_name, + out->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType()); + + cnmlBaseOp_t interp_op; + cnmlNearestNeighborOpParam_t nn_param; + CNML_CALL(cnmlCreateNearestNeighborOpParam(&nn_param, out_w, out_h)); + CNML_CALL(cnmlSetNearestNeighborAlignCorner(&nn_param, align_corners)); + CNML_CALL(cnmlCreateNearestNeighborOp(&interp_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + nn_param)); + CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param)); + graph->FuseOp(interp_op); + + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(nearest_interp, + kMLU, + paddle::lite::subgraph::mlu::InterpolateConverter); diff --git a/lite/kernels/mlu/bridges/interpolate_op_test.cc b/lite/kernels/mlu/bridges/interpolate_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0e99da64358e6590af0b8e57dc3ddec142c8d0f0 --- /dev/null +++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc @@ -0,0 +1,406 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/interpolate_op.h" +#include +#include +#include "lite/core/device_info.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +void ResizeNearestAlign(const lite::Tensor* x, + lite::Tensor* out, + bool with_align) { + auto x_dims = x->dims(); + int num = x_dims[0]; + int channels = x_dims[1]; + int hin = x_dims[2]; + int win = x_dims[3]; + int hout = out->dims()[2]; + int wout = out->dims()[3]; + dtype scale_w = (with_align) ? (static_cast(win - 1) / (wout - 1)) + : (static_cast(win) / (wout)); + dtype scale_h = (with_align) ? (static_cast(hin - 1) / (hout - 1)) + : (static_cast(hin) / (hout)); + const dtype* src = x->data(); + dtype* dst = out->mutable_data(); + int dst_stride_w = 1; + int dst_stride_h = wout; + int dst_stride_c = wout * hout; + int dst_stride_batch = wout * hout * channels; + int src_stride_w = 1; + int src_stride_h = win; + int src_stride_c = win * hin; + int src_stride_batch = win * hin * channels; + for (int n = 0; n < num; ++n) { + for (int c = 0; c < channels; ++c) { + int src_index = n * src_stride_batch + c * src_stride_c; + for (int h = 0; h < hout; ++h) { + for (int w = 0; w < wout; ++w) { + int fw = (with_align) ? static_cast(scale_w * w + 0.5) + : static_cast(scale_w * w); + fw = (fw < 0) ? 0 : fw; + int fh = (with_align) ? static_cast(scale_h * h + 0.5) + : static_cast(scale_h * h); + fh = (fh < 0) ? 0 : fh; + int w_start = static_cast(fw); + int h_start = static_cast(fh); + int dst_index = n * dst_stride_batch + c * dst_stride_c + + h * dst_stride_h + w * dst_stride_w; + dst[dst_index] = + src[src_index + w_start * src_stride_w + h_start * src_stride_h]; + } + } + } + } +} + +template +void BilinearInterpRef(const lite::Tensor* x, + lite::Tensor* out, + bool align_corners, + int align_mode) { + auto x_dims = x->dims(); + int batch_size = x_dims[0]; + int channel_size = x_dims[1]; + auto x_h = x_dims[2]; + auto x_w = x_dims[3]; + CHECK_EQ(x_dims.size(), 4); + + auto out_dims = out->dims(); + int out_h = out_dims[2]; + int out_w = out_dims[3]; + + // copy from x if no change + if (x_h == out_h && x_w == out_w) { + out->CopyDataFrom(*x); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(x_h - 1) / (out_h - 1) + : static_cast(x_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(x_w - 1) / (out_w - 1) + : static_cast(x_w) / out_w; + } + + // naive bilinear interpolation + auto x_data = x->data(); + auto out_data = out->mutable_data(); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vy_n, vy_s; + std::vector vd_n, vd_s; + vy_n.reserve(out_h); + vy_s.reserve(out_h); + vd_n.reserve(out_h); + vd_s.reserve(out_h); + for (int k = 0; k < out_h; k++) { + int yn = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + yn = (yn > 0) ? yn : 0; + int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn; + float ds = 1.f - dn; + { + vy_n[k] = yn; + vy_s[k] = ys; + vd_n[k] = dn; + vd_s[k] = ds; + } + } + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); + for (int l = 0; l < out_w; l++) { + int xw = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + xw = (xw > 0) ? xw : 0; + int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw; + float de = 1.f - dw; + { + vx_w[l] = xw; + vx_e[l] = xe; + vd_w[l] = dw; + vd_e[l] = de; + } + } + + std::vector x_strides(x_dims.size(), 1); + for (int idx = x_strides.size() - 2; idx >= 0; idx--) { + x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1]; + } + for (int i = 0; i < batch_size; i++) { + for (int j = 0; j < channel_size; j++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + DType x0 = x_data[i * x_strides[0] + j * x_strides[1] + + vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]]; + DType x1 = x_data[i * x_strides[0] + j * x_strides[1] + + vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]]; + DType x2 = x_data[i * x_strides[0] + j * x_strides[1] + + vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]]; + DType x3 = x_data[i * x_strides[0] + j * x_strides[1] + + vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]]; + *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] + + x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l]; + out_data++; + } + } + } + } +} + +class InterpComputeTester { + protected: + // common attributes for this op. + std::string x_var_name = "X"; + std::string outsize_var_name = "OutSize"; + std::string out_var_name = "Out"; + std::string out_ref_var_name = "out_ref"; + DDim dims_{{1, 2, 3, 4}}; + + Scope scope; + std::string interp_method_ = "nearest"; + float scale_ = -1.f; + int out_h_ = -1; + int out_w_ = -1; + bool align_corners_ = true; + int align_mode_ = 1; + bool use_outsize_ = false; + + public: + InterpComputeTester(const std::string& alias, + DDim dims, + std::string interp_method = "nearest", + float scale = -1.f, + int out_h = -1, + int out_w = -1, + bool align_corners = true, + int align_mode = 1, + bool use_outsize = false) + : dims_(dims), + interp_method_(interp_method), + scale_(scale), + out_h_(out_h), + out_w_(out_w), + align_corners_(align_corners), + align_mode_(align_mode), + use_outsize_(use_outsize) {} + + void Execute(float abs_error) { + cpp::OpDesc op_desc; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* outsize = scope.Var(outsize_var_name)->GetMutable(); + auto* outref = scope.Var(out_ref_var_name)->GetMutable(); + int out_h = out_h_; + int out_w = out_w_; + if (scale_ > 0) { + out_h = static_cast(dims_[2] * scale_); + out_w = static_cast(dims_[3] * scale_); + } + x->Resize(dims_); + /* printf("----output tensor dims: %ld, %d, %d, %ld\n", dims_[0], out_h, + * out_w, dims_[1]); */ + std::vector out_shape_nchw = {dims_[0], dims_[1], out_h, out_w}; + outref->Resize(out_shape_nchw); + outsize->Resize({2}); + + FillTensor(x, -1.f, 1.f); + + if (use_outsize_) { + outsize->mutable_data()[0] = out_h; + outsize->mutable_data()[1] = out_w; + outsize->set_persistable(true); + } + + if (interp_method_ == "nearest") { + op_desc.SetType("nearest_interp"); + } else if (interp_method_ == "bilinear") { + op_desc.SetType("bilinear_interp"); + } else { + LOG(FATAL) << "unsupport"; + } + op_desc.SetInput("X", {x_var_name}); + if (use_outsize_) { + op_desc.SetInput("OutSize", {outsize_var_name}); + } + op_desc.SetOutput("Out", {out_var_name}); + op_desc.SetAttr("scale", scale_); + op_desc.SetAttr("out_h", out_h_); + op_desc.SetAttr("out_w", out_w_); + op_desc.SetAttr("align_corners", align_corners_); + op_desc.SetAttr("align_mode", align_mode_); + op_desc.SetAttr("interp_method", interp_method_); + auto op = CreateOp(op_desc, &scope); + + if (interp_method_ == "nearest") { + ResizeNearestAlign(x, outref, align_corners_); + } else if (interp_method_ == "bilinear") { + BilinearInterpRef(x, outref, align_corners_, align_mode_); + } + + int in = dims_[0], ic = dims_[1], ih = dims_[2], iw = dims_[3]; + Tensor input_trans; + input_trans.Resize(dims_); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {in, ic, ih, iw}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_trans); + if (use_outsize_) { + LaunchOp(op, {x_var_name, outsize_var_name}, {out_var_name}); + } else { + LaunchOp(op, {x_var_name}, {out_var_name}); + } + + auto* out_ref_data = outref->mutable_data(); + + Tensor output_trans; + output_trans.Resize(out_shape_nchw); + transpose( + out->mutable_data(), + output_trans.mutable_data(), + {static_cast(dims_[0]), out_h, out_w, static_cast(dims_[1])}, + {0, 3, 1, 2}); + auto* out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); ++i) { + EXPECT_NEAR(out_data[i], out_ref_data[i], abs_error); + } + } +}; + +void TestInterpOuthw(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + /* for (auto interp_method : std::vector{"nearest", + * "bilinear"}) { */ + for (auto interp_method : std::vector{"nearest"}) { + for (int out_h : {6, 8, 12}) { + for (int out_w : {6, 9}) { + printf("testcase %s: out_w %d, out_h %d\n", + interp_method.c_str(), + out_w, + out_h); + InterpComputeTester tester( + "def", DDim(x_dims), interp_method, -1.f, out_h, out_w); + tester.Execute(abs_error); + } + } + } + } +} + +void TestInterpScale(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + /* for (auto interp_method : std::vector{"nearest", + * "bilinear"}) { */ + for (auto interp_method : std::vector{"nearest"}) { + for (float scale : {0.3f, 1.f, 1.7f}) { + printf("testcase %s: scale: %f\n", interp_method.c_str(), scale); + InterpComputeTester tester("def", DDim(x_dims), interp_method, scale); + tester.Execute(abs_error); + } + } + } +} + +void TestInterpOutsize(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + /* for (auto interp_method : std::vector{"nearest", + * "bilinear"}) { */ + for (auto interp_method : std::vector{"nearest"}) { + printf("testcase %s: outsize: %d %d\n", interp_method.c_str(), 4, 4); + InterpComputeTester tester( + "def", DDim(x_dims), interp_method, -1, 4, 4, true, 1, true); + tester.Execute(abs_error); + } + } +} + +void TestInterpAlignCorners(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + for (bool align_corners : {true, false}) { + printf( + "testcase nearest: scale: 0.4, out_w -1 out_h -1, align_corners %d\n", + align_corners); + InterpComputeTester tester( + "def", DDim(x_dims), "nearest", 0.4, -1, -1, align_corners); + tester.Execute(abs_error); + } + } +} + +void TestInterpAlignMode(float abs_error = 2e-5) { + for (auto x_dims : std::vector>{{3, 4, 8, 9}}) { + for (bool align_corners : {true, false}) { + for (int align_mode : {0, 1}) { + printf( + "testcase bilinear: scale: 0.7, out_w -1 out_h -1, align_corners " + "%d, mode %d\n", + align_corners, + align_mode); + InterpComputeTester tester("def", + DDim(x_dims), + "bilinear", + 0.7, + -1, + -1, + align_corners, + align_mode); + tester.Execute(abs_error); + } + } + } +} + +TEST(MLUBridges, interpolate) { + float abs_error = 2e-5; + TestInterpOuthw(abs_error); + TestInterpScale(abs_error); + // bug, not usable + // TestInterpOutsize(abs_error); + TestInterpAlignCorners(abs_error); + // only for bilinear interp + // TestInterpAlignMode(abs_error); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU); diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h index 1b12970afadd4e3bdcd7568c05bc15583ccbaaae..d31ba0dd41111860a3b26d8ac3afb3273bef4557 100644 --- a/lite/kernels/mlu/bridges/paddle_use_bridges.h +++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h @@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(pool2d, kMLU); USE_SUBGRAPH_BRIDGE(softmax, kMLU); USE_SUBGRAPH_BRIDGE(batch_norm, kMLU); USE_SUBGRAPH_BRIDGE(fc, kMLU); +USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU); +USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU); +USE_SUBGRAPH_BRIDGE(concat, kMLU); +USE_SUBGRAPH_BRIDGE(scale, kMLU); diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc index 3119b6c77dca10641c7c7c32072969fedb1ecef6..f77c8084c76fc52c39938e723f02bde9b3cac41b 100644 --- a/lite/kernels/mlu/bridges/pool_op.cc +++ b/lite/kernels/mlu/bridges/pool_op.cc @@ -47,9 +47,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Get input, and attributes auto x_var_name = op_info->Input("X").front(); auto x = scope->FindTensor(x_var_name); - auto input_dims_nhwc = x->dims(); - const auto input_dims = DimNHWC2NCHW(input_dims_nhwc); auto output_var_name = op_info->Output("Out").front(); + auto output_shape = scope->FindTensor(output_var_name)->dims().Vectorize(); auto pooling_type = op_info->GetAttr("pooling_type"); auto ceil_mode = op_info->GetAttr("ceil_mode"); auto paddings = op_info->GetAttr>("paddings"); @@ -81,23 +80,17 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { strides, ksize); - std::vector output_shape({input_dims[0], input_dims[1]}); - for (size_t i = 0; i < 2; i++) { - output_shape.push_back( - (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) / - strides[i] + - 1); - } + // std::vector output_shape({input_dims[0], input_dims[1]}); + // for (size_t i = 0; i < 2; i++) { + // output_shape.push_back( + // (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - + // ksize[0]) / + // strides[i] + + // 1); + // } - auto output_shape_nhwc = DimNCHW2NHWC(output_shape); - auto output_tensor = graph->AddNode(output_var_name, - output_shape_nhwc, - CNML_TENSOR, - CNML_NHWC, - graph->FPType()); - scope->FindVar(output_var_name) - ->GetMutable<::paddle::lite::Tensor>() - ->Resize(output_shape_nhwc); + auto output_tensor = graph->AddNode( + output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); cnmlPoolOpParam_t pool_param; CNML_CALL( diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc index 29ef68781f4a99ebcc20901dabab6ee22a258424..8cee8dbe86109b14cff49f329d71074a9b3bfb61 100644 --- a/lite/kernels/mlu/bridges/pool_op_test.cc +++ b/lite/kernels/mlu/bridges/pool_op_test.cc @@ -24,8 +24,6 @@ namespace lite { namespace subgraph { namespace mlu { -int PoolConverter(void* ctx, OpLite* op); - void pool_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); @@ -182,12 +180,7 @@ void test_pool(int bs, {0, 2, 3, 1}); auto os = out->dims(); - out->Resize({static_cast(os[0]), - static_cast(os[2]), - static_cast(os[3]), - static_cast(os[1])}); x->CopyDataFrom(input_trans); - x->Resize({bs, ih, iw, ic}); LaunchOp(op, {x_var_name}, {out_var_name}); @@ -275,6 +268,4 @@ TEST(MLUBridges, pool) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - pool2d, - paddle::lite::subgraph::mlu::PoolConverter); +USE_SUBGRAPH_BRIDGE(pool2d, kMLU) diff --git a/lite/kernels/mlu/bridges/scale_op.cc b/lite/kernels/mlu/bridges/scale_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5557602bd7576ccd71c51f52a538a45fe27f7ada --- /dev/null +++ b/lite/kernels/mlu/bridges/scale_op.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create act node and set params from op + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + auto bias_after_scale = op_info->GetAttr("bias_after_scale"); + auto scale = op_info->GetAttr("scale"); + auto bias = op_info->GetAttr("bias"); + auto beta = bias_after_scale ? bias : bias * scale; + + std::vector shape = {1, 1, 1, 1}; + + std::string prefix = string_format("_%p", op); + auto alpha_tensor = graph->AddNode( + "Alpha" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType()); + auto beta_tensor = graph->AddNode( + "Beta" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType()); + + graph->BindConstRawData("Alpha" + prefix, &scale, 1); + graph->BindConstRawData("Beta" + prefix, &beta, 1); + + auto input_tensor = graph->GetNode(x_var_name); + cnmlBaseOp_t scale_op; + CNML_CALL(cnmlCreateScaleOp(&scale_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + alpha_tensor->mlu_tensor(), + beta_tensor->mlu_tensor())); + graph->FuseOp(scale_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(scale, + kMLU, + paddle::lite::subgraph::mlu::ScaleConverter); diff --git a/lite/kernels/mlu/bridges/scale_op_test.cc b/lite/kernels/mlu/bridges/scale_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0ed975a84a174d1a58c9ed23bb925fdcc82b46f --- /dev/null +++ b/lite/kernels/mlu/bridges/scale_op_test.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/scale_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void scale_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + float scale = op_info->GetAttr("scale"); + float bias = op_info->GetAttr("bias"); + bool bias_after_scale = op_info->GetAttr("bias_after_scale"); + if (!bias_after_scale) { + bias *= scale; + } + auto x_data = x->data(); + auto out_data = out->mutable_data(); + DDim x_dims = x->dims(); + DDim out_dims = out->dims(); + CHECK_EQ(x_dims.production(), out_dims.production()); + for (int i = 0; i < out_dims.production(); i++) { + out_data[i] = x_data[i] * scale + bias; + } +} + +void test_scale(int bs, + int ic, + int ih, + int iw, + bool bias_after_scale, + float scale, + float bias) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("scale"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("bias_after_scale", bias_after_scale); + opdesc.SetAttr("scale", scale); + opdesc.SetAttr("bias", bias); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + scale_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_trans; + input_trans.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {bs, ic, ih, iw}, + {0, 2, 3, 1}); + auto os = out->dims(); + out->Resize({static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}); + x->CopyDataFrom(input_trans); + x->Resize({bs, ih, iw, ic}); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor('out') + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize(os); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(MLUBridges, scale) { + for (auto bs : {1, 3}) { + for (auto ic : {1, 3}) { + for (auto ih : {3, 4}) { + for (auto iw : {4, 3}) { + for (auto bias_after_scale : {false, true}) { + for (auto scale : {-1.0f, 5.0f}) { + for (auto bias : {-2.0f, 30.0f}) { + VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih + << " iw: " << iw + // << " bias_after_scale: " << bias_after_scale + << " scale: " << scale << " bias: " << bias; + test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias); + } + } + } + } + } + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(scale, kMLU); diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc index b9e2b1116dc95ec276f8d85a5669cec45d98ea39..17c911675718a15c7ede4888b268ffcd62b4d8ed 100644 --- a/lite/kernels/mlu/bridges/softmax_op.cc +++ b/lite/kernels/mlu/bridges/softmax_op.cc @@ -45,11 +45,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { axis = output_dims.size() + axis; } } - int nhwc_axis = nchw_to_nhwc_aixs_map[axis]; auto output_tensor = graph->AddNode( - out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); cnmlBaseOp_t softmax_op; CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op, nhwc_axis, diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc index 7ceb050d8008f8186fdd737c394d8fe8dc0ffd7f..a5251ed43c9187fc2874f9b01853b45b8abf7f1c 100644 --- a/lite/kernels/mlu/bridges/softmax_op_test.cc +++ b/lite/kernels/mlu/bridges/softmax_op_test.cc @@ -23,8 +23,6 @@ namespace lite { namespace subgraph { namespace mlu { -int SoftmaxConverter(void* ctx, OpLite* op); - template void softmax_ref(const std::shared_ptr op) { Scope* scope = op->scope(); @@ -112,9 +110,7 @@ void test_softmax(const std::vector& input_shape, int axis) { {bs, ic, ih, iw}, {0, 2, 3, 1}); - out->Resize({bs, ih, iw, ic}); x->CopyDataFrom(input_trans); - x->Resize({bs, ih, iw, ic}); LaunchOp(op, {x_var_name}, {out_var_name}); @@ -171,6 +167,4 @@ TEST(MLUBridges, softmax) { } // namespace lite } // namespace paddle -REGISTER_SUBGRAPH_BRIDGE(MLU, - softmax, - paddle::lite::subgraph::mlu::SoftmaxConverter); +USE_SUBGRAPH_BRIDGE(softmax, kMLU) diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h index 7bb2e1b20334e359b2db0ecf1fe61e16175413dc..12dc97a772dabc529bf183f783a22a9f2dfa936d 100644 --- a/lite/kernels/mlu/bridges/tensor.h +++ b/lite/kernels/mlu/bridges/tensor.h @@ -47,6 +47,8 @@ class MLUTensor { return mlu_ptr_; } + void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; } + ~MLUTensor(); private: diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc index cf2d7bd6c1ec5634bb0d7556a16166ac0b0bcb45..377a00689ef3a27f78ae008072578ab3701cd337 100644 --- a/lite/kernels/mlu/bridges/test_helper.cc +++ b/lite/kernels/mlu/bridges/test_helper.cc @@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr op, const std::vector& input_var_names, const std::vector& output_var_names) { CNRT_CALL(cnrtInit(0)); - SetMluDevice(0); + ::paddle::lite::SetMluDevice(0); cnrtQueue_t queue_; cnrtInvokeFuncParam_t forward_param; u32_t affinity = 1; @@ -47,7 +47,7 @@ void LaunchOp(const std::shared_ptr op, const auto& bridges = subgraph::Registry::Instance(); CHECK(bridges.Exists(op_type, TARGET(kMLU))); - // Convert all of input data vars and added into the MLU IR graph + // Convert input data var and add it into the MLU IR graph for (auto& input_name : input_var_names) { auto input_tensor = scope->FindMutableTensor(input_name); CHECK(input_tensor); @@ -58,7 +58,7 @@ void LaunchOp(const std::shared_ptr op, graph.AddNode(input_name, input_tensor->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph.FPType(), reinterpret_cast( input_tensor->mutable_data(TARGET(kMLU)))); @@ -68,6 +68,8 @@ void LaunchOp(const std::shared_ptr op, sizeof(float) * input_tensor->dims().production(), CNRT_MEM_TRANS_DIR_HOST2DEV)); } + op->CheckShape(); + op->InferShape(); bridges.Select(op_type, TARGET(kMLU))( reinterpret_cast(&graph), const_cast(op.get()), nullptr); diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h index 2af8274e07713300277f7280f12e6d1fcb47c3c2..fa8fb1597c0fb068a855928dd20057d48ecd5eaf 100644 --- a/lite/kernels/mlu/bridges/utility.h +++ b/lite/kernels/mlu/bridges/utility.h @@ -84,7 +84,7 @@ struct FPTypeTraits { template <> struct FPTypeTraits { - typedef ::paddle::lite::fluid::float16 T; + typedef paddle::lite::fluid::float16 T; }; } // namespace mlu diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc index bc6e1838d70383edb3dcc65d7a9b0f627719e963..02e4d8b28e81e88201b895a4b8fbe9e93d3f17f9 100644 --- a/lite/kernels/mlu/io_copy_compute.cc +++ b/lite/kernels/mlu/io_copy_compute.cc @@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL( .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); - -// kMLU, -// kFloat, -// kNHWC, -// paddle::lite::kernels::mlu::IoCopyHostToMluCompute, -// host_to_device) -// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) -// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) -// .Finalize(); -// -// -// kMLU, -// kFloat, -// kNHWC, -// paddle::lite::kernels::mlu::IoCopyMluToHostCompute, -// device_to_host) -// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) -// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) -// .Finalize(); diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..d4e16734d6d2dae6f5c119194008bce114a2e918 --- /dev/null +++ b/lite/kernels/mlu/layout_compute.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/layout_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu {} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kFloat, + kNHWC, + paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute, + def_layout_nhwc2nchw_fp32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kFP16, + kNHWC, + paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute, + def_layout_nhwc2nchw_fp16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kFloat, + kNHWC, + paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, + def_layout_nchw2nhwc_fp32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kFP16, + kNHWC, + paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, + def_layout_nchw2nhwc_fp16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + layout, + kMLU, + kInt8, + kNHWC, + paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, + def_layout_nchw2nhwc_fp32_int8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..edacdf8a98a2ffde6e538f61d4dd8259e3211b22 --- /dev/null +++ b/lite/kernels/mlu/layout_compute.h @@ -0,0 +1,175 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/backends/x86/math/math_function.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/operators/layout_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace mlu { + +template +struct FPTypeTraits {}; + +template <> +struct FPTypeTraits { + typedef float T; +}; + +template <> +struct FPTypeTraits { + typedef paddle::lite::fluid::float16 T; +}; + +template <> +struct FPTypeTraits { + typedef int8_t T; +}; + +template +inline void LayoutTransCompute(const int dim, + const lite::Context& context, + const lite::Tensor& in, + lite::Tensor* out, + const std::vector& axis) { + switch (dim) { + case 2: + paddle::lite::x86::math::Transpose trans2; + trans2(context, in, out, axis); + break; + case 3: + paddle::lite::x86::math::Transpose trans3; + trans3(context, in, out, axis); + break; + case 4: + paddle::lite::x86::math::Transpose trans4; + trans4(context, in, out, axis); + break; + default: + CHECK(0) << ("Unsupport dim in mlu layout"); + } +} + +template +class LayoutNchwToNhwcCompute + : public KernelLite { + public: + using param_t = operators::LayoutParam; + + void Run() override { + auto& param = this->template Param(); + auto* x = param.x; + auto* out = param.y; + out->template mutable_data::T>(); + auto x_dims = param.x->dims().size(); + auto& context = this->ctx_->template As(); + + const auto origin_dims = out->dims().Vectorize(); + + std::vector axis; + switch (x_dims) { + case 2: + axis = {0, 1}; + break; + case 3: + axis = {0, 2, 1}; + out->Resize(std::vector{ + out->dims()[0], out->dims()[2], out->dims()[1]}); + break; + case 4: + axis = {0, 2, 3, 1}; + out->Resize(std::vector{ + out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); + break; + default: + CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc"; + } + + LayoutTransCompute::T>( + x_dims, context, *x, out, axis); + + if (x_dims > 2) { + out->Resize(origin_dims); + } + } + + std::string doc() const override { + return "Mlu layout transform nchw to nhwc"; + } +}; + +template +class LayoutNhwcToNchwCompute + : public KernelLite { + public: + using param_t = operators::LayoutParam; + + void Run() override { + auto& param = this->template Param(); + auto* x = param.x; + auto* out = param.y; + out->template mutable_data::T>(); + auto x_dims = param.x->dims().size(); + auto& context = this->ctx_->template As(); + + const auto origin_dims = out->dims().Vectorize(); + + std::vector axis; + switch (x_dims) { + case 2: + axis = {0, 1}; + break; + case 3: + out->Resize(std::vector{ + out->dims()[0], out->dims()[2], out->dims()[1]}); + axis = {0, 2, 1}; + break; + case 4: + out->Resize(std::vector{ + out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]}); + axis = {0, 3, 1, 2}; + break; + default: + CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw"; + } + + LayoutTransCompute::T>( + x_dims, context, *x, out, axis); + + if (x_dims > 2) { + out->Resize(origin_dims); + } + } + + std::string doc() const override { + return "Mlu layout transform nhwc to nchw"; + } +}; + +} // namespace mlu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index 06fc791fe7d07ba759e2ed0f9c6187432e195186..3bfba33f4d7e8fd86f7aaf276da2ca4a8b0bd7cf 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine { graph_.SetFPType(type); } + int Build() { + // In order to attach all of the ops of the block desc, we need to build + // the original program firstly. + BuildOriginProgram(); + // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph + build_device_program_status_ = BuildDeviceProgram(); + return build_device_program_status_; + } + + int Launch() { + // Rebuild device program when the shapes of input tensors have been + // changed. + if (subgraph::CHECK_SUCCESS(build_device_program_status_) && + subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED( + build_device_program_status_) && + InputShapeChanged()) { + Build(); + } + if (subgraph::CHECK_FAILED(build_device_program_status_)) { + LaunchOriginProgram(); + } else { + LaunchDeviceProgram(); + } + return 0; + } + protected: int BuildDeviceProgram() override { int status = 0; @@ -57,7 +83,7 @@ class SubgraphEngine : public subgraph::Engine { graph_.AddNode(input_name, input_tensor->dims().Vectorize(), CNML_TENSOR, - CNML_NHWC, + CNML_NCHW, graph_.FPType(), const_cast(input_tensor->raw_data())); CHECK(input_node); @@ -71,9 +97,9 @@ class SubgraphEngine : public subgraph::Engine { for (auto& inst : origin_program_) { auto op = inst.op(); CHECK(op); - op->CheckShape(); - op->InferShape(); std::string op_type = op->op_info()->Type(); + op->CheckShape(); + const_cast(op)->InferShape(); if (!bridges.Exists(op_type, TARGET(kMLU))) { LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type; return subgraph::FAILED; @@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine { graph_.AddInput(graph_.GetNode(input_name)); } CHECK(!valid_output_names.empty()) << "[MLU] no valid output names"; - // auto& mlu_context = this->ctx_->template As(); - // auto core_version = mlu_context.MLUCoreVersion(); - // auto core_number = mlu_context.MLUCoreNumber(); - // graph_.Compile(core_version, core_number); + auto& mlu_context = this->ctx_->template As(); + auto core_version = mlu_context.MLUCoreVersion(); + auto core_number = mlu_context.MLUCoreNumber(); + graph_.Compile(core_version, core_number); return status; } int LaunchDeviceProgram() override { - // auto& mlu_context = this->ctx_->template As(); - // auto exec_queue = mlu_context.exec_queue(); - // u32_t affinity = mlu_context.affinity(); - // cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); - // int data_param = 1; - // forward_param.data_parallelism = &data_param; - // forward_param.affinity = &affinity; - // forward_param.end = CNRT_PARAM_END; - // graph_.Compute(forward_param, exec_queue); + auto& mlu_context = this->ctx_->template As(); + auto exec_queue = mlu_context.exec_queue(); + u32_t affinity = mlu_context.affinity(); + cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); + int data_param = 1; + forward_param.data_parallelism = &data_param; + forward_param.affinity = &affinity; + forward_param.end = CNRT_PARAM_END; + graph_.Compute(forward_param, exec_queue); return 0; } diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index bcf6ba63eb820ee187dd26b2722686a768f78c98..e53bd60c6bade98992524fe0959e2f80f535a6be 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM) +if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) return() endif() diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 8b1a1f8d3d950840ce8fadef70150c452b54c186..d9fae3d48efb1eab2681338b02afa2fee65750b6 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -33,7 +33,7 @@ add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kerne add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps}) - +add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps}) # extra # wait to add ... @@ -97,6 +97,10 @@ lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc DEPS pad2d_opencl layout_opencl op_registry program context) + +lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc + DEPS box_coder_opencl op_registry program context) + ###################### # buffer kernel # ###################### diff --git a/lite/kernels/opencl/box_coder_image_compute.cc b/lite/kernels/opencl/box_coder_image_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..81ad858df0834f58b84b55ef594d71442a27f186 --- /dev/null +++ b/lite/kernels/opencl/box_coder_image_compute.cc @@ -0,0 +1,171 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/opencl/cl_half.h" +#include "lite/backends/opencl/cl_image_converter.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" +#include "lite/operators/op_params.h" +#include "lite/utils/logging.h" +#include "lite/utils/replace_stl/stream.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { +class BoxCoderComputeImage : public KernelLite { + public: + using param_t = operators::BoxCoderParam; + + void PrepareForRun() override { + auto& context = ctx_->As(); + boxcoder_param_ = param_.get_mutable(); + if (boxcoder_param_->code_type == "decode_center_size" && + boxcoder_param_->box_normalized == true) { + kernel_func_name_ = "decode_center_size"; + } else { + printf("This code_type %s doesn't support \n", + boxcoder_param_->code_type.c_str()); + return; + } + CHECK(context.cl_context() != nullptr); + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + context.cl_context()->AddKernel( + kernel_func_name_, "image/box_coder_kernel.cl", build_options_); + } + + void Run() override { + boxcoder_param_ = param_.get_mutable(); + const auto& out_dims = boxcoder_param_->proposals->dims(); + auto image_shape = InitImageDimInfoWith(out_dims); + + auto* out_buf = + boxcoder_param_->proposals->mutable_data( + image_shape["width"], image_shape["height"]); + +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "boxcoder input shape: "; + +#endif + const auto* input_priorbox = boxcoder_param_->prior_box; + const auto* input_priorboxvar = boxcoder_param_->prior_box_var; + const auto* input_targetbox = boxcoder_param_->target_box; + const auto& code_type = boxcoder_param_->code_type; + if (code_type == "decode_center_size") { + auto* prior_box_image = input_priorbox->data(); + auto* prior_box_var_image = + input_priorboxvar->data(); + auto* target_box_image = input_targetbox->data(); + + int new_dims[4] = {1, 1, 1, 1}; + for (int i = 0; i < out_dims.size(); i++) { + new_dims[4 - out_dims.size() + i] = out_dims[i]; + } + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + + auto default_work_size = + DefaultWorkSize(out_dims, + DDim(std::vector{ + static_cast(image_shape["width"]), + static_cast(image_shape["height"])})); + + int out_C = new_dims[1]; + int out_H = new_dims[2]; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << TargetToStr(boxcoder_param_->proposals->target()); + VLOG(4) << "output shape: " << out_dims[0] << ", " << out_dims[1] << ", " + << out_dims[2] << ", " << out_dims[3]; + VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " + << image_shape["height"]; + VLOG(4) << "out_C = " << out_C; + VLOG(4) << "out_H = " << out_H; + VLOG(4) << "default_work_size = " << default_work_size[0] << ", " + << default_work_size[1] << ", " << default_work_size[2]; +#endif + int arg_idx = 0; + cl_int status = kernel.setArg(arg_idx++, *prior_box_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, *prior_box_var_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, *target_box_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, *out_buf); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, out_C); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, out_H); + CL_CHECK_FATAL(status); + auto global_work_size = + cl::NDRange{static_cast(default_work_size[0]), + static_cast(default_work_size[2])}; + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_buf, event_); + +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " + << global_work_size[1]; +#endif + } + } + std::string doc() { return "Boxcoder using cl::Image, kFP16"; } + + param_t* boxcoder_param_{nullptr}; + std::string kernel_func_name_{}; + std::string build_options_{" -DCL_DTYPE_half"}; + std::shared_ptr event_{new cl::Event}; +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle +typedef paddle::lite::kernels::opencl::BoxCoderComputeImage BoxCoder_image; + +REGISTER_LITE_KERNEL( + box_coder, kOpenCL, kFP16, kImageDefault, BoxCoder_image, ImageDefault) + .BindInput("PriorBox", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("PriorBoxVar", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("TargetBox", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("OutputBox", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); diff --git a/lite/kernels/opencl/box_coder_image_compute_test.cc b/lite/kernels/opencl/box_coder_image_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ab37a8b015a80c0389bd6f62bb07c70c0d14a74a --- /dev/null +++ b/lite/kernels/opencl/box_coder_image_compute_test.cc @@ -0,0 +1,298 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/kernels/opencl/test_helper.h" + +#define FP16_MAX_DIFF (5e-1) +namespace paddle { +namespace lite { +void box_coder_ref(float* proposals_data, + const float* anchors_data, + const float* bbox_deltas_data, + const float* variances_data, + int axis, + bool box_normalized, + std::string code_type, + int row, + int col) { + if (code_type == "decode_center_size") { + int anchor_len = 4; + int out_len = 4; + int var_len = 4; + int delta_len = 4; + float normalized = !box_normalized ? 1.f : 0; + + for (int64_t row_id = 0; row_id < row; ++row_id) { + for (int64_t col_id = 0; col_id < col; ++col_id) { + size_t delta_offset = row_id * col * delta_len + col_id * delta_len; + size_t out_offset = row_id * col * out_len + col_id * out_len; + int prior_box_offset = + axis == 0 ? col_id * anchor_len : row_id * anchor_len; + int var_offset = axis == 0 ? col_id * var_len : row_id * var_len; + auto anchor_data_tmp = anchors_data + prior_box_offset; + auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset; + auto proposals_data_tmp = proposals_data + out_offset; + auto anchor_width = + anchor_data_tmp[2] - anchor_data_tmp[0] + normalized; + auto anchor_height = + anchor_data_tmp[3] - anchor_data_tmp[1] + normalized; + auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width; + auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height; + float bbox_center_x = 0, bbox_center_y = 0; + float bbox_width = 0, bbox_height = 0; + + auto variances_data_tmp = variances_data + var_offset; + bbox_center_x = + variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width + + anchor_center_x; + bbox_center_y = + variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) * + anchor_width; + bbox_height = + std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) * + anchor_height; + proposals_data_tmp[0] = bbox_center_x - bbox_width / 2; + proposals_data_tmp[1] = bbox_center_y - bbox_height / 2; + proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized; + proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized; + } + } + } else if (code_type == "encode_center_size") { + LOG(FATAL) << "not implemented type: " << code_type; + } else { + LOG(FATAL) << "not supported type: " << code_type; + } +} +// #define BOXCODER_FP16_LOOP_TEST +// #define BOXCODER_FP16_PRINT_RESULT +TEST(box_coder_image2d, compute) { +#ifdef BOXCODER_FP16_LOOP_TEST + for (auto n : {1, 2, 3, 4}) { + for (auto m : {1, 3, 4, 8}) { + for (auto norm : {true}) { + for (auto code_type : {"decode_center_size"}) { + for (auto axis : {0}) { +#else + const int n = 1; + const int m = 1; + const bool norm = true; + const std::string code_type = "decode_center_size"; + const int axis = 0; +#endif // BOXCODER_FP16_LOOP_TEST + + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << m + << " ========"; + LOG(INFO) << "======== parameters: norm = " << norm + << ", axis = " << axis << "code_type: " << code_type; + + auto kernels = + KernelRegistry::Global().Create("box_coder", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + LOG(INFO) << "get kernel:" << kernel->doc(); + + lite::Tensor prior_box, prior_box_var, target_box, output_box; + operators::BoxCoderParam param; + param.prior_box = &prior_box; + param.prior_box_var = &prior_box_var; + param.target_box = &target_box; + param.proposals = &output_box; + param.axis = axis; + param.box_normalized = norm; + param.code_type = code_type; + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + kernel->SetParam(param); + std::unique_ptr boxcoder_context(new KernelContext); + context->As().CopySharedTo( + &(boxcoder_context->As())); + kernel->SetContext(std::move(boxcoder_context)); + + const DDim prior_box_dims = + DDim(std::vector{1, 1, m, 4}); + const DDim prior_box_var_dims = + DDim(std::vector{1, 1, m, 4}); + const DDim target_box_dims = + DDim(std::vector{1, n, m, 4}); + const DDim out_dim = + DDim(std::vector{1, n, m, 4}); + prior_box.Resize(prior_box_dims); + prior_box_var.Resize(prior_box_var_dims); + target_box.Resize(target_box_dims); + output_box.Resize(out_dim); + + std::vector prior_box_data(prior_box_dims.production()); + std::vector prior_box_var_data( + prior_box_var_dims.production()); + std::vector target_box_data(target_box_dims.production()); + for (int i = 0; i < prior_box_dims.production(); i++) { + prior_box_data[i] = i * 1.1 / prior_box_dims.production(); + } + for (int i = 0; i < prior_box_var_dims.production(); i++) { + prior_box_var_data[i] = i * 1.2 / prior_box_var_dims.production(); + } + for (int i = 0; i < target_box_dims.production(); i++) { + target_box_data[i] = i * 1.3 / target_box_dims.production(); + } + + LOG(INFO) << "prepare input"; + CLImageConverterDefault* default_converter = + new CLImageConverterDefault(); + DDim prior_box_image_shape = + default_converter->InitImageDimInfoWith(prior_box_dims); + LOG(INFO) << "prior_box_image_shape = " << prior_box_image_shape[0] + << " " << prior_box_image_shape[1]; + std::vector prior_box_image_data( + prior_box_image_shape.production() * 4); // 4 : RGBA + default_converter->NCHWToImage(prior_box_data.data(), + prior_box_image_data.data(), + prior_box_dims); + auto* prior_box_image = prior_box.mutable_data( + prior_box_image_shape[0], + prior_box_image_shape[1], + prior_box_image_data.data()); + + DDim prior_box_var_image_shape = + default_converter->InitImageDimInfoWith(prior_box_var_dims); + LOG(INFO) << "prior_box_var_image_shape = " + << prior_box_var_image_shape[0] << " " + << prior_box_var_image_shape[1]; + std::vector prior_box_var_image_data( + prior_box_var_image_shape.production() * 4); // 4 : RGBA + default_converter->NCHWToImage(prior_box_var_data.data(), + prior_box_var_image_data.data(), + prior_box_var_dims); + auto* prior_box_var_image = + prior_box_var.mutable_data( + prior_box_var_image_shape[0], + prior_box_var_image_shape[1], + prior_box_var_image_data.data()); + + DDim target_box_image_shape = + default_converter->InitImageDimInfoWith(target_box_dims); + LOG(INFO) << "target_box_image_shape = " + << target_box_image_shape[0] << " " + << target_box_image_shape[1]; + std::vector target_box_image_data( + target_box_image_shape.production() * 4); // 4 : RGBA + default_converter->NCHWToImage(target_box_data.data(), + target_box_image_data.data(), + target_box_dims); + auto* target_box_image = + target_box.mutable_data( + target_box_image_shape[0], + target_box_image_shape[1], + target_box_image_data.data()); + + DDim out_image_shape = + default_converter->InitImageDimInfoWith(out_dim); + LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " " + << out_image_shape[1]; + auto* out_image = output_box.mutable_data( + out_image_shape[0], out_image_shape[1]); + kernel->Launch(); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.proposals->data(); + auto it = wait_list->find(out_ptr); + if (it != wait_list->end()) { + VLOG(4) << "--- Find the sync event for the target cl " + "tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the " + "target cl tensor."; + } + + lite::Tensor out_ref_tensor; + out_ref_tensor.Resize(out_dim); + box_coder_ref(out_ref_tensor.mutable_data(), + prior_box_data.data(), + target_box_data.data(), + prior_box_var_data.data(), + axis, + norm, + code_type, + target_box_dims[0], + target_box_dims[1]); + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + half_t* out_image_data = + new half_t[40000]; // [out_image_shape.production() * 4]; + TargetWrapperCL::ImgcpySync(out_image_data, + out_image, + out_image_shape[0], + out_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + float* out_data = new float[out_image_shape.production() * 4]; + default_converter->ImageToNCHW( + out_image_data, out_data, out_image_shape, out_dim); +// result +#ifdef BOXCODER_FP16_PRINT_RESULT + LOG(INFO) << "---- print kernel result (input -> output) ----"; + for (int eidx = 0; eidx < out_dim.production(); ++eidx) { + std::cout << target_box_data[eidx] << " -> " << out_data[eidx] + << std::endl; + } +#endif // BOXCODER_FP16_PRINT_RESULT + const float* out_ref = out_ref_tensor.data(); + for (int i = 0; i < out_dim.production(); i++) { + auto abs_diff = abs(out_data[i] - out_ref[i]); + auto relative_diff = + COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]); + EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || + (abs_diff <= FP16_MAX_DIFF), + true); + if ((relative_diff > FP16_MAX_DIFF) && + (abs_diff > FP16_MAX_DIFF)) { + LOG(ERROR) << "error idx:" << i << ", in_data[" << i + << "]: " << target_box_data[i] << ", out_data[" << i + << "]: " << out_data[i] << ", out_ref[" << i + << "]: " << out_ref[i] << ", abs_diff: " << abs_diff + << ", relative_diff: " << relative_diff + << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF; + } + } +#ifdef BOXCODER_FP16_LOOP_TEST + } // axis + } // code_type + } // norm + } // m + } // n +#else +// nothing to do. +#endif +} + +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(box_coder, kOpenCL, kFP16, kImageDefault, ImageDefault); diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt index 3d79dc3dfee80613c39f51323e7ba61adcf7cd8a..2036a343d722d5c01a4b9dcd0d4cdf682a92d218 100644 --- a/lite/kernels/x86/CMakeLists.txt +++ b/lite/kernels/x86/CMakeLists.txt @@ -2,7 +2,7 @@ if(NOT LITE_WITH_X86) return() endif() -add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function) +add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_function) # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps}) @@ -30,6 +30,8 @@ add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute) #add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps}) add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(sequence_unpad_compute_x86 X86 basic SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps} sequence_padding) +add_kernel(sequence_conv_compute_x86 X86 basic SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_function blas context_project) # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type) diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h index 5d8110e67c17f3a0f8d3211179df831dad83cc9b..65d270e02fab902a1dfa92ddf27de040ef43a1b9 100644 --- a/lite/kernels/x86/activation_compute.h +++ b/lite/kernels/x86/activation_compute.h @@ -21,7 +21,7 @@ #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/fluid/eigen.h" -#include "lite/operators/activation_ops.h" +#include "lite/operators/op_params.h" namespace paddle { namespace lite { @@ -231,8 +231,8 @@ class SoftsignCompute : public KernelLite { // auto& context = ctx_->As(); auto& param = *param_.get_mutable(); - const T* x_data = param.X->data(); - T* out_data = param.Out->mutable_data(); + const T* x_data = param.X->template data(); + T* out_data = param.Out->template mutable_data(); size_t x_size = param.X->numel(); for (size_t i = 0; i < x_size; i++) { out_data[i] = x_data[i] / (static_cast(1) + std::abs(x_data[i])); diff --git a/lite/kernels/x86/attention_padding_mask_compute.h b/lite/kernels/x86/attention_padding_mask_compute.h index b9124e5ad49a0d68c41a21fe55d28102f09d14b9..f6d3d5aa31df1f188c196ac283c734c879f40244 100644 --- a/lite/kernels/x86/attention_padding_mask_compute.h +++ b/lite/kernels/x86/attention_padding_mask_compute.h @@ -45,9 +45,9 @@ class AttentionPaddingMaskCompute auto src_len = static_cast(bottom1->lod()[0][1]); const int att_batch = bottom0->lod()[0].size() - 1; const int src_batch = bottom1->lod()[0].size() - 1; - int* pad_begin = _pad_begin->mutable_data(); + int* pad_begin = _pad_begin->template mutable_data(); for (int i = 0; i < src_batch; ++i) { - const auto* src_data = bottom1->data() + src_len * i; + const auto* src_data = bottom1->template data() + src_len * i; int index = src_len - 1; for (; index >= 0 && _pad_id == static_cast(src_data[index]); --index) { @@ -56,13 +56,14 @@ class AttentionPaddingMaskCompute } const auto att_len = static_cast(bottom0->lod()[0][1]); - auto* top_data = top->mutable_data(); + auto* top_data = top->template mutable_data(); memcpy(top_data, - bottom0->data(), + bottom0->template data(), bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T)); for (int i = 0; i < att_batch; ++i) { for (int j = 0; j < att_len; ++j) { - top_data = top->mutable_data() + src_len * (att_len * i + j); + top_data = + top->template mutable_data() + src_len * (att_len * i + j); int src_idx = i % src_batch; for (int k = pad_begin[src_idx]; k < src_len; ++k) { top_data[k] = _mask; diff --git a/lite/kernels/x86/batch_norm_compute.h b/lite/kernels/x86/batch_norm_compute.h index 092280752cb92e1784eefc09cb26fa3bea8eb939..0f206b8c32aaaf9b3a1b278a69f3a9aa77a11ba6 100644 --- a/lite/kernels/x86/batch_norm_compute.h +++ b/lite/kernels/x86/batch_norm_compute.h @@ -59,26 +59,26 @@ class BatchNormCompute : public KernelLite { const int sample_size = x->dims().production() / N / C; // alloc memory - param.y->mutable_data(); + param.y->template mutable_data(); if (!param.is_test) { - param.mean_out->mutable_data(); - param.variance_out->mutable_data(); - param.saved_mean->mutable_data(); - param.saved_variance->mutable_data(); + param.mean_out->template mutable_data(); + param.variance_out->template mutable_data(); + param.saved_mean->template mutable_data(); + param.saved_variance->template mutable_data(); } if (!global_stats) { // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e(param.saved_mean->mutable_data(), - C); + EigenVectorArrayMap saved_mean_e( + param.saved_mean->template mutable_data(), C); EigenVectorArrayMap saved_variance_e( - param.saved_variance->mutable_data(), C); + param.saved_variance->template mutable_data(), C); saved_mean_e.setZero(); saved_variance_e.setZero(); - EigenVectorArrayMap running_mean_arr(param.mean_out->mutable_data(), - C); + EigenVectorArrayMap running_mean_arr( + param.mean_out->template mutable_data(), C); EigenVectorArrayMap running_var_arr( - param.variance_out->mutable_data(), C); + param.variance_out->template mutable_data(), C); if ((N * sample_size) == 1) { LOG(WARNING) << "Only 1 element in normalization dimension, " @@ -89,7 +89,8 @@ class BatchNormCompute : public KernelLite { switch (param.data_layout) { case DATALAYOUT(kNCHW): { - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + ConstEigenArrayMap x_arr( + x->template data(), sample_size, N * C); for (int nc = 0; nc < N * C; ++nc) { saved_mean_e(nc % C) += x_arr.col(nc).sum(); } @@ -115,33 +116,37 @@ class BatchNormCompute : public KernelLite { // use SavedMean and SavedVariance to do normalize Eigen::Array inv_std(C); if (global_stats) { - ConstEigenVectorArrayMap var_arr(param.variance->data(), C); + ConstEigenVectorArrayMap var_arr(param.variance->template data(), + C); inv_std = (var_arr + param.epsilon).sqrt().inverse(); } else { EigenVectorArrayMap saved_inv_std( - param.saved_variance->mutable_data(), C); + param.saved_variance->template mutable_data(), C); // inverse SavedVariance first, gradient will use it too. saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt(); inv_std = saved_inv_std; } ConstEigenVectorArrayMap mean_arr( - global_stats ? param.mean->data() : param.saved_mean->data(), C); + global_stats ? param.mean->template data() + : param.saved_mean->template data(), + C); // ((x - est_mean) * (inv_var) * scale + bias // formula transform ====> // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - ConstEigenVectorArrayMap scale_arr(param.scale->data(), C); - ConstEigenVectorArrayMap bias_arr(param.bias->data(), C); + ConstEigenVectorArrayMap scale_arr(param.scale->template data(), C); + ConstEigenVectorArrayMap bias_arr(param.bias->template data(), C); Eigen::Array new_scale = inv_std * scale_arr; Eigen::Array new_bias = bias_arr - mean_arr * inv_std * scale_arr; switch (param.data_layout) { case DATALAYOUT(kNCHW): { - EigenArrayMap y_arr(param.y->mutable_data(), sample_size, N * C); - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + EigenArrayMap y_arr( + param.y->template mutable_data(), sample_size, N * C); + ConstEigenArrayMap x_arr(x->template data(), sample_size, N * C); for (int nc = 0; nc < N * C; ++nc) { y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); } diff --git a/lite/kernels/x86/cast_compute.cc b/lite/kernels/x86/cast_compute.cc index d342056c7f19e9eba0fe16196d772da6bd5fda3c..bbb63e595269667dedebeafd83cc962d1d0fb878 100644 --- a/lite/kernels/x86/cast_compute.cc +++ b/lite/kernels/x86/cast_compute.cc @@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast, .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); + +REGISTER_LITE_KERNEL( + cast, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::CastCompute<::paddle::lite::fluid::float16>, + fp16_to_any) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/concat_compute.h b/lite/kernels/x86/concat_compute.h index 935f0811d4e7a7cbe2ce5fafa61b6d16a25d4a81..e423cd04f16917f200f45ac93d9a6a09f3fb1c54 100644 --- a/lite/kernels/x86/concat_compute.h +++ b/lite/kernels/x86/concat_compute.h @@ -47,7 +47,7 @@ class ConcatCompute : public KernelLite { int64_t axis = static_cast(param.axis); auto* axis_tensor = param.axis_tensor; if (axis_tensor != nullptr) { - auto* axis_tensor_data = axis_tensor->data(); + auto* axis_tensor_data = axis_tensor->template data(); axis = static_cast(axis_tensor_data[0]); } @@ -60,7 +60,7 @@ class ConcatCompute : public KernelLite { int concat_input_size = count(axis + 1, x_dims.size(), x_dims); const int top_concat_axis = out->dims()[axis]; for (size_t i = 0; i < param.x.size(); ++i) { - const T* bottom_data = param.x[i]->data(); + const T* bottom_data = param.x[i]->template data(); const int64_t bottom_concat_axis = param.x[i]->dims()[axis]; for (int n = 0; n < num_concat; ++n) { std::memcpy( diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h index e9f403059f90cf6635bc22db3e6890b86cbe85f6..29442158c756418327dd3de31fd4dfdbec2cbc1d 100644 --- a/lite/kernels/x86/conv_compute.h +++ b/lite/kernels/x86/conv_compute.h @@ -52,7 +52,7 @@ class Conv2dCompute : public KernelLite { auto& context = ctx_->As(); auto& param = *param_.get_mutable(); lite::Tensor filter = *param.filter; - param.output->mutable_data(); + param.output->template mutable_data(); const int batch_size = static_cast(param.x->dims()[0]); std::vector filter_shape_vec(filter.dims().Vectorize()); @@ -95,9 +95,9 @@ class Conv2dCompute : public KernelLite { auto blas = paddle::lite::x86::math::GetBlas(context); for (int i = 0; i < batch_size; i++) { - lite::Tensor in_batch = param.x->Slice(i, i + 1); + lite::Tensor in_batch = param.x->template Slice(i, i + 1); in_batch.Resize(input_shape); - lite::Tensor out_batch = param.output->Slice(i, i + 1); + lite::Tensor out_batch = param.output->template Slice(i, i + 1); out_batch.Resize(output_matrix_shape); for (int g = 0; g < param.groups; g++) { lite::Tensor in_slice = diff --git a/lite/kernels/x86/dropout_compute.h b/lite/kernels/x86/dropout_compute.h index 2ba383bdbdc99e7643f3bf09350f833665c8548e..4b5f3359501b8b4c801c395dfa7d5990d9d4d7a3 100644 --- a/lite/kernels/x86/dropout_compute.h +++ b/lite/kernels/x86/dropout_compute.h @@ -38,10 +38,10 @@ class DropoutCompute : public KernelLite { using param_t = operators::DropoutParam; void Run() override { auto& param = *param_.get_mutable(); - const auto* x_data = param.x->data(); - auto* out_data = param.output->mutable_data(); + const auto* x_data = param.x->template data(); + auto* out_data = param.output->template mutable_data(); if (!param.is_test) { - auto* mask_data = param.mask->mutable_data(); + auto* mask_data = param.mask->template mutable_data(); std::random_device rnd; std::minstd_rand engine; int seed = param.fix_seed ? param.seed : rnd(); diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h index 40116479f6f4d6dc8658c2d781a48b7a07dd20c9..42ea38d979e39f97a8aef971370c83303c53c48f 100644 --- a/lite/kernels/x86/elementwise_op_function.h +++ b/lite/kernels/x86/elementwise_op_function.h @@ -248,8 +248,8 @@ class TransformFunctor { lite::Tensor *z, const lite::Context &ctx, Functor func) - : x_(x->data()), - y_(y->data()), + : x_(x->template data()), + y_(y->template data()), z_(z->mutable_data()), nx_(x->numel()), ctx_(ctx), @@ -483,9 +483,10 @@ void FusedElemwiseAndActComputeNoBroadcast(const lite::Context &ctx, x.data(), y.data(), compound_functor, - out->mutable_data(), - intermediate_out == nullptr ? nullptr - : intermediate_out->mutable_data()}); + out->template mutable_data(), + intermediate_out == nullptr + ? nullptr + : intermediate_out->template mutable_data()}); } template &ctx, compound_functor, h, w, - out->mutable_data(), - intermediate_out == nullptr ? nullptr - : intermediate_out->mutable_data()); + out->template mutable_data(), + intermediate_out == nullptr + ? nullptr + : intermediate_out->template mutable_data()); } else { FusedElemwiseAndActBroadcast2CPU &ctx, n, post, compound_functor, - out->mutable_data(), - intermediate_out == nullptr ? nullptr - : intermediate_out->mutable_data()); + out->template mutable_data(), + intermediate_out == nullptr + ? nullptr + : intermediate_out->template mutable_data()); } } diff --git a/lite/kernels/x86/fc_compute.h b/lite/kernels/x86/fc_compute.h index e719b8d2216949746f612bca0689c22be0606031..9f25a2584fe8d2579939e144d6799ba79927ae63 100644 --- a/lite/kernels/x86/fc_compute.h +++ b/lite/kernels/x86/fc_compute.h @@ -140,9 +140,9 @@ class FcCompute : public KernelLite { int M = output->dims().production() / w_dims1; - const T* input_data = input->data(); - const T* w_data = w->data(); - T* output_data = output->mutable_data(); + const T* input_data = input->template data(); + const T* w_data = w->template data(); + T* output_data = output->template mutable_data(); auto& context = ctx_->As(); FCFunctor fc; @@ -153,7 +153,7 @@ class FcCompute : public KernelLite { input_data, w_data, output_data, - bias ? bias->data() : NULL, + bias ? bias->template data() : NULL, with_relu, padding_weights); } diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute.h b/lite/kernels/x86/fill_constant_batch_size_like_compute.h index 8d49b0816d85f30351a4ded81e0f6ef650b6c445..1c54912c21d1479b990c5a56064d9789e8619400 100644 --- a/lite/kernels/x86/fill_constant_batch_size_like_compute.h +++ b/lite/kernels/x86/fill_constant_batch_size_like_compute.h @@ -42,9 +42,9 @@ class FillConstantBatchSizeLikeCompute int output_dim_idx = param.output_dim_idx; odims[output_dim_idx] = static_cast(in->lod().back().size()) - 1; out->Resize(odims); - // out->mutable_data(); + // out->template mutable_data(); } - out->mutable_data(); + out->template mutable_data(); auto value = param.value; paddle::lite::x86::math::SetConstant setter; diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h index 6ee270647f8fb7d7ec540047cd4d546a7eb89ce8..bd01d9da3af1640770838c262dcd848b557d40c3 100644 --- a/lite/kernels/x86/gather_compute.h +++ b/lite/kernels/x86/gather_compute.h @@ -50,9 +50,9 @@ void CPUGather(const lite::Tensor* src, auto src_dims = src->dims(); - const T* p_src = src->data(); + const T* p_src = src->template data(); const IndexT* p_index = index->data(); - T* p_output = output->mutable_data(); + T* p_output = output->template mutable_data(); // slice size int slice_size = 1; @@ -77,7 +77,7 @@ class GatherCompute : public KernelLite { auto index = param.Index; auto out = param.Out; - out->mutable_data(); + out->template mutable_data(); if (x->dims().production() == 0) return; /* * Since there's no type defined for lite::Tensor in Paddle-Lite, then diff --git a/lite/kernels/x86/gru_compute.h b/lite/kernels/x86/gru_compute.h index 89076b51dae1fed4b8f56b280f177caf1f142158..e701ba16a55e9695c6b70f07cc4e1443e6b75698 100644 --- a/lite/kernels/x86/gru_compute.h +++ b/lite/kernels/x86/gru_compute.h @@ -44,7 +44,7 @@ inline void ReorderInitState(const lite::Context& context, bool indexed_src) { lite::x86::math::CopyMatrixRowsFunctor row_shuffle; dst->Resize(src.dims()); - dst->mutable_data(); + dst->template mutable_data(); row_shuffle(context, src, index_lod, dst, indexed_src); } @@ -65,18 +65,19 @@ class GRUCompute : public KernelLite { auto* input = param.input; auto* h0 = param.h0; auto* weight = param.weight; - const T* weight_data = weight->data(); + const T* weight_data = weight->template data(); auto* bias = param.bias; auto* batch_gate = param.batch_gate; auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev; auto* batch_hidden = param.batch_hidden; - T* batch_gate_ptr = batch_gate->mutable_data(); - T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data(); - T* batch_hidden_ptr = batch_hidden->mutable_data(); + T* batch_gate_ptr = batch_gate->template mutable_data(); + T* batch_reset_hidden_prev_ptr = + batch_reset_hidden_prev->template mutable_data(); + T* batch_hidden_ptr = batch_hidden->template mutable_data(); auto* hidden = param.hidden; - hidden->mutable_data(); + hidden->template mutable_data(); const auto& hidden_dims = hidden->dims(); @@ -99,7 +100,7 @@ class GRUCompute : public KernelLite { // Since the batch computing for GRU reorders the input sequences // according to their length. The initialized cell state also needs // to reorder. - const std::vector& order(batch_gate->lod()[2]); + const std::vector& order(batch_gate->lod()[2]); ReorderInitState(context, *h0, order, &ordered_h0, true); gru_value.prev_out_value = ordered_h0.mutable_data(); } else { diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h index ca2ddf60c5e150ba7d2712ccb2e67e444cd07010..46d151bbc406e19b498b87420029da7f9c1c2f12 100644 --- a/lite/kernels/x86/layer_norm_compute.h +++ b/lite/kernels/x86/layer_norm_compute.h @@ -47,9 +47,9 @@ class LayerNormCompute : public KernelLite { auto x_dims = x->dims(); - y->mutable_data(); - Mean->mutable_data(); - Var->mutable_data(); + y->template mutable_data(); + Mean->template mutable_data(); + Var->template mutable_data(); auto matrix_dim = x_dims.Flatten2D(begin_norm_axis); int left = static_cast(matrix_dim[0]); @@ -73,10 +73,10 @@ class LayerNormCompute : public KernelLite { .At(right); ker(in.mutable_data(), out.mutable_data(), - Mean->mutable_data(), - Var->mutable_data(), - Scale->data(), - Bias->data(), + Mean->template mutable_data(), + Var->template mutable_data(), + Scale->template data(), + Bias->template data(), static_cast(left), epsilon, right); diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h index 1801144f6eeb25a40fa052440b63913bc41a65a3..73cffe4ce8130b18612e42b0243205e74e011005 100644 --- a/lite/kernels/x86/lookup_table_compute.h +++ b/lite/kernels/x86/lookup_table_compute.h @@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite { auto *ids_t = param.Ids; auto *output_t = param.Out; int64_t padding_idx = param.padding_idx; - const int64_t *ids = ids_t->data(); + const int64_t *ids = ids_t->template data(); int64_t ids_numel = ids_t->dims().production(); auto *table_t = param.W; int64_t row_number = table_t->dims()[0]; int64_t row_width = table_t->dims()[1]; - const T *table = table_t->data(); - T *output = output_t->mutable_data(); + const T *table = table_t->template data(); + T *output = output_t->template mutable_data(); memset(output, 0, output_t->dims().production() * sizeof(T)); for (int64_t i = 0; i < ids_numel; ++i) { if (padding_idx != -1 && ids[i] == padding_idx) { diff --git a/lite/kernels/x86/match_matrix_tensor_compute.cc b/lite/kernels/x86/match_matrix_tensor_compute.cc index feda180d22e59b2ca0e8f0f89f3c7a1ddb8acd4a..171308b1a8b0294241e77366390c4828172bc077 100644 --- a/lite/kernels/x86/match_matrix_tensor_compute.cc +++ b/lite/kernels/x86/match_matrix_tensor_compute.cc @@ -35,7 +35,7 @@ void MatchMatrixTensorCompute::Run() { const auto& offset_l = x->lod()[0]; const auto& offset_r = y->lod()[0]; - std::vector top_offset; + std::vector top_offset; int top_size = 0; top_offset.push_back(top_size); for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { @@ -97,9 +97,9 @@ void MatchMatrixTensorCompute::Run() { int batch_size = x->lod()[0].size() - 1; int lod_lv1_size = batch_size * dim_t; int lod_lv2_size = x->lod()[0].back() * dim_t; - std::vector out_lod0(batch_size + 1, 0); - std::vector out_lod1(lod_lv1_size + 1, 0); - std::vector out_lod2(lod_lv2_size + 1, 0); + std::vector out_lod0(batch_size + 1, 0); + std::vector out_lod1(lod_lv1_size + 1, 0); + std::vector out_lod2(lod_lv2_size + 1, 0); for (int i = 0; i < batch_size; i++) { out_lod0[i + 1] = out_lod0[i] + dim_t; int len_l = offset_l[i + 1] - offset_l[i]; diff --git a/lite/kernels/x86/matmul_compute.h b/lite/kernels/x86/matmul_compute.h index 3d2b3c7482c266d0c8771c9be1dbac540a315528..e17f12b6b6471bfb587fc3866695b808e11122da 100644 --- a/lite/kernels/x86/matmul_compute.h +++ b/lite/kernels/x86/matmul_compute.h @@ -56,7 +56,7 @@ class MatMulCompute : public KernelLite { auto *x = param.X; auto *y = param.Y; auto *out = param.Out; - out->mutable_data(); + out->template mutable_data(); auto blas = lite::x86::math::GetBlas(context); auto mat_dim_a = lite::x86::math::CreateMatrixDescriptor( diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h index be58f24ba2ed37db6661ecaaceb0d9d70fdd75d4..5c3dbe9342c8642470e8997fc2fec6428c2aa832 100644 --- a/lite/kernels/x86/mul_compute.h +++ b/lite/kernels/x86/mul_compute.h @@ -64,7 +64,7 @@ class MulCompute : public KernelLite { y_matrix = *y; } - z->mutable_data(); + z->template mutable_data(); auto z_dim = z->dims(); if (z_dim.size() != 2) { z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); diff --git a/lite/kernels/x86/reduce_compute.h b/lite/kernels/x86/reduce_compute.h index f93157c837995792772c86d969312bfa28341ce4..1b7c99eeef9dd80525eb9ed249bdf6ed1e493443 100644 --- a/lite/kernels/x86/reduce_compute.h +++ b/lite/kernels/x86/reduce_compute.h @@ -49,7 +49,7 @@ class ReduceSumCompute : public KernelLite { bool reduce_all = param.reduce_all; auto* input = param.x; auto* output = param.output; - param.output->mutable_data(); + param.output->template mutable_data(); const auto& dims = param.dim; bool keep_dim = param.keep_dim; diff --git a/lite/kernels/x86/scale_compute.h b/lite/kernels/x86/scale_compute.h index c78f385b96dd2bdbf83204f2a80739657350ae7e..978a81fb22f382f9f036e503e3f674d38f1467a6 100644 --- a/lite/kernels/x86/scale_compute.h +++ b/lite/kernels/x86/scale_compute.h @@ -41,8 +41,8 @@ class ScaleCompute : public KernelLite { void Run() override { auto& param = *param_.get_mutable(); - scale_compute(param.x->data(), - param.output->mutable_data(), + scale_compute(param.x->template data(), + param.output->template mutable_data(), param.x->dims().production(), param.scale, param.bias, diff --git a/lite/kernels/x86/search_grnn_compute.cc b/lite/kernels/x86/search_grnn_compute.cc index 95839ba71b9f63fad9d659fd65c0028005d29799..f25c960f19b60056bd9702a31774a378378f24d6 100644 --- a/lite/kernels/x86/search_grnn_compute.cc +++ b/lite/kernels/x86/search_grnn_compute.cc @@ -84,7 +84,7 @@ void SearchGrnnCompute::PrepareLayout(const Tensor* input_blob) { int max_width = width_data[idx_sorted_by_width_data[0]]; // start of reorganizing the input - std::vector new_offset; + std::vector new_offset; new_offset.resize(max_width + 1); new_offset[0] = 0; diff --git a/lite/kernels/x86/search_group_padding_compute.h b/lite/kernels/x86/search_group_padding_compute.h index 17244d15d9124d9d61d1f4fdef4f12590958c0be..eee2a8ac8ef757d776580eac9dfc2c6e31694107 100644 --- a/lite/kernels/x86/search_group_padding_compute.h +++ b/lite/kernels/x86/search_group_padding_compute.h @@ -50,7 +50,7 @@ class SearchGroupPaddingCompute } } - std::vector new_offset; + std::vector new_offset; new_offset.resize(batch + 1); for (int i = 0; i < batch + 1; ++i) { new_offset[i] = i * max_seq; @@ -67,7 +67,7 @@ class SearchGroupPaddingCompute top1_lod.push_back(offset); top1->set_lod(top1_lod); top1->Resize({dim0, 1}); - memset(top1->mutable_data(), + memset(top1->template mutable_data(), 0, top1->dims()[0] * top1->dims()[1] * sizeof(T)); // for padding input id @@ -76,9 +76,9 @@ class SearchGroupPaddingCompute top2->set_lod(top2_lod); top2->Resize({batch * max_seq, 1}); // copy data - const auto* bottom_data = bottom0->data(); - auto* top_data = top0->mutable_data(); - auto* top_padding_input_data = top2->mutable_data(); + const auto* bottom_data = bottom0->template data(); + auto* top_data = top0->template mutable_data(); + auto* top_padding_input_data = top2->template mutable_data(); for (int i = 0; i < batch; i++) { const int copy_step = offset[i + 1] - offset[i]; const int start = i * max_seq; diff --git a/lite/kernels/x86/search_seq_fc_compute.h b/lite/kernels/x86/search_seq_fc_compute.h index 80ef54b30b762848eceb16940c9f60ef8ba96927..0f19466e0862e36e744fe74d985ab6136dee0e8d 100644 --- a/lite/kernels/x86/search_seq_fc_compute.h +++ b/lite/kernels/x86/search_seq_fc_compute.h @@ -58,8 +58,10 @@ class SearchSeqFcCompute : public KernelLite { int M = x_dims[0]; int N = w_dims[0]; for (int i = 0; i < M; i++) { - blas.AXPY( - N, static_cast(1), b->data(), out->mutable_data() + i * N); + blas.AXPY(N, + static_cast(1), + b->template data(), + out->template mutable_data() + i * N); } } } diff --git a/lite/kernels/x86/sequence_arithmetic_compute.h b/lite/kernels/x86/sequence_arithmetic_compute.h index 88510b8b1c7a04ab01da9af331f9d1f72765b215..080d0bcd0b42f6f59266e56d0f729eb2a28d4179 100644 --- a/lite/kernels/x86/sequence_arithmetic_compute.h +++ b/lite/kernels/x86/sequence_arithmetic_compute.h @@ -39,9 +39,9 @@ class SequenceArithmeticCompute out->Resize(x->dims()); out->set_lod(x->lod()); - auto x_data = x->data(); - auto y_data = y->data(); - auto out_data = out->mutable_data(); + auto x_data = x->template data(); + auto y_data = y->template data(); + auto out_data = out->template mutable_data(); auto x_seq_offset = x->lod()[0]; auto y_seq_offset = y->lod()[0]; int seq_num = x_seq_offset.size() - 1; diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h index 8dd7077f7dbbb3e61f21d63e8c935157b3d2d579..cbf8a41b7e2228d3b2fab3fe5049281850961c1e 100644 --- a/lite/kernels/x86/sequence_concat_compute.h +++ b/lite/kernels/x86/sequence_concat_compute.h @@ -25,7 +25,7 @@ namespace x86 { template inline LoD ConcatLoD(const std::vector& xs, std::vector* xs_in_order) { - std::vector result; + std::vector result; result.resize(xs[0]->lod()[0].size()); for (size_t i = 1; i < result.size(); ++i) { @@ -75,7 +75,7 @@ class SequenceConcatCompute out_dims[0] = batch_size; param.Out->Resize(out_dims); - T* dout = param.Out->mutable_data(); + T* dout = param.Out->template mutable_data(); std::vector x_in_order; param.Out->set_lod(ConcatLoD(param.X, &x_in_order)); diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc index be1f86a5c848b5c03634ea2a1aed0d57f2283879..eb6678a655ed1eb5a7bcda1dc2a6b8afe4477d2d 100644 --- a/lite/kernels/x86/sequence_concat_compute_test.cc +++ b/lite/kernels/x86/sequence_concat_compute_test.cc @@ -26,7 +26,7 @@ namespace x86 { namespace { inline LoD ConcatLoD(const std::vector& xs, std::vector* xs_in_order) { - std::vector result; + std::vector result; result.resize(xs[0]->lod()[0].size()); for (size_t i = 1; i < result.size(); ++i) { diff --git a/lite/kernels/x86/sequence_conv_compute.cc b/lite/kernels/x86/sequence_conv_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..32bf8b315c7952a74846af5c4e5548767c80e63e --- /dev/null +++ b/lite/kernels/x86/sequence_conv_compute.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_conv_compute.h" + +REGISTER_LITE_KERNEL(sequence_conv, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceConvCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_conv_compute.h b/lite/kernels/x86/sequence_conv_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c1a47aa20f4886aa5dddbe6b398e5365abdc16f2 --- /dev/null +++ b/lite/kernels/x86/sequence_conv_compute.h @@ -0,0 +1,88 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include "lite/backends/x86/math/blas.h" +#include "lite/backends/x86/math/context_project.h" +#include "lite/backends/x86/math/math_function.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +namespace math = paddle::lite::x86::math; + +template +class SequenceConvCompute : public KernelLite { + public: + using param_t = operators::SequenceConvParam; + + void Run() override { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* in = param.X; + auto* filter = param.Filter; + auto* out = param.Out; + out->template mutable_data(); + CHECK(in->lod().size() == 1) << "Only support one level sequence now"; + + int context_start = param.contextStart; + int context_stride = param.contextStride; + int context_length = param.contextLength; + bool padding_trainable = false; + const Tensor* padding_data = nullptr; + + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + auto sequence_width = static_cast(in->dims()[1]); + + std::vector col_shape{in->dims()[0], + context_length * sequence_width}; + Tensor col; + col.Resize(col_shape); + col.mutable_data(); + + // Because if padding_trainable is false, padding data should be zeros. + math::SetConstant set_zero; + auto blas = math::GetBlas(ctx); + set_zero(ctx, &col, static_cast(0)); + math::ContextProjectFunctor seq_project_functor; + + seq_project_functor(ctx, + *in, + padding_data, + padding_trainable, + context_start, + context_length, + context_stride, + up_pad, + down_pad, + &col); + + blas.MatMul(col, *filter, out); + } + + virtual ~SequenceConvCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_expand_as_compute.h b/lite/kernels/x86/sequence_expand_as_compute.h index 16759c1b9f1d136d5aaf58d4531882ab6a2618a2..badbfac14cbeb120d23ea1174a9fc3a218b2224f 100644 --- a/lite/kernels/x86/sequence_expand_as_compute.h +++ b/lite/kernels/x86/sequence_expand_as_compute.h @@ -29,9 +29,10 @@ using Tensor = lite::Tensor; template struct SequenceExpandFunctor { - void operator()(const Tensor &x, - const std::vector &ref_lod, /*expand referenced lod*/ - Tensor *out) { + void operator()( + const Tensor &x, + const std::vector &ref_lod, /*expand referenced lod*/ + Tensor *out) { int64_t hight = x.dims()[0]; int64_t width = x.data_size() / hight; @@ -39,13 +40,13 @@ struct SequenceExpandFunctor { T *out_data = out->mutable_data(); for (int h_id = 0; h_id < hight; ++h_id) { - size_t span = ref_lod[h_id + 1] - ref_lod[h_id]; + uint64_t span = ref_lod[h_id + 1] - ref_lod[h_id]; if (span == 0) continue; const T *src = in_data + h_id * width; - for (int64_t w_id = 0; w_id < width; ++w_id) { + for (uint64_t w_id = 0; w_id < width; ++w_id) { T ele = src[w_id]; size_t offset = ref_lod[h_id] * width; - for (size_t k = 0; k < span; ++k) { + for (uint64_t k = 0; k < span; ++k) { out_data[offset + k * width + w_id] = ele; } } @@ -68,7 +69,7 @@ class SequenceExpandAsCompute CHECK_EQ(y_lod.size(), 1); CHECK_GT(y_lod[0].size(), 1); - out->mutable_data(); + out->template mutable_data(); SequenceExpandFunctor seq_espand_functor; seq_espand_functor(*x, y_lod[0], out); diff --git a/lite/kernels/x86/sequence_pool_compute.h b/lite/kernels/x86/sequence_pool_compute.h index 329a76658d342078ed5d708125d9ff01e0ecef02..20e0307cef2347ce68237f70c990362bbaa210e7 100644 --- a/lite/kernels/x86/sequence_pool_compute.h +++ b/lite/kernels/x86/sequence_pool_compute.h @@ -40,7 +40,7 @@ class SequencePoolCompute : public KernelLite { dims[0] = lod[0].size() - 1; out->Resize({dims}); - out->mutable_data(); + out->template mutable_data(); lite::Tensor* index = nullptr; const bool is_test = true; diff --git a/lite/kernels/x86/sequence_reshape_compute.h b/lite/kernels/x86/sequence_reshape_compute.h index 99f84ebd06e1f5742bbaee9f98ec17aee44bd871..d166f8bc3d80d9f87efb0315462daee3296f393f 100644 --- a/lite/kernels/x86/sequence_reshape_compute.h +++ b/lite/kernels/x86/sequence_reshape_compute.h @@ -64,9 +64,9 @@ class SequenceReshapeCompute out->Resize(std::vector{static_cast(out->lod()[0].back()), out_width}); - auto* dst_ptr = out->mutable_data(); + auto* dst_ptr = out->template mutable_data(); auto size = in->numel() * sizeof(T); - std::memcpy(dst_ptr, in->data(), size); + std::memcpy(dst_ptr, in->template data(), size); } virtual ~SequenceReshapeCompute() = default; diff --git a/lite/kernels/x86/sequence_unpad_compute.cc b/lite/kernels/x86/sequence_unpad_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..430f3c47c60b8f5a506ff1191a118db754f1dffe --- /dev/null +++ b/lite/kernels/x86/sequence_unpad_compute.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_unpad_compute.h" + +REGISTER_LITE_KERNEL(sequence_unpad, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceUnpadCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Length", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_unpad_compute.h b/lite/kernels/x86/sequence_unpad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..5b4e3f6c1638975ec042598942363f516ddf3bb9 --- /dev/null +++ b/lite/kernels/x86/sequence_unpad_compute.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/sequence_padding.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +namespace math = paddle::lite::x86::math; + +template +class SequenceUnpadCompute + : public KernelLite { + public: + using param_t = operators::SequenceUnpadParam; + + void Run() override { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + param.Out->template mutable_data(); + int64_t padded_length = param.X->dims()[1]; + math::UnpaddingLoDTensorFunctor()( + ctx, + *param.X, + param.Out, + padded_length, + 0, + false, + math::kBatchLengthWidth); + } + + virtual ~SequenceUnpadCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/shape_compute.h b/lite/kernels/x86/shape_compute.h index ee3678a7f1c6651226c479aeedcacce91085b295..e78684e629727fc7023e6ae4c3385f9c58d48a6b 100644 --- a/lite/kernels/x86/shape_compute.h +++ b/lite/kernels/x86/shape_compute.h @@ -29,7 +29,7 @@ class ShapeCompute : public KernelLite { void Run() override { auto& param = *param_.get_mutable(); // auto& context = context_->As(); - auto out_data = param.Out->mutable_data(); + auto out_data = param.Out->template mutable_data(); auto in_dims = param.X->dims(); for (int i = 0; i < in_dims.size(); ++i) { out_data[i] = in_dims[i]; diff --git a/lite/kernels/x86/softmax_compute.h b/lite/kernels/x86/softmax_compute.h index 5a18a8022773682c0853a3592a9925f3a6015e83..3abc15145bde35a2c442daa9feff7137bcb40fb4 100644 --- a/lite/kernels/x86/softmax_compute.h +++ b/lite/kernels/x86/softmax_compute.h @@ -58,7 +58,7 @@ class SoftmaxCompute : public KernelLite { auto* x = param.x; auto* output = param.output; - output->mutable_data(); + output->template mutable_data(); const int rank = x->dims().size(); const int axis = CanonicalAxis(param.axis, rank); diff --git a/lite/kernels/x86/squeeze_compute.h b/lite/kernels/x86/squeeze_compute.h index 67086f8c732d412064c6bb0bece7e8208f8a0799..3288421c14447a348efd63c8cc5ea4de9bd2b24e 100644 --- a/lite/kernels/x86/squeeze_compute.h +++ b/lite/kernels/x86/squeeze_compute.h @@ -35,8 +35,8 @@ class SqueezeCompute : public KernelLite { auto x = param.X; auto output = param.Out; auto x_dims = x->dims(); - auto* x_data = x->data(); - auto* out_data = output->mutable_data(); + auto* x_data = x->template data(); + auto* out_data = output->template mutable_data(); memcpy(out_data, x_data, x_dims.production() * sizeof(T)); } @@ -54,9 +54,9 @@ class Squeeze2Compute : public KernelLite { auto output = param.Out; auto xshape = param.XShape; auto x_dims = x->dims(); - auto* x_data = x->data(); - auto* out_data = output->mutable_data(); - auto* xshape_data = xshape->mutable_data(); + auto* x_data = x->template data(); + auto* out_data = output->template mutable_data(); + auto* xshape_data = xshape->template mutable_data(); memcpy(out_data, x_data, x_dims.production() * sizeof(T)); memcpy(xshape_data, x_data, x_dims.production() * sizeof(T)); } diff --git a/lite/kernels/x86/stack_compute.h b/lite/kernels/x86/stack_compute.h index 12a6c3490eff9d446de96366c8dd5fe6b2a4bd06..08b3515948750a5cb36627f0349c852e597619e6 100644 --- a/lite/kernels/x86/stack_compute.h +++ b/lite/kernels/x86/stack_compute.h @@ -40,9 +40,9 @@ class StackCompute : public KernelLite { if (axis < 0) axis += (x[0]->dims().size() + 1); int n = static_cast(x.size()); - auto y_data = y->mutable_data(); + auto y_data = y->template mutable_data(); std::vector x_datas(n); - for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data(); + for (int i = 0; i < n; ++i) x_datas[i] = x[i]->template data(); int pre = 1, post = 1; auto dim = x[0]->dims(); diff --git a/lite/kernels/x86/transpose_compute.h b/lite/kernels/x86/transpose_compute.h index 603b96015e267aa24d20bf20f2c3090a2daab74c..5f6faed2017b6bdef60e7505bf1f0088d86b3ec1 100644 --- a/lite/kernels/x86/transpose_compute.h +++ b/lite/kernels/x86/transpose_compute.h @@ -73,7 +73,7 @@ class TransposeCompute : public KernelLite { auto& param = *param_.get_mutable(); auto* x = param.x; auto* out = param.output; - out->mutable_data(); + out->template mutable_data(); int ndims = param.axis.size(); auto& context = ctx_->As(); TransCompute( @@ -92,7 +92,7 @@ class Transpose2Compute : public KernelLite { auto& param = *param_.get_mutable(); auto* x = param.x; auto* out = param.output; - out->mutable_data(); + out->template mutable_data(); int ndims = param.axis.size(); auto& context = ctx_->As(); TransCompute( diff --git a/lite/kernels/x86/uniform_random_compute.cc b/lite/kernels/x86/uniform_random_compute.cc index 64a701d4c67a9bf908f7fc87e9923f22dde811e3..45c1c08d46e5a23857547aac15b952a1123e741f 100644 --- a/lite/kernels/x86/uniform_random_compute.cc +++ b/lite/kernels/x86/uniform_random_compute.cc @@ -34,8 +34,8 @@ class UniformRandomCompute auto *param_out = ¶m.Out->raw_tensor(); - T *data = - param_out->mutable_data(context.x86_device_context()->GetPlace()); + T *data = param_out->template mutable_data( + context.x86_device_context()->GetPlace()); unsigned int seed = static_cast(param.seed); std::minstd_rand engine; diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h index 7a9ba16d2ea87adb40df23e1fbe149ab805afbc8..1bed39f479c87636ff217c8fd7234ea2c5bd5904 100644 --- a/lite/kernels/x86/var_conv_2d_compute.h +++ b/lite/kernels/x86/var_conv_2d_compute.h @@ -80,7 +80,7 @@ class VarConv2DCompute : public KernelLite { std::vector col_dims_vec{top_size}; col_dims_vec.push_back(1); col->Resize(col_dims_vec); - auto* top_data = col->mutable_data(); + auto* top_data = col->template mutable_data(); const auto* bottom_data = input.data(); int kernel_win_size = kernel_h * kernel_w; @@ -149,7 +149,7 @@ class VarConv2DCompute : public KernelLite { // const auto& offset_y = in_row->lod()[0]; const auto& offset_y = param.X->lod()[1]; const auto& offset_x = param.X->lod()[2]; - std::vector top_offset; + std::vector top_offset; int top_size = 0; top_offset.push_back(top_size); for (int b = 0; b < batch; ++b) { @@ -178,9 +178,9 @@ class VarConv2DCompute : public KernelLite { std::vector top_dims_vec{top_size}; top_dims_vec.push_back(1); top->Resize(top_dims_vec); - auto* top_data = top->mutable_data(); - const auto* w_data = w->data(); - const auto* col_data = col->data(); + auto* top_data = top->template mutable_data(); + const auto* w_data = w->template data(); + const auto* col_data = col->template data(); auto blas = lite::x86::math::GetBlas(context); for (int b = 0; b < batch; ++b) { diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc index d6ae5a67bfc9deba1fb097fa5c0c0cf323b65e48..edef8cb2df75dfb45ad4964975365d4ddbbe9086 100644 --- a/lite/kernels/x86/var_conv_2d_compute_test.cc +++ b/lite/kernels/x86/var_conv_2d_compute_test.cc @@ -140,7 +140,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom, const auto& col_offset = col->lod()[0]; const auto& offset_x = in_col->lod()[0]; const auto& offset_y = in_row->lod()[0]; - std::vector top_offset; + std::vector top_offset; int top_size = 0; top_offset.push_back(top_size); for (int b = 0; b < batch; ++b) { diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index d9c6de358650d5bc84e12762198988c0e46e34bf..07dc127695e3906719b45020a585966877bec868 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -1,4 +1,27 @@ +if(NOT LITE_WITH_XPU) + return() +endif() -add_subdirectory(bridges) - -add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges}) +if(LITE_WITH_XTCL) + add_subdirectory(bridges) + add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges}) +else() + add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu) + add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(activation_compute_xpu XPU basic SRCS activation_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(pool_compute_xpu XPU basic SRCS pool_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(elementwise_compute_xpu XPU basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps}) +endif() diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..a0ba33110d2b3efd4a5e164da86ea949c95bbb63 --- /dev/null +++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/__xpu__multi_encoder_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUMultiEncoderCompute::PrepareForRun() { + auto& param = this->Param(); + + for (auto* fc_weight : param.fc_weight) { + arg_fc_weight_.push_back( + reinterpret_cast(fc_weight->data())); + } + for (auto* fc_bias : param.fc_bias) { + arg_fc_bias_.push_back(fc_bias->data()); + } + for (auto* ln_scale : param.ln_scale) { + arg_ln_scale_.push_back(ln_scale->data()); + } + for (auto* ln_bias : param.ln_bias) { + arg_ln_bias_.push_back(ln_bias->data()); + } + if (param.act_type == "relu") { + act_type_ = xdnn::Activation_t::RELU; + } +} + +void XPUMultiEncoderCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int batch_size = param.input->dims()[0]; + int seq_len = param.input->dims()[1]; + int r = xdnn::bert_encoder_transformer_int16( + ctx.GetRawContext(), /* context */ + batch_size, /* batch_size */ + seq_len, /* from_seq_len */ + seq_len, /* to_seq_len */ + param.head_num, /* head_num */ + param.size_per_head, /* size_per_head */ + param.n_layers, /* n_layers */ + param.input->data(), /* from_tensor */ + param.input->data(), /* to_tensor */ + param.mask->data(), /* att_mask */ + &arg_fc_weight_[0], /* fc_weights */ + &arg_fc_bias_[0], /* fc_biass */ + &arg_ln_scale_[0], /* ln_scales */ + &arg_ln_bias_[0], /* ln_biass */ + param.output->mutable_data(TARGET(kXPU)), /* output */ + param.fc_weight_max->data(), /* fc_weights_max */ + true, /* pretrans_b */ + true, /* use_l3 */ + act_type_ /* act_type */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__multi_encoder, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMultiEncoderCompute, + def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("FCWeight", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("FCWeightMax", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..71db4e6f44f9c36e4acdaf0a440463a61f4e3099 --- /dev/null +++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUMultiEncoderCompute + : public KernelLite { + public: + using param_t = operators::XPUMultiEncoderParam; + + virtual void PrepareForRun(); + + virtual void Run(); + + private: + std::vector arg_fc_weight_; + std::vector arg_fc_bias_; + std::vector arg_ln_scale_; + std::vector arg_ln_bias_; + xdnn::Activation_t act_type_{xdnn::Activation_t::GELU}; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.cc b/lite/kernels/xpu/__xpu__resnet50_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e63e03fc9c1d52be42a8ff9b1d6260b3396a2fe --- /dev/null +++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/__xpu__resnet50_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUResNet50Compute::PrepareForRun() { + auto& param = this->Param(); + + for (auto* filter : param.filter) { + arg_filter_.push_back( + reinterpret_cast(filter->data())); + } + for (auto* bias : param.bias) { + arg_bias_.push_back(bias->data()); + } + for (auto* max_filter : param.max_filter) { + arg_max_filter_.push_back(max_filter->data()); + } +} + +void XPUResNet50Compute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int batch_size = param.input->dims()[0]; + int r = xdnn::conv2d_int16_resnet( + ctx.GetRawContext(), /* context */ + batch_size, /* num */ + param.input->data(), /* bottom */ + &arg_filter_[0], /* weight_list */ + param.output->mutable_data(TARGET(kXPU)), /* top */ + &arg_bias_[0], /* bias_list */ + &arg_max_filter_[0] /* max_filter_list */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__resnet50, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUResNet50Compute, + def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.h b/lite/kernels/xpu/__xpu__resnet50_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..3d42f8b6f26edf615dba165b553b633673a4ae66 --- /dev/null +++ b/lite/kernels/xpu/__xpu__resnet50_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUResNet50Compute : public KernelLite { + public: + using param_t = operators::XPUResNet50Param; + + virtual void PrepareForRun(); + + virtual void Run(); + + private: + std::vector arg_filter_; + std::vector arg_max_filter_; + std::vector arg_bias_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..a46b33252e40a56299ebc7d0f133520a04b7cb20 --- /dev/null +++ b/lite/kernels/xpu/activation_compute.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/activation_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void ReluCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int r = xdnn::activation_forward( + ctx.GetRawContext(), /* context */ + xdnn::Activation_t::RELU, /* type */ + param.X->numel(), /* len */ + param.X->data(), /* x */ + param.Out->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + +void TanhCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int r = xdnn::activation_forward( + ctx.GetRawContext(), /* context */ + xdnn::Activation_t::TANH, /* type */ + param.X->numel(), /* len */ + param.X->data(), /* x */ + param.Out->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + +void SigmoidCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int r = xdnn::activation_forward( + ctx.GetRawContext(), /* context */ + xdnn::Activation_t::SIGMOID, /* type */ + param.X->numel(), /* len */ + param.X->data(), /* x */ + param.Out->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sigmoid, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SigmoidCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..e440bde4146a88929c52c20ff1038eb35be91d38 --- /dev/null +++ b/lite/kernels/xpu/activation_compute.h @@ -0,0 +1,53 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class ReluCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + virtual void Run(); + + virtual ~ReluCompute() = default; +}; + +class TanhCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + virtual void Run(); + + virtual ~TanhCompute() = default; +}; + +class SigmoidCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + virtual void Run(); + + virtual ~SigmoidCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/batch_norm_compute.cc b/lite/kernels/xpu/batch_norm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..1b3139165a06fd0f42897e9ed6c98d80d27adeab --- /dev/null +++ b/lite/kernels/xpu/batch_norm_compute.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/batch_norm_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void BatchNormCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + float epsilon = param.epsilon; + auto& x_dims = param.x->dims(); + + int r = xdnn::batch_norm_infer_forward( + ctx.GetRawContext(), /* context */ + epsilon, /* epsilon */ + x_dims[0], /* img_n */ + x_dims[1], /* img_c */ + x_dims[2], /* img_h */ + x_dims[3], /* img_w */ + param.x->data(), /* img_gm */ + param.y->mutable_data(TARGET(kXPU)), /* out_gm */ + param.scale->data(), /* scale_gm */ + param.bias->data(), /* bias_gm */ + param.mean->data(), /* mean_gm */ + param.variance->data() /* var__gm */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(batch_norm, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::BatchNormCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/batch_norm_compute.h b/lite/kernels/xpu/batch_norm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..7b428476b96ca3b2b60c66df28b7f82e8f57bebc --- /dev/null +++ b/lite/kernels/xpu/batch_norm_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class BatchNormCompute : public KernelLite { + public: + using param_t = operators::BatchNormParam; + + virtual void Run(); + + virtual ~BatchNormCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt index 93f3cdb445af7b75adc76294b287d9963f4e3cca..0d6d708952b0806da7b060bb76b3ce35df352c26 100644 --- a/lite/kernels/xpu/bridges/CMakeLists.txt +++ b/lite/kernels/xpu/bridges/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_XPU) +if(NOT LITE_WITH_XTCL) return() endif() diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h index dafd8d853210278220b79fdf58895484cbd89ec0..562e5fea9eef92fae306fe4bb48a4e224b3c76ee 100644 --- a/lite/kernels/xpu/bridges/graph.h +++ b/lite/kernels/xpu/bridges/graph.h @@ -14,12 +14,12 @@ #pragma once -#include #include #include #include #include #include +#include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" diff --git a/lite/kernels/xpu/bridges/utility.h b/lite/kernels/xpu/bridges/utility.h index 776955854567b919234e7c79dcf6321e8e24b70a..0deb4fd7b4723d97a9159a88c6d8a054a047dc92 100644 --- a/lite/kernels/xpu/bridges/utility.h +++ b/lite/kernels/xpu/bridges/utility.h @@ -14,10 +14,10 @@ #pragma once -#include #include #include #include +#include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" diff --git a/lite/kernels/xpu/cast_compute.cc b/lite/kernels/xpu/cast_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..c7eabd28a16073db218dcd03542bac0d1e3459be --- /dev/null +++ b/lite/kernels/xpu/cast_compute.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/cast_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +void CastCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* x = param.X; + auto* out = param.Out; + int out_dtype = param.out_dtype; + auto* in_data = x->template data(); + int numel = x->numel(); + + int r = 0; + // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6; + // SIZE_T = 19;UINT8 = 20;INT8 = 21; + if (out_dtype == 5) { + auto* out_data = out->template mutable_data(TARGET(kXPU)); + r = xdnn::cast( + ctx.GetRawContext(), in_data, out_data, numel); + } else if (out_dtype == 2) { + auto* out_data = out->template mutable_data(TARGET(kXPU)); + r = xdnn::cast(ctx.GetRawContext(), in_data, out_data, numel); + } else if (out_dtype == 3) { + auto* out_data = out->template mutable_data(TARGET(kXPU)); + r = xdnn::cast( + ctx.GetRawContext(), in_data, out_data, numel); + } else { + CHECK(false); + } + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(cast, + kXPU, + kAny, + kNCHW, + paddle::lite::kernels::xpu::CastCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))}) + .Finalize(); diff --git a/lite/kernels/xpu/cast_compute.h b/lite/kernels/xpu/cast_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..8992c29732630a5bf0d9c092461569234257e3a9 --- /dev/null +++ b/lite/kernels/xpu/cast_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class CastCompute : public KernelLite { + public: + using param_t = operators::CastParam; + + void Run() override; + + virtual ~CastCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/conv_compute.cc b/lite/kernels/xpu/conv_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed692fd0e2d474cbe5ce9f06633280bb09c3878c --- /dev/null +++ b/lite/kernels/xpu/conv_compute.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/conv_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template <> +void Conv2dCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.x->dims(); + auto& w_dims = param.filter->dims(); + int groups = param.groups; + auto& strides = param.strides; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + + int r = xdnn::conv2d_forward_int16( + ctx.GetRawContext(), /* context */ + x_dims[0], /* num */ + x_dims[1], /* input_c */ + x_dims[2], /* input_h */ + x_dims[3], /* input_w */ + w_dims[0], /* num_filter */ + w_dims[2], /* kernel_h */ + w_dims[3], /* kernel_w */ + strides[0], /* stride_h */ + strides[1], /* stride_w */ + paddings[0], /* pad_h */ + paddings[1], /* pad_w */ + dilations[0], /* dilation_h */ + dilations[1], /* dilation_w */ + groups, /* group */ + param.x->data(), /* bottom */ + param.filter->data(), /* weight */ + param.output->mutable_data(TARGET(kXPU)), /* top */ + nullptr, /* bias */ + nullptr, /* branch */ + xdnn::Activation_t::LINEAR, /* type */ + nullptr, /* max_image_ptr */ + nullptr, /* max_filter_ptr */ + nullptr /* max_result_ptr */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +namespace xpu = paddle::lite::kernels::xpu; +using Conv2dFp32 = xpu::Conv2dCompute; + +REGISTER_LITE_KERNEL(conv2d, kXPU, kFloat, kNCHW, Conv2dFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/conv_compute.h b/lite/kernels/xpu/conv_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..b7631ce4e5773afe7cdd797a245c806b51d25c56 --- /dev/null +++ b/lite/kernels/xpu/conv_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class Conv2dCompute : public KernelLite { + public: + using param_t = operators::ConvParam; + + virtual void Run(); + + virtual ~Conv2dCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/dropout_compute.cc b/lite/kernels/xpu/dropout_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..f42d3eeff5da40251c27476a53709aee1e65fbcf --- /dev/null +++ b/lite/kernels/xpu/dropout_compute.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/dropout_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void DropoutCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int size = param.x->numel() * sizeof(float); + + int r = xdnn::memcpy_device( + ctx.GetRawContext(), /* context */ + param.output->mutable_data(TARGET(kXPU)), /* dst */ + param.x->data(), /* src */ + size /* size */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(dropout, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::DropoutCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/dropout_compute.h b/lite/kernels/xpu/dropout_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..0eaafb4f5555a163623402fee82d50bfa095b0b3 --- /dev/null +++ b/lite/kernels/xpu/dropout_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class DropoutCompute : public KernelLite { + public: + using param_t = operators::DropoutParam; + + virtual void Run(); + + virtual ~DropoutCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..e37337948bf639832ea936de2b5b929d26f534cc --- /dev/null +++ b/lite/kernels/xpu/elementwise_compute.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/elementwise_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void ElementwiseAddCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.X->dims().data(); + auto& y_dims = param.Y->dims(); + int axis = param.axis; + if (param.axis == -1) { + axis = x_dims.size() - y_dims.size(); + } + int iter = std::accumulate( + x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies()); + int stride = param.Y->numel(); + + for (int i = 0; i < iter; ++i) { + const float* x_ptr = param.X->data() + i * stride; + const float* y_ptr = param.Y->data(); + float* o_ptr = param.Out->mutable_data(TARGET(kXPU)) + i * stride; + int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */ + x_ptr, /* x */ + y_ptr, /* y */ + o_ptr, /* z */ + stride /* len */); + CHECK_EQ(r, 0); + } +} + +void ElementwiseSubCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.X->dims().data(); + auto& y_dims = param.Y->dims(); + int axis = param.axis; + if (param.axis == -1) { + axis = x_dims.size() - y_dims.size(); + } + int iter = std::accumulate( + x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies()); + int stride = param.Y->numel(); + + for (int i = 0; i < iter; ++i) { + const float* x_ptr = param.X->data() + i * stride; + const float* y_ptr = param.Y->data(); + float* o_ptr = param.Out->mutable_data(TARGET(kXPU)) + i * stride; + int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */ + x_ptr, /* x */ + y_ptr, /* y */ + o_ptr, /* z */ + stride /* len */); + CHECK_EQ(r, 0); + } +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(elementwise_add, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ElementwiseAddCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_sub, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ElementwiseSubCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/elementwise_compute.h b/lite/kernels/xpu/elementwise_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..863ee3c643f9c431dacd057e251941914b1dd1c5 --- /dev/null +++ b/lite/kernels/xpu/elementwise_compute.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class ElementwiseAddCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + virtual void Run(); + + virtual ~ElementwiseAddCompute() = default; +}; + +class ElementwiseSubCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + virtual void Run(); + + virtual ~ElementwiseSubCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/io_copy_compute.cc b/lite/kernels/xpu/io_copy_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ee809563475434cfa286cc3a535bf9acac5d923 --- /dev/null +++ b/lite/kernels/xpu/io_copy_compute.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/target_wrapper.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +/* + * This kernel copies a tensor from host to XPU. + */ +class IoCopyHostToXPUCompute + : public KernelLite { + public: + void Run() override { + auto& param = Param(); + CHECK(param.x->target() == TARGET(kHost) || + param.x->target() == TARGET(kX86) || + param.x->target() == TARGET(kARM)); + auto mem_size = param.x->memory_size(); + VLOG(4) << "host to xpu, copy size " << mem_size; + auto* data = param.y->mutable_data(TARGET(kXPU), mem_size); + TargetWrapperXPU::MemcpySync( + data, param.x->raw_data(), mem_size, IoDirection::HtoD); + } + + std::unique_ptr GetTypeInferHandler() override { + std::unique_ptr res(new type_infer_handler_t); + *res = [](const std::map& inputs, + const std::string& out) -> const Type* { + CHECK(!inputs.empty()); + auto* type = inputs.at("Input"); + CHECK(type->target() == TARGET(kHost)); + + auto out_place = type->place(); + out_place.target = TARGET(kXPU); + auto* out_type = Type::Get(type->id(), + out_place.target, + out_place.precision, + out_place.layout, + out_place.device); + return out_type; + }; + return res; + } + + std::string doc() const override { return "Copy IO from HOST to XPU"; } +}; + +/* + * This kernel copies a tensor from XPU to host. + */ +class IoCopyXPUToHostCompute + : public KernelLite { + public: + void Run() override { + auto& param = Param(); + CHECK(param.x->target() == TARGET(kXPU)); + auto mem_size = param.x->memory_size(); + VLOG(4) << "xpu to host, copy size " << mem_size; + auto* data = param.y->mutable_data(TARGET(kHost), mem_size); + TargetWrapperXPU::MemcpySync( + data, param.x->raw_data(), mem_size, IoDirection::DtoH); + } + + std::string doc() const override { return "Copy IO from XPU to HOST"; } +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(io_copy, + kXPU, + kAny, + kAny, + paddle::lite::kernels::xpu::IoCopyHostToXPUCompute, + host_to_device) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL(io_copy, + kXPU, + kAny, + kAny, + paddle::lite::kernels::xpu::IoCopyXPUToHostCompute, + device_to_host) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL(io_copy_once, + kXPU, + kAny, + kAny, + paddle::lite::kernels::xpu::IoCopyHostToXPUCompute, + host_to_device) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL(io_copy_once, + kXPU, + kAny, + kAny, + paddle::lite::kernels::xpu::IoCopyXPUToHostCompute, + device_to_host) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); diff --git a/lite/kernels/xpu/layer_norm_compute.cc b/lite/kernels/xpu/layer_norm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..538ad849d93182488ca35433800f687027c02e4a --- /dev/null +++ b/lite/kernels/xpu/layer_norm_compute.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/layer_norm_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void LayerNormCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto x_dims = param.X->dims(); + auto axis = param.begin_norm_axis; + auto matrix_dim = x_dims.Flatten2D(axis); + float epsilon = param.epsilon; + + int r = xdnn::layer_norm(ctx.GetRawContext(), /* context */ + matrix_dim[0], /* m */ + matrix_dim[1], /* n */ + param.X->data(), /* in */ + param.Y->mutable_data(TARGET(kXPU)), /* out */ + param.Scale->data(), /* scale */ + param.Bias->data(), /* bias */ + epsilon /* epsilon */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(layer_norm, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::LayerNormCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/layer_norm_compute.h b/lite/kernels/xpu/layer_norm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..5d2df37795811ef8027e12b25139f2b7091cceed --- /dev/null +++ b/lite/kernels/xpu/layer_norm_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class LayerNormCompute : public KernelLite { + public: + using param_t = operators::LayerNormParam; + + virtual void Run(); + + virtual ~LayerNormCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/lookup_table_compute.cc b/lite/kernels/xpu/lookup_table_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..568d303adefaa06bb8665b4cc92d4a949419d587 --- /dev/null +++ b/lite/kernels/xpu/lookup_table_compute.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/lookup_table_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void LookupTableCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int num = param.Ids->numel(); + int embed_dim = param.W->dims()[1]; + + int r = xdnn::embedding( + ctx.GetRawContext(), /* context */ + num, /* num */ + param.Ids->data(), /* indices */ + embed_dim, /* embed_dim */ + param.W->data(), /* table */ + param.Out->mutable_data(TARGET(kXPU)) /* top */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(lookup_table, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::LookupTableCompute, + def) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/lookup_table_compute.h b/lite/kernels/xpu/lookup_table_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..2ba1afc869cf9c3a49ab1ad29c66c6c89ba87d19 --- /dev/null +++ b/lite/kernels/xpu/lookup_table_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class LookupTableCompute : public KernelLite { + public: + using param_t = operators::LookupTableParam; + + virtual void Run(); + + virtual ~LookupTableCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/matmul_compute.cc b/lite/kernels/xpu/matmul_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..62e018889d415de8968444594804facc3292e799 --- /dev/null +++ b/lite/kernels/xpu/matmul_compute.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/matmul_compute.h" +#include "lite/backends/xpu/math.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +namespace math = paddle::lite::xpu::math; + +void MatMulCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* x = param.X; + auto* y = param.Y; + auto* out = param.Out; + + auto mat_dim_a = math::CreateMatrixDescriptor( + math::RowMatrixFromVector(x->dims()), 0, param.transpose_X); + auto mat_dim_b = math::CreateMatrixDescriptor( + math::ColumnMatrixFromVector(y->dims()), 0, param.transpose_Y); + int lda = (mat_dim_a.trans_ ? mat_dim_a.height_ : mat_dim_a.width_); + int ldb = (mat_dim_b.trans_ ? mat_dim_b.height_ : mat_dim_b.width_); + int ldc = mat_dim_b.width_; + + int r = 0; + if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) { + r = xdnn::fc_int16(ctx.GetRawContext(), /* context */ + mat_dim_a.trans_, /* TransA */ + mat_dim_b.trans_, /* TransB */ + mat_dim_a.height_, /* m */ + mat_dim_b.width_, /* n */ + mat_dim_a.width_, /* k */ + param.alpha, /* alpha */ + x->data(), /* A */ + y->data(), /* B */ + 0.0f, /* beta */ + out->mutable_data(TARGET(kXPU)) /* C */); + } else { + // batch matmul + r = xdnn::gemm_strided_batched_int16( + ctx.GetRawContext(), /* context */ + mat_dim_a.trans_, /* TransA */ + mat_dim_b.trans_, /* TransB */ + mat_dim_a.batch_size_, /* batch_size */ + mat_dim_a.height_, /* M */ + mat_dim_b.width_, /* N */ + mat_dim_a.width_, /* K */ + param.alpha, /* alpha */ + x->data(), /* A */ + lda, /* lda */ + mat_dim_a.stride_, /* stride_a */ + y->data(), /* B */ + ldb, /* ldb */ + mat_dim_b.stride_, /* stride_b */ + 0.0f, /* beta */ + out->mutable_data(TARGET(kXPU)), /* C */ + ldc, /* ldc */ + mat_dim_a.height_ * mat_dim_b.width_ /* stride_c */); + } + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + matmul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MatMulCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/matmul_compute.h b/lite/kernels/xpu/matmul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..aca3cbc603eff490ae19fd2546352adca3c1a7cf --- /dev/null +++ b/lite/kernels/xpu/matmul_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class MatMulCompute : public KernelLite { + public: + using param_t = operators::MatMulParam; + + virtual void Run(); + + virtual ~MatMulCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/mul_compute.cc b/lite/kernels/xpu/mul_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..8aa93a9c8b8d84874b95dae2c15bf985585c916c --- /dev/null +++ b/lite/kernels/xpu/mul_compute.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/mul_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void MulCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& origin_x = *param.x; + auto& origin_y = *param.y; + auto& x_dims = origin_x.dims(); + auto& y_dims = origin_y.dims(); + Tensor x_matrix, y_matrix; + if (x_dims.size() > 2) { + x_matrix = ReshapeToMatrix(origin_x, param.x_num_col_dims); + } else { + x_matrix = origin_x; + } + if (y_dims.size() > 2) { + y_matrix = ReshapeToMatrix(origin_y, param.y_num_col_dims); + } else { + y_matrix = origin_y; + } + int m = x_matrix.dims()[0]; + int k = x_matrix.dims()[1]; + int n = y_matrix.dims()[1]; + + int r = + xdnn::fc_int16(ctx.GetRawContext(), /* context */ + false, /* TransA */ + false, /* TransB */ + m, + n, + k, + 1.0f, /* alpha */ + x_matrix.data(), /* A */ + y_matrix.data(), /* B */ + 0.0f, /* beta */ + param.output->mutable_data(TARGET(kXPU)) /* C */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + mul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MulCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/mul_compute.h b/lite/kernels/xpu/mul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..bb2778c0e73189b11135395b42655e0250bbfd0a --- /dev/null +++ b/lite/kernels/xpu/mul_compute.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +static inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src, + int num_col_dims) { + int rank = src.dims().size(); + if (rank == 2) { + return src; + } + lite::Tensor res; + res.ShareDataWith(src); + res.Resize(src.dims().Flatten2D(num_col_dims)); + return res; +} + +class MulCompute : public KernelLite { + public: + using param_t = operators::MulParam; + + virtual void Run(); + + virtual ~MulCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/pool_compute.cc b/lite/kernels/xpu/pool_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..4480e4875cb3317ddeeea7017f4aa825e2afe320 --- /dev/null +++ b/lite/kernels/xpu/pool_compute.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/pool_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void Pool2DCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.x->dims(); + CHECK_EQ(x_dims.size(), 4); + auto& o_dims = param.output->dims(); + CHECK_EQ(param.ksize.size(), 2); + if (param.global_pooling) { + param.ksize[0] = x_dims[2]; + param.ksize[1] = x_dims[3]; + } + CHECK_EQ(param.strides.size(), 2); + CHECK_EQ(param.paddings->size(), 4); + auto& paddings = *param.paddings; + auto type = xdnn::MAX_WITHOUT_INDEX; + if (param.pooling_type == "avg") { + if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 && + paddings[3] == 0) { + type = xdnn::AVG_WITHOUT_PAD; + } else { + type = xdnn::AVG_WITH_PAD; + } + } + + int r = xdnn::pooling_forward( + ctx.GetRawContext(), /* context */ + param.x->data(), /* x */ + param.output->mutable_data(TARGET(kXPU)), /* y */ + nullptr, /* y_index */ + type, /* type */ + x_dims[0] * x_dims[1], /* c */ + x_dims[2], /* in_h */ + x_dims[3], /* in_w */ + paddings[0], /* pad_left */ + paddings[1], /* pad_right */ + paddings[2], /* pad_up */ + paddings[3], /* pad_down */ + param.ksize[0], /* win_h */ + param.ksize[1], /* win_w */ + param.strides[0], /* stride_h */ + param.strides[1], /* stride_w */ + o_dims[2], /* out_h */ + o_dims[3] /* out_w */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..5648554c41c76396184b7dc536f8c8628cbf23e4 --- /dev/null +++ b/lite/kernels/xpu/pool_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class Pool2DCompute : public KernelLite { + public: + using param_t = operators::PoolParam; + + virtual void Run(); + + virtual ~Pool2DCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/scale_compute.cc b/lite/kernels/xpu/scale_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c8d3b0a238880402c09e014aeb91a898b252660 --- /dev/null +++ b/lite/kernels/xpu/scale_compute.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/scale_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void ScaleCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.x->dims(); + + int r = xdnn::scale(ctx.GetRawContext(), /* context */ + x_dims.production(), /* len */ + param.scale, /* alpha */ + param.bias, /* beta */ + param.bias_after_scale, /* bias_after_scale */ + param.x->data(), /* x */ + param.output->mutable_data(TARGET(kXPU)) /* y */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + scale, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ScaleCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/scale_compute.h b/lite/kernels/xpu/scale_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6989b0f0f31e54a63dac2f7c2090dc676e31acfb --- /dev/null +++ b/lite/kernels/xpu/scale_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class ScaleCompute : public KernelLite { + public: + using param_t = operators::ScaleParam; + + virtual void Run(); + + virtual ~ScaleCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/slice_compute.cc b/lite/kernels/xpu/slice_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..5919f84dbd3f0923cc44f2ad4bee13d1bb13f98d --- /dev/null +++ b/lite/kernels/xpu/slice_compute.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/slice_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SliceCompute::PrepareForRun() { + auto& param = this->Param(); + auto x_dims = param.X->dims(); + x_shape_.reserve(x_dims.size()); + x_dim_begin_.reserve(x_dims.size()); + x_dim_end_.reserve(x_dims.size()); +} + +void SliceCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto x_dims = param.X->dims(); + for (size_t i = 0; i < x_dims.size(); ++i) { + x_shape_[i] = x_dims[i]; + x_dim_begin_[i] = 0; + x_dim_end_[i] = x_dims[i]; + } + for (size_t i = 0; i < param.axes.size(); ++i) { + int axis = param.axes[i]; + x_dim_begin_[axis] = param.starts[i]; + x_dim_end_[axis] = param.ends[i]; + } + + int ndim = param.X->dims().size(); + int r = xdnn::slice_forward( + ctx.GetRawContext(), /* context */ + &x_shape_[0], /* shape */ + &x_dim_begin_[0], /* starts */ + &x_dim_end_[0], /* ends */ + ndim, /* n */ + param.X->data(), /* in */ + param.Out->mutable_data(TARGET(kXPU)) /* out */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + slice, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SliceCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/slice_compute.h b/lite/kernels/xpu/slice_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6fb34e30c143d0890dc76e9b0fd3b2d1bfcef8e9 --- /dev/null +++ b/lite/kernels/xpu/slice_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SliceCompute : public KernelLite { + public: + using param_t = operators::SliceParam; + + virtual void PrepareForRun(); + + virtual void Run(); + + virtual ~SliceCompute() = default; + + private: + std::vector x_shape_; + std::vector x_dim_begin_; + std::vector x_dim_end_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/softmax_compute.cc b/lite/kernels/xpu/softmax_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e4a6c19f3bfc9ced852c5b6aa7f63e568bc7669 --- /dev/null +++ b/lite/kernels/xpu/softmax_compute.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/softmax_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SoftmaxCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.x->dims(); + int axis = CanonicalAxis(param.axis, x_dims.size()); + int rows = SizeToAxis(axis, x_dims); + int cols = SizeFromAxis(axis, x_dims); + + int r = xdnn::softmax2d_forward( + ctx.GetRawContext(), /* context */ + param.x->data(), /* x */ + param.output->mutable_data(TARGET(kXPU)), /* y */ + rows, /* rows */ + cols /* cols */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(softmax, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SoftmaxCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/softmax_compute.h b/lite/kernels/xpu/softmax_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..e807f38a2ea3c9645b78340ac4dc87d1984c40f7 --- /dev/null +++ b/lite/kernels/xpu/softmax_compute.h @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +static inline int CanonicalAxis(const int axis, const int rank) { + if (axis < 0) { + return axis + rank; + } + return axis; +} + +static inline int SizeToAxis(const int axis, lite::DDim dims) { + int size = 1; + for (int i = 0; i < axis; i++) { + size *= dims[i]; + } + return size; +} + +static inline int SizeFromAxis(const int axis, lite::DDim dims) { + int size = 1; + for (size_t i = axis; i < dims.size(); i++) { + size *= dims[i]; + } + return size; +} + +class SoftmaxCompute : public KernelLite { + public: + using param_t = operators::SoftmaxParam; + + virtual void Run(); + + virtual ~SoftmaxCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..e9e5c19d25135ac5877e38eaf65829fefc500e07 --- /dev/null +++ b/lite/kernels/xpu/stack_compute.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/stack_compute.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void StackCompute::PrepareForRun() { + auto& param = this->Param(); + + int n = param.X.size(); + void* x_ptr = nullptr; + xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */); + x_ptr_guard_.reset(x_ptr); + x_ptr_cpu_.reserve(n); +} + +void StackCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + int n = param.X.size(); + auto x_dims = param.X[0]->dims(); + int axis = param.axis; + // XXX(miaotianxiang): +1? + if (axis < 0) axis += (x_dims.size() + 1); + auto matrix = x_dims.Flatten2D(axis); + int height = matrix[0]; + int width = matrix[1]; + + for (int i = 0; i < n; ++i) { + x_ptr_cpu_[i] = param.X[i]->data(); + } + xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE); + + int r = xdnn::stack_forward( + ctx.GetRawContext(), /* context */ + height, /* height */ + width, /* width */ + n, /* n */ + x_ptr_guard_.get(), /* x_ptr */ + param.Out->mutable_data(TARGET(kXPU)) /* out */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + stack, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::StackCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6f77cbb3a73bce2d5496f840b2a1f8e14313e776 --- /dev/null +++ b/lite/kernels/xpu/stack_compute.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +struct XPUFreeDeleter { + void operator()(void* p) const { xpu_free(p); } +}; + +class StackCompute : public KernelLite { + public: + using param_t = operators::StackParam; + + virtual void PrepareForRun(); + + virtual void Run(); + + virtual ~StackCompute() = default; + + private: + std::unique_ptr x_ptr_guard_; + std::vector x_ptr_cpu_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h index 1faada3978a2ab33fbe0135d57f21a94c97d5c61..601c8821bc826e350c233573bf7eff89cdf5c1f5 100644 --- a/lite/kernels/xpu/subgraph_compute.h +++ b/lite/kernels/xpu/subgraph_compute.h @@ -14,10 +14,10 @@ #pragma once -#include #include #include #include +#include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/kernel.h" #include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/registry.h" diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index ae9ec3ad47fbc00c91ba06c1597bd65e510b629b..c7fa674bff745df29b271e10c8c4d99687a889ed 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -14,7 +14,7 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} ) add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS}) add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS}) add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS}) -add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS}) +add_operator(activation_basic_ops basic SRCS activation_ops.cc DEPS ${op_DEPS}) add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS}) add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS}) add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS}) @@ -60,6 +60,7 @@ add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS}) add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS}) # 3.extra ops +add_operator(activation_extra_ops extra SRCS activation_extra_ops.cc DEPS ${op_DEPS}) add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS}) add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS}) add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS}) @@ -73,6 +74,7 @@ add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS}) add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS}) add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS}) add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS}) +add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS}) add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS}) add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS}) add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS}) @@ -105,6 +107,7 @@ add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS $ add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS}) add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS}) add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS}) +add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS}) # for OCR specific add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) @@ -148,6 +151,10 @@ add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DE add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS}) add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS}) +# Only for XPU +add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS}) + if (NOT LITE_WITH_X86) lite_cc_test(test_fc_op SRCS fc_op_test.cc DEPS fc_op memory diff --git a/lite/operators/__xpu__multi_encoder_op.cc b/lite/operators/__xpu__multi_encoder_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6d8aca942592668831b8d46d3e07ce83a57f1011 --- /dev/null +++ b/lite/operators/__xpu__multi_encoder_op.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__multi_encoder_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUMultiEncoderOp::CheckShape() const { return true; } + +bool XPUMultiEncoderOp::InferShapeImpl() const { + auto input_shape = param_.input->dims(); + param_.output->Resize(input_shape); + return true; +} + +bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.input = const_cast( + &scope->FindVar(op_desc.Input("Input").front())->Get()); + param_.mask = const_cast( + &scope->FindVar(op_desc.Input("Mask").front())->Get()); + param_.fc_weight_max = const_cast( + &scope->FindVar(op_desc.Input("FCWeightMax").front()) + ->Get()); + param_.output = scope->FindVar(op_desc.Output("Output").front()) + ->GetMutable(); + + param_.fc_weight.clear(); + for (auto& name : op_desc.Input("FCWeight")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.fc_weight.push_back(t); + } + param_.fc_bias.clear(); + for (auto& name : op_desc.Input("FCBias")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.fc_bias.push_back(t); + } + param_.ln_scale.clear(); + for (auto& name : op_desc.Input("LNScale")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.ln_scale.push_back(t); + } + param_.ln_bias.clear(); + for (auto& name : op_desc.Input("LNBias")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.ln_bias.push_back(t); + } + + param_.n_layers = op_desc.GetAttr("n_layers"); + param_.head_num = op_desc.GetAttr("head_num"); + param_.size_per_head = op_desc.GetAttr("size_per_head"); + param_.act_type = op_desc.GetAttr("act_type"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__multi_encoder, + paddle::lite::operators::XPUMultiEncoderOp); diff --git a/lite/operators/__xpu__multi_encoder_op.h b/lite/operators/__xpu__multi_encoder_op.h new file mode 100644 index 0000000000000000000000000000000000000000..6c20562151ad751f3a8c72ce9ce262cf1f0a286a --- /dev/null +++ b/lite/operators/__xpu__multi_encoder_op.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUMultiEncoderOp : public OpLite { + public: + XPUMultiEncoderOp() {} + explicit XPUMultiEncoderOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "MultiEncoder"; } + + private: + mutable XPUMultiEncoderParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/__xpu__resnet50_op.cc b/lite/operators/__xpu__resnet50_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..02ea6dc1799baaab486b839a4d3137020a9f7a5c --- /dev/null +++ b/lite/operators/__xpu__resnet50_op.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__resnet50_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUResNet50Op::CheckShape() const { return true; } + +bool XPUResNet50Op::InferShapeImpl() const { + auto input_shape = param_.input->dims(); + input_shape[1] = 2048; + input_shape[2] = 1; + input_shape[3] = 1; + param_.output->Resize(input_shape); + return true; +} + +bool XPUResNet50Op::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { + param_.input = const_cast( + &scope->FindVar(op_desc.Input("Input").front())->Get()); + param_.output = scope->FindVar(op_desc.Output("Output").front()) + ->GetMutable(); + + param_.filter.clear(); + for (auto& name : op_desc.Input("Filter")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.filter.push_back(t); + } + param_.bias.clear(); + for (auto& name : op_desc.Input("Bias")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.bias.push_back(t); + } + param_.max_filter.clear(); + for (auto& name : op_desc.Input("MaxFilter")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.max_filter.push_back(t); + } + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__resnet50, paddle::lite::operators::XPUResNet50Op); diff --git a/lite/operators/__xpu__resnet50_op.h b/lite/operators/__xpu__resnet50_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97f4d42006c64243818af21aa26f708d7889ba96 --- /dev/null +++ b/lite/operators/__xpu__resnet50_op.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUResNet50Op : public OpLite { + public: + XPUResNet50Op() {} + explicit XPUResNet50Op(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "ResNet50"; } + + private: + mutable XPUResNet50Param param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/activation_extra_ops.cc b/lite/operators/activation_extra_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c773b4327abd48532a1bc9283963bd0dad19da6 --- /dev/null +++ b/lite/operators/activation_extra_ops.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.i + +#include "lite/core/op_registry.h" +#include "lite/operators/activation_ops.h" + +// Extra activation ops +REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(hard_swish, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(reciprocal, paddle::lite::operators::ActivationOp); diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc index 13abe0c53e95363e7f54c56819eaac26ef720072..a3d9895955d99b96609a8c35e2493b17a11b9181 100644 --- a/lite/operators/activation_ops.cc +++ b/lite/operators/activation_ops.cc @@ -74,6 +74,14 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { } else if (opdesc.Type() == "abs") { // abs param_.active_type = lite_api::ActivationType::kAbs; + } else if (opdesc.Type() == "hard_swish") { + // hard_swish + param_.active_type = lite_api::ActivationType::kHardSwish; + param_.hard_swish_threshold = opdesc.GetAttr("threshold"); + param_.hard_swish_scale = opdesc.GetAttr("scale"); + param_.hard_swish_offset = opdesc.GetAttr("offset"); + } else if (opdesc.Type() == "reciprocal") { + param_.active_type = lite_api::ActivationType::kReciprocal; } VLOG(4) << "opdesc.Type():" << opdesc.Type(); @@ -84,21 +92,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { } // namespace operators } // namespace lite } // namespace paddle -REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp); + +// Baisc activation ops REGISTER_LITE_OP(sigmoid, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(tanh, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp); -REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp); diff --git a/lite/operators/ctc_align_op.cc b/lite/operators/ctc_align_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea8e0c27059258a4e7c857c80ab64eb381446035 --- /dev/null +++ b/lite/operators/ctc_align_op.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/ctc_align_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool CtcAlignOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.input != nullptr); + CHECK_OR_FALSE(param_.output != nullptr); + + auto* input = param_.input; + auto* input_length = param_.input_length; + auto input_lod = input->lod(); + CHECK_OR_FALSE(!input_lod.empty() || input_length != nullptr); + return true; +} + +bool CtcAlignOpLite::InferShapeImpl() const { + auto input_dims = param_.input->dims(); + // It is tricky to set the wrong dimension here. + param_.output->Resize(input_dims); + if (param_.input_length != nullptr && param_.output_length != nullptr) { + param_.output_length->Resize({input_dims[0], 1}); + } + return true; +} + +bool CtcAlignOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + AttachInput(op_desc, scope, "Input", false, ¶m_.input); + AttachInput(op_desc, scope, "InputLength", true, ¶m_.input_length); + AttachOutput(op_desc, scope, "Output", false, ¶m_.output); + AttachOutput(op_desc, scope, "OutputLength", true, ¶m_.output_length); + param_.blank = op_desc.GetAttr("blank"); + param_.merge_repeated = op_desc.GetAttr("merge_repeated"); + param_.padding_value = op_desc.GetAttr("padding_value"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(ctc_align, paddle::lite::operators::CtcAlignOpLite); diff --git a/lite/operators/ctc_align_op.h b/lite/operators/ctc_align_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7593860e06c3d0104ca1f7ea7281d23149408923 --- /dev/null +++ b/lite/operators/ctc_align_op.h @@ -0,0 +1,48 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class CtcAlignOpLite : public OpLite { + public: + CtcAlignOpLite() {} + + explicit CtcAlignOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "ctc_align"; } + + private: + mutable CtcAlignParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 3fdca389bca1ba09ebfe008365b6992b717270d8..466de112fb2983e325b2bec17e90018984d7e233 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -336,17 +336,22 @@ struct ConcatParam : ParamBase { /// ----------------------- activation operators ---------------------- struct ActivationParam : ParamBase { const lite::Tensor* X{}; + lite::Tensor* Out{}; + lite_api::ActivationType active_type; + bool has_active{false}; float Leaky_relu_alpha{0}; // leaky_relu param float Relu_clipped_coef{6}; // relu_clipped param std::string Prelu_mode{ "channel"}; // prelu param, can be "all", "channel" or "element" lite::Tensor* Prelu_alpha{}; // prelu param float Swish_beta; // swish param + // hard_sigmoid param float hard_sigmoid_slope{0.2}; float hard_sigmoid_offset{0.5}; - lite::Tensor* Out{}; - bool has_active{false}; - lite_api::ActivationType active_type; + // hard_swish param + float hard_swish_threshold{6.0}; + float hard_swish_scale{6.0}; + float hard_swish_offset{3.0}; }; struct ActivationGradParam : ParamBase { @@ -1019,6 +1024,12 @@ struct SequenceExpandParam : ParamBase { int ref_level{-1}; }; +struct SequenceUnpadParam : ParamBase { + const lite::Tensor* X{}; + const lite::Tensor* Length{}; + lite::Tensor* Out{}; +}; + struct SequenceExpandAsParam : ParamBase { const lite::Tensor* x{nullptr}; const lite::Tensor* y{nullptr}; @@ -1438,6 +1449,40 @@ struct CrfDecodingParam : ParamBase { lite::Tensor* viterbi_path{}; }; +struct CtcAlignParam : ParamBase { + lite::Tensor* input{}; + lite::Tensor* input_length{}; + lite::Tensor* output{}; + lite::Tensor* output_length{}; + int blank{0}; + bool merge_repeated{true}; + int padding_value{0}; +}; + +struct XPUResNet50Param : ParamBase { + lite::Tensor* input{}; + std::vector filter; + std::vector bias; + std::vector max_filter; + lite::Tensor* output{}; +}; + +struct XPUMultiEncoderParam : ParamBase { + lite::Tensor* input{}; + std::vector fc_weight; + std::vector fc_bias; + std::vector ln_scale; + std::vector ln_bias; + lite::Tensor* fc_weight_max{}; + lite::Tensor* mask{}; + lite::Tensor* output{}; + + int n_layers{}; + int head_num{}; + int size_per_head{}; + std::string act_type{}; +}; + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/sequence_unpad_op.cc b/lite/operators/sequence_unpad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b91d43c741f002b2bdb30e161688cd40b462faee --- /dev/null +++ b/lite/operators/sequence_unpad_op.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_unpad_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceUnpadOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Length); + CHECK_OR_FALSE(param_.Out); + auto x_dims = param_.X->dims(); + auto len_dims = param_.Length->dims(); + CHECK(x_dims.size() >= 2) << "Rank of X can't be less than 2"; + CHECK(len_dims.size() == 1) << "Rank of Length should be 1"; + CHECK(x_dims[0] == len_dims[0]) + << "X and Length should have the same 1st dim"; + return true; +} + +bool SequenceUnpadOp::InferShapeImpl() const { + auto x_dims = param_.X->dims(); + auto len_dims = param_.Length->dims(); + + auto *seq_len_ptr = param_.Length->data(); + int64_t batch_size = len_dims[0]; + std::vector out_lod0(batch_size + 1, 0); + for (int64_t i = 0; i < batch_size; ++i) { + out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i]; + } + paddle::lite::LoD out_lod; + out_lod.push_back(out_lod0); + + int64_t out_dim0 = out_lod0.back(); + std::vector out_dims{out_dim0}; + if (x_dims.size() == 2) { + out_dims.push_back(1); + } else { + for (size_t i = 2; i < x_dims.size(); ++i) { + out_dims.push_back(x_dims[i]); + } + } + param_.Out->Resize(out_dims); + param_.Out->set_lod(out_lod); + return true; +} + +bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + param_.Length = const_cast( + &scope->FindVar(opdesc.Input("Length").front())->Get()); + param_.Out = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_unpad, paddle::lite::operators::SequenceUnpadOp); diff --git a/lite/operators/sequence_unpad_op.h b/lite/operators/sequence_unpad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..508f0437fe32f9b65716f78124df377b99b1ef49 --- /dev/null +++ b/lite/operators/sequence_unpad_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceUnpadOp : public OpLite { + public: + SequenceUnpadOp() {} + explicit SequenceUnpadOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_unpad"; } + + private: + mutable SequenceUnpadParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/stack_op.cc b/lite/operators/stack_op.cc index 0f9ba6662b16ce20acad497a4915cfc848b319cd..d4fb71c4b5cb429d1b3961d5c65f739af56ff39d 100644 --- a/lite/operators/stack_op.cc +++ b/lite/operators/stack_op.cc @@ -47,6 +47,7 @@ bool StackOp::InferShapeImpl() const { bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { auto X = op_desc.Input("X"); auto Out = op_desc.Output("Y").front(); + param_.X.clear(); for (auto var : X) { param_.X.emplace_back(scope->FindVar(var)->GetMutable()); } diff --git a/lite/tests/CMakeLists.txt b/lite/tests/CMakeLists.txt index 0416c33a81b524b4dba1c1b406d91204cca6946d..a94a46897a8ae8415efd8edf19e216ede69f8888 100644 --- a/lite/tests/CMakeLists.txt +++ b/lite/tests/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(kernels) add_subdirectory(math) add_subdirectory(cv) +add_subdirectory(api) diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c31e3ba58fc793aa92a5b37a59ad612e03c61a53 --- /dev/null +++ b/lite/tests/api/CMakeLists.txt @@ -0,0 +1,14 @@ +if(LITE_WITH_XPU) + lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) +endif() diff --git a/lite/tests/api/test_bert_lite_xpu.cc b/lite/tests/api/test_bert_lite_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b3ee9febb3f0eabd36118680beca66ace9470de4 --- /dev/null +++ b/lite/tests/api/test_bert_lite_xpu.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/lite_api_test_helper.h" +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +template +lite::Tensor GetTensorWithShape(std::vector shape) { + lite::Tensor ret; + ret.Resize(shape); + T* ptr = ret.mutable_data(); + for (int i = 0; i < ret.numel(); ++i) { + ptr[i] = (T)1; + } + return ret; +} + +TEST(Ernie, test_ernie_lite_xpu) { + lite_api::CxxConfig config; + config.set_model_dir(FLAGS_model_dir); + config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + config.set_xpu_workspace_l3_size_per_thread(); + auto predictor = lite_api::CreatePaddlePredictor(config); + + int64_t batch_size = 1; + int64_t seq_len = 64; + Tensor sample_input = GetTensorWithShape({batch_size, seq_len, 1}); + std::vector input_shape{batch_size, seq_len, 1}; + predictor->GetInput(0)->Resize(input_shape); + predictor->GetInput(1)->Resize(input_shape); + predictor->GetInput(2)->Resize(input_shape); + predictor->GetInput(3)->Resize(input_shape); + + memcpy(predictor->GetInput(0)->mutable_data(), + sample_input.raw_data(), + sizeof(int64_t) * batch_size * seq_len); + memcpy(predictor->GetInput(1)->mutable_data(), + sample_input.raw_data(), + sizeof(int64_t) * batch_size * seq_len); + memcpy(predictor->GetInput(2)->mutable_data(), + sample_input.raw_data(), + sizeof(int64_t) * batch_size * seq_len); + memcpy(predictor->GetInput(3)->mutable_data(), + sample_input.raw_data(), + sizeof(int64_t) * batch_size * seq_len); + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor->Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor->Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; + + std::vector> results; + results.emplace_back(std::vector({0.278893, 0.330888, 0.39022})); + auto out = predictor->GetOutput(0); + ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape()[0], 1); + ASSERT_EQ(out->shape()[1], 3); + + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { + EXPECT_NEAR( + out->data()[j + (out->shape()[1] * i)], results[i][j], 1e-5); + } + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/api/test_ernie_lite_xpu.cc b/lite/tests/api/test_ernie_lite_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b614fec96cbcc5d9c96653681d0e8794cf4ab8f --- /dev/null +++ b/lite/tests/api/test_ernie_lite_xpu.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/lite_api_test_helper.h" +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +template +lite::Tensor GetTensorWithShape(std::vector shape) { + lite::Tensor ret; + ret.Resize(shape); + T* ptr = ret.mutable_data(); + for (int i = 0; i < ret.numel(); ++i) { + ptr[i] = (T)1; + } + return ret; +} + +TEST(Ernie, test_ernie_lite_xpu) { + lite_api::CxxConfig config; + config.set_model_dir(FLAGS_model_dir); + config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + config.set_xpu_workspace_l3_size_per_thread(); + auto predictor = lite_api::CreatePaddlePredictor(config); + + int64_t batch_size = 1; + int64_t seq_len = 64; + Tensor sample_input = GetTensorWithShape({batch_size, seq_len, 1}); + std::vector input_shape{batch_size, seq_len, 1}; + predictor->GetInput(0)->Resize(input_shape); + predictor->GetInput(1)->Resize(input_shape); + predictor->GetInput(2)->Resize(input_shape); + predictor->GetInput(3)->Resize(input_shape); + + memcpy(predictor->GetInput(0)->mutable_data(), + sample_input.raw_data(), + sizeof(int64_t) * batch_size * seq_len); + memcpy(predictor->GetInput(1)->mutable_data(), + sample_input.raw_data(), + sizeof(int64_t) * batch_size * seq_len); + memcpy(predictor->GetInput(2)->mutable_data(), + sample_input.raw_data(), + sizeof(int64_t) * batch_size * seq_len); + memcpy(predictor->GetInput(3)->mutable_data(), + sample_input.raw_data(), + sizeof(int64_t) * batch_size * seq_len); + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor->Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor->Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; + + std::vector> results; + results.emplace_back(std::vector({0.108398})); + auto out = predictor->GetOutput(0); + ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape()[0], 1); + ASSERT_EQ(out->shape()[1], 1); + + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { + EXPECT_NEAR( + out->data()[j + (out->shape()[1] * i)], results[i][j], 1e-5); + } + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/api/test_resnet50_lite_xpu.cc b/lite/tests/api/test_resnet50_lite_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..be30369b9e187dd5d82527cb87eed405bc463ae3 --- /dev/null +++ b/lite/tests/api/test_resnet50_lite_xpu.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/lite_api_test_helper.h" +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +TEST(Resnet50, test_resnet50_lite_xpu) { + lite_api::CxxConfig config; + config.set_model_dir(FLAGS_model_dir); + config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + config.set_xpu_workspace_l3_size_per_thread(); + auto predictor = lite_api::CreatePaddlePredictor(config); + + auto input_tensor = predictor->GetInput(0); + std::vector input_shape{1, 3, 224, 224}; + input_tensor->Resize(input_shape); + auto* data = input_tensor->mutable_data(); + int input_num = 1; + for (size_t i = 0; i < input_shape.size(); ++i) { + input_num *= input_shape[i]; + } + for (int i = 0; i < input_num; i++) { + data[i] = 1; + } + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor->Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor->Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; + + std::vector> results; + results.emplace_back(std::vector( + {0.000268651, 0.000174053, 0.000213181, 0.000396771, 0.000591516, + 0.00018169, 0.000289721, 0.000855934, 0.000732185, 9.2055e-05, + 0.000220664, 0.00235289, 0.00571265, 0.00357688, 0.00129667, + 0.000465392, 0.000143775, 0.000211628, 0.000617144, 0.000265033})); + auto out = predictor->GetOutput(0); + ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape()[0], 1); + ASSERT_EQ(out->shape()[1], 1000); + + int step = 50; + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { + EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], + results[i][j], + 1e-5); + } + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt index 697c9874ef2072eedf6b654863e25e981fb6834a..1ab73792e7fa3a46fd4c4b4479e4f231d55608f6 100644 --- a/lite/tests/cv/CMakeLists.txt +++ b/lite/tests/cv/CMakeLists.txt @@ -1,3 +1,3 @@ -if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) +if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM) lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm) endif() diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index e108e35af76c6b5f2c5719b650b06d849a2f3887..cb454c4da5bc15d65e480f55dabe01124bf18ca5 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,4 +1,4 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) @@ -61,6 +61,7 @@ if(LITE_BUILD_EXTRA) lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) # for training kernel if (LITE_WITH_TRAIN) diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc index 5a0b033b1b8c4d8f28aa05c3f2fcac40f2569bf4..c71eac8d4532eefd5569421807c85128746c6c8b 100644 --- a/lite/tests/kernels/activation_compute_test.cc +++ b/lite/tests/kernels/activation_compute_test.cc @@ -36,7 +36,9 @@ enum activation_type_test { FLOOR, RSQRT, GELU, - SQUARE + SQUARE, + HARD_SWISH, + RECIPROCAL }; class ActivationComputeTester : public arena::TestCase { @@ -49,6 +51,9 @@ class ActivationComputeTester : public arena::TestCase { float relu_clipped_coef_ = 6.; std::string prelu_mode_ = ""; float swish_beta_ = 0.; + float hard_swish_threshold = 6.0; + float hard_swish_scale = 6.0; + float hard_swish_offset = 3.0; DDim dims_{{1}}; std::string type_ = ""; activation_type_test act_type_ = RELU; @@ -199,6 +204,20 @@ class ActivationComputeTester : public arena::TestCase { } break; } + case HARD_SWISH: { + for (int i = 0; i < dims_.production(); i++) { + float max_value = std::max(0.f, x_data[i] + hard_swish_offset); + float min_value = std::min(max_value, hard_swish_threshold); + output_data[i] = min_value * x_data[i] / hard_swish_scale; + } + break; + } + case RECIPROCAL: { + for (int i = 0; i < dims_.production(); i++) { + output_data[i] = 1.0 / x_data[i]; + } + break; + } default: LOG(INFO) << "the type of activation is unknow."; } @@ -221,6 +240,11 @@ class ActivationComputeTester : public arena::TestCase { if (act_type_ == SWISH) { op_desc->SetAttr("beta", swish_beta_); } + if (act_type_ == HARD_SWISH) { + op_desc->SetAttr("threshold", hard_swish_threshold); + op_desc->SetAttr("scale", hard_swish_scale); + op_desc->SetAttr("offset", hard_swish_offset); + } } void PrepareData() override { @@ -552,5 +576,61 @@ TEST(Activation_gelu, precision) { } } +TEST(activation_hard_swish, precision) { + LOG(INFO) << "test hard_swish op"; + Place place; + float abs_error = 2e-5; + +#if defined(LITE_WITH_ARM) + place = TARGET(kARM); +#else + return; +#endif + + for (auto dims : std::vector>{ + {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) { + std::unique_ptr tester( + new ActivationComputeTester(place, + "def", + 0.01, + 6., + "all", + 0., + DDim(dims), + "hard_swish", + HARD_SWISH)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + +TEST(activation_reciprocal, precision) { + LOG(INFO) << "test reciprocal op"; + Place place; + float abs_error = 2e-5; + +#if defined(LITE_WITH_ARM) + place = TARGET(kARM); +#else + return; +#endif + + for (auto dims : std::vector>{ + {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) { + std::unique_ptr tester( + new ActivationComputeTester(place, + "def", + 0.01, + 6., + "all", + 0., + DDim(dims), + "reciprocal", + RECIPROCAL)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + } // namespace lite } // namespace paddle diff --git a/lite/tests/kernels/ctc_align_compute_test.cc b/lite/tests/kernels/ctc_align_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e32012549cab42858938388857c65e14f65be099 --- /dev/null +++ b/lite/tests/kernels/ctc_align_compute_test.cc @@ -0,0 +1,254 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" + +namespace paddle { +namespace lite { + +class CtcAlignComputeTester : public arena::TestCase { + protected: + // common attributes for this op. + std::string input_ = "input"; + std::string input_length_ = "input_length"; + std::string output_ = "output"; + std::string output_length_ = "output_length"; + std::vector input_data_; + std::vector input_shape_; + std::vector> input_lod_; + std::vector input_length_data_; + std::vector input_length_shape_; + std::vector output_data_; + std::vector output_shape_; + std::vector> output_lod_; + std::vector output_length_data_; + std::vector output_length_shape_; + int blank_; + bool merge_repeated_; + int padding_value_; + + public: + CtcAlignComputeTester(const Place& place, + const std::string& alias, + const std::vector& input_data, + const std::vector input_shape, + const std::vector>& input_lod, + const std::vector& input_length_data, + const std::vector input_length_shape, + const int blank, + const bool merge_repeated, + const int padding_value, + const std::vector& output_data, + const std::vector& output_shape, + const std::vector>& output_lod, + const std::vector& output_length_data, + const std::vector& output_length_shape) + : TestCase(place, alias) { + input_data_ = input_data; + input_shape_ = input_shape; + input_lod_ = input_lod; + input_length_data_ = input_length_data; + input_length_shape_ = input_length_shape; + blank_ = blank; + merge_repeated_ = merge_repeated; + padding_value_ = padding_value; + output_data_ = output_data; + output_shape_ = output_shape; + output_lod_ = output_lod; + output_length_data_ = output_length_data; + output_length_shape_ = output_length_shape; + } + + void RunBaseline(Scope* scope) override { + auto* output_tensor = scope->NewTensor(output_); + output_tensor->Resize(output_shape_); + if (!output_lod_.empty()) { + output_tensor->set_lod(output_lod_); + } + auto* output_data = output_tensor->mutable_data(); + int64_t output_num = 1; + for (auto e : output_shape_) { + output_num *= e; + } + for (int i = 0; i < output_num; i++) { + output_data[i] = output_data_[i]; + } + + if (!input_length_data_.empty() && !output_length_data_.empty()) { + auto* output_length_tensor = scope->NewTensor(output_length_); + output_length_tensor->Resize(output_length_shape_); + auto* output_length_data = output_length_tensor->mutable_data(); + int64_t num = 1; + for (auto e : output_length_shape_) { + num *= e; + } + for (int i = 0; i < num; i++) { + output_length_data[i] = output_length_data_[i]; + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType("ctc_align"); + op_desc->SetInput("Input", {input_}); + op_desc->SetOutput("Output", {output_}); + if (!input_length_data_.empty()) { + op_desc->SetInput("InputLength", {input_length_}); + op_desc->SetOutput("OutputLength", {output_length_}); + } + op_desc->SetAttr("blank", blank_); + op_desc->SetAttr("merge_repeated", merge_repeated_); + op_desc->SetAttr("padding_value", padding_value_); + } + + void PrepareData() override { + SetCommonTensor(input_, DDim(input_shape_), input_data_.data(), input_lod_); + if (!input_length_data_.empty()) { + SetCommonTensor( + input_length_, DDim(input_length_shape_), input_length_data_.data()); + } + } +}; +TEST(CtcAlign1, precision) { + LOG(INFO) << "test ctc_align op"; +#ifdef LITE_WITH_ARM + // Define variable + const std::vector& input_data = { + 0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0}; + const std::vector input_shape = {18, 1}; + const std::vector> input_lod = {{11, 7}}; + const std::vector input_length_data = {}; + const std::vector input_length_shape = {}; + const int blank = 0; + const bool merge_repeated = false; + const int padding_value = 0; + const std::vector output_data = {1, 2, 2, 4, 4, 5, 6, 6, 7, 7, 7}; + const std::vector output_shape = {11, 1}; + const std::vector> output_lod = {{7, 4}}; + const std::vector output_length_data = {}; + const std::vector output_length_shape = {}; + + // Test + Place place(TARGET(kHost), PRECISION(kInt32)); + std::unique_ptr tester( + new CtcAlignComputeTester(place, + "def", + input_data, + input_shape, + input_lod, + input_length_data, + input_length_shape, + blank, + merge_repeated, + padding_value, + output_data, + output_shape, + output_lod, + output_length_data, + output_length_shape)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); +#endif +} + +TEST(CtcAlign2, precision) { + LOG(INFO) << "test ctc_align op"; +#ifdef LITE_WITH_ARM + // Define variable + const std::vector& input_data = { + 0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0}; + const std::vector input_shape = {3, 6}; + const std::vector> input_lod = {}; + const std::vector input_length_data = {6, 5, 4}; + const std::vector input_length_shape = {3, 1}; + const int blank = 0; + const bool merge_repeated = true; + const int padding_value = 0; + const std::vector output_data = { + 1, 2, 4, 0, 0, 0, 4, 5, 6, 0, 0, 0, 7, 0, 0, 0, 0, 0}; + const std::vector output_shape = {3, 6}; + const std::vector> output_lod = {}; + const std::vector output_length_data = {3, 3, 1}; + const std::vector output_length_shape = {3, 1}; + + // Test + Place place(TARGET(kHost), PRECISION(kInt32)); + std::unique_ptr tester( + new CtcAlignComputeTester(place, + "def", + input_data, + input_shape, + input_lod, + input_length_data, + input_length_shape, + blank, + merge_repeated, + padding_value, + output_data, + output_shape, + output_lod, + output_length_data, + output_length_shape)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); +#endif +} + +TEST(CtcAlign3, precision) { + LOG(INFO) << "test ctc_align op"; +#ifdef LITE_WITH_ARM + // Define variable + const std::vector& input_data = { + 0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0}; + const std::vector input_shape = {3, 6}; + const std::vector> input_lod = {}; + const std::vector input_length_data = {6, 5, 4}; + const std::vector input_length_shape = {3, 1}; + const int blank = 0; + const bool merge_repeated = false; + const int padding_value = 0; + const std::vector output_data = { + 1, 2, 2, 4, 0, 0, 4, 5, 6, 0, 0, 0, 7, 7, 7, 0, 0, 0}; + const std::vector output_shape = {3, 6}; + const std::vector> output_lod = {}; + const std::vector output_length_data = {4, 3, 3}; + const std::vector output_length_shape = {3, 1}; + + // Test + Place place(TARGET(kHost), PRECISION(kInt32)); + std::unique_ptr tester( + new CtcAlignComputeTester(place, + "def", + input_data, + input_shape, + input_lod, + input_length_data, + input_length_shape, + blank, + merge_repeated, + padding_value, + output_data, + output_shape, + output_lod, + output_length_data, + output_length_shape)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); +#endif +} +} // namespace lite +} // namespace paddle diff --git a/lite/tests/math/CMakeLists.txt b/lite/tests/math/CMakeLists.txt index 7dd4f522dbc0f10e8cfb7d19e95da4354ac4b779..e02307aa73cccdacd38bfd2bc9b4ca422a56d06c 100644 --- a/lite/tests/math/CMakeLists.txt +++ b/lite/tests/math/CMakeLists.txt @@ -1,4 +1,4 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tools/build.sh b/lite/tools/build.sh index a888d8ef25cbe7c816693fa45d954672a8ad5b1f..e7394fcb6edbd7a2f4b564b7a0e7d5aa43506843 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -25,6 +25,7 @@ SHUTDOWN_LOG=ON BUILD_NPU=OFF NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/ BUILD_XPU=OFF +BUILD_XTCL=OFF XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/" LITE_WITH_ARM_LANG=OFF @@ -138,6 +139,7 @@ function make_tiny_publish_so { -DLITE_WITH_NPU=$BUILD_NPU \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DLITE_WITH_XPU=$BUILD_XPU \ + -DLITE_WITH_XTCL=$BUILD_XTCL \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} @@ -226,6 +228,7 @@ function make_full_publish_so { -DLITE_WITH_NPU=$BUILD_NPU \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DLITE_WITH_XPU=$BUILD_XPU \ + -DLITE_WITH_XTCL=$BUILD_XTCL \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DLITE_WITH_TRAIN=$BUILD_TRAIN \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} @@ -260,6 +263,7 @@ function make_all_tests { -DLITE_WITH_NPU=$BUILD_NPU \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DLITE_WITH_XPU=$BUILD_XPU \ + -DLITE_WITH_XTCL=$BUILD_XTCL \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} @@ -330,7 +334,10 @@ function make_cuda { -DWITH_TESTING=OFF \ -DLITE_WITH_ARM=OFF \ -DLITE_WITH_PYTHON=${BUILD_PYTHON} \ - -DLITE_BUILD_EXTRA=ON + -DLITE_BUILD_EXTRA=ON \ + -DLITE_WITH_XPU=$BUILD_XPU \ + -DLITE_WITH_XTCL=$BUILD_XTCL \ + -DXPU_SDK_ROOT=$XPU_SDK_ROOT make publish_inference -j$NUM_PROC cd - @@ -362,9 +369,10 @@ function make_x86 { -DWITH_GPU=OFF \ -DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_BUILD_EXTRA=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DLITE_WITH_XPU=$BUID_XPU \ - -DXPU_SDK_ROOT=$XPU_SDK_ROOT + -DLITE_WITH_XPU=$BUILD_XPU \ + -DLITE_WITH_XTCL=$BUILD_XTCL \ + -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ + -DCMAKE_BUILD_TYPE=Release make publish_inference -j$NUM_PROC cd - @@ -483,6 +491,10 @@ function main { BUILD_XPU="${i#*=}" shift ;; + --build_xtcl=*) + BUILD_XTCL="${i#*=}" + shift + ;; --xpu_sdk_root=*) XPU_SDK_ROOT="${i#*=}" shift diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh index 1912efda5edc6e436cc84dbdf9919a99e1ed3279..01d71aaf213abb99633112664af580b897ce7454 100755 --- a/lite/tools/build_mlu.sh +++ b/lite/tools/build_mlu.sh @@ -2,10 +2,10 @@ set -ex # global variables with default value -NEUWARE_HOME="${NEUWARE_HOME}" # XPU SDK +NEUWARE_HOME="${NEUWARE_HOME}" TARGET_NAME="all" # default target BUILD_EXTRA=OFF # ON(with sequence ops)/OFF -WITH_TESTING=OFF # ON/OFF +WITH_TESTING=ON # ON/OFF function print_usage { echo -e "\nUSAGE:" @@ -20,10 +20,9 @@ function print_usage { # readonly variables with default value readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ - -DWITH_PYTHON=OFF \ -DLITE_WITH_ARM=OFF" -readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1} +readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8} readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz readonly workspace=$(pwd) @@ -37,8 +36,7 @@ function prepare_thirdparty { fi tar xzf third-party-05b862.tar.gz else - # git submodule update --init --recursive - echo "third-party is in ready" + git submodule update --init --recursive fi } @@ -62,12 +60,12 @@ function prepare_workspace { } function build_mlu { + prepare_workspace build_dir=${workspace}/build.lite.mlu mkdir -p $build_dir cd $build_dir export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" - prepare_workspace cmake .. \ ${CMAKE_COMMON_OPTIONS} \ -DWITH_GPU=OFF \ @@ -75,9 +73,10 @@ function build_mlu { -DLITE_WITH_X86=ON \ -DWITH_MKL=ON \ -DLITE_WITH_MLU=ON \ + -DLITE_WITH_PYTHON=OFF \ -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \ -DWITH_TESTING=${WITH_TESTING} \ - -DMLU_SDK_ROOT=${XPU_SDK_ROOT} + -DNEUWARE_HOME=${NEUWARE_HOME} make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh deleted file mode 100755 index fdf287501e8f4411f51e73c55b789753f2e85674..0000000000000000000000000000000000000000 --- a/lite/tools/build_xpu.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/bin/bash -set -ex - -# global variables with default value -XPU_SDK_ROOT="$(pwd)/../XPU_SDK" # XPU SDK -TARGET_NAME="test_subgraph_pass" # default target -BUILD_EXTRA=ON # ON(with sequence ops)/OFF -WITH_TESTING=ON # ON/OFF - -function print_usage { - echo -e "\nUSAGE:" - echo - echo "----------------------------------------" - echo -e "--xpu_sdk_root=" - echo -e "--target_name=" - echo "----------------------------------------" - echo -} - -# readonly variables with default value -readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ - -DWITH_PYTHON=OFF \ - -DLITE_WITH_ARM=OFF" - -readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1} - -readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz -readonly workspace=$(pwd) - -function prepare_thirdparty { - if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then - rm -rf $workspace/third-party - - if [ ! -f $workspace/third-party-05b862.tar.gz ]; then - wget $THIRDPARTY_TAR - fi - tar xzf third-party-05b862.tar.gz - else - git submodule update --init --recursive - fi -} - -# for code gen, a source file is generated after a test, but is dependended by some targets in cmake. -# here we fake an empty file to make cmake works. -function prepare_workspace { - # in build directory - # 1. Prepare gen_code file - GEN_CODE_PATH_PREFIX=lite/gen_code - mkdir -p ./${GEN_CODE_PATH_PREFIX} - touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc - - # 2.Prepare debug tool - DEBUG_TOOL_PATH_PREFIX=lite/tools/debug - mkdir -p ./${DEBUG_TOOL_PATH_PREFIX} - cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/ - - # clone submodule - # git submodule update --init --recursive - prepare_thirdparty -} - -function build_xpu { - build_dir=${workspace}/build.lite.xpu - mkdir -p $build_dir - cd $build_dir - - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" - prepare_workspace - cmake .. \ - ${CMAKE_COMMON_OPTIONS} \ - -DWITH_GPU=OFF \ - -DWITH_MKLDNN=OFF \ - -DLITE_WITH_X86=ON \ - -DWITH_MKL=ON \ - -DLITE_WITH_XPU=ON \ - -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \ - -DWITH_TESTING=${WITH_TESTING} \ - -DXPU_SDK_ROOT=${XPU_SDK_ROOT} - - make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE - - cd - - echo "Done" -} - -function main { - # Parse command line. - for i in "$@"; do - case $i in - --target_name=*) - TARGET_NAME="${i#*=}" - shift - ;; - --build_extra=*) - BUILD_EXTRA="${i#*=}" - shift - ;; - --xpu_sdk_root=*) - XPU_SDK_ROOT="${i#*=}" - shift - ;; - build) - build_xpu - shift - ;; - full_publish) - TARGET_NAME=publish_inference - build_xpu - shift - ;; - *) - # unknown option - print_usage - exit 1 - ;; - esac - done -} - -main $@