未验证 提交 d64d4b6f 编写于 作者: myq406450149's avatar myq406450149 提交者: GitHub

Merge branch 'develop' into pass

...@@ -59,7 +59,9 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) ...@@ -59,7 +59,9 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
...@@ -177,6 +179,10 @@ if(LITE_WITH_XPU) ...@@ -177,6 +179,10 @@ if(LITE_WITH_XPU)
include(device/xpu) include(device/xpu)
endif() endif()
if(LITE_WITH_MLU)
include(mlu)
endif()
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
include(external/xbyak) # download xbyak package include(external/xbyak) # download xbyak package
include(external/libxsmm) # download, build, install libxsmm include(external/libxsmm) # download, build, install libxsmm
......
...@@ -136,6 +136,9 @@ endif() ...@@ -136,6 +136,9 @@ endif()
if (LITE_WITH_XPU) if (LITE_WITH_XPU)
add_definitions("-DLITE_WITH_XPU") add_definitions("-DLITE_WITH_XPU")
if (LITE_WITH_XTCL)
add_definitions("-DLITE_WITH_XTCL")
endif()
endif() endif()
if (LITE_WITH_OPENCL) if (LITE_WITH_OPENCL)
...@@ -150,6 +153,10 @@ if (LITE_WITH_BM) ...@@ -150,6 +153,10 @@ if (LITE_WITH_BM)
add_definitions("-DLITE_WITH_BM") add_definitions("-DLITE_WITH_BM")
endif() endif()
if (LITE_WITH_MLU)
add_definitions("-DLITE_WITH_MLU")
endif()
if (LITE_WITH_PROFILE) if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE") add_definitions("-DLITE_WITH_PROFILE")
endif() endif()
......
...@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT) ...@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
endif() endif()
endif() endif()
message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}") message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
find_path(XPU_SDK_INC NAMES xtcl.h
PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
NO_DEFAULT_PATH)
if(NOT XPU_SDK_INC)
message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
endif()
include_directories("${XPU_SDK_ROOT}/XTCL/include")
include_directories("${XPU_SDK_ROOT}/XTDK/include") include_directories("${XPU_SDK_ROOT}/XTDK/include")
find_library(XPU_SDK_XTCL_FILE NAMES xtcl
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XTCL_FILE)
message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
endif()
find_library(XPU_SDK_TVM_FILE NAMES tvm
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_TVM_FILE)
message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
endif()
find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
PATHS ${XPU_SDK_ROOT}/XTDK/shlib PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
...@@ -82,23 +50,55 @@ else() ...@@ -82,23 +50,55 @@ else()
set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE}) set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
endif() endif()
find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
PATHS ${XPU_SDK_ROOT}/XTDK/shlib set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
if(LITE_WITH_XTCL)
find_path(XPU_SDK_INC NAMES xtcl.h
PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
if(NOT XPU_SDK_INC)
message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
endif()
include_directories("${XPU_SDK_ROOT}/XTCL/include")
find_library(XPU_SDK_XTCL_FILE NAMES xtcl
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XTCL_FILE)
message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
endif()
find_library(XPU_SDK_TVM_FILE NAMES tvm
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 if(NOT XPU_SDK_TVM_FILE)
message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
endif()
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
PATHS ${XPU_SDK_ROOT}/XTDK/shlib PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
if(NOT XPU_SDK_LLVM_FILE) if(NOT XPU_SDK_LLVM_FILE)
message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
else() else()
message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
endif() endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
endif()
...@@ -22,7 +22,7 @@ endfunction() ...@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET) function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS) set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -100,6 +100,12 @@ function (lite_deps TARGET) ...@@ -100,6 +100,12 @@ function (lite_deps TARGET)
endforeach(var) endforeach(var)
endif() endif()
if (LITE_WITH_MLU)
foreach(var ${lite_deps_MLU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE) set(${TARGET} ${deps} PARENT_SCOPE)
endfunction() endfunction()
...@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean ...@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET) function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -144,6 +150,7 @@ function(lite_cc_library TARGET) ...@@ -144,6 +150,7 @@ function(lite_cc_library TARGET)
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS}
) )
if (args_SHARED OR ARGS_shared) if (args_SHARED OR ARGS_shared)
...@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET) ...@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET)
set(options " -g ") set(options " -g ")
endif() endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET) ...@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS} CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
) )
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
...@@ -218,7 +226,7 @@ function(lite_cc_test TARGET) ...@@ -218,7 +226,7 @@ function(lite_cc_test TARGET)
endif() endif()
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS ARGS
COMPILE_LEVEL # (basic|extra) COMPILE_LEVEL # (basic|extra)
...@@ -245,6 +253,7 @@ function(lite_cc_test TARGET) ...@@ -245,6 +253,7 @@ function(lite_cc_test TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS} CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
) )
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size # strip binary target to reduce size
...@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels") ...@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels") set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels") set(bm_kernels CACHE INTERNAL "bm kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels") set(host_kernels CACHE INTERNAL "host kernels")
...@@ -285,12 +295,12 @@ if(LITE_BUILD_TAILOR) ...@@ -285,12 +295,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif() endif()
# add a kernel for some specific device # add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM) # device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
# level: one of (basic, extra) # level: one of (basic, extra)
function(add_kernel TARGET device level) function(add_kernel TARGET device level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -369,6 +379,12 @@ function(add_kernel TARGET device level) ...@@ -369,6 +379,12 @@ function(add_kernel TARGET device level)
endif() endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "MLU")
if (NOT LITE_WITH_MLU)
return()
endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL") if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL) if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
...@@ -409,6 +425,7 @@ function(add_kernel TARGET device level) ...@@ -409,6 +425,7 @@ function(add_kernel TARGET device level)
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -427,7 +444,7 @@ endif() ...@@ -427,7 +444,7 @@ endif()
function(add_operator TARGET level) function(add_operator TARGET level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -462,6 +479,7 @@ function(add_operator TARGET level) ...@@ -462,6 +479,7 @@ function(add_operator TARGET level)
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
......
...@@ -8,7 +8,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") ...@@ -8,7 +8,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
......
...@@ -10,6 +10,7 @@ if (LITE_ON_TINY_PUBLISH) ...@@ -10,6 +10,7 @@ if (LITE_ON_TINY_PUBLISH)
endif() endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light) set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library #full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
...@@ -66,7 +67,8 @@ if (WITH_TESTING) ...@@ -66,7 +67,8 @@ if (WITH_TESTING)
CUDA_DEPS ${cuda_kernels} CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
BM_DEPS ${bm_kernels}) BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
endif() endif()
if(LITE_WITH_FPGA) if(LITE_WITH_FPGA)
set(light_api_deps ${light_api_deps} ${fpga_deps}) set(light_api_deps ${light_api_deps} ${fpga_deps})
...@@ -88,6 +90,7 @@ message(STATUS "get NPU kernels ${npu_kernels}") ...@@ -88,6 +90,7 @@ message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}") message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}")
# for full api # for full api
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
...@@ -125,7 +128,8 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -125,7 +128,8 @@ lite_cc_library(light_api SRCS light_api.cc
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}) BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
include(ExternalProject) include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
...@@ -144,6 +148,7 @@ if(WITH_TESTING) ...@@ -144,6 +148,7 @@ if(WITH_TESTING)
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
EXCLUDE_COMPILE_DEPS "ON" EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
...@@ -264,8 +269,6 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -264,8 +269,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
BM_DEPS ${bm_kernels}) BM_DEPS ${bm_kernels})
# The final inference library for just MobileConfig. # The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
...@@ -292,6 +295,7 @@ lite_cc_test(test_apis SRCS apis_test.cc ...@@ -292,6 +295,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
...@@ -329,6 +333,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle ...@@ -329,6 +333,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING) if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
...@@ -342,6 +347,7 @@ if(NOT IOS) ...@@ -342,6 +347,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
...@@ -354,6 +360,7 @@ if(NOT IOS) ...@@ -354,6 +360,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
...@@ -366,6 +373,7 @@ if(NOT IOS) ...@@ -366,6 +373,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
...@@ -378,6 +386,7 @@ if(NOT IOS) ...@@ -378,6 +386,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
...@@ -389,6 +398,7 @@ if(NOT IOS) ...@@ -389,6 +398,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
......
...@@ -43,6 +43,16 @@ class LITE_API Predictor { ...@@ -43,6 +43,16 @@ class LITE_API Predictor {
public: public:
// Create an empty predictor. // Create an empty predictor.
Predictor() { scope_ = std::make_shared<Scope>(); } Predictor() { scope_ = std::make_shared<Scope>(); }
~Predictor() {
#ifdef LITE_WITH_OPENCL
CLRuntime::Global()->ReleaseResources();
#endif
scope_.reset();
exec_scope_ = nullptr;
program_.reset();
input_names_.clear();
output_names_.clear();
}
// Create a predictor with the weight variable scope set. // Create a predictor with the weight variable scope set.
explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope) explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
: scope_(root_scope) {} : scope_(root_scope) {}
......
...@@ -42,6 +42,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -42,6 +42,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
} }
} }
#endif #endif
#ifdef LITE_WITH_MLU
Env<TARGET(kMLU)>::Init();
lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
config.mlu_core_number(),
config.mlu_use_first_conv(),
config.mlu_first_conv_mean(),
config.mlu_first_conv_std(),
config.mlu_input_layout());
#endif // LITE_WITH_MLU
std::vector<std::string> passes{}; std::vector<std::string> passes{};
auto use_layout_preprocess_pass = auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS"); config.model_dir().find("OPENCL_PRE_PRECESS");
......
...@@ -107,6 +107,8 @@ class LightPredictorImpl : public lite_api::PaddlePredictor { ...@@ -107,6 +107,8 @@ class LightPredictorImpl : public lite_api::PaddlePredictor {
public: public:
LightPredictorImpl() = default; LightPredictorImpl() = default;
~LightPredictorImpl();
std::unique_ptr<lite_api::Tensor> GetInput(int i) override; std::unique_ptr<lite_api::Tensor> GetInput(int i) override;
std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override; std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;
......
...@@ -21,6 +21,13 @@ ...@@ -21,6 +21,13 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
LightPredictorImpl::~LightPredictorImpl() {
raw_predictor_.reset();
#ifdef LITE_WITH_OPENCL
CLRuntime::Global()->ReleaseResources();
#endif
}
void LightPredictorImpl::Init(const lite_api::MobileConfig& config) { void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
// LightPredictor Only support NaiveBuffer backend in publish lib // LightPredictor Only support NaiveBuffer backend in publish lib
if (config.lite_model_file().empty()) { if (config.lite_model_file().empty()) {
......
...@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() { ...@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(TARGET(kNPU)); valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU)); valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") {
valid_places.emplace_back(TARGET(kMLU));
} else { } else {
LOG(FATAL) << lite::string_format( LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag " "Wrong target '%s' found, please check the command flag "
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/core/context.h"
#include "lite/core/device_info.h" #include "lite/core/device_info.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
...@@ -203,6 +204,58 @@ void ConfigBase::set_threads(int threads) { ...@@ -203,6 +204,58 @@ void ConfigBase::set_threads(int threads) {
#endif #endif
} }
#ifdef LITE_WITH_MLU
void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
mlu_core_version_ = core_version;
}
void CxxConfig::set_mlu_core_number(int core_number) {
mlu_core_number_ = core_number;
}
void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
mlu_input_layout_ = layout;
}
void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
mlu_use_first_conv_ = use_first_conv;
}
void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
mlu_first_conv_mean_ = mean;
}
void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
mlu_first_conv_std_ = std;
}
lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
return mlu_core_version_;
}
int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
return mlu_first_conv_mean_;
}
const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
return mlu_first_conv_std_;
}
#endif
void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_workspace_l3_size_per_thread' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}
void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetDev(dev_no);
#else
LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
"ignored, please rebuild it with LITE_WITH_XPU=ON.";
#endif
}
// set model data in combined format, `set_model_from_file` refers to loading // set model data in combined format, `set_model_from_file` refers to loading
// model from file, set_model_from_buffer refers to loading model from memory // model from file, set_model_from_buffer refers to loading model from memory
// buffer // buffer
......
...@@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase {
#ifdef LITE_WITH_X86 #ifdef LITE_WITH_X86
int x86_math_library_math_threads_ = 1; int x86_math_library_math_threads_ = 1;
#endif #endif
#ifdef LITE_WITH_MLU
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
int mlu_core_number_{1};
DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
bool mlu_use_first_conv_{false};
std::vector<float> mlu_first_conv_mean_;
std::vector<float> mlu_first_conv_std_;
#endif
public: public:
void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; } void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
...@@ -163,6 +171,37 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -163,6 +171,37 @@ class LITE_API CxxConfig : public ConfigBase {
return x86_math_library_math_threads_; return x86_math_library_math_threads_;
} }
#endif #endif
#ifdef LITE_WITH_MLU
// set MLU core version, which is used when compiling MLU kernels
void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
// set MLU core number, which is used when compiling MLU kernels
void set_mlu_core_number(int core_number);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
// whether use MLU's first conv kernel. First conv is a special kernel
// provided by MLU, its input is uint8, and also needs two 3-dimentional
// vectors which save all inputs' mean and std values
void set_mlu_use_first_conv(bool use_first_conv);
// set the 3-dimentional mean vector used by MLU's first conv
void set_mlu_first_conv_mean(const std::vector<float>& mean);
// set the 3-dimentional std vector used by MLU's first conv
void set_mlu_first_conv_std(const std::vector<float>& std);
lite_api::MLUCoreVersion mlu_core_version() const;
int mlu_core_number() const;
DataLayoutType mlu_input_layout() const;
bool mlu_use_first_conv() const;
const std::vector<float>& mlu_first_conv_mean() const;
const std::vector<float>& mlu_first_conv_std() const;
#endif
// XPU only, set the size of the workspace memory from L3 cache for the
// current thread.
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
// XPU only, specify the target device ID for the current thread.
void set_xpu_dev_per_thread(int dev_no = 0);
}; };
/// MobileConfig is the config for the light weight predictor, it will skip /// MobileConfig is the config for the light weight predictor, it will skip
......
...@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) { ...@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) {
"fpga", "fpga",
"npu", "npu",
"xpu", "xpu",
"bm"}; "bm",
"mlu"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) { ...@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) {
"kFPGA", "kFPGA",
"kNPU", "kNPU",
"kXPU", "kXPU",
"kMLU",
"kBM"}; "kBM"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
...@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) { ...@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kNPU), TARGET(kNPU),
TARGET(kXPU), TARGET(kXPU),
TARGET(kBM), TARGET(kBM),
TARGET(kMLU),
TARGET(kFPGA)}); TARGET(kFPGA)});
if (target == TARGET(kAny)) { if (target == TARGET(kAny)) {
return valid_set; return valid_set;
......
...@@ -53,8 +53,8 @@ enum class TargetType : int { ...@@ -53,8 +53,8 @@ enum class TargetType : int {
kNPU = 8, kNPU = 8,
kXPU = 9, kXPU = 9,
kBM = 10, kBM = 10,
kAny = 6, // any target
kMLU = 11, kMLU = 11,
kAny = 6, // any target
NUM = 12, // number of fields. NUM = 12, // number of fields.
}; };
enum class PrecisionType : int { enum class PrecisionType : int {
...@@ -89,6 +89,8 @@ typedef enum { ...@@ -89,6 +89,8 @@ typedef enum {
LITE_POWER_RAND_LOW = 5 LITE_POWER_RAND_LOW = 5
} PowerMode; } PowerMode;
typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
enum class ActivationType : int { enum class ActivationType : int {
kIndentity = 0, kIndentity = 0,
kRelu = 1, kRelu = 1,
...@@ -100,7 +102,9 @@ enum class ActivationType : int { ...@@ -100,7 +102,9 @@ enum class ActivationType : int {
kSwish = 7, kSwish = 7,
kExp = 8, kExp = 8,
kAbs = 9, kAbs = 9,
NUM = 10, kHardSwish = 10,
kReciprocal = 11,
NUM = 12,
}; };
static size_t PrecisionTypeLength(PrecisionType type) { static size_t PrecisionTypeLength(PrecisionType type) {
......
...@@ -45,6 +45,10 @@ USE_MIR_PASS(memory_optimize_pass); ...@@ -45,6 +45,10 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(assign_value_eliminate_pass); USE_MIR_PASS(assign_value_eliminate_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
...@@ -47,6 +47,7 @@ using lite_api::TargetType; ...@@ -47,6 +47,7 @@ using lite_api::TargetType;
using lite_api::PrecisionType; using lite_api::PrecisionType;
using lite_api::DataLayoutType; using lite_api::DataLayoutType;
using lite_api::Place; using lite_api::Place;
using lite_api::MLUCoreVersion;
using lite::LightPredictorImpl; using lite::LightPredictorImpl;
using lite_api::OptBase; using lite_api::OptBase;
...@@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m); ...@@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m);
static void BindLitePowerMode(py::module *m); static void BindLitePowerMode(py::module *m);
static void BindLitePlace(py::module *m); static void BindLitePlace(py::module *m);
static void BindLiteTensor(py::module *m); static void BindLiteTensor(py::module *m);
static void BindLiteMLUCoreVersion(py::module *m);
void BindLiteApi(py::module *m) { void BindLiteApi(py::module *m) {
BindLiteCxxConfig(m); BindLiteCxxConfig(m);
...@@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) { ...@@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) {
BindLitePowerMode(m); BindLitePowerMode(m);
BindLitePlace(m); BindLitePlace(m);
BindLiteTensor(m); BindLiteTensor(m);
BindLiteMLUCoreVersion(m);
#ifndef LITE_ON_TINY_PUBLISH #ifndef LITE_ON_TINY_PUBLISH
BindLiteCxxPredictor(m); BindLiteCxxPredictor(m);
#endif #endif
...@@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) { ...@@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) {
.def("set_power_mode", &CxxConfig::set_power_mode) .def("set_power_mode", &CxxConfig::set_power_mode)
.def("power_mode", &CxxConfig::power_mode); .def("power_mode", &CxxConfig::power_mode);
#endif #endif
#ifdef LITE_WITH_MLU
cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
.def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
.def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
.def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
.def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
.def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
#endif
} }
// TODO(sangoly): Should MobileConfig be renamed to LightConfig ?? // TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
...@@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) { ...@@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) {
.value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW); .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW);
} }
void BindLiteMLUCoreVersion(py::module *m) {
py::enum_<MLUCoreVersion>(*m, "MLUCoreVersion")
.value("LITE_MLU_220", MLUCoreVersion::MLU_220)
.value("LITE_MLU_270", MLUCoreVersion::MLU_270);
}
void BindLitePlace(py::module *m) { void BindLitePlace(py::module *m) {
// TargetType // TargetType
py::enum_<TargetType>(*m, "TargetType") py::enum_<TargetType>(*m, "TargetType")
...@@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) { ...@@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) {
.value("OpenCL", TargetType::kOpenCL) .value("OpenCL", TargetType::kOpenCL)
.value("FPGA", TargetType::kFPGA) .value("FPGA", TargetType::kFPGA)
.value("NPU", TargetType::kNPU) .value("NPU", TargetType::kNPU)
.value("MLU", TargetType::kMLU)
.value("Any", TargetType::kAny); .value("Any", TargetType::kAny);
// PrecisionType // PrecisionType
...@@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) { ...@@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) {
DO_GETTER_ONCE(data_type__, name__##_data) DO_GETTER_ONCE(data_type__, name__##_data)
DATA_GETTER_SETTER_ONCE(int8_t, int8); DATA_GETTER_SETTER_ONCE(int8_t, int8);
#ifdef LITE_WITH_MLU
tensor.def("set_uint8_data",
[](Tensor &self,
const std::vector<uint8_t> &data,
TargetType type = TargetType::kHost) {
if (type == TargetType::kHost) {
self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
}
},
py::arg("data"),
py::arg("type") = TargetType::kHost);
DO_GETTER_ONCE(uint8_t, "uint8_data");
#endif
DATA_GETTER_SETTER_ONCE(int32_t, int32); DATA_GETTER_SETTER_ONCE(int32_t, int32);
DATA_GETTER_SETTER_ONCE(float, float); DATA_GETTER_SETTER_ONCE(float, float);
#undef DO_GETTER_ONCE #undef DO_GETTER_ONCE
......
...@@ -6,4 +6,5 @@ add_subdirectory(fpga) ...@@ -6,4 +6,5 @@ add_subdirectory(fpga)
add_subdirectory(host) add_subdirectory(host)
add_subdirectory(npu) add_subdirectory(npu)
add_subdirectory(xpu) add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm) add_subdirectory(bm)
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "lite/backends/arm/math/activation.h" #include "lite/backends/arm/math/activation.h"
#include <algorithm>
#include <string> #include <string>
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
...@@ -711,6 +712,38 @@ void act_square<float>(const float* din, float* dout, int size, int threads) { ...@@ -711,6 +712,38 @@ void act_square<float>(const float* din, float* dout, int size, int threads) {
} }
} }
template <>
void act_hard_swish<float>(const float* din,
float* dout,
int size,
float threshold,
float scale,
float offset,
int threads) {
const float* ptr_in = din;
float* ptr_out = dout;
for (int i = 0; i < size; ++i) {
ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) *
ptr_in[0] / scale;
ptr_in++;
ptr_out++;
}
}
template <>
void act_reciprocal<float>(const float* din,
float* dout,
int size,
int threads) {
const float* ptr_in = din;
float* ptr_out = dout;
for (int i = 0; i < size; ++i) {
ptr_out[0] = 1.0 / ptr_in[0];
ptr_in++;
ptr_out++;
}
}
#ifdef LITE_WITH_TRAIN #ifdef LITE_WITH_TRAIN
template <> template <>
void act_square_grad(const float* din, void act_square_grad(const float* din,
......
...@@ -72,6 +72,17 @@ void act_rsqrt(const T* din, T* dout, int size, int threads); ...@@ -72,6 +72,17 @@ void act_rsqrt(const T* din, T* dout, int size, int threads);
template <typename T> template <typename T>
void act_square(const T* din, T* dout, int size, int threads); void act_square(const T* din, T* dout, int size, int threads);
template <typename T>
void act_hard_swish(const T* din,
T* dout,
int size,
float threshold,
float scale,
float offset,
int threads);
template <typename T>
void act_reciprocal(const T* din, T* dout, int size, int threads);
#ifdef LITE_WITH_TRAIN #ifdef LITE_WITH_TRAIN
template <typename T> template <typename T>
void act_square_grad( void act_square_grad(
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void decode_center_size(__read_only image2d_t prior_box_image,
__read_only image2d_t prior_box_var_image,
__read_only image2d_t target_box_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H){
const int out_c = get_global_id(0);
const int out_nh = get_global_id(1);
const int out_h = out_nh % out_H;
const int out_n = 1;
const int prior_box_n = 1;
const int prior_box_c = 0;
const int prior_box_h = out_h;
const int prior_box_var_n = 1;
const int prior_box_var_c = 0;
const int prior_box_var_h = out_h;
const int target_box_n = 1;
const int target_box_c = out_c;
const int target_box_h = out_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 prior_box_pos;
int2 prior_box_var_pos;
int2 target_box_pos;
int2 output_pos;
prior_box_pos.x = prior_box_c * 4;
prior_box_pos.y = prior_box_n * prior_box_h;
prior_box_var_pos.x = prior_box_var_c * 4;
prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
target_box_pos.x = target_box_c * 4;
target_box_pos.y = target_box_n * target_box_h;
output_pos.x = out_c * 4;
output_pos.y = out_n * out_h;
CL_DTYPE4 prior_box_input[4];
CL_DTYPE4 prior_box_var_input[4];
CL_DTYPE4 target_box_input[4];
prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
(int2)(prior_box_pos.x + 0, prior_box_pos.y));
prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
(int2)(prior_box_pos.x + 1, prior_box_pos.y));
prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
(int2)(prior_box_pos.x + 2, prior_box_pos.y));
prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
(int2)(prior_box_pos.x + 3, prior_box_pos.y));
prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
(int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y));
prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
(int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y));
prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
(int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y));
prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
(int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y));
target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
(int2)(target_box_pos.x + 0,target_box_pos.y));
target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
(int2)(target_box_pos.x + 1, target_box_pos.y));
target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
(int2)(target_box_pos.x + 2, target_box_pos.y));
target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
(int2)(target_box_pos.x + 3, target_box_pos.y));
CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2;
CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2;
CL_DTYPE4 target_box_center_x;
CL_DTYPE4 target_box_center_y;
CL_DTYPE4 target_box_width;
CL_DTYPE4 target_box_height;
CL_DTYPE4 output[4];
output[0] = 0.0f;
output[1] = 0.0f;
output[2] = 0.0f;
output[3] = 0.0f;
target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
if(out_C - out_c * 4 >= 2){
target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
}
if(out_C - out_c * 4 >= 3){
target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
}
if(out_C - out_c * 4 >= 4){
target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
}
...@@ -29,30 +29,38 @@ CLRuntime* CLRuntime::Global() { ...@@ -29,30 +29,38 @@ CLRuntime* CLRuntime::Global() {
} }
CLRuntime::~CLRuntime() { CLRuntime::~CLRuntime() {
LOG(INFO) << "CLRuntime::~CLRuntime()";
// Note: do ReleaseResources() in predictor
command_queue_&& clReleaseCommandQueue(command_queue_->get());
command_queue_.reset();
context_&& clReleaseContext(context_->get());
context_.reset();
device_.reset();
platform_.reset();
initialized_ = false;
}
void CLRuntime::ReleaseResources() {
// if (is_resources_released_) {
// return;
// }
if (command_queue_ != nullptr) { if (command_queue_ != nullptr) {
command_queue_->flush(); command_queue_->flush();
command_queue_->finish(); command_queue_->finish();
} }
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
clReleaseKernel(kernels_[kidx]->get()); clReleaseKernel(kernels_[kidx]->get());
kernels_[kidx].reset(); kernels_[kidx].reset();
} }
kernels_.clear(); kernels_.clear();
kernel_offset_.clear(); kernel_offset_.clear();
for (auto& p : programs_) { for (auto& p : programs_) {
clReleaseProgram(p.second->get()); clReleaseProgram(p.second->get());
} }
programs_.clear(); programs_.clear();
LOG(INFO) << "release resources finished.";
// For controlling the destruction order is_resources_released_ = true;
command_queue_&& clReleaseCommandQueue(command_queue_->get());
command_queue_.reset();
context_&& clReleaseContext(context_->get());
context_.reset();
device_.reset();
platform_.reset();
} }
bool CLRuntime::Init() { bool CLRuntime::Init() {
......
...@@ -33,6 +33,8 @@ class CLRuntime { ...@@ -33,6 +33,8 @@ class CLRuntime {
public: public:
static CLRuntime* Global(); static CLRuntime* Global();
void ReleaseResources();
bool Init(); bool Init();
cl::Platform& platform(); cl::Platform& platform();
...@@ -116,6 +118,8 @@ class CLRuntime { ...@@ -116,6 +118,8 @@ class CLRuntime {
bool initialized_{false}; bool initialized_{false};
bool is_init_success_{false}; bool is_init_success_{false};
bool is_resources_released_{false};
}; };
} // namespace lite } // namespace lite
......
...@@ -96,8 +96,8 @@ class BeamSearchFunctor<TARGET(kX86), T> { ...@@ -96,8 +96,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
// : nullptr; // : nullptr;
// fill in data // fill in data
std::vector<size_t> low_level; std::vector<uint64_t> low_level;
size_t low_offset = 0; uint64_t low_offset = 0;
for (auto &items : selected_items) { for (auto &items : selected_items) {
low_level.push_back(low_offset); low_level.push_back(low_offset);
for (auto &item : items) { for (auto &item : items) {
......
...@@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids, ...@@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
paddle::framework::LoDTensor* pre_scores) { paddle::framework::LoDTensor* pre_scores) {
// lod // lod
paddle::framework::LoD lod; paddle::framework::LoD lod;
std::vector<size_t> level0({0, 2, 4}); std::vector<uint64_t> level0({0, 2, 4});
std::vector<size_t> level1({0, 1, 2, 3, 4}); std::vector<uint64_t> level1({0, 1, 2, 3, 4});
lod.push_back(level0); lod.push_back(level0);
lod.push_back(level1); lod.push_back(level1);
ids->set_lod(lod); ids->set_lod(lod);
......
...@@ -483,7 +483,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a, ...@@ -483,7 +483,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
mat_a.data<T>(), mat_a.data<T>(),
mat_b.data<T>(), mat_b.data<T>(),
beta, beta,
mat_out->mutable_data<T>()); mat_out->template mutable_data<T>());
} }
template <> template <>
...@@ -759,7 +759,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a, ...@@ -759,7 +759,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
mat_a.data<T>(), mat_a.data<T>(),
mat_b.data<T>(), mat_b.data<T>(),
beta, beta,
mat_out->mutable_data<T>()); mat_out->template mutable_data<T>());
} else { } else {
PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0); dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
...@@ -773,7 +773,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a, ...@@ -773,7 +773,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
mat_a.data<T>(), mat_a.data<T>(),
mat_b.data<T>(), mat_b.data<T>(),
beta, beta,
mat_out->mutable_data<T>(), mat_out->template mutable_data<T>(),
dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_, dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
dim_a.stride_, dim_a.stride_,
dim_b.stride_); dim_b.stride_);
......
...@@ -51,7 +51,7 @@ class ConcatFunctor<lite::TargetType::kX86, T> { ...@@ -51,7 +51,7 @@ class ConcatFunctor<lite::TargetType::kX86, T> {
// auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace()); // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation // computation
auto output_data = output->mutable_data<T>(); auto output_data = output->template mutable_data<T>();
int col_idx = 0; int col_idx = 0;
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
int col_len = input_cols[j]; int col_len = input_cols[j];
...@@ -108,7 +108,7 @@ class SplitFunctor<lite::TargetType::kX86, T> { ...@@ -108,7 +108,7 @@ class SplitFunctor<lite::TargetType::kX86, T> {
int col_len = output_cols[j]; int col_len = output_cols[j];
auto* out_tensor = outputs->at(j); auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) { if (out_tensor != nullptr) {
T* dst_ptr = out_tensor->mutable_data<T>() + k * col_len; T* dst_ptr = out_tensor->template mutable_data<T>() + k * col_len;
std::copy_n(src_ptr + col_idx, col_len, dst_ptr); std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
// memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
// sizeof(T) * col_len); // sizeof(T) * col_len);
......
...@@ -50,8 +50,8 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> { ...@@ -50,8 +50,8 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
.reshape(batch_axis_remain) .reshape(batch_axis_remain)
.sum(Eigen::DSizes<int, 1>(1))); .sum(Eigen::DSizes<int, 1>(1)));
} else { } else {
const T* prob_data = prob->data<T>(); const T* prob_data = prob->template data<T>();
T* loss_data = out->mutable_data<T>(); T* loss_data = out->template mutable_data<T>();
const int64_t* label_data = labels->data<int64_t>(); const int64_t* label_data = labels->data<int64_t>();
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
......
...@@ -99,7 +99,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO, ...@@ -99,7 +99,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
int channels_col = im_channels * filter_height * filter_width; int channels_col = im_channels * filter_height * filter_width;
T* im_data = im->mutable_data<T>(); T* im_data = im->template mutable_data<T>();
const T* col_data = col.data<T>(); const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
...@@ -161,7 +161,7 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF, ...@@ -161,7 +161,7 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
int col_width = col->dims()[1]; int col_width = col->dims()[1];
const T* im_data = im.data<T>(); const T* im_data = im.data<T>();
T* col_data = col->mutable_data<T>(); T* col_data = col->template mutable_data<T>();
for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
...@@ -235,7 +235,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF, ...@@ -235,7 +235,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
"col_width and padding(padding_left, padding_right) are " "col_width and padding(padding_left, padding_right) are "
"inconsistent."); "inconsistent.");
T* im_data = im->mutable_data<T>(); T* im_data = im->template mutable_data<T>();
const T* col_data = col.data<T>(); const T* col_data = col.data<T>();
for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
......
...@@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im, ...@@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im,
int channels_col = im_channels * filter_height * filter_width; int channels_col = im_channels * filter_height * filter_width;
const T* im_data = im.data<T>(); const T* im_data = im.data<T>();
T* col_data = col->mutable_data<T>(); T* col_data = col->template mutable_data<T>();
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width; int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height; int h_offset = (c / filter_width) % filter_height;
...@@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im, ...@@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
int output_width = col->dims()[4]; int output_width = col->dims()[4];
const T* im_data = im.data<T>(); const T* im_data = im.data<T>();
T* col_data = col->mutable_data<T>(); T* col_data = col->template mutable_data<T>();
int col_matrix_width = output_width * output_height; int col_matrix_width = output_width * output_height;
int im_size = im_height * im_width; int im_size = im_height * im_width;
size_t copy_size = sizeof(T) * output_width; size_t copy_size = sizeof(T) * output_width;
...@@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im, ...@@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
constexpr int prw = 1; constexpr int prw = 1;
const T* im_data = im.data<T>(); const T* im_data = im.data<T>();
T* col_data = col->mutable_data<T>(); T* col_data = col->template mutable_data<T>();
int im_size = im_height * im_width; int im_size = im_height * im_width;
int col_matrix_width = output_width * output_height; int col_matrix_width = output_width * output_height;
int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow
......
...@@ -65,7 +65,7 @@ struct TensorSetConstantCPU { ...@@ -65,7 +65,7 @@ struct TensorSetConstantCPU {
: tensor_(tensor), value_(value) {} : tensor_(tensor), value_(value) {}
template <typename T> template <typename T>
void apply() const { void apply() const {
auto* begin = tensor_->mutable_data<T>(lite::TargetType::kX86); auto* begin = tensor_->template mutable_data<T>(lite::TargetType::kX86);
std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_)); std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
} }
lite::Tensor* tensor_; lite::Tensor* tensor_;
...@@ -126,7 +126,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> { ...@@ -126,7 +126,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* vector_data = vector.data<T>(); const T* vector_data = vector.data<T>();
T* output_data = output->mutable_data<T>(); T* output_data = output->template mutable_data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t i = 0; i < in_dims[0]; ++i) {
for (int64_t j = 0; j < size; ++j) { for (int64_t j = 0; j < size; ++j) {
output_data[i * in_dims[0] + j] = output_data[i * in_dims[0] + j] =
......
...@@ -83,7 +83,7 @@ class ColwiseSum<lite::TargetType::kX86, T> { ...@@ -83,7 +83,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
auto size = in_dims[1]; auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size); PADDLE_ENFORCE_EQ(out->numel(), size);
T* out_buf = out->mutable_data<T>(out->target()); T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>(); const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) { for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
...@@ -129,7 +129,7 @@ class RowwiseMean<lite::TargetType::kX86, T> { ...@@ -129,7 +129,7 @@ class RowwiseMean<lite::TargetType::kX86, T> {
auto size = in_dims[1]; auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height); PADDLE_ENFORCE_EQ(out->numel(), height);
auto inv_size = 1.0 / size; auto inv_size = 1.0 / size;
T* out_buf = out->mutable_data<T>(out->target()); T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>(); const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) { for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
...@@ -173,7 +173,7 @@ class RowwiseSum<lite::TargetType::kX86, T> { ...@@ -173,7 +173,7 @@ class RowwiseSum<lite::TargetType::kX86, T> {
auto size = in_dims[1]; auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height); PADDLE_ENFORCE_EQ(out->numel(), height);
T* out_buf = out->mutable_data<T>(out->target()); T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>(); const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) { for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
......
...@@ -35,7 +35,7 @@ class MaxOutFunctor<lite::TargetType::kX86, T> { ...@@ -35,7 +35,7 @@ class MaxOutFunctor<lite::TargetType::kX86, T> {
// c_size means the output size of each sample // c_size means the output size of each sample
int c_size = fea_size * output_channels; int c_size = fea_size * output_channels;
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
T* output_data = output->mutable_data<T>(lite::TargetType::kX86); T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
int new_bindex = c_size * i; int new_bindex = c_size * i;
...@@ -72,7 +72,8 @@ class MaxOutGradFunctor<lite::TargetType::kX86, T> { ...@@ -72,7 +72,8 @@ class MaxOutGradFunctor<lite::TargetType::kX86, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86); T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
int blen = fea_size * output_channels * i; int blen = fea_size * output_channels * i;
......
...@@ -54,8 +54,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -54,8 +54,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
const int input_stride = input_height * input_width; const int input_stride = input_height * input_width;
const int output_stride = output_height * output_width; const int output_stride = output_height * output_width;
const T* input_data = input->data<T>(); const T* input_data = input->template data<T>();
T* output_data = output->mutable_data<T>(lite::TargetType::kX86); T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
int hstart, hend; int hstart, hend;
int wstart, wend; int wstart, wend;
...@@ -137,7 +137,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -137,7 +137,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86); T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
int hstart, hend; int hstart, hend;
int wstart, wend; int wstart, wend;
...@@ -220,7 +221,8 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> { ...@@ -220,7 +221,8 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86); T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -322,7 +324,7 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -322,7 +324,7 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
const int output_stride = output_depth * output_height * output_width; const int output_stride = output_depth * output_height * output_width;
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
T* output_data = output->mutable_data<T>(lite::TargetType::kX86); T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
int dstart, dend; int dstart, dend;
int hstart, hend; int hstart, hend;
...@@ -425,7 +427,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> { ...@@ -425,7 +427,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86); T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
int dstart, dend; int dstart, dend;
int hstart, hend; int hstart, hend;
...@@ -530,7 +533,8 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> { ...@@ -530,7 +533,8 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86); T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
......
...@@ -58,11 +58,11 @@ class SampleWithProb { ...@@ -58,11 +58,11 @@ class SampleWithProb {
const int64_t* label_data = L->data<int64_t>(); const int64_t* label_data = L->data<int64_t>();
// int64_t* samples_data = // int64_t* samples_data =
// S->mutable_data<int64_t>(ret_dim, Target); // S->mutable_data<int64_t>(ret_dim, Target);
// T* probabilities_data = P->mutable_data<T>(ret_dim, Target); // T* probabilities_data = P->template mutable_data<T>(ret_dim, Target);
S->Resize({batch_size, num_sampled_classes}); S->Resize({batch_size, num_sampled_classes});
auto* samples_data = S->mutable_data<int64_t>(Target); auto* samples_data = S->mutable_data<int64_t>(Target);
P->Resize({batch_size, num_sampled_classes}); P->Resize({batch_size, num_sampled_classes});
auto* probabilities_data = P->mutable_data<T>(Target); auto* probabilities_data = P->template mutable_data<T>(Target);
// temp sets for unique sampling // temp sets for unique sampling
std::unordered_set<int64_t> tmp_samples; std::unordered_set<int64_t> tmp_samples;
......
...@@ -42,7 +42,7 @@ class SearchFcFunctor<lite::TargetType::kX86, T> { ...@@ -42,7 +42,7 @@ class SearchFcFunctor<lite::TargetType::kX86, T> {
lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size})); lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
const auto bottom_data = bottom.data<T>(); const auto bottom_data = bottom.data<T>();
auto top_data = top->mutable_data<T>(lite::TargetType::kX86); auto top_data = top->template mutable_data<T>(lite::TargetType::kX86);
const auto weights = w.data<T>(); const auto weights = w.data<T>();
auto blas = math::GetBlas<lite::TargetType::kX86, T>(context); auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
call_gemm<lite::X86Context, T>(blas, call_gemm<lite::X86Context, T>(blas,
......
...@@ -52,7 +52,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> { ...@@ -52,7 +52,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
auto* out_data = out_value->mutable_data<T>(); auto* out_data = out_value->template mutable_data<T>();
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
std::copy_n(in1_data, in1_value.numel(), out_data); std::copy_n(in1_data, in1_value.numel(), out_data);
...@@ -87,7 +87,7 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> { ...@@ -87,7 +87,7 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
functor(context, output, 0.0); functor(context, output, 0.0);
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* out_data = output->mutable_data<T>(); auto* out_data = output->template mutable_data<T>();
for (size_t i = 0; i < in1_rows.size(); i++) { for (size_t i = 0; i < in1_rows.size(); i++) {
for (int64_t j = 0; j < in1_row_numel; j++) { for (int64_t j = 0; j < in1_row_numel; j++) {
...@@ -127,7 +127,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> { ...@@ -127,7 +127,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->mutable_data<T>(); auto* in2_data = in2_value->template mutable_data<T>();
std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset); std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
} }
}; };
...@@ -161,7 +161,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> { ...@@ -161,7 +161,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
input2->set_rows(in2_rows); input2->set_rows(in2_rows);
auto* in2_value = input2->mutable_value(); auto* in2_value = input2->mutable_value();
T* in2_data = in2_value->mutable_data<T>(); T* in2_data = in2_value->template mutable_data<T>();
auto blas = math::GetBlas<lite::TargetType::kX86, T>(context); auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
size_t offset = 0u; size_t offset = 0u;
for (size_t i = 0u; i != input1.size(); ++i) { for (size_t i = 0u; i != input1.size(); ++i) {
...@@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> { ...@@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->mutable_data<T>(); auto* input2_data = input2->template mutable_data<T>();
for (size_t i = 0; i < in1_rows.size(); i++) { for (size_t i = 0; i < in1_rows.size(); i++) {
for (int64_t j = 0; j < in1_row_numel; j++) { for (int64_t j = 0; j < in1_row_numel; j++) {
...@@ -305,7 +305,7 @@ struct MergeAdd<lite::TargetType::kX86, T> { ...@@ -305,7 +305,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
lite::DDim dims(std::vector<int64_t>( lite::DDim dims(std::vector<int64_t>(
{static_cast<int64_t>(merged_row_set.size()), input_width})); {static_cast<int64_t>(merged_row_set.size()), input_width}));
out.mutable_value()->Resize(dims); out.mutable_value()->Resize(dims);
auto* out_data = out.mutable_value()->mutable_data<T>(); auto* out_data = out.mutable_value()->template mutable_data<T>();
if (merged_row_set.size() == row_num && !sorted_result) { if (merged_row_set.size() == row_num && !sorted_result) {
// no duplicated ids, just concat the result together // no duplicated ids, just concat the result together
...@@ -385,7 +385,7 @@ struct UpdateToTensor<lite::TargetType::kX86, T> { ...@@ -385,7 +385,7 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>(); auto* input2_data = input2->template data<T>();
// FIXME(typhoonzero): use macro fix the below messy code. // FIXME(typhoonzero): use macro fix the below messy code.
switch (op) { switch (op) {
......
...@@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> { ...@@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
public: public:
void operator()(const lite::Context<lite::TargetType::kX86>& context, void operator()(const lite::Context<lite::TargetType::kX86>& context,
const lite::Tensor& src, const lite::Tensor& src,
const std::vector<size_t>& index_lod, const std::vector<uint64_t>& index_lod,
lite::Tensor* dst, lite::Tensor* dst,
bool is_src_index) { bool is_src_index) {
const size_t* index = index_lod.data(); const uint64_t* index = index_lod.data();
const auto& src_dims = src.dims(); const auto& src_dims = src.dims();
const auto& dst_dims = dst->dims(); const auto& dst_dims = dst->dims();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> { ...@@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
auto height = dst_dims[0]; auto height = dst_dims[0];
auto width = dst_dims[1]; auto width = dst_dims[1];
auto* src_data = src.data<T>(); auto* src_data = src.data<T>();
auto* dst_data = dst->mutable_data<T>(); auto* dst_data = dst->template mutable_data<T>();
const int sz = width * sizeof(T); const int sz = width * sizeof(T);
if (is_src_index) { if (is_src_index) {
for (int i = 0; i < height; ++i) { for (int i = 0; i < height; ++i) {
......
...@@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor { ...@@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor {
// The indexed rows are based on the input index. // The indexed rows are based on the input index.
void operator()(const lite::Context<Target>& context, void operator()(const lite::Context<Target>& context,
const lite::Tensor& src, const lite::Tensor& src,
const std::vector<size_t>& index_lod, const std::vector<uint64_t>& index_lod,
lite::Tensor* dst, lite::Tensor* dst,
bool is_src_index); bool is_src_index);
}; };
...@@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor { ...@@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor {
// batch_lods[2] is the sort order for the input LoDTensor. // batch_lods[2] is the sort order for the input LoDTensor.
batch_lods->at(2).resize(seq_info.size()); batch_lods->at(2).resize(seq_info.size());
size_t* batch_starts = batch_lods->at(0).data(); auto* batch_starts = batch_lods->at(0).data();
size_t* seq2batch_idx = batch_lods->at(1).data(); auto* seq2batch_idx = batch_lods->at(1).data();
batch_starts[0] = 0; batch_starts[0] = 0;
for (int n = 0; n < max_seqlen; n++) { for (int n = 0; n < max_seqlen; n++) {
auto batch_id = static_cast<int>(batch_starts[n]); auto batch_id = static_cast<int>(batch_starts[n]);
...@@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor { ...@@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor {
} }
batch_starts[n + 1] = static_cast<size_t>(batch_id); batch_starts[n + 1] = static_cast<size_t>(batch_id);
} }
size_t* seq_order = batch_lods->at(2).data(); auto* seq_order = batch_lods->at(2).data();
for (size_t i = 0; i < seq_info.size(); ++i) { for (size_t i = 0; i < seq_info.size(); ++i) {
seq_order[i] = seq_info[i].seq_idx; seq_order[i] = seq_info[i].seq_idx;
} }
......
...@@ -22,15 +22,15 @@ namespace math { ...@@ -22,15 +22,15 @@ namespace math {
template <typename T> template <typename T>
void CopyValidData(lite::Tensor* dst_tensor, void CopyValidData(lite::Tensor* dst_tensor,
const lite::Tensor* src_tensor, const lite::Tensor* src_tensor,
const std::vector<size_t>& seq_offsets, const std::vector<uint64_t>& seq_offsets,
int pad_seq_len, int pad_seq_len,
int step_width, int step_width,
bool norm_by_len, bool norm_by_len,
CopyType type, CopyType type,
PadLayout layout) { PadLayout layout) {
int seq_num = seq_offsets.size() - 1; int seq_num = seq_offsets.size() - 1;
const T* src_data = src_tensor->data<T>(); const T* src_data = src_tensor->template data<T>();
T* dst_data = dst_tensor->mutable_data<T>(); T* dst_data = dst_tensor->template mutable_data<T>();
int seq_cpy_gap = step_width; int seq_cpy_gap = step_width;
int pad_cpy_gap = int pad_cpy_gap =
...@@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> { ...@@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
"'step_width'."); "'step_width'.");
// fill padding value // fill padding value
T* pad_data = pad_tensor->mutable_data<T>(); T* pad_data = pad_tensor->template mutable_data<T>();
const T* pad_value_data = pad_value.data<T>(); const T* pad_value_data = pad_value.data<T>();
if (pad_value.numel() == 1) { if (pad_value.numel() == 1) {
fast_mem_init<T>( fast_mem_init<T>(
......
...@@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth }; ...@@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
enum CopyType { kSeqToPad, kPadToSeq }; enum CopyType { kSeqToPad, kPadToSeq };
inline static size_t MaximumSequenceLength( inline static uint64_t MaximumSequenceLength(
const std::vector<size_t>& seq_offset) { const std::vector<uint64_t>& seq_offset) {
size_t seq_num = seq_offset.size() - 1; uint64_t seq_num = seq_offset.size() - 1;
size_t max_seq_len = 0; uint64_t max_seq_len = 0;
for (size_t i = 0; i < seq_num; ++i) { for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]); max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
} }
...@@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength( ...@@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength(
inline static void CheckDims(const lite::DDim& seq_tensor_dims, inline static void CheckDims(const lite::DDim& seq_tensor_dims,
const lite::DDim& pad_tensor_dims, const lite::DDim& pad_tensor_dims,
const std::vector<size_t>& seq_offset, const std::vector<uint64_t>& seq_offset,
int64_t padded_seq_len, int64_t padded_seq_len,
int64_t step_width, int64_t step_width,
const PadLayout& layout) { const PadLayout& layout) {
......
...@@ -55,7 +55,7 @@ class MaxSeqPoolFunctor { ...@@ -55,7 +55,7 @@ class MaxSeqPoolFunctor {
auto starts = input.lod()[0]; auto starts = input.lod()[0];
const T* in_data = input.data<T>(); const T* in_data = input.data<T>();
T* out_data = output->mutable_data<T>(); T* out_data = output->template mutable_data<T>();
int* max_index = index->mutable_data<int>(); int* max_index = index->mutable_data<int>();
int64_t num_seq = out_dims[0]; int64_t num_seq = out_dims[0];
...@@ -103,7 +103,7 @@ class MaxSeqPoolFunctor<T, true> { ...@@ -103,7 +103,7 @@ class MaxSeqPoolFunctor<T, true> {
auto starts = input.lod()[0]; auto starts = input.lod()[0];
const T* in_data = input.data<T>(); const T* in_data = input.data<T>();
T* out_data = output->mutable_data<T>(); T* out_data = output->template mutable_data<T>();
int64_t num_seq = out_dims[0]; int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq; int64_t dim = output->numel() / num_seq;
...@@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor { ...@@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor {
const T* og_data = out_grad.data<T>(); const T* og_data = out_grad.data<T>();
const int* max_index = index.data<int>(); const int* max_index = index.data<int>();
T* ig_data = in_grad->mutable_data<T>(); T* ig_data = in_grad->template mutable_data<T>();
SetConstant<TARGET(kX86), T> set_zero; SetConstant<TARGET(kX86), T> set_zero;
set_zero(context, in_grad, static_cast<T>(0.0)); set_zero(context, in_grad, static_cast<T>(0.0));
...@@ -170,7 +170,7 @@ class LastSeqPoolFunctor { ...@@ -170,7 +170,7 @@ class LastSeqPoolFunctor {
lite::Tensor* output) { lite::Tensor* output) {
// Create pointers to input and output data // Create pointers to input and output data
auto* in_data = input.data<T>(); auto* in_data = input.data<T>();
auto* out_data = output->mutable_data<T>(); auto* out_data = output->template mutable_data<T>();
// Calculate the size of each item in sequence // Calculate the size of each item in sequence
int64_t item_size = input.numel() / input.dims()[0]; int64_t item_size = input.numel() / input.dims()[0];
...@@ -203,7 +203,7 @@ class FirstSeqPoolFunctor { ...@@ -203,7 +203,7 @@ class FirstSeqPoolFunctor {
lite::Tensor* output) { lite::Tensor* output) {
// Create pointers to input and output data // Create pointers to input and output data
auto* in_data = input.data<T>(); auto* in_data = input.data<T>();
auto* out_data = output->mutable_data<T>(); auto* out_data = output->template mutable_data<T>();
// Calculate the size of each item in sequence // Calculate the size of each item in sequence
int64_t item_size = input.numel() / input.dims()[0]; int64_t item_size = input.numel() / input.dims()[0];
...@@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor { ...@@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor {
int64_t in_w = in_grad->numel() / in_grad->dims()[0]; int64_t in_w = in_grad->numel() / in_grad->dims()[0];
PADDLE_ENFORCE(in_w == out_w); PADDLE_ENFORCE(in_w == out_w);
const T* out_g_data = out_grad.data<T>(); const T* out_g_data = out_grad.data<T>();
T* in_g_data = in_grad->mutable_data<T>(TARGET(kX86)); T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
auto blas = math::GetBlas<TARGET(kX86), T>(context); auto blas = math::GetBlas<TARGET(kX86), T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]); int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
...@@ -288,7 +288,7 @@ class SequencePoolFunctor<TARGET(kX86), T> { ...@@ -288,7 +288,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
auto lod = input.lod()[0]; auto lod = input.lod()[0];
if (pooltype == "SUM") { if (pooltype == "SUM") {
const T* src = input.data<T>(); const T* src = input.data<T>();
T* dst = output->mutable_data<T>(TARGET(kX86)); T* dst = output->template mutable_data<T>(TARGET(kX86));
jit::seq_pool_attr_t attr( jit::seq_pool_attr_t attr(
static_cast<int>(input.numel() / input.dims()[0]), static_cast<int>(input.numel() / input.dims()[0]),
jit::SeqPoolType::kSum); jit::SeqPoolType::kSum);
......
...@@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { ...@@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
TEST(SequencePoolingGrad, CPU_SUM) { TEST(SequencePoolingGrad, CPU_SUM) {
paddle::framework::LoD lod1; paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10}); lod1.push_back(std::vector<uint64_t>{0, 10});
TestSequencePoolingSum<paddle::platform::CPUDeviceContext, TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace, paddle::platform::CPUPlace,
float>(lod1); float>(lod1);
paddle::framework::LoD lod2; paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10}); lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
TestSequencePoolingSum<paddle::platform::CPUDeviceContext, TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace, paddle::platform::CPUPlace,
float>(lod2); float>(lod2);
...@@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) { ...@@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(SequencePoolingGrad, CUDA_SUM) { TEST(SequencePoolingGrad, CUDA_SUM) {
paddle::framework::LoD lod1; paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10}); lod1.push_back(std::vector<uint64_t>{0, 10});
TestSequencePoolingSum<paddle::platform::CUDADeviceContext, TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace, paddle::platform::CUDAPlace,
float>(lod1); float>(lod1);
paddle::framework::LoD lod2; paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10}); lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
TestSequencePoolingSum<paddle::platform::CUDADeviceContext, TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace, paddle::platform::CUDAPlace,
float>(lod2); float>(lod2);
......
...@@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> { ...@@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> {
size_t seq_width = seq->dims()[1]; size_t seq_width = seq->dims()[1];
lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod); lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
T* seq_data = seq->mutable_data<T>(lite::TargetType::kX86); T* seq_data = seq->template mutable_data<T>(lite::TargetType::kX86);
for (size_t i = 0; i < num_seq; ++i) { for (size_t i = 0; i < num_seq; ++i) {
for (size_t j = lod[level][i] * seq_width; for (size_t j = lod[level][i] * seq_width;
j < lod[level][i + 1] * seq_width; j < lod[level][i + 1] * seq_width;
......
...@@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> { ...@@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86); auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
int offset = 0; int offset = 0;
std::vector<size_t> vec_out_lod; std::vector<uint64_t> vec_out_lod;
vec_out_lod.reserve(batch_size + 1); vec_out_lod.reserve(batch_size + 1);
for (int i = 0; i <= batch_size; ++i) { for (int i = 0; i <= batch_size; ++i) {
offset = row_lod[i]; offset = row_lod[i];
...@@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> { ...@@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
out->set_lod(lod_temp); out->set_lod(lod_temp);
auto in_data = in.data<T>(); auto in_data = in.data<T>();
auto out_data = out->mutable_data<T>(lite::TargetType::kX86); auto out_data = out->template mutable_data<T>(lite::TargetType::kX86);
T* sum_data = new T[max_k]; T* sum_data = new T[max_k];
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
......
...@@ -108,8 +108,8 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> { ...@@ -108,8 +108,8 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
const int num_remain = num_classes / axis_dim; const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
const T* in_data = X->data<T>(); const T* in_data = X->template data<T>();
auto* out_data = Y->mutable_data<T>(); auto* out_data = Y->template mutable_data<T>();
for (int bs = 0; bs < batch_size; ++bs) { for (int bs = 0; bs < batch_size; ++bs) {
T max_val = *std::max_element(in_data, in_data + num_classes); T max_val = *std::max_element(in_data, in_data + num_classes);
max_val *= static_cast<T>(-1); max_val *= static_cast<T>(-1);
...@@ -219,9 +219,9 @@ class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> { ...@@ -219,9 +219,9 @@ class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
const int num_remain = num_classes / axis_dim; const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
const T* out_data = y->data<T>(); const T* out_data = y->template data<T>();
const T* out_grad = y_grad->data<T>(); const T* out_grad = y_grad->template data<T>();
T* in_grad = x_grad->mutable_data<T>(); T* in_grad = x_grad->template mutable_data<T>();
for (int bs = 0; bs < batch_size; ++bs) { for (int bs = 0; bs < batch_size; ++bs) {
T scalar; T scalar;
vec_mul_reduce<T, lite::x86::avx>( vec_mul_reduce<T, lite::x86::avx>(
......
...@@ -104,12 +104,12 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> { ...@@ -104,12 +104,12 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> {
patch_size = processing_list.size(); patch_size = processing_list.size();
// T *patch_data = // T *patch_data =
// patch->mutable_data<T>({static_cast<int64_t>(patch_size), // patch->template mutable_data<T>({static_cast<int64_t>(patch_size),
// static_cast<int64_t>(patch_elem_size)}, // static_cast<int64_t>(patch_elem_size)},
// cpu_place); // cpu_place);
patch->Resize({static_cast<int64_t>(patch_size), patch->Resize({static_cast<int64_t>(patch_size),
static_cast<int64_t>(patch_elem_size)}); static_cast<int64_t>(patch_elem_size)});
auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86); auto *patch_data = patch->template mutable_data<T>(lite::TargetType::kX86);
constant(context, patch, 0); constant(context, patch, 0);
const T *features = node_features.data<T>(); const T *features = node_features.data<T>();
...@@ -166,12 +166,12 @@ class Col2TreeFunctor<lite::TargetType::kX86, T> { ...@@ -166,12 +166,12 @@ class Col2TreeFunctor<lite::TargetType::kX86, T> {
} }
} }
// T *grad_data = // T *grad_data =
// in_grad->mutable_data<T>({static_cast<int64_t>(node_count), // in_grad->template mutable_data<T>({static_cast<int64_t>(node_count),
// static_cast<int64_t>(grad_elem_size)}, // static_cast<int64_t>(grad_elem_size)},
// cpu_place); // cpu_place);
in_grad->Resize({static_cast<int64_t>(node_count), in_grad->Resize({static_cast<int64_t>(node_count),
static_cast<int64_t>(grad_elem_size)}); static_cast<int64_t>(grad_elem_size)});
auto *grad_data = in_grad->mutable_data<T>(lite::TargetType::kX86); auto *grad_data = in_grad->template mutable_data<T>(lite::TargetType::kX86);
constant(context, in_grad, 0); constant(context, in_grad, 0);
const T *out_g = out_grad.data<T>(); const T *out_g = out_grad.data<T>();
......
...@@ -36,7 +36,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> { ...@@ -36,7 +36,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
int output_feasize = output_height * output_width; int output_feasize = output_height * output_width;
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>(); const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(lite::TargetType::kX86); T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
for (int b = 0; b < batch_size; ++b) { for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
for (int i = 0; i < input_feasize; ++i) { for (int i = 0; i < input_feasize; ++i) {
...@@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> { ...@@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
int output_feasize = output_height * output_width; int output_feasize = output_height * output_width;
const int* indices_data = indices.data<int>(); const int* indices_data = indices.data<int>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86); T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
for (int b = 0; b < batch_size; ++b) { for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
......
...@@ -75,7 +75,7 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> { ...@@ -75,7 +75,7 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
"mismatching."); "mismatching.");
const T* vol_data = vol.data<T>(); const T* vol_data = vol.data<T>();
T* col_data = col->mutable_data<T>(); T* col_data = col->template mutable_data<T>();
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width; int w_offset = c % filter_width;
...@@ -159,7 +159,7 @@ class Col2VolFunctor<lite::TargetType::kX86, T> { ...@@ -159,7 +159,7 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
output_width, output_width,
"input_width and output_width are " "input_width and output_width are "
"mismatching."); "mismatching.");
T* vol_data = vol->mutable_data<T>(); T* vol_data = vol->template mutable_data<T>();
const T* col_data = col.data<T>(); const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
......
...@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU) ...@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
return() return()
endif() endif()
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) if(LITE_WITH_XTCL)
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
endif()
lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <cstdlib> #include <cstdlib>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <cmath>
#include <cstdlib>
#include <utility>
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace xpu {
namespace math {
static inline long round_half_to_even(const float src) { // NOLINT
long ret = llround(src); // NOLINT
if (fabs(fabs(round(src) - src) - 0.5) > 0) {
return ret;
} else {
if (abs(ret) % 2 == 0) {
return ret;
} else {
return ret + (ret > 0 ? -1 : 1);
}
}
}
static float ieee_compliance_0(float f) {
uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
uint32_t sign = (*ptr) & 0x80000000;
uint32_t uf = 0;
// nan -> inf
if (std::isnan(f)) {
uf = (sign | 0x7F800000);
float *ptr = reinterpret_cast<float *>(&uf);
return *ptr;
} else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
return f;
} else {
// denormal -> +-0
uf = 0x0;
float *ptr = reinterpret_cast<float *>(&uf);
return *ptr;
}
}
template <typename T, int RMAX>
static inline T fp32_to_intx(const float f, float max) {
max = ieee_compliance_0(max);
float input = ieee_compliance_0(f);
// +0 and -0 -> +0
if (input == 0) {
input = 0.0f;
}
float tmp = RMAX / max;
if (std::isinf(tmp)) {
uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
if ((*ptr) >> 31 & 1) {
return T(-RMAX);
} else {
return T(RMAX);
}
}
tmp = input * tmp;
if (std::isnan(tmp)) {
return T(RMAX);
}
tmp = ieee_compliance_0(tmp);
// early check to avoid INF or big value get into convertor func.
if (tmp > RMAX) {
return T(RMAX);
}
if (tmp < -RMAX) {
return T(-RMAX);
}
T ret = (T)round_half_to_even(tmp);
if (ret > RMAX) {
ret = T(RMAX);
}
if (ret < -RMAX) {
ret = T(-RMAX);
}
return ret;
}
static inline int16_t fp32_to_int16(const float f, float max) {
int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
return v1;
}
static inline int ConvertFP32ToInt16(const void *input,
void *output,
float max_val,
int len) {
for (int i = 0; i < len; i++) {
static_cast<int16_t *>(output)[i] =
fp32_to_int16(static_cast<const float *>(input)[i], max_val);
}
return 0;
}
static inline float FindMaxAbs(const float *data, int len) {
float max_f = 0.0f;
for (int i = 0; i < len; ++i) {
float max = std::abs(data[i]);
if (max > max_f) {
max_f = max;
}
}
return max_f;
}
template <typename T>
static inline void Transpose(const T *in, T *out, int h, int w) {
for (int h1 = 0; h1 < w; ++h1) {
for (int w1 = 0; w1 < h; ++w1) {
out[h1 * h + w1] = in[w1 * w + h1];
}
}
}
/**
* Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
* original x_dim is returned.
*/
static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
if (x_dim.size() > 1) {
return x_dim;
}
return lite::DDim({1, x_dim[0]});
}
/**
* Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
* original y_dim is returned.
*/
static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
if (y_dim.size() > 1) {
return y_dim;
}
return lite::DDim({y_dim[0], 1});
}
/**
* Matrix Descriptor of a memory buffer.
*
* It is used for Blas::MatMul. MatMul operator can be batched.
* if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
* `batch_size` times of GEMM. The batched GEMM could be faster base on the
* implementation of the blas library. The batch size could be zero. If any
* matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
* Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
* [BatchSize, H1, W2]
*
* The boolean flag, `trans`, describe the memory is the transpose of matrix or
* not. If the trans is true, the last two dims of matrix are transposed. The
* memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
*
* The MatDescriptor is not only the dimension or shape of a matrix, it also
* contains the layout, stride of matrix. It is clearer to have a structure than
* reuse `DDim`.
*/
struct MatDescriptor {
int64_t height_;
int64_t width_;
int64_t stride_{0};
int64_t batch_size_{0};
bool trans_;
};
static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
int num_flatten_cols,
bool trans) {
MatDescriptor retv;
if (num_flatten_cols > 1) {
auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
retv.height_ = flatten_dim[0];
retv.width_ = flatten_dim[1];
} else {
if (tensor_dim.size() == 2) {
retv.height_ = tensor_dim[0];
retv.width_ = tensor_dim[1];
} else {
auto dim_vec = tensor_dim.Vectorize();
retv.batch_size_ = 1;
for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
retv.batch_size_ *= dim_vec[i];
}
retv.height_ = dim_vec[dim_vec.size() - 2];
retv.width_ = dim_vec[dim_vec.size() - 1];
retv.stride_ = retv.height_ * retv.width_;
}
}
if (trans) {
std::swap(retv.width_, retv.height_);
}
retv.trans_ = trans;
return retv;
}
} // namespace math
} // namespace xpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/target_wrapper.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
namespace paddle {
namespace lite {
void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr};
xpu_malloc(&ptr, size);
return ptr;
}
void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
void TargetWrapperXPU::MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir) {
switch (dir) {
case IoDirection::HtoD:
xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
break;
case IoDirection::DtoH:
xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
template <>
class TargetWrapper<TARGET(kXPU)> {
public:
static size_t num_devices() { return 1; }
static size_t maximum_stream() { return 0; }
static void* Malloc(size_t size);
static void Free(void* ptr);
static void MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir);
};
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#pragma GCC system_header
#include <xpu/api.h>
#include <xpu/golden.h>
#include <xpu/runtime.h>
#if defined(LITE_WITH_XTCL)
#include <xtcl/xtcl.h>
#endif
namespace paddle {
namespace lite {
namespace xdnn = baidu::xpu::api;
} // namespace lite
} // namespace paddle
...@@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc ...@@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
DEPS target_wrapper_host place DEPS target_wrapper_host place
X86_DEPS target_wrapper_x86 X86_DEPS target_wrapper_x86
CUDA_DEPS target_wrapper_cuda CUDA_DEPS target_wrapper_cuda
XPU_DEPS target_wrapper_xpu
CL_DEPS cl_target_wrapper CL_DEPS cl_target_wrapper
FPGA_DEPS fpga_target_wrapper FPGA_DEPS fpga_target_wrapper
BM_DEPS target_wrapper_bm) BM_DEPS target_wrapper_bm
MLU_DEPS target_wrapper_mlu)
lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper) lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)
......
...@@ -6,5 +6,5 @@ endif() ...@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif() endif()
...@@ -15,5 +15,11 @@ ...@@ -15,5 +15,11 @@
#include "lite/core/context.h" #include "lite/core/context.h"
namespace paddle { namespace paddle {
namespace lite {} // namespace lite namespace lite {
#ifdef LITE_WITH_XPU
thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
#endif
} // namespace lite
} // namespace paddle } // namespace paddle
...@@ -24,6 +24,14 @@ ...@@ -24,6 +24,14 @@
#include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_context.h"
#include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_runtime.h"
#endif #endif
#ifdef LITE_WITH_MLU
#include <cnml.h>
#include <cnrt.h>
#include "lite/backends/mlu/mlu_utils.h"
#endif
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/xpu_header_sitter.h"
#endif
#include <map> #include <map>
#include <memory> #include <memory>
...@@ -103,11 +111,38 @@ class Context<TargetType::kXPU> { ...@@ -103,11 +111,38 @@ class Context<TargetType::kXPU> {
public: public:
Context() {} Context() {}
explicit Context(const XPUContext& ctx); explicit Context(const XPUContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {} void InitOnce() {}
void CopySharedTo(XPUContext* ctx) {} void CopySharedTo(XPUContext* ctx) {}
static xdnn::Context* GetRawContext() {
if (_tls_raw_ctx == nullptr) {
_tls_raw_ctx = xdnn::create_context();
CHECK(_tls_raw_ctx);
}
return _tls_raw_ctx;
}
static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
}
static void SetDev(int dev_no = 0) {
const char* dev_env = getenv("LITE_XPU_DEV");
if (dev_env) {
xpu_set_device(atoi(dev_env));
return;
}
xpu_set_device(dev_no);
}
std::string name() const { return "XPUContext"; } std::string name() const { return "XPUContext"; }
private:
static thread_local xdnn::Context* _tls_raw_ctx;
}; };
#endif #endif
...@@ -172,6 +207,85 @@ class Context<TargetType::kFPGA> { ...@@ -172,6 +207,85 @@ class Context<TargetType::kFPGA> {
}; };
#endif #endif
#ifdef LITE_WITH_MLU
template <>
class Context<TargetType::kMLU> {
public:
typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
void InitOnce() {}
MLUContext& operator=(const MLUContext& ctx) {
this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
return *this;
}
void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
CHECK_GT(devs.size(), 0UL)
<< "Env is not initialized or current target is not exit!";
if (dev_id >= static_cast<int>(devs.size())) {
LOG(WARNING) << "device index exceeds the number of devices, set to "
"default device(0)!";
device_id_ = 0;
} else {
device_id_ = dev_id;
}
SetMluDevice(device_id_);
if (io_queue_id >= devs[dev_id].max_queue()) {
LOG(WARNING) << "data queue index exceeds the maximum queue number, "
"set to default qeueu(0)!";
io_queue_id = 0;
}
if (exec_queue_id >= devs[dev_id].max_queue()) {
LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
"set to default qeueu(0)!";
exec_queue_id = 0;
}
io_queue_ = devs[dev_id].io_queues()[io_queue_id];
exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
exec_queue_id_ = exec_queue_id;
io_queue_id_ = io_queue_id;
}
void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
const cnrtQueue_t& exec_queue() const { return exec_queue_; }
void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
const cnrtQueue_t& io_queue() const { return io_queue_; }
void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
cnmlCoreVersion_t MLUCoreVersion() {
return DeviceInfo::Global().MLUCoreVersion();
}
int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
u32_t affinity() { return affinity_; }
cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
int device_id() { return device_id_; }
std::string name() const { return "MLUContext"; }
private:
int device_id_;
// overall information
int exec_queue_id_;
int io_queue_id_;
cnrtQueue_t io_queue_;
cnrtQueue_t exec_queue_;
std::vector<cnrtNotifier_t> input_notifiers_;
std::vector<cnrtNotifier_t> output_notifiers_;
cnrtInvokeFuncParam_t forward_param_;
u32_t affinity_ = 0x01;
};
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
// Only works with CUDA kernels. // Only works with CUDA kernels.
template <> template <>
...@@ -398,6 +512,16 @@ class ContextScheduler { ...@@ -398,6 +512,16 @@ class ContextScheduler {
kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo( kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
&ctx->As<BMContext>()); &ctx->As<BMContext>());
break; break;
#endif
#ifdef LITE_WITH_MLU
case TARGET(kMLU): {
int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
auto& context = ctx->As<MLUContext>();
context.Init(dev_id);
kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
&context);
LOG(INFO) << "New Context for MLU";
} break;
#endif #endif
default: default:
#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON) #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
...@@ -439,6 +563,9 @@ class ContextScheduler { ...@@ -439,6 +563,9 @@ class ContextScheduler {
#endif #endif
#ifdef LITE_WITH_BM #ifdef LITE_WITH_BM
InitContext<TargetType::kBM, BMContext>(); InitContext<TargetType::kBM, BMContext>();
#endif
#ifdef LITE_WITH_MLU
InitContext<TargetType::kMLU, MLUContext>();
#endif #endif
} }
......
...@@ -58,7 +58,7 @@ ...@@ -58,7 +58,7 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
#ifdef LITE_WITH_ARM #if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
thread_local lite_api::PowerMode DeviceInfo::mode_; thread_local lite_api::PowerMode DeviceInfo::mode_;
thread_local ARMArch DeviceInfo::arch_; thread_local ARMArch DeviceInfo::arch_;
thread_local int DeviceInfo::mem_size_; thread_local int DeviceInfo::mem_size_;
...@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_; ...@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
thread_local TensorLite DeviceInfo::workspace_; thread_local TensorLite DeviceInfo::workspace_;
thread_local int64_t DeviceInfo::count_ = 0; thread_local int64_t DeviceInfo::count_ = 0;
#ifdef LITE_WITH_MLU
thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
thread_local int DeviceInfo::mlu_core_number_{1};
thread_local bool DeviceInfo::use_first_conv_{false};
thread_local std::vector<float> DeviceInfo::mean_vec_;
thread_local std::vector<float> DeviceInfo::std_vec_;
thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
#endif
#ifdef TARGET_IOS #ifdef TARGET_IOS
const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
...@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() { ...@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() {
return 0; return 0;
} }
#ifdef LITE_WITH_MLU
void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
int core_number,
bool use_first_conv,
const std::vector<float>& mean_vec,
const std::vector<float>& std_vec,
DataLayoutType input_layout) {
switch (core_version) {
case (lite_api::MLUCoreVersion::MLU_220):
mlu_core_version_ = CNML_MLU220;
break;
case (lite_api::MLUCoreVersion::MLU_270):
mlu_core_version_ = CNML_MLU270;
break;
default:
mlu_core_version_ = CNML_MLU270;
break;
}
mlu_core_number_ = core_number;
use_first_conv_ = use_first_conv;
mean_vec_ = mean_vec;
std_vec_ = std_vec;
input_layout_ = input_layout;
}
cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
#endif // LITE_WITH_MLU
void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP #ifdef ARM_WITH_OMP
thread_num = std::min(thread_num, core_num_); thread_num = std::min(thread_num, core_num_);
...@@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) { ...@@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
#ifdef LITE_WITH_MLU
void SetMluDevice(int device_id) {
LOG(INFO) << "Set mlu device " << device_id;
cnrtDev_t dev_handle;
CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
}
void Device<TARGET(kMLU)>::Init() {
SetMluDevice(idx_);
GetInfo();
CreateQueue();
}
void Device<TARGET(kMLU)>::GetInfo() {}
void Device<TARGET(kMLU)>::CreateQueue() {
exec_queue_.clear();
io_queue_.clear();
for (size_t i = 0; i < max_queue_; ++i) {
cnrtQueue_t exec_queue;
cnrtQueue_t io_queue;
cnrtCreateQueue(&exec_queue);
cnrtCreateQueue(&io_queue);
exec_queue_.push_back(exec_queue);
io_queue_.push_back(io_queue);
cnrtCreateQueue(&exec_queue);
exec_queue_.push_back(exec_queue);
}
}
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
void Device<TARGET(kCUDA)>::Init() { void Device<TARGET(kCUDA)>::Init() {
......
...@@ -19,11 +19,14 @@ ...@@ -19,11 +19,14 @@
#include <vector> #include <vector>
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/mlu_utils.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
#ifdef LITE_WITH_ARM #if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
typedef enum { typedef enum {
kAPPLE = 0, kAPPLE = 0,
...@@ -52,6 +55,20 @@ class DeviceInfo { ...@@ -52,6 +55,20 @@ class DeviceInfo {
int Setup(); int Setup();
void SetRunMode(lite_api::PowerMode mode, int thread_num); void SetRunMode(lite_api::PowerMode mode, int thread_num);
#ifdef LITE_WITH_MLU
void SetMLURunMode(lite_api::MLUCoreVersion core_version,
int core_number,
bool use_first_conv,
const std::vector<float>& mean_vec,
const std::vector<float>& std_vec,
DataLayoutType input_layout);
cnmlCoreVersion_t MLUCoreVersion();
int MLUCoreNumber();
bool UseFirstConv();
const std::vector<float>& MeanVec() const;
const std::vector<float>& StdVec() const;
DataLayoutType InputLayout() const;
#endif
void SetCache(int l1size, int l2size, int l3size); void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; } void SetArch(ARMArch arch) { arch_ = arch; }
...@@ -103,6 +120,15 @@ class DeviceInfo { ...@@ -103,6 +120,15 @@ class DeviceInfo {
static thread_local TensorLite workspace_; static thread_local TensorLite workspace_;
static thread_local int64_t count_; static thread_local int64_t count_;
#ifdef LITE_WITH_MLU
static thread_local cnmlCoreVersion_t mlu_core_version_;
static thread_local int mlu_core_number_;
static thread_local bool use_first_conv_;
static thread_local std::vector<float> mean_vec_;
static thread_local std::vector<float> std_vec_;
static thread_local DataLayoutType input_layout_;
#endif
void SetDotInfo(int argc, ...); void SetDotInfo(int argc, ...);
void SetFP16Info(int argc, ...); void SetFP16Info(int argc, ...);
void SetFP32Info(int argc, ...); void SetFP32Info(int argc, ...);
...@@ -134,6 +160,9 @@ class Env { ...@@ -134,6 +160,9 @@ class Env {
return *devs; return *devs;
} }
static void Init(int max_stream = 4) { static void Init(int max_stream = 4) {
#ifdef LITE_WITH_MLU
CNRT_CALL(cnrtInit(0));
#endif
Devs& devs = Global(); Devs& devs = Global();
if (devs.size() > 0) { if (devs.size() > 0) {
return; return;
...@@ -156,6 +185,41 @@ class Env { ...@@ -156,6 +185,41 @@ class Env {
} }
}; };
#ifdef LITE_WITH_MLU
void SetMluDevice(int device_id);
template <>
class Device<TARGET(kMLU)> {
public:
Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
void Init();
int id() { return idx_; }
int max_queue() { return max_queue_; }
void SetId(int idx) { idx_ = idx; }
std::string name() { return "MLU"; }
int core_num() { return 16; }
float max_memory() { return 16 * 1024; }
std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
private:
void CreateQueue();
void GetInfo();
private:
int idx_{0};
int max_queue_;
std::string device_name_;
float max_memory_;
std::vector<cnrtQueue_t> io_queue_;
std::vector<cnrtQueue_t> exec_queue_;
};
template class Env<TARGET(kMLU)>;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
template <> template <>
class Device<TARGET(kCUDA)> { class Device<TARGET(kCUDA)> {
......
...@@ -83,6 +83,9 @@ class KernelBase { ...@@ -83,6 +83,9 @@ class KernelBase {
#if defined(LITE_WITH_CUDA) #if defined(LITE_WITH_CUDA)
WorkSpace::Global_CUDA().AllocReset(); WorkSpace::Global_CUDA().AllocReset();
#endif #endif
#if defined(LITE_WITH_MLU)
WorkSpace::Global_MLU().AllocReset();
#endif
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
......
...@@ -45,6 +45,16 @@ void* TargetMalloc(TargetType target, size_t size) { ...@@ -45,6 +45,16 @@ void* TargetMalloc(TargetType target, size_t size) {
data = TargetWrapper<TARGET(kBM)>::Malloc(size); data = TargetWrapper<TARGET(kBM)>::Malloc(size);
break; break;
#endif #endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
break;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_XPU
case TargetType::kXPU:
data = TargetWrapperXPU::Malloc(size);
break;
#endif // LITE_WITH_XPU
default: default:
LOG(FATAL) << "Unknown supported target " << TargetToStr(target); LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
} }
...@@ -83,6 +93,16 @@ void TargetFree(TargetType target, void* data, std::string free_flag) { ...@@ -83,6 +93,16 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
TargetWrapper<TARGET(kBM)>::Free(data); TargetWrapper<TARGET(kBM)>::Free(data);
break; break;
#endif #endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
TargetWrapper<TARGET(kMLU)>::Free(data);
break;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_XPU
case TargetType::kXPU:
TargetWrapperXPU::Free(data);
break;
#endif // LITE_WITH_XPU
default: default:
LOG(FATAL) << "Unknown type"; LOG(FATAL) << "Unknown type";
} }
...@@ -114,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { ...@@ -114,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD); TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
break; break;
#endif #endif
#ifdef LITE_WITH_MLU
case TargetType::kMLU:
TargetWrapper<TARGET(kMLU)>::MemcpySync(
dst, src, size, IoDirection::HtoD);
break;
#endif
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
case TargetType::kOpenCL: case TargetType::kOpenCL:
TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
......
...@@ -31,6 +31,14 @@ ...@@ -31,6 +31,14 @@
#include "lite/backends/bm/target_wrapper.h" #include "lite/backends/bm/target_wrapper.h"
#endif // LITE_WITH_BM #endif // LITE_WITH_BM
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/target_wrapper.h"
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/target_wrapper.h"
#endif // LITE_WITH_XPU
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -75,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { ...@@ -75,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
TargetWrapperCL::MemcpySync(dst, src, size, dir); TargetWrapperCL::MemcpySync(dst, src, size, dir);
break; break;
#endif // LITE_WITH_OPENCL #endif // LITE_WITH_OPENCL
#ifdef LITE_WITH_MLU
case TARGET(kMLU):
TargetWrapperMlu::MemcpySync(dst, src, size, dir);
break;
#endif
#ifdef LITE_WITH_FPGA #ifdef LITE_WITH_FPGA
case TARGET(kFPGA): case TARGET(kFPGA):
TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir); TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
...@@ -126,7 +139,7 @@ class Buffer { ...@@ -126,7 +139,7 @@ class Buffer {
const size_t img_h, const size_t img_h,
void* host_ptr = nullptr) { void* host_ptr = nullptr) {
if (target != target_ || cl_image2d_width_ < img_w || if (target != target_ || cl_image2d_width_ < img_w ||
cl_image2d_height_ < img_h) { cl_image2d_height_ < img_h || host_ptr != nullptr) {
CHECK_EQ(own_data_, true) << "Can not reset unowned buffer."; CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
Free(); Free();
data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr); data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
......
...@@ -21,6 +21,8 @@ lite_cc_library(mir_passes ...@@ -21,6 +21,8 @@ lite_cc_library(mir_passes
fusion/elementwise_add_activation_fuse_pass.cc fusion/elementwise_add_activation_fuse_pass.cc
fusion/quant_dequant_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc
fusion/__xpu__resnet_fuse_pass.cc
fusion/__xpu__multi_encoder_fuse_pass.cc
elimination/identity_scale_eliminate_pass.cc elimination/identity_scale_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc
elimination/assign_value_eliminate_pass.cc elimination/assign_value_eliminate_pass.cc
...@@ -36,6 +38,7 @@ lite_cc_library(mir_passes ...@@ -36,6 +38,7 @@ lite_cc_library(mir_passes
demo_pass.cc demo_pass.cc
runtime_context_assign_pass.cc runtime_context_assign_pass.cc
memory_optimize_pass.cc memory_optimize_pass.cc
mlu_postprocess_pass.cc
weight_quantization_preprocess_pass.cc weight_quantization_preprocess_pass.cc
quantized_op_attributes_inference_pass.cc quantized_op_attributes_inference_pass.cc
DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs}) DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
...@@ -70,10 +73,10 @@ set(pattern_deps mir_node mir_ssa_graph op) ...@@ -70,10 +73,10 @@ set(pattern_deps mir_node mir_ssa_graph op)
if (WITH_TESTING) if (WITH_TESTING)
list(APPEND pattern_deps gtest) list(APPEND pattern_deps gtest)
endif() endif()
lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps}) lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps})
lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher) lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)
lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher) lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher)
# for mobile, unnecessary to compile the following testings. # for mobile, unnecessary to compile the following testings.
......
...@@ -27,8 +27,8 @@ ...@@ -27,8 +27,8 @@
#include "lite/utils/string.h" #include "lite/utils/string.h"
namespace paddle { namespace paddle {
namespace inference { namespace lite {
namespace analysis { namespace mir {
static size_t dot_node_counter{0}; static size_t dot_node_counter{0};
...@@ -162,6 +162,6 @@ class Dot { ...@@ -162,6 +162,6 @@ class Dot {
std::vector<Attr> attrs_; std::vector<Attr> attrs_;
}; };
} // namespace analysis } // namespace mir
} // namespace inference } // namespace lite
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/operators/subgraph_op.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUSingleEncoderFuser : public FuseBase {
public:
explicit XPUSingleEncoderFuser(const std::string& act_type = "gelu")
: act_type_(act_type) {}
void BuildPattern() override {
auto* input = VarNode("input")
->assert_is_op_input("mul", "X")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* q_mul_y =
VarNode("q_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* q_mul = OpNode("q_mul", "mul");
auto* q_mul_out = VarNode("q_mul_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* q_add_y = VarNode("q_add_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* q_add = OpNode("q_add", "elementwise_add")->AsIntermediate();
auto* q_add_out = VarNode("q_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("reshape2", "X")
->AsIntermediate();
auto* q_reshape2 = OpNode("q_reshape2", "reshape2")->AsIntermediate();
auto* q_reshape2_out = VarNode("q_reshape2_out")
->assert_is_op_output("reshape2", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
auto* q_reshape2_xshape = VarNode("q_reshape2_xshape")
->assert_is_op_output("reshape2", "XShape")
->AsIntermediate();
auto* q_transpose2 = OpNode("q_transpose2", "transpose2")->AsIntermediate();
auto* q_transpose2_out = VarNode("q_transpose2_out")
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("scale", "X")
->AsIntermediate();
auto* q_transpose2_xshape =
VarNode("q_transpose2_xshape")
->assert_is_op_output("transpose2", "XShape")
->AsIntermediate();
auto* q_scale = OpNode("q_scale", "scale")->AsIntermediate();
auto* q_scale_out = VarNode("q_scale_out")
->assert_is_op_output("scale", "Out")
->assert_is_op_input("matmul", "X")
->AsIntermediate();
auto* k_mul_y =
VarNode("k_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* k_mul = OpNode("k_mul", "mul")->AsIntermediate();
auto* k_mul_out = VarNode("k_mul_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* k_add_y = VarNode("k_add_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* k_add = OpNode("k_add", "elementwise_add")->AsIntermediate();
auto* k_add_out = VarNode("k_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("reshape2", "X")
->AsIntermediate();
auto* k_reshape2 = OpNode("k_reshape2", "reshape2")->AsIntermediate();
auto* k_reshape2_out = VarNode("k_reshape2_out")
->assert_is_op_output("reshape2", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
auto* k_reshape2_xshape = VarNode("k_reshape2_xshape")
->assert_is_op_output("reshape2", "XShape")
->AsIntermediate();
auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate();
auto* k_transpose2_out = VarNode("k_transpose2_out")
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("matmul", "Y")
->AsIntermediate();
auto* k_transpose2_xshape =
VarNode("k_transpose2_xshape")
->assert_is_op_output("transpose2", "XShape")
->AsIntermediate();
auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate();
auto* qk_matmul_out = VarNode("qk_matmul_out")
->assert_is_op_output("matmul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qk_mask = VarNode("qk_mask")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* qk_add = OpNode("qk_add", "elementwise_add")->AsIntermediate();
auto* qk_add_out = VarNode("qk_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("softmax", "X")
->AsIntermediate();
auto* qk_softmax = OpNode("qk_softmax", "softmax")->AsIntermediate();
auto* qk_softmax_out = VarNode("qk_softmax_out")
->assert_is_op_output("softmax", "Out")
->AsIntermediate();
auto* qk_dropout = OpNode("qk_dropout", "dropout")->AsIntermediate();
auto* qk_dropout_out = VarNode("qk_dropout_out")
->assert_is_op_output("dropout", "Out")
->assert_is_op_input("matmul", "X")
->AsIntermediate();
auto* qk_dropout_mask = VarNode("qk_dropout_mask")
->assert_is_op_output("dropout", "Mask")
->AsIntermediate();
auto* v_mul_y =
VarNode("v_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* v_mul = OpNode("v_mul", "mul")->AsIntermediate();
auto* v_mul_out = VarNode("v_mul_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* v_add_y = VarNode("v_add_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* v_add = OpNode("v_add", "elementwise_add")->AsIntermediate();
auto* v_add_out = VarNode("v_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("reshape2", "X")
->AsIntermediate();
auto* v_reshape2 = OpNode("v_reshape2", "reshape2")->AsIntermediate();
auto* v_reshape2_out = VarNode("v_reshape2_out")
->assert_is_op_output("reshape2", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
auto* v_reshape2_xshape = VarNode("v_reshape2_xshape")
->assert_is_op_output("reshape2", "XShape")
->AsIntermediate();
auto* v_transpose2 = OpNode("v_transpose2", "transpose2")->AsIntermediate();
auto* v_transpose2_out = VarNode("v_transpose2_out")
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("matmul", "Y")
->AsIntermediate();
auto* v_transpose2_xshape =
VarNode("v_transpose2_xshape")
->assert_is_op_output("transpose2", "XShape")
->AsIntermediate();
auto* qkv_matmul = OpNode("qkv_matmul", "matmul")->AsIntermediate();
auto* qkv_matmul_out = VarNode("qkv_matmul_out")
->assert_is_op_output("matmul", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
auto* qkv_transpose2 =
OpNode("qkv_transpose2", "transpose2")->AsIntermediate();
auto* qkv_transpose2_out = VarNode("qkv_transpose2_out")
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("reshape2", "X")
->AsIntermediate();
auto* qkv_transpose2_xshape =
VarNode("qkv_transpose2_xshape")
->assert_is_op_output("transpose2", "XShape")
->AsIntermediate();
auto* qkv_reshape2 = OpNode("qkv_reshape2", "reshape2")->AsIntermediate();
auto* qkv_reshape2_out = VarNode("qkv_reshape2_out")
->assert_is_op_output("reshape2", "Out")
->assert_is_op_input("mul", "X")
->AsIntermediate();
auto* qkv_reshape2_xshape = VarNode("qkv_reshape2_xshape")
->assert_is_op_output("reshape2", "XShape")
->AsIntermediate();
auto* qkv_mul_y =
VarNode("qkv_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* qkv_mul = OpNode("qkv_mul", "mul")->AsIntermediate();
auto* qkv_mul_out = VarNode("qkv_mul_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_add_y = VarNode("qkv_add_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* qkv_add = OpNode("qkv_add", "elementwise_add")->AsIntermediate();
auto* qkv_add_out = VarNode("qkv_add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("dropout", "X")
->AsIntermediate();
auto* qkv_dropout = OpNode("qkv_dropout", "dropout")->AsIntermediate();
auto* qkv_dropout_out = VarNode("qkv_dropout_out")
->assert_is_op_output("dropout", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_dropout_mask = VarNode("qkv_dropout_mask")
->assert_is_op_output("dropout", "Mask")
->AsIntermediate();
auto* qkv_add_2 = OpNode("qkv_add_2", "elementwise_add")->AsIntermediate();
auto* qkv_add_2_out = VarNode("qkv_add_2_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("layer_norm", "X")
->AsIntermediate();
auto* qkv_ln_2_scale = VarNode("qkv_ln_2_scale")
->assert_is_op_input("layer_norm", "Scale")
->AsInput();
auto* qkv_ln_2_bias = VarNode("qkv_ln_2_bias")
->assert_is_op_input("layer_norm", "Bias")
->AsInput();
auto* qkv_ln_2 = OpNode("qkv_ln_2", "layer_norm")->AsIntermediate();
auto* qkv_ln_2_out = VarNode("qkv_ln_2_out")
->assert_is_op_output("layer_norm", "Y")
->assert_is_op_input("mul", "X")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* qkv_ln_2_mean = VarNode("qkv_ln_2_mean")
->assert_is_op_output("layer_norm", "Mean")
->AsIntermediate();
auto* qkv_ln_2_var = VarNode("qkv_ln_2_var")
->assert_is_op_output("layer_norm", "Variance")
->AsIntermediate();
auto* qkv_mul_3_y =
VarNode("qkv_mul_3_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* qkv_mul_3 = OpNode("qkv_mul_3", "mul")->AsIntermediate();
auto* qkv_mul_3_out = VarNode("qkv_mul_3_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_add_3_y = VarNode("qkv_add_3_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* qkv_add_3 = OpNode("qkv_add_3", "elementwise_add")->AsIntermediate();
auto* qkv_add_3_out = VarNode("qkv_add_3_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input(act_type_, "X")
->AsIntermediate();
auto* qkv_act = OpNode("qkv_act", act_type_)->AsIntermediate();
auto* qkv_act_out = VarNode("qkv_act_out")
->assert_is_op_output(act_type_, "Out")
->assert_is_op_input("mul", "X")
->AsIntermediate();
auto* qkv_mul_4_y =
VarNode("qkv_mul_4_y")->assert_is_op_input("mul", "Y")->AsInput();
auto* qkv_mul_4 = OpNode("qkv_mul_4", "mul")->AsIntermediate();
auto* qkv_mul_4_out = VarNode("qkv_mul_4_out")
->assert_is_op_output("mul", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_add_4_y = VarNode("qkv_add_4_y")
->assert_is_op_input("elementwise_add", "Y")
->AsInput();
auto* qkv_add_4 = OpNode("qkv_add_4", "elementwise_add")->AsIntermediate();
auto* qkv_add_4_out = VarNode("qkv_add_4_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("dropout", "X")
->AsIntermediate();
auto* qkv_dropout_4 = OpNode("qkv_dropout_4", "dropout")->AsIntermediate();
auto* qkv_dropout_4_out = VarNode("qkv_dropout_4_out")
->assert_is_op_output("dropout", "Out")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* qkv_dropout_4_mask = VarNode("qkv_dropout_4_mask")
->assert_is_op_output("dropout", "Mask")
->AsIntermediate();
auto* qkv_add_5 = OpNode("qkv_add_5", "elementwise_add")->AsIntermediate();
auto* qkv_add_5_out = VarNode("qkv_add_5_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("layer_norm", "X")
->AsIntermediate();
auto* qkv_ln_5_scale = VarNode("qkv_ln_5_scale")
->assert_is_op_input("layer_norm", "Scale")
->AsInput();
auto* qkv_ln_5_bias = VarNode("qkv_ln_5_bias")
->assert_is_op_input("layer_norm", "Bias")
->AsInput();
auto* qkv_ln_5 = OpNode("qkv_ln_5", "layer_norm")->AsIntermediate();
auto* qkv_ln_5_out = VarNode("qkv_ln_5_out")
->assert_is_op_output("layer_norm", "Y")
->AsOutput();
auto* qkv_ln_5_mean = VarNode("qkv_ln_5_mean")
->assert_is_op_output("layer_norm", "Mean")
->AsIntermediate();
auto* qkv_ln_5_var = VarNode("qkv_ln_5_var")
->assert_is_op_output("layer_norm", "Variance")
->AsIntermediate();
// TODO(miaotianxiang): use LinksFrom/LinksTo() instead
*input >> *q_mul >> *q_mul_out >> *q_add >> *q_add_out >> *q_reshape2 >>
*q_reshape2_out >> *q_transpose2 >> *q_transpose2_out >> *q_scale >>
*q_scale_out >> *qk_matmul;
*q_mul_y >> *q_mul;
*q_add_y >> *q_add;
*q_reshape2 >> *q_reshape2_xshape;
*q_transpose2 >> *q_transpose2_xshape;
*input >> *k_mul >> *k_mul_out >> *k_add >> *k_add_out >> *k_reshape2 >>
*k_reshape2_out >> *k_transpose2 >> *k_transpose2_out >> *qk_matmul;
*k_mul_y >> *k_mul;
*k_add_y >> *k_add;
*k_reshape2 >> *k_reshape2_xshape;
*k_transpose2 >> *k_transpose2_xshape;
*qk_matmul >> *qk_matmul_out >> *qk_add >> *qk_add_out >> *qk_softmax >>
*qk_softmax_out >> *qk_dropout >> *qk_dropout_out >> *qkv_matmul;
*qk_mask >> *qk_add;
*qk_dropout >> *qk_dropout_mask;
*input >> *v_mul >> *v_mul_out >> *v_add >> *v_add_out >> *v_reshape2 >>
*v_reshape2_out >> *v_transpose2 >> *v_transpose2_out >> *qkv_matmul;
*v_mul_y >> *v_mul;
*v_add_y >> *v_add;
*v_reshape2 >> *v_reshape2_xshape;
*v_transpose2 >> *v_transpose2_xshape;
*qkv_matmul >> *qkv_matmul_out >> *qkv_transpose2 >> *qkv_transpose2_out >>
*qkv_reshape2 >> *qkv_reshape2_out >> *qkv_mul >> *qkv_mul_out >>
*qkv_add >> *qkv_add_out >> *qkv_dropout >> *qkv_dropout_out >>
*qkv_add_2;
*qkv_transpose2 >> *qkv_transpose2_xshape;
*qkv_reshape2 >> *qkv_reshape2_xshape;
*qkv_mul_y >> *qkv_mul;
*qkv_add_y >> *qkv_add;
*qkv_dropout >> *qkv_dropout_mask;
*input >> *qkv_add_2 >> *qkv_add_2_out >> *qkv_ln_2 >> *qkv_ln_2_out;
*qkv_ln_2_scale >> *qkv_ln_2;
*qkv_ln_2_bias >> *qkv_ln_2;
*qkv_ln_2 >> *qkv_ln_2_mean;
*qkv_ln_2 >> *qkv_ln_2_var;
*qkv_ln_2_out >> *qkv_mul_3 >> *qkv_mul_3_out >> *qkv_add_3 >>
*qkv_add_3_out >> *qkv_act >> *qkv_act_out >> *qkv_mul_4 >>
*qkv_mul_4_out >> *qkv_add_4 >> *qkv_add_4_out >> *qkv_dropout_4 >>
*qkv_dropout_4_out >> *qkv_add_5;
*qkv_mul_3_y >> *qkv_mul_3;
*qkv_add_3_y >> *qkv_add_3;
*qkv_mul_4_y >> *qkv_mul_4;
*qkv_add_4_y >> *qkv_add_4;
*qkv_dropout_4 >> *qkv_dropout_4_mask;
*qkv_ln_2_out >> *qkv_add_5 >> *qkv_add_5_out >> *qkv_ln_5 >> *qkv_ln_5_out;
*qkv_ln_5_scale >> *qkv_ln_5;
*qkv_ln_5_bias >> *qkv_ln_5;
*qkv_ln_5 >> *qkv_ln_5_mean;
*qkv_ln_5 >> *qkv_ln_5_var;
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("single_encoder");
op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
op_desc.SetInput("Mask", {matched.at("qk_mask")->arg()->name});
op_desc.SetInput("FCWeight",
{
matched.at("q_mul_y")->arg()->name,
matched.at("k_mul_y")->arg()->name,
matched.at("v_mul_y")->arg()->name,
matched.at("qkv_mul_y")->arg()->name,
matched.at("qkv_mul_3_y")->arg()->name,
matched.at("qkv_mul_4_y")->arg()->name,
});
op_desc.SetInput("FCBias",
{
matched.at("q_add_y")->arg()->name,
matched.at("k_add_y")->arg()->name,
matched.at("v_add_y")->arg()->name,
matched.at("qkv_add_y")->arg()->name,
matched.at("qkv_add_3_y")->arg()->name,
matched.at("qkv_add_4_y")->arg()->name,
});
op_desc.SetInput("LNScale",
{
matched.at("qkv_ln_2_scale")->arg()->name,
matched.at("qkv_ln_5_scale")->arg()->name,
});
op_desc.SetInput("LNBias",
{
matched.at("qkv_ln_2_bias")->arg()->name,
matched.at("qkv_ln_5_bias")->arg()->name,
});
op_desc.SetOutput("Outputs", {matched.at("qkv_ln_5_out")->arg()->name});
// XXX: keep these to fool SubgraphOp::AttachImpl()
op_desc.SetAttr<int>("sub_block", 0);
op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
// extra traits to distill
auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info();
auto reshape_dim = reshape_op_info->GetAttr<std::vector<int>>("shape");
op_desc.SetAttr<int>("head_num", reshape_dim[2]);
op_desc.SetAttr<int>("size_per_head", reshape_dim[3]);
op_desc.SetAttr<std::string>("act_type", act_type_);
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
auto* single_encoder_stmt = matched.at("q_mul")->stmt();
fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
single_encoder_stmt->SetOp(fake_subgraph_op);
std::vector<std::string> froms = {
"qk_mask",
"k_mul_y",
"v_mul_y",
"qkv_mul_y",
"qkv_mul_3_y",
"qkv_mul_4_y",
"q_add_y",
"k_add_y",
"v_add_y",
"qkv_add_y",
"qkv_add_3_y",
"qkv_add_4_y",
"qkv_ln_2_scale",
"qkv_ln_2_bias",
"qkv_ln_5_scale",
"qkv_ln_5_bias",
};
for (auto& from : froms) {
IR_NODE_LINK_TO(matched.at(from), matched.at("q_mul"));
}
IR_OP_VAR_LINK(matched.at("q_mul"), matched.at("qkv_ln_5_out"));
}
private:
std::string act_type_;
};
class XPUMultiEncoderFuser {
public:
bool IsDirectPredecessorOf(Node* op1, Node* op2) {
for (auto* out : op1->outlinks) {
for (auto* in : op2->inlinks) {
if (out == in) return true;
}
}
return false;
}
void operator()(SSAGraph* graph) {
std::vector<Node*> all_encoders;
for (auto* node : graph->StmtTopologicalOrder()) {
CHECK(node->IsStmt());
if (node->stmt()->op_info()->Type() == "single_encoder") {
all_encoders.push_back(node);
}
}
VLOG(3) << "Found " << all_encoders.size() << " single_encoder";
if (all_encoders.size() == 0) {
return;
}
// TODO(miaotianxiang): more verification
for (size_t i = 0; i < all_encoders.size() - 1; ++i) {
CHECK(IsDirectPredecessorOf(all_encoders[i], all_encoders[i + 1]));
}
std::string mask_name;
for (auto* encoder : all_encoders) {
auto* op_info = encoder->stmt()->op_info();
if (mask_name.empty()) {
mask_name = op_info->Input("Mask").front();
} else {
// CHECK(mask_name == op_info->Input("Mask").front());
}
}
std::unordered_set<const Node*> to_remove;
Node* first_encoder = all_encoders[0];
std::string in_name, out_name;
std::vector<std::string> arg_names{
"FCWeight", "FCBias", "LNScale", "LNBias"};
std::unordered_map<std::string, std::vector<std::string>> arg_map;
for (size_t i = 0; i < all_encoders.size(); ++i) {
Node* cur_encoder = all_encoders[i];
auto* op_info = cur_encoder->stmt()->op_info();
for (auto arg_name : arg_names) {
auto real_names = op_info->Input(arg_name);
for (auto name : real_names) {
auto* arg_node = graph->RetrieveArgument(name);
DirectedLink(arg_node, first_encoder);
arg_map[arg_name].push_back(name);
}
}
auto* cur_out =
graph->RetrieveArgument(op_info->Output("Outputs").front());
if (i == 0) {
// first encoder
to_remove.insert(cur_out);
in_name = op_info->Input("Inputs").front();
mask_name = op_info->Input("Mask").front();
} else if (i == all_encoders.size() - 1) {
// last encoder
to_remove.insert(cur_encoder);
DirectedLink(first_encoder, cur_out);
out_name = op_info->Output("Outputs").front();
} else {
to_remove.insert(cur_encoder);
to_remove.insert(cur_out);
}
}
GraphSafeRemoveNodes(graph, to_remove);
auto* multi_encoder_stmt = first_encoder->stmt();
cpp::OpDesc op_desc;
op_desc.SetType("__xpu__multi_encoder");
op_desc.SetInput("Input", {in_name});
for (auto kv : arg_map) {
op_desc.SetInput(kv.first, kv.second);
}
op_desc.SetInput("Mask", {mask_name});
op_desc.SetOutput("Output", {out_name});
op_desc.SetAttr<int>("xpu", 1);
auto* first_encoder_op_info = multi_encoder_stmt->op_info();
op_desc.SetAttr<int>("head_num",
first_encoder_op_info->GetAttr<int>("head_num"));
op_desc.SetAttr<int>("size_per_head",
first_encoder_op_info->GetAttr<int>("size_per_head"));
op_desc.SetAttr<int>("n_layers", all_encoders.size());
op_desc.SetAttr<std::string>(
"act_type", first_encoder_op_info->GetAttr<std::string>("act_type"));
auto* scope = multi_encoder_stmt->op()->scope();
std::vector<float> fc_weight_max(arg_map["FCWeight"].size());
auto& fc_weight_names = arg_map["FCWeight"];
for (size_t i = 0; i < fc_weight_names.size(); ++i) {
auto* weight_t = scope->FindMutableTensor(fc_weight_names[i]);
auto weight_dims = weight_t->dims();
int weight_len = weight_t->numel();
float* weight_on_host = weight_t->mutable_data<float>();
float max_f =
paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
paddle::lite::xpu::math::ConvertFP32ToInt16(
weight_on_host, weight_int16.get(), max_f, weight_len);
paddle::lite::xpu::math::Transpose(weight_int16.get(),
weight_trans_int16.get(),
weight_dims[0],
weight_dims[1]);
memcpy(weight_on_host,
weight_trans_int16.get(),
weight_len * sizeof(int16_t));
fc_weight_max[i] = max_f;
}
std::string max_name = "encoder_max";
auto* max_filter_node = graph->NewArgumentNode(max_name);
max_filter_node->arg()->is_weight = true;
max_filter_node->arg()->type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
DirectedLink(max_filter_node, first_encoder);
auto* max_filter_tensor = scope->NewTensor(max_name);
max_filter_tensor->Resize({static_cast<int>(fc_weight_max.size())});
memcpy(max_filter_tensor->mutable_data<float>(),
&fc_weight_max[0],
sizeof(float) * fc_weight_max.size());
op_desc.SetInput("FCWeightMax", {max_name});
auto multi_encoder_op = LiteOpRegistry::Global().Create(op_desc.Type());
multi_encoder_op->Attach(op_desc, scope);
multi_encoder_op->SetValidPlaces(multi_encoder_stmt->op()->valid_places());
auto kernels =
multi_encoder_op->CreateKernels(multi_encoder_op->valid_places());
multi_encoder_stmt->SetOp(multi_encoder_op);
multi_encoder_stmt->SetKernels(std::move(kernels));
// temp remove useless cast
std::unordered_set<const Node*> to_remove2;
Node* stack = nullptr;
for (auto* node : graph->StmtTopologicalOrder()) {
CHECK(node->IsStmt());
if (node->stmt()->op_info()->Type() == "stack") {
stack = node;
}
}
Node* stack_out = stack->outlinks.front();
for (Node* cast : stack_out->outlinks) {
Node* cast_out = cast->outlinks.front();
if (cast_out->outlinks.size() == 0) {
// remove
to_remove2.insert(cast_out);
to_remove2.insert(cast);
}
}
GraphSafeRemoveNodes(graph, to_remove2);
}
};
} // namespace fusion
class XPUMultiEncoderFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
// TODO(miaotianxiang): backup graph, recover from failed match
std::vector<std::string> act_types{"gelu", "relu"};
for (auto& act_type : act_types) {
fusion::XPUSingleEncoderFuser single_encoder_fuser(act_type);
single_encoder_fuser(graph.get());
fusion::XPUMultiEncoderFuser multi_encoder_fuser;
multi_encoder_fuser(graph.get());
}
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__multi_encoder_fuse_pass,
paddle::lite::mir::XPUMultiEncoderFusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("matmul");
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <vector>
#include "lite/backends/xpu/math.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include "lite/operators/subgraph_op.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class XPUResNetBlock0Fuser : public FuseBase {
public:
XPUResNetBlock0Fuser() {}
void BuildPattern() override {
auto* input =
VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
auto* left_conv1_weight = VarNode("left_conv1_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* left_conv1 = OpNode("left_conv1", "conv2d");
auto* left_conv1_out = VarNode("left_conv1_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* left_bn1_scale = VarNode("left_bn1_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* left_bn1_bias = VarNode("left_bn1_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* left_bn1_mean = VarNode("left_bn1_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* left_bn1_var = VarNode("left_bn1_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate();
auto* left_bn1_out = VarNode("left_bn1_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* left_bn1_mean_out = VarNode("left_bn1_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* left_bn1_var_out =
VarNode("left_bn1_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* left_bn1_saved_mean =
VarNode("left_bn1_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* left_bn1_saved_var =
VarNode("left_bn1_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate();
auto* left_relu1_out = VarNode("left_relu1_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("conv2d", "Input")
->AsIntermediate();
auto* left_conv2_weight = VarNode("left_conv2_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate();
auto* left_conv2_out = VarNode("left_conv2_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* left_bn2_scale = VarNode("left_bn2_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* left_bn2_bias = VarNode("left_bn2_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* left_bn2_mean = VarNode("left_bn2_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* left_bn2_var = VarNode("left_bn2_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate();
auto* left_bn2_out = VarNode("left_bn2_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* left_bn2_mean_out = VarNode("left_bn2_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* left_bn2_var_out =
VarNode("left_bn2_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* left_bn2_saved_mean =
VarNode("left_bn2_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* left_bn2_saved_var =
VarNode("left_bn2_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate();
auto* left_relu2_out = VarNode("left_relu2_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("conv2d", "Input")
->AsIntermediate();
auto* left_conv3_weight = VarNode("left_conv3_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate();
auto* left_conv3_out = VarNode("left_conv3_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* left_bn3_scale = VarNode("left_bn3_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* left_bn3_bias = VarNode("left_bn3_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* left_bn3_mean = VarNode("left_bn3_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* left_bn3_var = VarNode("left_bn3_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate();
auto* left_bn3_out = VarNode("left_bn3_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* left_bn3_mean_out = VarNode("left_bn3_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* left_bn3_var_out =
VarNode("left_bn3_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* left_bn3_saved_mean =
VarNode("left_bn3_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* left_bn3_saved_var =
VarNode("left_bn3_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* right_conv1_weight = VarNode("right_conv1_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate();
auto* right_conv1_out = VarNode("right_conv1_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* right_bn1_scale = VarNode("right_bn1_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* right_bn1_bias = VarNode("right_bn1_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* right_bn1_mean = VarNode("right_bn1_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* right_bn1_var = VarNode("right_bn1_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
auto* right_bn1_out = VarNode("right_bn1_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("elementwise_add", "X")
->AsIntermediate();
auto* right_bn1_mean_out =
VarNode("right_bn1_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* right_bn1_var_out =
VarNode("right_bn1_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* right_bn1_saved_mean =
VarNode("right_bn1_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* right_bn1_saved_var =
VarNode("right_bn1_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
auto* add_out = VarNode("add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* relu = OpNode("relu", "relu")->AsIntermediate();
auto* relu_out =
VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
*input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >>
*left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >>
*left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >>
*left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add;
*left_conv1_weight >> *left_conv1;
*left_bn1_scale >> *left_bn1;
*left_bn1_bias >> *left_bn1;
*left_bn1_mean >> *left_bn1;
*left_bn1_var >> *left_bn1;
*left_bn1 >> *left_bn1_mean_out;
*left_bn1 >> *left_bn1_var_out;
*left_bn1 >> *left_bn1_saved_mean;
*left_bn1 >> *left_bn1_saved_var;
*left_conv2_weight >> *left_conv2;
*left_bn2_scale >> *left_bn2;
*left_bn2_bias >> *left_bn2;
*left_bn2_mean >> *left_bn2;
*left_bn2_var >> *left_bn2;
*left_bn2 >> *left_bn2_mean_out;
*left_bn2 >> *left_bn2_var_out;
*left_bn2 >> *left_bn2_saved_mean;
*left_bn2 >> *left_bn2_saved_var;
*left_conv3_weight >> *left_conv3;
*left_bn3_scale >> *left_bn3;
*left_bn3_bias >> *left_bn3;
*left_bn3_mean >> *left_bn3;
*left_bn3_var >> *left_bn3;
*left_bn3 >> *left_bn3_mean_out;
*left_bn3 >> *left_bn3_var_out;
*left_bn3 >> *left_bn3_saved_mean;
*left_bn3 >> *left_bn3_saved_var;
*input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
*right_bn1_out >> *add;
*right_conv1_weight >> *right_conv1;
*right_bn1_scale >> *right_bn1;
*right_bn1_bias >> *right_bn1;
*right_bn1_mean >> *right_bn1;
*right_bn1_var >> *right_bn1;
*right_bn1 >> *right_bn1_mean_out;
*right_bn1 >> *right_bn1_var_out;
*right_bn1 >> *right_bn1_saved_mean;
*right_bn1 >> *right_bn1_saved_var;
*add >> *add_out >> *relu >> *relu_out;
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("resnet_block0");
op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
op_desc.SetInput("Filter",
{
matched.at("left_conv1_weight")->arg()->name,
matched.at("left_conv2_weight")->arg()->name,
matched.at("left_conv3_weight")->arg()->name,
matched.at("right_conv1_weight")->arg()->name,
});
op_desc.SetInput("Scale",
{
matched.at("left_bn1_scale")->arg()->name,
matched.at("left_bn2_scale")->arg()->name,
matched.at("left_bn3_scale")->arg()->name,
matched.at("right_bn1_scale")->arg()->name,
});
op_desc.SetInput("Bias",
{
matched.at("left_bn1_bias")->arg()->name,
matched.at("left_bn2_bias")->arg()->name,
matched.at("left_bn3_bias")->arg()->name,
matched.at("right_bn1_bias")->arg()->name,
});
op_desc.SetInput("Mean",
{
matched.at("left_bn1_mean")->arg()->name,
matched.at("left_bn2_mean")->arg()->name,
matched.at("left_bn3_mean")->arg()->name,
matched.at("right_bn1_mean")->arg()->name,
});
op_desc.SetInput("Var",
{
matched.at("left_bn1_variance")->arg()->name,
matched.at("left_bn2_variance")->arg()->name,
matched.at("left_bn3_variance")->arg()->name,
matched.at("right_bn1_variance")->arg()->name,
});
op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
// XXX: keep these to fool SubgraphOp::AttachImpl()
op_desc.SetAttr<int>("sub_block", 0);
op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
auto block0_stmt = matched.at("left_conv1")->stmt();
// block0_stmt->ResetOp(op_desc, graph->valid_places());
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
block0_stmt->SetOp(fake_subgraph_op);
std::vector<std::string> froms = {
"left_conv2_weight",
"left_conv3_weight",
"right_conv1_weight",
"left_bn1_bias",
"left_bn2_bias",
"left_bn3_bias",
"right_bn1_bias",
};
for (auto& from : froms) {
IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1"));
}
IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out"));
}
};
class XPUResNetBlock1Fuser : public FuseBase {
public:
XPUResNetBlock1Fuser() {}
void BuildPattern() override {
auto* input = VarNode("input")
->assert_is_op_input("conv2d", "Input")
->assert_is_op_input("elementwise_add", "X")
->AsInput();
auto* right_conv1_weight = VarNode("right_conv1_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* right_conv1 = OpNode("right_conv1", "conv2d");
auto* right_conv1_out = VarNode("right_conv1_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* right_bn1_scale = VarNode("right_bn1_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* right_bn1_bias = VarNode("right_bn1_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* right_bn1_mean = VarNode("right_bn1_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* right_bn1_var = VarNode("right_bn1_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
auto* right_bn1_out = VarNode("right_bn1_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* right_bn1_mean_out =
VarNode("right_bn1_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* right_bn1_var_out =
VarNode("right_bn1_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* right_bn1_saved_mean =
VarNode("right_bn1_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* right_bn1_saved_var =
VarNode("right_bn1_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate();
auto* right_relu1_out = VarNode("right_relu1_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("conv2d", "Input")
->AsIntermediate();
auto* right_conv2_weight = VarNode("right_conv2_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate();
auto* right_conv2_out = VarNode("right_conv2_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* right_bn2_scale = VarNode("right_bn2_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* right_bn2_bias = VarNode("right_bn2_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* right_bn2_mean = VarNode("right_bn2_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* right_bn2_var = VarNode("right_bn2_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate();
auto* right_bn2_out = VarNode("right_bn2_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* right_bn2_mean_out =
VarNode("right_bn2_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* right_bn2_var_out =
VarNode("right_bn2_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* right_bn2_saved_mean =
VarNode("right_bn2_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* right_bn2_saved_var =
VarNode("right_bn2_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate();
auto* right_relu2_out = VarNode("right_relu2_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("conv2d", "Input")
->AsIntermediate();
auto* right_conv3_weight = VarNode("right_conv3_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate();
auto* right_conv3_out = VarNode("right_conv3_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* right_bn3_scale = VarNode("right_bn3_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* right_bn3_bias = VarNode("right_bn3_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* right_bn3_mean = VarNode("right_bn3_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* right_bn3_var = VarNode("right_bn3_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate();
auto* right_bn3_out = VarNode("right_bn3_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("elementwise_add", "Y")
->AsIntermediate();
auto* right_bn3_mean_out =
VarNode("right_bn3_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* right_bn3_var_out =
VarNode("right_bn3_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* right_bn3_saved_mean =
VarNode("right_bn3_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* right_bn3_saved_var =
VarNode("right_bn3_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
auto* add_out = VarNode("add_out")
->assert_is_op_output("elementwise_add", "Out")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* relu = OpNode("relu", "relu")->AsIntermediate();
auto* relu_out =
VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
*input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
*right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >>
*right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >>
*right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >>
*right_bn3_out >> *add;
*right_conv1_weight >> *right_conv1;
*right_bn1_scale >> *right_bn1;
*right_bn1_bias >> *right_bn1;
*right_bn1_mean >> *right_bn1;
*right_bn1_var >> *right_bn1;
*right_bn1 >> *right_bn1_mean_out;
*right_bn1 >> *right_bn1_var_out;
*right_bn1 >> *right_bn1_saved_mean;
*right_bn1 >> *right_bn1_saved_var;
*right_conv2_weight >> *right_conv2;
*right_bn2_scale >> *right_bn2;
*right_bn2_bias >> *right_bn2;
*right_bn2_mean >> *right_bn2;
*right_bn2_var >> *right_bn2;
*right_bn2 >> *right_bn2_mean_out;
*right_bn2 >> *right_bn2_var_out;
*right_bn2 >> *right_bn2_saved_mean;
*right_bn2 >> *right_bn2_saved_var;
*right_conv3_weight >> *right_conv3;
*right_bn3_scale >> *right_bn3;
*right_bn3_bias >> *right_bn3;
*right_bn3_mean >> *right_bn3;
*right_bn3_var >> *right_bn3;
*right_bn3 >> *right_bn3_mean_out;
*right_bn3 >> *right_bn3_var_out;
*right_bn3 >> *right_bn3_saved_mean;
*right_bn3 >> *right_bn3_saved_var;
*input >> *add;
*add >> *add_out >> *relu >> *relu_out;
}
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
cpp::OpDesc op_desc;
op_desc.SetType("resnet_block1");
op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
op_desc.SetInput("Filter",
{
matched.at("right_conv1_weight")->arg()->name,
matched.at("right_conv2_weight")->arg()->name,
matched.at("right_conv3_weight")->arg()->name,
});
op_desc.SetInput("Scale",
{
matched.at("right_bn1_scale")->arg()->name,
matched.at("right_bn2_scale")->arg()->name,
matched.at("right_bn3_scale")->arg()->name,
});
op_desc.SetInput("Bias",
{
matched.at("right_bn1_bias")->arg()->name,
matched.at("right_bn2_bias")->arg()->name,
matched.at("right_bn3_bias")->arg()->name,
});
op_desc.SetInput("Mean",
{
matched.at("right_bn1_mean")->arg()->name,
matched.at("right_bn2_mean")->arg()->name,
matched.at("right_bn3_mean")->arg()->name,
});
op_desc.SetInput("Var",
{
matched.at("right_bn1_variance")->arg()->name,
matched.at("right_bn2_variance")->arg()->name,
matched.at("right_bn3_variance")->arg()->name,
});
op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
// XXX: keep these to fool SubgraphOp::AttachImpl()
op_desc.SetAttr<int>("sub_block", 0);
op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
auto block1_stmt = matched.at("right_conv1")->stmt();
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
block1_stmt->SetOp(fake_subgraph_op);
std::vector<std::string> froms = {
"right_conv2_weight",
"right_conv3_weight",
"right_bn1_bias",
"right_bn2_bias",
"right_bn3_bias",
};
for (auto& from : froms) {
IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1"));
}
IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out"));
}
};
class XPUResNet50Fuser : public xpu::XPUFuseBase {
public:
XPUResNet50Fuser() {}
void BuildPattern() override {
auto* input =
VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
auto* top_conv_weight = VarNode("top_conv_weight")
->assert_is_op_input("conv2d", "Filter")
->AsInput();
auto* top_conv = OpNode("top_conv", "conv2d");
auto* top_conv_out = VarNode("top_conv_out")
->assert_is_op_output("conv2d", "Output")
->assert_is_op_input("batch_norm", "X")
->AsIntermediate();
auto* top_bn_scale = VarNode("top_bn_scale")
->assert_is_op_input("batch_norm", "Scale")
->AsIntermediate();
auto* top_bn_bias = VarNode("top_bn_bias")
->assert_is_op_input("batch_norm", "Bias")
->AsInput();
auto* top_bn_mean = VarNode("top_bn_mean")
->assert_is_op_input("batch_norm", "Mean")
->AsIntermediate();
auto* top_bn_var = VarNode("top_bn_variance")
->assert_is_op_input("batch_norm", "Variance")
->AsIntermediate();
auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate();
auto* top_bn_out = VarNode("top_bn_out")
->assert_is_op_output("batch_norm", "Y")
->assert_is_op_input("relu", "X")
->AsIntermediate();
auto* top_bn_mean_out = VarNode("top_bn_mean_out")
->assert_is_op_output("batch_norm", "MeanOut")
->AsIntermediate();
auto* top_bn_var_out =
VarNode("top_bn_var_out")
->assert_is_op_output("batch_norm", "VarianceOut")
->AsIntermediate();
auto* top_bn_saved_mean =
VarNode("top_bn_saved_mean")
->assert_is_op_output("batch_norm", "SavedMean")
->AsIntermediate();
auto* top_bn_saved_var =
VarNode("top_bn_saved_var")
->assert_is_op_output("batch_norm", "SavedVariance")
->AsIntermediate();
auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
auto* top_relu_out = VarNode("top_relu_out")
->assert_is_op_output("relu", "Out")
->assert_is_op_input("pool2d", "X")
->AsIntermediate();
auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
auto* top_pool_out = VarNode("top_pool_out")
->assert_is_op_output("pool2d", "Out")
->assert_is_op_input("resnet_block0", "Inputs")
->AsIntermediate();
// args are left out
auto* resnet_block0_1 =
OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate();
auto* resnet_block0_1_out =
VarNode("resnet_block0_1_out")
->assert_is_op_output("resnet_block0", "Outputs")
->AsIntermediate();
auto* resnet_block1_1_1 =
OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate();
auto* resnet_block1_1_1_out =
VarNode("resnet_block1_1_1_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_1_2 =
OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate();
auto* resnet_block1_1_2_out =
VarNode("resnet_block1_1_2_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block0_2 =
OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate();
auto* resnet_block0_2_out =
VarNode("resnet_block0_2_out")
->assert_is_op_output("resnet_block0", "Outputs")
->AsIntermediate();
auto* resnet_block1_2_1 =
OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate();
auto* resnet_block1_2_1_out =
VarNode("resnet_block1_2_1_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_2_2 =
OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate();
auto* resnet_block1_2_2_out =
VarNode("resnet_block1_2_2_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_2_3 =
OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate();
auto* resnet_block1_2_3_out =
VarNode("resnet_block1_2_3_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block0_3 =
OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate();
auto* resnet_block0_3_out =
VarNode("resnet_block0_3_out")
->assert_is_op_output("resnet_block0", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_1 =
OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_1_out =
VarNode("resnet_block1_3_1_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_2 =
OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_2_out =
VarNode("resnet_block1_3_2_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_3 =
OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_3_out =
VarNode("resnet_block1_3_3_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_4 =
OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_4_out =
VarNode("resnet_block1_3_4_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_3_5 =
OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate();
auto* resnet_block1_3_5_out =
VarNode("resnet_block1_3_5_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block0_4 =
OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate();
auto* resnet_block0_4_out =
VarNode("resnet_block0_4_out")
->assert_is_op_output("resnet_block0", "Outputs")
->AsIntermediate();
auto* resnet_block1_4_1 =
OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate();
auto* resnet_block1_4_1_out =
VarNode("resnet_block1_4_1_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* resnet_block1_4_2 =
OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate();
auto* resnet_block1_4_2_out =
VarNode("resnet_block1_4_2_out")
->assert_is_op_output("resnet_block1", "Outputs")
->AsIntermediate();
auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate();
auto* bottom_pool_out = VarNode("bottom_pool_out")
->assert_is_op_output("pool2d", "Out")
->AsOutput();
*input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
*top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
*resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
*resnet_block1_1_1_out >> *resnet_block1_1_2 >>
*resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
*resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >>
*resnet_block1_2_2_out >> *resnet_block1_2_3 >>
*resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >>
*resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >>
*resnet_block1_3_2_out >> *resnet_block1_3_3 >>
*resnet_block1_3_3_out >> *resnet_block1_3_4 >>
*resnet_block1_3_4_out >> *resnet_block1_3_5 >>
*resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >>
*resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >>
*resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out;
*top_conv_weight >> *top_conv;
*top_bn_scale >> *top_bn;
*top_bn_bias >> *top_bn;
*top_bn_mean >> *top_bn;
*top_bn_var >> *top_bn;
*top_bn >> *top_bn_mean_out;
*top_bn >> *top_bn_var_out;
*top_bn >> *top_bn_saved_mean;
*top_bn >> *top_bn_saved_var;
}
void InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched,
const std::vector<Node*>& extra_input_vars) override {
cpp::OpDesc op_desc;
op_desc.SetType("__xpu__resnet50");
op_desc.SetInput("Input", {matched.at("input")->arg()->name});
std::vector<std::string> filter_name = {
matched.at("top_conv_weight")->arg()->name};
std::vector<std::string> scale_name = {
matched.at("top_bn_scale")->arg()->name};
std::vector<std::string> bias_name = {
matched.at("top_bn_bias")->arg()->name};
std::vector<std::string> mean_name = {
matched.at("top_bn_mean")->arg()->name};
std::vector<std::string> var_name = {
matched.at("top_bn_variance")->arg()->name};
std::vector<std::string> max_filter_name;
std::vector<std::string> resnet_block_vec = {
"resnet_block0_1",
"resnet_block1_1_1",
"resnet_block1_1_2",
"resnet_block0_2",
"resnet_block1_2_1",
"resnet_block1_2_2",
"resnet_block1_2_3",
"resnet_block0_3",
"resnet_block1_3_1",
"resnet_block1_3_2",
"resnet_block1_3_3",
"resnet_block1_3_4",
"resnet_block1_3_5",
"resnet_block0_4",
"resnet_block1_4_1",
"resnet_block1_4_2",
};
for (auto& block : resnet_block_vec) {
auto* block_op_info = matched.at(block)->stmt()->op_info();
auto block_filter_name = block_op_info->Input("Filter");
std::copy(block_filter_name.begin(),
block_filter_name.end(),
std::back_inserter(filter_name));
auto block_scale_name = block_op_info->Input("Scale");
std::copy(block_scale_name.begin(),
block_scale_name.end(),
std::back_inserter(scale_name));
auto block_bias_name = block_op_info->Input("Bias");
std::copy(block_bias_name.begin(),
block_bias_name.end(),
std::back_inserter(bias_name));
auto block_mean_name = block_op_info->Input("Mean");
std::copy(block_mean_name.begin(),
block_mean_name.end(),
std::back_inserter(mean_name));
auto block_var_name = block_op_info->Input("Var");
std::copy(block_var_name.begin(),
block_var_name.end(),
std::back_inserter(var_name));
}
op_desc.SetInput("Filter", filter_name);
op_desc.SetInput("Bias", bias_name);
op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name});
op_desc.SetAttr<int>("xpu", 1);
auto* resnet50_stmt = matched.at("top_conv")->stmt();
auto* scope = resnet50_stmt->op()->scope();
for (size_t i = 0; i < filter_name.size(); ++i) {
auto* filter_t = scope->FindMutableTensor(filter_name[i]);
auto* scale_t = scope->FindMutableTensor(scale_name[i]);
auto* bias_t = scope->FindMutableTensor(bias_name[i]);
auto* mean_t = scope->FindMutableTensor(mean_name[i]);
auto* var_t = scope->FindMutableTensor(var_name[i]);
int mean_len = mean_t->numel();
int filter_len = filter_t->numel();
int filter_stride = filter_len / mean_len;
float* filter_on_host = filter_t->mutable_data<float>();
float* scale_on_host = scale_t->mutable_data<float>();
float* bias_on_host = bias_t->mutable_data<float>();
float* mean_on_host = mean_t->mutable_data<float>();
float* var_on_host = var_t->mutable_data<float>();
// Perform preprocess
for (int i = 0; i < mean_len; ++i) {
scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
}
for (int i = 0; i < mean_len; ++i) {
for (int j = 0; j < filter_stride; ++j) {
filter_on_host[i * filter_stride + j] *= scale_on_host[i];
}
}
for (int i = 0; i < mean_len; ++i) {
bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
}
float max_f =
paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
paddle::lite::xpu::math::ConvertFP32ToInt16(
filter_on_host, filter_int16.get(), max_f, filter_len);
memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
// create new arg in graph and scope
std::string max_name = filter_name[i] + "_max";
max_filter_name.push_back(max_name);
auto* max_filter_node = graph->NewArgumentNode(max_name);
max_filter_node->arg()->is_weight = true;
max_filter_node->arg()->type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
DirectedLink(max_filter_node, matched.at("top_conv"));
auto* max_filter_t = scope->NewTensor(max_name);
max_filter_t->Resize({4});
float* max_ptr = max_filter_t->mutable_data<float>();
max_ptr[0] = max_f;
max_ptr[1] = max_f;
max_ptr[2] = max_f;
max_ptr[3] = max_f;
}
op_desc.SetInput("MaxFilter", max_filter_name);
auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type());
resnet50_op->Attach(op_desc, scope);
resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places());
auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places());
resnet50_stmt->SetOp(resnet50_op);
resnet50_stmt->SetKernels(std::move(kernels));
IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
for (auto* node : extra_input_vars) {
IR_NODE_LINK_TO(node, matched.at("top_conv"));
}
IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out"));
}
};
} // namespace fusion
class XPUResNet50FusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
fusion::XPUResNetBlock0Fuser block0_fuser;
block0_fuser(graph.get());
fusion::XPUResNetBlock1Fuser block1_fuser;
block1_fuser(graph.get());
fusion::XPUResNet50Fuser resnet50_fuser;
resnet50_fuser(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(__xpu__resnet_fuse_pass,
paddle::lite::mir::XPUResNet50FusePass)
.BindTargets({TARGET(kXPU)})
.BindKernel("conv2d");
...@@ -26,15 +26,13 @@ namespace paddle { ...@@ -26,15 +26,13 @@ namespace paddle {
namespace lite { namespace lite {
namespace mir { namespace mir {
using inference::analysis::Dot;
void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) { void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
VLOG(5) << "\n" << Visualize(graph.get()); VLOG(5) << "\n" << Visualize(graph.get());
} }
std::string Visualize(mir::SSAGraph* graph) { std::string Visualize(mir::SSAGraph* graph) {
std::ostringstream os; std::ostringstream os;
inference::analysis::Dot dot; Dot dot;
auto string_trunc = [](const std::string& str) -> std::string { auto string_trunc = [](const std::string& str) -> std::string {
const int max_disp_size = 100; const int max_disp_size = 100;
if (str.length() > max_disp_size) if (str.length() > max_disp_size)
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include "lite/core/mir/mlu_postprocess_pass.h" #include "lite/core/mir/mlu_postprocess_pass.h"
#include <list> #include <list>
#include <memory> #include <memory>
#include <set>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, ...@@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
op_desc.SetAttr<int>("out_dtype", 4); // FP16 op_desc.SetAttr<int>("out_dtype", 4); // FP16
op_desc.SetInput("X", {cur_node->AsArg().name}); op_desc.SetInput("X", {cur_node->AsArg().name});
op_desc.SetOutput("Out", {cast_arg_name}); op_desc.SetOutput("Out", {cast_arg_name});
} else if (op_type == "transpose") { } else if (op_type == "layout") {
// NCHW -> NHWC // NCHW -> NHWC
op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1}); op_desc.SetInput("Input", {cur_node->AsArg().name});
op_desc.SetInput("X", {cur_node->AsArg().name});
op_desc.SetOutput("Out", {cast_arg_name}); op_desc.SetOutput("Out", {cast_arg_name});
} else if (op_type == "io_copy") { } else if (op_type == "io_copy") {
op_desc.SetInput("Input", {cur_node->AsArg().name}); op_desc.SetInput("Input", {cur_node->AsArg().name});
...@@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, ...@@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
is_found = true; is_found = true;
} }
} else if (op_type == "transpose") { } else if (op_type == "layout") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
DataLayoutCompatible(*out_arg_ty, *cast_type) &&
// for first conv
PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
is_found = true; is_found = true;
}
} else if (op_type == "io_copy") { } else if (op_type == "io_copy") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
...@@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, ...@@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
// we pick the kernel // we pick the kernel
cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
auto& stmt = cast_inst->AsStmt(); auto& stmt = cast_inst->AsStmt();
if (op_type == "layout") {
stmt.picked_kernel().SetContext( stmt.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); ContextScheduler::Global().NewContext(TARGET(kX86)));
} else {
stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
stmt.picked_kernel().target()));
}
break; break;
} }
} }
...@@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, ...@@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
cast_arg->AsArg().type = cast_type; cast_arg->AsArg().type = cast_type;
auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
// for CastAfter manully set the tensor's type // for CastAfter manully set the tensor's type
var->GetMutable<::paddle::lite::Tensor>(); var->GetMutable<paddle::lite::Tensor>();
// create the stmt node // create the stmt node
auto* cast_inst = graph->NewInstructNode(); auto* cast_inst = graph->NewInstructNode();
...@@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, ...@@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
op_desc.SetAttr<int>("out_dtype", 5); // FP16 op_desc.SetAttr<int>("out_dtype", 5); // FP16
op_desc.SetInput("X", {cast_arg_name}); op_desc.SetInput("X", {cast_arg_name});
op_desc.SetOutput("Out", {cur_node->AsArg().name}); op_desc.SetOutput("Out", {cur_node->AsArg().name});
} else if (op_type == "transpose") { } else if (op_type == "layout") {
// NHWC -> NCHW // NHWC -> NCHW
op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2}); op_desc.SetInput("Input", {cast_arg_name});
op_desc.SetInput("X", {cast_arg_name});
op_desc.SetOutput("Out", {cur_node->AsArg().name}); op_desc.SetOutput("Out", {cur_node->AsArg().name});
} else if (op_type == "io_copy") { } else if (op_type == "io_copy") {
op_desc.SetInput("Input", {cast_arg_name}); op_desc.SetInput("Input", {cast_arg_name});
...@@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, ...@@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) { if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
is_found = true; is_found = true;
} }
} else if (op_type == "transpose") { } else if (op_type == "layout") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
is_found = true; is_found = true;
}
} else if (op_type == "io_copy") { } else if (op_type == "io_copy") {
const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
...@@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, ...@@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
// we pick the kernel // we pick the kernel
cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
auto& stmt = cast_inst->AsStmt(); auto& stmt = cast_inst->AsStmt();
if (op_type == "layout") {
stmt.picked_kernel().SetContext( stmt.picked_kernel().SetContext(
ContextScheduler::Global().NewContext(stmt.picked_kernel().target())); ContextScheduler::Global().NewContext(TARGET(kX86)));
} else {
stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
stmt.picked_kernel().target()));
}
break; break;
} }
} }
...@@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph, ...@@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
auto* cur_node = head_node; auto* cur_node = head_node;
const auto name_prefix = const auto name_prefix =
head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
bool is_first_conv_head =
std::find(first_conv_nodes_.begin(),
first_conv_nodes_.end(),
head_node->AsArg().name) != first_conv_nodes_.end();
// layout cast node // precision cast node
if (head_type->layout() != inst_type->layout()) { if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
cur_node = InsertCastBefore( cur_node = InsertCastBefore(
"transpose", "cast",
name_prefix + "transpose", name_prefix + "cast",
graph, graph,
cur_node, cur_node,
inst_node, inst_node,
LiteType::GetTensorTy( LiteType::GetTensorTy(
head_type->target(), head_type->precision(), inst_type->layout())); head_type->target(), inst_type->precision(), head_type->layout()));
} }
// precision cast node // layout cast node
if (head_type->precision() != inst_type->precision()) { if (head_type->layout() != inst_type->layout()) {
cur_node = InsertCastBefore( cur_node = InsertCastBefore(
"cast", "layout",
name_prefix + "cast", name_prefix + "layout",
graph, graph,
cur_node, cur_node,
inst_node, inst_node,
...@@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, ...@@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
// get subgraph's valid precision // get subgraph's valid precision
const auto& places = graph->valid_places(); const auto& places = graph->valid_places();
std::set<::paddle::lite_api::PrecisionType> prec_set; std::set<paddle::lite_api::PrecisionType> prec_set;
for (const auto& place : places) { for (const auto& place : places) {
if (place.target == TARGET(kMLU)) { if (place.target == TARGET(kMLU)) {
prec_set.insert(place.precision); prec_set.insert(place.precision);
...@@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, ...@@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
const auto name_prefix = const auto name_prefix =
tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
// layout cast node // precision cast node
if (tail_type->layout() != inst_type->layout()) { if (tail_type->precision() != inst_type->precision()) {
cur_node = InsertCastAfter( cur_node = InsertCastAfter(
"transpose", "cast",
name_prefix + "transpose", name_prefix + "cast",
graph, graph,
cur_node, cur_node,
inst_node, inst_node,
LiteType::GetTensorTy( LiteType::GetTensorTy(
tail_type->target(), tail_type->precision(), inst_type->layout())); tail_type->target(), inst_type->precision(), tail_type->layout()));
} }
// precision cast node // layout cast node
if (tail_type->precision() != inst_type->precision()) { if (tail_type->layout() != inst_type->layout()) {
cur_node = InsertCastAfter( cur_node = InsertCastAfter(
"cast", "layout",
name_prefix + "cast", name_prefix + "layout",
graph, graph,
cur_node, cur_node,
inst_node, inst_node,
...@@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, ...@@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i); auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
UpdateOutputTo( UpdateOutputTo(
sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
/* graph like this
* subgraph_op_0
* / \
* / \
* subgraph_op_1 host_op
*/
UpdateInputTo(
sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
} }
// recreate the op // recreate the op
...@@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) { ...@@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
} }
} }
bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
auto* block_desc =
static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
->GetSubBlock();
for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
CHECK(op_desc);
if (op_desc->Type() == "conv2d") {
for (auto& names : op_desc->inputs()) {
if (std::find(names.second.begin(),
names.second.end(),
arg_node->AsArg().name) != names.second.end()) {
return true;
}
}
}
}
return false;
}
bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
CHECK(arg_node->IsArg());
for (auto& inst : arg_node->outlinks) {
if (inst->AsStmt().op_type() == "subgraph") {
return IsFirstConvInSubgraph(arg_node, inst);
}
}
return false;
}
void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
for (auto& node : graph->mutable_nodes()) {
if (!node.IsStmt()) continue;
if (node.AsStmt().op_type() == "feed") {
for (auto& out : node.outlinks) {
if (IsFirstConvNode(out)) {
first_conv_nodes_.insert(out->AsArg().name);
// modify first conv nodes' type
const auto* old_type = out->AsArg().type;
out->AsArg().type =
LiteType::GetTensorTy(old_type->target(),
paddle::lite_api::PrecisionType::kInt8,
old_type->layout(),
old_type->device());
}
}
}
}
}
void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
for (auto& node : graph->mutable_nodes()) { for (auto& node : graph->mutable_nodes()) {
if (!node.IsStmt()) continue; if (!node.IsStmt()) continue;
...@@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { ...@@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
out->AsArg().type = out->AsArg().type =
LiteType::GetTensorTy(old_type->target(), LiteType::GetTensorTy(old_type->target(),
old_type->precision(), old_type->precision(),
::paddle::lite_api::DataLayoutType::kNHWC, paddle::lite_api::DataLayoutType::kNHWC,
old_type->device()); old_type->device());
} }
} }
...@@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { ...@@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
inp->AsArg().type = inp->AsArg().type =
LiteType::GetTensorTy(old_type->target(), LiteType::GetTensorTy(old_type->target(),
old_type->precision(), old_type->precision(),
::paddle::lite_api::DataLayoutType::kNHWC, paddle::lite_api::DataLayoutType::kNHWC,
old_type->device()); old_type->device());
} }
} }
...@@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { ...@@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
} }
void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// currently for non-persistent input and output args, mlu subgraph op // currently for non-persistent input and output args, mlu subgraph op
// only support float16/float32 data type // only support float16/float32 data type
// in two situations as folllows: // in two situations as folllows:
// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
// arg_in and arg_out are assumed to be NHWC which user should be aware of. // arg_in and arg_out are assumed to be NHWC which user should be aware of.
// Thus here we change these args' layout to NHWC // Thus here we change these args' layout to NHWC
#ifdef LITE_WITH_MLU
if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
ModifyLayout(graph.get()); ModifyLayout(graph.get());
}
if (lite::DeviceInfo::Global().UseFirstConv()) {
GatherAndModifyFirstConvNodes(graph.get());
}
#endif
// insert io_copy, layout and precision cast of subgraph's inputs and outputs // insert io_copy, layout and precision cast of subgraph's inputs and outputs
for (auto& node : graph->mutable_nodes()) { for (auto& node : graph->mutable_nodes()) {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <memory> #include <memory>
#include <set>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/core/mir/pass.h" #include "lite/core/mir/pass.h"
...@@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass { ...@@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass {
const Type* cast_type); const Type* cast_type);
void RecreateOp(Node* inst_node, SSAGraph* graph); void RecreateOp(Node* inst_node, SSAGraph* graph);
void GatherAndModifyFirstConvNodes(SSAGraph* graph);
bool IsFirstConvNode(Node* arg_node);
bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
private:
std::set<std::string> first_conv_nodes_;
}; };
} // namespace mir } // namespace mir
......
...@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) { ...@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) {
} }
std::string PMPattern::DotString() const { std::string PMPattern::DotString() const {
using inference::analysis::Dot;
Dot dot; Dot dot;
int id = 0; int id = 0;
// Create Nodes // Create Nodes
......
...@@ -64,7 +64,6 @@ class FuseBase { ...@@ -64,7 +64,6 @@ class FuseBase {
protected: protected:
virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0; virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0;
private:
void PerformPatternMatcher(SSAGraph* graph); void PerformPatternMatcher(SSAGraph* graph);
// Delete nodes that are marked as Intermediate // Delete nodes that are marked as Intermediate
......
...@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() { ...@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
return adj_list; return adj_list;
} }
std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildNodeAdjList() {
std::map<mir::Node *, std::set<mir::Node *>> adj_list;
for (auto &n : mutable_nodes()) {
if (adj_list.find(&n) == adj_list.end()) {
adj_list[&n] = std::set<mir::Node *>();
}
std::vector<mir::Node *> nodes;
for (auto &var : n.inlinks) {
nodes.push_back(var);
}
std::sort(nodes.begin(),
nodes.end(),
[](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
std::make_move_iterator(nodes.end()));
}
return adj_list;
}
void SSAGraph::SortHelper( void SSAGraph::SortHelper(
const std::map<mir::Node *, std::set<mir::Node *>> &adj_list, const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
mir::Node *node, mir::Node *node,
...@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() { ...@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
return res; return res;
} }
std::vector<mir::Node *> SSAGraph::NodeTopologicalOrder() {
CheckBidirectionalConnection();
std::stack<mir::Node *> stack;
std::set<mir::Node *> visited;
std::vector<mir::Node *> res;
auto adj_list = BuildNodeAdjList();
for (auto adj : adj_list) {
if (visited.find(adj.first) == visited.end()) {
SortHelper(adj_list, adj.first, &visited, &res);
}
}
return res;
}
Node *SSAGraph::GraphCreateInstructNode( Node *SSAGraph::GraphCreateInstructNode(
const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) { const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
node_storage_.emplace_back(); node_storage_.emplace_back();
...@@ -213,9 +251,10 @@ std::vector<mir::Node *> SSAGraph::outputs() { ...@@ -213,9 +251,10 @@ std::vector<mir::Node *> SSAGraph::outputs() {
} }
mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) { mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
auto it = arguments_.find(arg); for (auto &node : node_storage_) {
if (it != arguments_.end()) { if (node.IsArg() && node.arg()->name == arg) {
return it->second; return &node;
}
} }
return nullptr; return nullptr;
} }
......
...@@ -42,6 +42,8 @@ class SSAGraph : GraphBase { ...@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {
std::vector<mir::Node *> StmtTopologicalOrder(); std::vector<mir::Node *> StmtTopologicalOrder();
std::vector<mir::Node *> NodeTopologicalOrder();
// The inputs of the graph. // The inputs of the graph.
std::vector<mir::Node *> inputs(); std::vector<mir::Node *> inputs();
...@@ -86,6 +88,9 @@ class SSAGraph : GraphBase { ...@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
// Build operator inlink edge table. // Build operator inlink edge table.
std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList(); std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();
// Build node inlink edge table.
std::map<mir::Node *, std::set<mir::Node *>> BuildNodeAdjList();
void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list, void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
mir::Node *node, mir::Node *node,
std::set<mir::Node *> *visited, std::set<mir::Node *> *visited,
......
...@@ -30,10 +30,8 @@ namespace paddle { ...@@ -30,10 +30,8 @@ namespace paddle {
namespace lite { namespace lite {
namespace mir { namespace mir {
using inference::analysis::Dot;
std::string SubgraphVisualizer::operator()() { std::string SubgraphVisualizer::operator()() {
inference::analysis::Dot dot; Dot dot;
const std::vector<std::string> subgraph_colors{ const std::vector<std::string> subgraph_colors{
"red", "green", "cyan", "bisque3", "red", "green", "cyan", "bisque3",
"coral", "darkseagreen1", "goldenrod1", "darkorchid", "coral", "darkseagreen1", "goldenrod1", "darkorchid",
...@@ -314,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) { ...@@ -314,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs( std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
node_map_t *nodes) { node_map_t *nodes) {
for (auto &it : *nodes) { for (auto &ordered_node : graph_->NodeTopologicalOrder()) {
node_dat_t *node = it.second; // different orders when traversing nodes in graph may lead to
// different subgraph division, which may generate different result
// with device such as MLU. These different results are all "right"
// but a little confusing. Thus the topological order is used instead
// of the address of the node in graph.
CHECK(nodes->find(ordered_node) != nodes->end());
node_dat_t *node = (*nodes)[ordered_node];
if (!node->marked) { if (!node->marked) {
continue; continue;
} }
...@@ -573,13 +577,14 @@ void ExtractInputsOutputs(const std::vector<Node *> &op_nodes, ...@@ -573,13 +577,14 @@ void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
unused_var_nodes->insert(var_node); unused_var_nodes->insert(var_node);
continue; continue;
} }
// Var can have more than one next op node, So, if any one in the // Var can have more than one next op node, So, if all next nodes are in
// op_nodes then continue // op_nodes then it should be put into local_var_nodes
bool next_op_in_nodes = false; bool next_op_in_nodes = true;
for (auto &next_op_node : var_node->outlinks) { for (auto &next_op_node : var_node->outlinks) {
if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) != if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) ==
op_nodes.end()) { op_nodes.end()) {
next_op_in_nodes = true; next_op_in_nodes = false;
break;
} }
} }
if (next_op_in_nodes) { if (next_op_in_nodes) {
......
...@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) { ...@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
Place{TARGET(kNPU), PRECISION(kFloat)}, Place{TARGET(kNPU), PRECISION(kFloat)},
#endif #endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XTCL
Place{TARGET(kXPU), PRECISION(kFloat)}, Place{TARGET(kXPU), PRECISION(kFloat)},
#endif #endif
}); });
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <vector> #include <vector>
#include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/subgraph/subgraph_detector.h" #include "lite/core/mir/subgraph/subgraph_detector.h"
#include "lite/utils/env.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
} }
void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
std::unordered_set<std::string> supported_lists; std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/xpu/bridges/paddle_use_bridges.h" #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
...@@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser(); fuser();
} }
void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
} // namespace mir } // namespace mir
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -77,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) ...@@ -77,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.BindTargets({TARGET(kXPU)}); .BindTargets({TARGET(kXPU)});
REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
.BindTargets({TARGET(kBM)}); .BindTargets({TARGET(kBM)});
REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
.BindTargets({TARGET(kMLU)});
...@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass { ...@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
}; };
class MLUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir } // namespace mir
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) { ...@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}); valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
#endif #endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XTCL
valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}); valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
#endif #endif
auto tar_predictor = TestModel(FLAGS_model_dir, auto tar_predictor = TestModel(FLAGS_model_dir,
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/pass.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
class SubgraphCastDisplayPass : public DebugPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
VLOG(3) << "== Argument types ==";
for (auto& node : graph->mutable_nodes()) {
if (!node.IsArg()) continue;
auto* type = node.AsArg().type;
if (type) {
VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
} else {
VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
}
}
VLOG(3) << "---------------------";
//
VLOG(0) << "== SubgraphOp Debug Info ==";
for (auto& node : graph->mutable_nodes()) {
if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
VLOG(0) << "FOUND SUBGRAPH OP";
display_debug_info(node, "subgraph");
break;
}
}
VLOG(0) << "---------------------";
}
void display_debug_info(const Node& node,
std::string op_type,
bool display_in_nodes = true,
bool display_out_nodes = true) {
CHECK(node.IsStmt());
VLOG(0) << node.AsStmt();
if (display_in_nodes) {
for (auto p_in_arg_node : node.inlinks) {
CHECK(p_in_arg_node->IsArg());
VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
<< " type: " << *p_in_arg_node->AsArg().type
<< " is_weight: " << p_in_arg_node->AsArg().is_weight
<< " is_persist: " << p_in_arg_node->AsArg().is_persist
<< " input_count: " << p_in_arg_node->inlinks.size();
if (p_in_arg_node->inlinks.size() == 0) {
VLOG(0) << "** END with No Op";
}
for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
CHECK(p_in_stmt_node->IsStmt());
std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
stmt_op_type == "io_copy") {
display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
} else {
VLOG(0) << "** END with op type: " << stmt_op_type;
}
}
}
}
if (display_out_nodes) {
for (auto p_out_arg_node : node.outlinks) {
CHECK(p_out_arg_node->IsArg());
VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
<< " type: " << *p_out_arg_node->AsArg().type
<< " is_weight: " << p_out_arg_node->AsArg().is_weight
<< " is_persist: " << p_out_arg_node->AsArg().is_persist
<< " output_count: " << p_out_arg_node->outlinks.size();
if (p_out_arg_node->outlinks.size() == 0) {
VLOG(0) << "** END with No Op";
}
for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
CHECK(p_out_stmt_node->IsStmt());
std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
stmt_op_type == "io_copy") {
display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
} else {
VLOG(0) << "** END with op type: " << stmt_op_type;
}
}
}
}
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(subgraph_cast_display_pass,
paddle::lite::mir::SubgraphCastDisplayPass)
.BindTargets({TARGET(kAny)});
...@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
VLOG(4) << "picked, opencl found"; VLOG(4) << "picked, opencl found";
is_found = true; is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) && } else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) { TargetCompatibleTo(*out_arg_ty, to)) {
VLOG(4) << "picked"; VLOG(4) << "picked";
is_found = true; is_found = true;
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <array>
#include <string>
#include <vector>
#include "lite/core/mir/dot.h"
#include "lite/core/mir/xpu_pattern_matcher.h"
#include "lite/core/op_lite.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
namespace mir {
namespace xpu {
void XPUPatternMatcher::operator()(SSAGraph *graph,
XPUPatternMatcher::handle_t handler) {
if (!MarkPMNodesInGraph(graph)) {
return;
}
auto subgraphs = DetectPatterns();
UniquePatterns(&subgraphs);
RemoveOverlappedMatch(&subgraphs);
ValidateByNodeRole(&subgraphs);
if (subgraphs.empty()) return;
LOG(INFO) << "detected " << subgraphs.size() << " subgraph";
int id = 0;
for (auto &g : subgraphs) {
VLOG(3) << "optimizing #" << id++ << " subgraph";
handler(g, graph);
}
}
bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
VLOG(3) << "mark pmnodes in graph";
if (graph->nodes().empty()) return false;
for (auto &node : graph->mutable_nodes()) {
for (const auto &pmnode : pattern_.nodes()) {
if (pmnode->Tell(&node)) {
pmnodes2nodes_[pmnode.get()].insert(&node);
}
}
}
// Check to early stop if some PMNode can't find matched Node.
for (auto &pmnode : pattern_.nodes()) {
if (!pmnodes2nodes_.count(pmnode.get())) {
VLOG(4) << pmnode->name() << " can't find matched Node, early stop";
// return false;
}
}
VLOG(3) << pmnodes2nodes_.size() << " nodes marked";
return !pmnodes2nodes_.empty();
}
// The intermediate Nodes can only link to the nodes inside the pattern, or this
// subgraph will be droped.
void XPUPatternMatcher::ValidateByNodeRole(
std::vector<PatternMatcher::subgraph_t> *subgraphs) {
subgraphs->erase(
std::remove_if(subgraphs->begin(),
subgraphs->end(),
[](const XPUPatternMatcher::subgraph_t &subgraph) -> bool {
// Collect the inlinks and outlinks.
std::unordered_set<Node *> ios;
for (auto &item : subgraph) {
ios.insert(item.second);
}
for (auto &item : subgraph) {
if (item.first->IsIntermediate()) {
for (auto *x : item.second->outlinks) {
if (!ios.count(x)) {
return true;
}
}
}
}
return false;
}),
subgraphs->end());
for (auto &subgraph : *subgraphs) {
std::unordered_set<Node *> ios;
for (auto &item : subgraph) {
ios.insert(item.second);
}
extra_input_vars_.emplace_back();
for (auto &item : subgraph) {
for (auto *x : item.second->inlinks) {
if (x->IsArg() && ios.count(x) == 0) {
// extra weight var
extra_input_vars_.back().push_back(x);
}
}
}
}
}
struct HitGroup {
std::unordered_map<PMNode *, Node *> roles;
bool Match(Node *node, PMNode *pat) {
if (nodes_.count(node)) {
if (roles.count(pat) && roles[pat] == node) return true;
return false;
} else {
if (roles.count(pat) && roles[pat] != node) return false;
return true;
}
}
void Register(Node *node, PMNode *pat) {
roles[pat] = node;
nodes_.insert(node);
}
private:
std::unordered_set<Node *> nodes_;
};
// Tell whether Node a links to b.
bool IsNodesLink(Node *a, Node *b) {
for (auto *node : a->outlinks) {
if (b == node) {
return true;
}
}
return false;
}
std::vector<PatternMatcher::subgraph_t> XPUPatternMatcher::DetectPatterns() {
// Init empty subgraphs.
std::vector<PatternMatcher::subgraph_t> result;
std::vector<HitGroup> init_groups;
std::array<std::vector<HitGroup>, 2> bi_records;
auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
: pattern_.edges().front().first;
if (!pmnodes2nodes_.count(first_pnode)) return result;
for (auto *node : pmnodes2nodes_[first_pnode]) {
HitGroup group;
group.roles[first_pnode] = node;
init_groups.emplace_back(group);
}
int step = 0;
bi_records[0] = std::move(init_groups);
// Extend a PMNode to subgraphs by deducing the connection relations defined
// in edges of PMNodes.
for (const auto &edge : pattern_.edges()) {
VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
// TODO(Superjomn) Fix bug here, the groups might be duplicate here.
// Each role has two PMNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected.
auto &pre_groups = bi_records[step % 2];
auto &cur_groups = bi_records[1 - (step++ % 2)];
cur_groups.clear();
if (pre_groups.empty()) break;
// source -> target
for (Node *source : pmnodes2nodes_[edge.first]) {
for (Node *target : pmnodes2nodes_[edge.second]) {
// TODO(Superjomn) add some prune strategies.
for (const auto &group : pre_groups) {
if (IsNodesLink(source, target)) {
HitGroup new_group = group;
bool flag = new_group.Match(source, edge.first) &&
new_group.Match(target, edge.second);
if (flag) {
new_group.Register(source, edge.first);
new_group.Register(target, edge.second);
cur_groups.push_back(new_group);
// TODO(Superjomn) need to unique
}
}
}
}
}
VLOG(3) << "step " << step << " get records: " << cur_groups.size();
}
for (auto &group : bi_records[step % 2]) {
XPUPatternMatcher::subgraph_t subgraph;
for (auto &role : group.roles) {
subgraph.emplace(role.first, role.second);
}
result.emplace_back(subgraph);
}
return result;
}
struct GraphItemLessThan {
bool operator()(const std::pair<PMNode *, Node *> &a,
const std::pair<PMNode *, Node *> &b) {
if (a.first != b.first) {
return a.first < b.first;
} else {
return a.second < b.second;
}
}
};
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550
void XPUPatternMatcher::UniquePatterns(
std::vector<PatternMatcher::subgraph_t> *subgraphs) {
if (subgraphs->empty()) return;
std::vector<PatternMatcher::subgraph_t> result;
std::unordered_set<size_t> set;
std::hash<std::string> hasher;
for (auto &g : *subgraphs) {
// Sort the items in the sub-graph, and transform to a string key.
std::vector<std::pair<PMNode *, Node *>> sorted_keys(g.begin(), g.end());
std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
STL::stringstream ss;
for (auto &item : sorted_keys) {
ss << reinterpret_cast<size_t>(item.first) << ":"
<< reinterpret_cast<size_t>(item.second);
}
auto key = hasher(ss.str());
if (!set.count(key)) {
result.emplace_back(g);
set.insert(key);
}
}
*subgraphs = result;
}
void XPUPatternMatcher::RemoveOverlappedMatch(
std::vector<subgraph_t> *subgraphs) {
std::vector<subgraph_t> result;
std::unordered_set<Node *> node_set;
for (const auto &subgraph : *subgraphs) {
bool valid = true;
for (auto &item : subgraph) {
if (item.first->IsIntermediate() && node_set.count(item.second)) {
valid = false;
break;
}
}
if (valid) {
for (auto &item : subgraph) {
node_set.insert(item.second);
}
result.push_back(subgraph);
}
}
*subgraphs = result;
}
} // namespace xpu
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/pattern_matcher.h"
namespace paddle {
namespace lite {
namespace mir {
namespace xpu {
/*
* PatternMatcher helps to detect the specific patterns in the graph.
* Input a pattern, output a list of the matched subgraphs/nodes.
* This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
*
* The algorithm has three phases:
* 1. Mark the nodes that match the defined PMNodes in a PMPattern,
* 2. Extend a PMNode to subgraphs by deducing the connection relation defined
* in PAPattern(the edges),
* 3. Get the filtered subgraphs and treat them with a pre-defined handler.
*
* Usage:
* // Create a matcher
* PatternMatcher matcher;
* // Define the matcher's pattern, by adding PMNode and define the edges.
* auto* node0 = matcher.mutable_pattern().AddNode(...)
* auto* node1 = matcher.mutable_pattern().AddNode(...)
* node0->teller = some lambda.
* node1->teller = some lambda.
* matcher.mutable_pattern().AddEdge(node0, node1);
* // Create an handler, to define the behavior of treating the filtered
* // subgraphs that comply with the patterns.
* PatternMatcher::handle_t handler = some labmda
* // Execute the matcher.
* matcher(&graph, handler);
*/
struct XPUPatternMatcher {
using subgraph_t = std::unordered_map<PMNode*, Node*>;
// Operate on the detected pattern.
using handle_t =
std::function<void(const subgraph_t& /*hitted pattern*/, SSAGraph*)>;
void operator()(SSAGraph* graph, handle_t handler);
const PMPattern& pattern() const { return pattern_; }
PMPattern* mutable_pattern() { return &pattern_; }
// Mark the nodes that fits the pattern.
bool MarkPMNodesInGraph(SSAGraph* graph);
// Detect all the pattern and output the hit records.
std::vector<subgraph_t> DetectPatterns();
// Remove duplicate patterns.
void UniquePatterns(std::vector<subgraph_t>* subgraphs);
// Remove overlapped match subgraphs, when overlapped, keep the previous one.
// The intermediate PMNodes will be removed, so can't shared by multiple
// patterns.
void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
// Validate whether the intermediate nodes are linked by external nodes.
void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
using hit_rcd_t =
std::pair<Node* /*node in graph*/, PMNode* /*node in pattern*/>;
PMPattern pattern_;
std::unordered_map<const PMNode*, std::unordered_set<Node*>> pmnodes2nodes_;
std::vector<std::vector<Node*>> extra_input_vars_;
};
} // namespace xpu
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
#include <set>
#include <unordered_set>
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace mir {
namespace xpu {
void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) {
VLOG(4) << "\n" << matcher_.pattern().DotString();
// Get subgraphs and record the mir::Node pointers for each PMNode.
auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) {
// get all the reigistered nodes.
key2nodes_.emplace_back();
for (auto &item : nodes_) {
key2nodes_.back()[item.first] = subgraph.at(item.second);
}
};
matcher_(graph, handler);
}
void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) {
std::set<std::string> keys;
for (auto &node : nodes_) {
if (node.second->IsIntermediate()) {
keys.insert(node.first);
}
}
VLOG(4) << "keys: " << key2nodes_.size();
std::unordered_set<const Node *> nodes2rm;
for (auto &matched : key2nodes_) {
for (const auto &key : keys) {
nodes2rm.insert(matched.at(key));
}
}
VLOG(3) << "clean nodes " << nodes2rm.size();
GraphSafeRemoveNodes(graph, nodes2rm);
}
PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) {
auto it = nodes_.find(key);
if (it != nodes_.end()) {
return it->second;
}
nodes_.emplace(key,
matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key)));
it = nodes_.find(key);
return it->second;
}
PMNode *XPUFuseBase::OpNode(const std::string &key,
const std::string &op_type) {
GetOrCreateNode(key)->set_op_type(op_type);
GetOrCreateNode(key)->AsOp(op_type);
return GetOrCreateNode(key);
}
PMNode *XPUFuseBase::VarNode(const std::string &key) {
GetOrCreateNode(key)->AsVar();
return GetOrCreateNode(key);
}
} // namespace xpu
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <vector>
#include "lite/core/mir/pattern_matcher_high_api.h"
#include "lite/core/mir/xpu_pattern_matcher.h"
namespace paddle {
namespace lite {
namespace mir {
namespace xpu {
class XPUFuseBase {
public:
using key2nodes_t = std::map<std::string, Node*>;
virtual ~XPUFuseBase() = default;
void operator()(SSAGraph* graph) {
BuildPattern();
PerformPatternMatcher(graph);
for (size_t i = 0; i < key2nodes_.size(); ++i) {
InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]);
}
DeleteInterNodes(graph);
}
// Build a PMPattern using PMNode.
virtual void BuildPattern() = 0;
// Generate an operator desc with a matched subgraph.
virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
return cpp::OpDesc();
}
PMNode* OpNode(const std::string& key) {
return GetOrCreateNode(key)->assert_is_op();
}
PMNode* OpNode(const std::string& key, const std::string& op_type);
PMNode* VarNode(const std::string& key);
protected:
virtual void InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched,
const std::vector<Node*>& extra_input_vars) = 0;
void PerformPatternMatcher(SSAGraph* graph);
// Delete nodes that are marked as Intermediate
void DeleteInterNodes(SSAGraph* graph);
PMNode* GetOrCreateNode(const std::string& key);
protected:
XPUPatternMatcher matcher_;
std::map<std::string, PMNode*> nodes_;
std::vector<key2nodes_t> key2nodes_;
};
} // namespace xpu
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -157,5 +157,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope, ...@@ -157,5 +157,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope,
return var->GetMutable<lite::Tensor>(); return var->GetMutable<lite::Tensor>();
} }
void OpLite::AttachInput(const cpp::OpDesc &op_desc,
lite::Scope *scope,
const std::string &input_name,
bool is_dispensable,
lite::Tensor **input_var) {
bool is_have_input =
op_desc.HasInput(input_name) && op_desc.Input(input_name).size() > 0;
CHECK(is_dispensable || is_have_input);
if (is_have_input) {
std::string input_var_name = op_desc.Input(input_name).front();
*input_var = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
}
}
void OpLite::AttachOutput(const cpp::OpDesc &op_desc,
lite::Scope *scope,
const std::string &output_name,
bool is_dispensable,
lite::Tensor **output_var) {
bool is_have_output =
op_desc.HasOutput(output_name) && op_desc.Output(output_name).size() > 0;
CHECK(is_dispensable || is_have_output);
if (is_have_output) {
std::string output_var_name = op_desc.Output(output_name).front();
*output_var = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
}
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -105,6 +105,20 @@ class OpLite : public Registry { ...@@ -105,6 +105,20 @@ class OpLite : public Registry {
return kernel_.get(); return kernel_.get();
} }
// Attach input variable from scope by op_desc and input name
void AttachInput(const cpp::OpDesc &op_desc,
lite::Scope *scope,
const std::string &input_name,
bool is_dispensable,
lite::Tensor **input_var);
// Attach output variable from scope by op_desc and output name
void AttachOutput(const cpp::OpDesc &op_desc,
lite::Scope *scope,
const std::string &output_name,
bool is_dispensable,
lite::Tensor **output_var);
virtual ~OpLite() = default; virtual ~OpLite() = default;
protected: protected:
......
...@@ -152,6 +152,8 @@ KernelRegistry::KernelRegistry() ...@@ -152,6 +152,8 @@ KernelRegistry::KernelRegistry()
INIT_FOR(kMLU, kInt16, kNCHW); INIT_FOR(kMLU, kInt16, kNCHW);
INIT_FOR(kHost, kFloat, kNCHW); INIT_FOR(kHost, kFloat, kNCHW);
INIT_FOR(kHost, kInt32, kNCHW);
INIT_FOR(kHost, kInt64, kNCHW);
INIT_FOR(kHost, kAny, kNCHW); INIT_FOR(kHost, kAny, kNCHW);
INIT_FOR(kHost, kFloat, kNHWC); INIT_FOR(kHost, kFloat, kNHWC);
INIT_FOR(kHost, kFloat, kAny); INIT_FOR(kHost, kFloat, kAny);
......
...@@ -135,6 +135,12 @@ class KernelRegistry final { ...@@ -135,6 +135,12 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kHost), KernelRegistryForTarget<TARGET(kHost),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny)> *, // DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kHost),
PRECISION(kInt64),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny)> *, // DATALAYOUT(kAny)> *, //
......
...@@ -76,6 +76,8 @@ class Optimizer { ...@@ -76,6 +76,8 @@ class Optimizer {
(defined LITE_WITH_ARM) (defined LITE_WITH_ARM)
"lite_elementwise_add_activation_fuse_pass", // "lite_elementwise_add_activation_fuse_pass", //
#endif #endif
"__xpu__resnet_fuse_pass",
"__xpu__multi_encoder_fuse_pass",
"quantized_op_attributes_inference_pass", // Only for fully "quantized_op_attributes_inference_pass", // Only for fully
// quantized model, infer // quantized model, infer
// the output scale and // the output scale and
...@@ -116,9 +118,15 @@ class Optimizer { ...@@ -116,9 +118,15 @@ class Optimizer {
"variable_place_inference_pass", // "variable_place_inference_pass", //
"argument_type_display_pass", "argument_type_display_pass",
"mlu_subgraph_pass",
"runtime_context_assign_pass", "runtime_context_assign_pass",
"argument_type_display_pass", "argument_type_display_pass",
"mlu_postprocess_pass",
"memory_optimize_pass"}}; "memory_optimize_pass"}};
if (passes.size() == 1) { if (passes.size() == 1) {
passes_local.push_back(passes[0]); passes_local.push_back(passes[0]);
} }
......
...@@ -69,6 +69,13 @@ class WorkSpace { ...@@ -69,6 +69,13 @@ class WorkSpace {
} }
#endif #endif
#if defined(LITE_WITH_MLU)
static WorkSpace& Global_MLU() {
thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kMLU)));
return *x;
}
#endif
private: private:
explicit WorkSpace(TargetType x) : target_(x) {} explicit WorkSpace(TargetType x) : target_(x) {}
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace fluid { namespace fluid {
using LoD = std::vector<std::vector<size_t>>; using LoD = std::vector<std::vector<uint64_t>>;
static LoD ToAbsOffset(const LoD &in) { static LoD ToAbsOffset(const LoD &in) {
// the lowest level stores relative offsets // the lowest level stores relative offsets
......
...@@ -10,4 +10,5 @@ add_subdirectory(opencl) ...@@ -10,4 +10,5 @@ add_subdirectory(opencl)
add_subdirectory(fpga) add_subdirectory(fpga)
add_subdirectory(npu) add_subdirectory(npu)
add_subdirectory(xpu) add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm) add_subdirectory(bm)
...@@ -179,6 +179,34 @@ void SquareCompute::Run() { ...@@ -179,6 +179,34 @@ void SquareCompute::Run() {
x_data, output_data, x_dims.production(), ctx.threads()); x_data, output_data, x_dims.production(), ctx.threads());
} }
void HardSwishCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
auto x_dims = param.X->dims();
auto x_data = param.X->data<float>();
auto output_data = param.Out->mutable_data<float>();
float threshold = param.hard_swish_threshold;
float scale = param.hard_swish_scale;
float offset = param.hard_swish_offset;
lite::arm::math::act_hard_swish<float>(x_data,
output_data,
x_dims.production(),
threshold,
scale,
offset,
ctx.threads());
}
void ReciprocalCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
auto x_dims = param.X->dims();
auto x_data = param.X->data<float>();
auto output_data = param.Out->mutable_data<float>();
lite::arm::math::act_reciprocal<float>(
x_data, output_data, x_dims.production(), ctx.threads());
}
} // namespace arm } // namespace arm
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
...@@ -275,3 +303,21 @@ REGISTER_LITE_KERNEL( ...@@ -275,3 +303,21 @@ REGISTER_LITE_KERNEL(
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(hard_swish,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::HardSwishCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(reciprocal,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ReciprocalCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
...@@ -148,6 +148,24 @@ class SquareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> { ...@@ -148,6 +148,24 @@ class SquareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
virtual ~SquareCompute() = default; virtual ~SquareCompute() = default;
}; };
class HardSwishCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~HardSwishCompute() = default;
};
class ReciprocalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~ReciprocalCompute() = default;
};
} // namespace arm } // namespace arm
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -5,3 +5,4 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne ...@@ -5,3 +5,4 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps}) add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps})
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/host/ctc_align_compute.h"
#include <algorithm>
#include <cstring>
#include <map>
#include <utility>
#include <vector>
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
LoD ToAbs(const LoD& in) {
if (in.empty()) return in;
LoD result;
for (auto& src : in) {
std::vector<uint64_t> dest(src.size() + 1, 0);
for (int i = 0; i < src.size(); i++) {
dest[i + 1] = dest[i] + src[i];
}
result.emplace_back(dest);
}
return result;
}
LoD ToNorm(const LoD& in) {
if (in.empty()) return in;
LoD result;
for (auto& src : in) {
std::vector<uint64_t> dest(src.size() - 1, 0);
for (int i = 0; i < dest.size(); i++) {
dest[i] = src[i + 1] - src[i];
}
result.emplace_back(dest);
}
return result;
}
LoD ToAbsOffset(const LoD& in) {
// the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in;
LoD result = in;
for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
for (size_t i = 0; i < in[level].size(); ++i) {
size_t index = in[level][i];
result[level][i] = result[level + 1][index];
}
}
return result;
}
template <typename T, PrecisionType PT>
void CtcAlignCompute<T, PT>::Run() {
auto& param = this->template Param<operators::CtcAlignParam>();
auto* input = param.input;
auto* output = param.output;
size_t blank = static_cast<size_t>(param.blank);
bool merge_repeated = param.merge_repeated;
size_t padding_value = static_cast<size_t>(param.padding_value);
const auto* input_data = input->template data<T>();
auto input_dims = input->dims();
auto* output_data = output->template mutable_data<T>();
if (input->lod().empty()) {
auto* input_length = param.input_length;
auto* output_length = param.output_length;
CHECK(input_length != nullptr);
CHECK(output_length != nullptr);
const auto* input_length_data = input_length->template data<T>();
auto* output_length_data = output_length->template mutable_data<T>();
for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0]; batch_id++) {
T prev_token = -1;
size_t output_idx = 0;
for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
size_t input_ind = batch_id * input_dims[1] + i;
if ((unsigned)input_data[input_ind] != blank &&
!(merge_repeated && input_data[input_ind] == prev_token)) {
output_data[batch_id * input_dims[1] + output_idx] =
input_data[input_ind];
++output_idx;
}
prev_token = input_data[input_ind];
}
output_length_data[batch_id] = output_idx;
for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
output_data[batch_id * input_dims[1] + j] = padding_value;
}
} else {
const size_t level = 0;
auto input_lod = input->lod();
input_lod = ToAbs(input->lod());
input_lod = ToAbsOffset(input_lod);
CHECK_EQ(input_dims[0], static_cast<int64_t>(input_lod[level].back()));
const size_t num_sequences = input_lod[level].size() - 1;
// merge repeated tokens and delete blank
size_t output_idx = 0;
std::vector<uint64_t> output_lod0(1, 0);
for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
T prev_token = -1;
for (size_t i = input_lod[level][seq_idx];
i < input_lod[level][seq_idx + 1];
++i) {
if ((unsigned)input_data[i] != blank &&
!(merge_repeated && input_data[i] == prev_token)) {
output_data[output_idx] = input_data[i];
++output_idx;
}
prev_token = input_data[i];
}
output_lod0.push_back(static_cast<uint64_t>(output_idx));
}
LoD output_lod;
output_lod.push_back(output_lod0);
output_lod = ToNorm(output_lod);
output->set_lod(output_lod);
output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
if (output_lod0.back() == 0) {
output->Resize({1, 1});
output_data = output->template mutable_data<T>();
output_data[0] = -1;
}
}
}
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
using ctc_align_int64 =
paddle::lite::kernels::host::CtcAlignCompute<int64_t, PRECISION(kInt64)>;
REGISTER_LITE_KERNEL(ctc_align, kHost, kInt64, kNCHW, ctc_align_int64, def)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
.BindInput("InputLength",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
.BindOutput("OutputLength",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
.Finalize();
using ctc_align_int32 =
paddle::lite::kernels::host::CtcAlignCompute<int32_t, PRECISION(kInt32)>;
REGISTER_LITE_KERNEL(ctc_align, kHost, kInt32, kNCHW, ctc_align_int32, def)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("InputLength",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("OutputLength",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
template <typename T, PrecisionType PT>
class CtcAlignCompute : public KernelLite<TARGET(kHost), PT> {
public:
void Run() override;
virtual ~CtcAlignCompute() = default;
};
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -6,3 +6,4 @@ add_subdirectory(bridges) ...@@ -6,3 +6,4 @@ add_subdirectory(bridges)
add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges}) add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
...@@ -15,6 +15,9 @@ lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS ...@@ -15,6 +15,9 @@ lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS
lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu})
lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu})
set(mlu_subgraph_bridges set(mlu_subgraph_bridges
subgraph_bridge_registry subgraph_bridge_registry
subgraph_bridge_utility_mlu subgraph_bridge_utility_mlu
...@@ -26,16 +29,20 @@ set(mlu_subgraph_bridges ...@@ -26,16 +29,20 @@ set(mlu_subgraph_bridges
subgraph_bridge_softmax_op_mlu subgraph_bridge_softmax_op_mlu
subgraph_bridge_fc_op_mlu subgraph_bridge_fc_op_mlu
subgraph_bridge_batch_norm_op_mlu subgraph_bridge_batch_norm_op_mlu
subgraph_bridge_scale_op_mlu
subgraph_bridge_interp_op_mlu
subgraph_bridge_concat_op_mlu
CACHE INTERNAL "mlu_subgraph_bridges") CACHE INTERNAL "mlu_subgraph_bridges")
lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges}) lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}") message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
...@@ -31,20 +31,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -31,20 +31,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
VLOG(3) << "[MLU] Converting " + op_type + "..."; VLOG(3) << "[MLU] Converting " + op_type + "...";
// Create act node and set params from op // Create act node and set params from op
auto fp_type = graph->FPType();
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front(); auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>(); auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize(); auto output_dims = output->dims().Vectorize();
auto output_tensor = graph->AddNode( auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
CHECK(graph->HasNode(x_var_name)); CHECK(graph->HasNode(x_var_name));
auto input_tensor = graph->GetNode(x_var_name); auto input_tensor = graph->GetNode(x_var_name);
cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
cnmlBaseOp_t activation_op; cnmlBaseOp_t activation_op;
if (op_type == "leaky_relu") {
auto alpha = op_info->GetAttr<float>("alpha");
std::vector<int64_t> shape = {1, 1, 1, 1};
std::string alpha_var_name = string_format("leaky_relu_alpha_%p", op);
auto alpha_tensor =
graph->AddNode(alpha_var_name, shape, CNML_CONST, CNML_NHWC, fp_type);
graph->BindConstRawData(alpha_var_name, &alpha, 1, true);
CNML_CALL(cnmlCreatePreluOp(&activation_op,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
alpha_tensor->mlu_tensor()));
} else {
cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
CNML_CALL(cnmlCreateActiveOp(&activation_op, CNML_CALL(cnmlCreateActiveOp(&activation_op,
act_type, act_type,
input_tensor->mlu_tensor(), input_tensor->mlu_tensor(),
output_tensor->mlu_tensor())); output_tensor->mlu_tensor()));
}
graph->FuseOp(activation_op); graph->FuseOp(activation_op);
return SUCCESS; return SUCCESS;
} }
...@@ -54,4 +68,11 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -54,4 +68,11 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(sigmoid,
kMLU,
paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
kMLU,
paddle::lite::subgraph::mlu::ActConverter);
...@@ -25,8 +25,6 @@ namespace lite { ...@@ -25,8 +25,6 @@ namespace lite {
namespace subgraph { namespace subgraph {
namespace mlu { namespace mlu {
int ActConverter(void* ctx, OpLite* op);
template void FillTensor<float, int>(Tensor* x, template void FillTensor<float, int>(Tensor* x,
float lower = -2, float lower = -2,
float upper = -2); float upper = -2);
...@@ -136,7 +134,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) { ...@@ -136,7 +134,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
TEST(MLUBridges, activation) { TEST(MLUBridges, activation) {
std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}}; std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
std::vector<std::string> types{"sigmoid", "relu", "tanh"}; std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
for (auto x_shape : shapes) { for (auto x_shape : shapes) {
for (auto op_type : types) { for (auto op_type : types) {
test_act(x_shape, op_type); test_act(x_shape, op_type);
...@@ -149,8 +147,7 @@ TEST(MLUBridges, activation) { ...@@ -149,8 +147,7 @@ TEST(MLUBridges, activation) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter); USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
REGISTER_SUBGRAPH_BRIDGE(MLU, USE_SUBGRAPH_BRIDGE(relu, kMLU)
sigmoid, USE_SUBGRAPH_BRIDGE(tanh, kMLU)
paddle::lite::subgraph::mlu::ActConverter); USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
...@@ -42,7 +42,7 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -42,7 +42,7 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>(); auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize(); auto output_dims = output->dims().Vectorize();
auto output_tensor = graph->AddNode( auto output_tensor = graph->AddNode(
y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); y_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
CHECK(graph->HasNode(x_var_name)); CHECK(graph->HasNode(x_var_name));
......
...@@ -23,8 +23,6 @@ namespace lite { ...@@ -23,8 +23,6 @@ namespace lite {
namespace subgraph { namespace subgraph {
namespace mlu { namespace mlu {
int BatchNormConverter(void* ctx, OpLite* op);
template <typename dtype> template <typename dtype>
void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) { void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
Scope* scope = op->scope(); Scope* scope = op->scope();
...@@ -139,9 +137,7 @@ void test_batch_norm( ...@@ -139,9 +137,7 @@ void test_batch_norm(
{bs, ic, ih, iw}, {bs, ic, ih, iw},
{0, 2, 3, 1}); {0, 2, 3, 1});
out->Resize({bs, ih, iw, ic});
x->CopyDataFrom(input_trans); x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name}); LaunchOp(op, {x_var_name}, {out_var_name});
...@@ -181,6 +177,4 @@ TEST(MLUBridges, batch_norm) { ...@@ -181,6 +177,4 @@ TEST(MLUBridges, batch_norm) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, USE_SUBGRAPH_BRIDGE(batch_norm, kMLU)
batch_norm,
paddle::lite::subgraph::mlu::BatchNormConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
auto x_var_name = op_info->Input("X");
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto param_axis = op_info->GetAttr<int>("axis");
std::vector<cnmlTensor_t> input_tensor;
for (auto x_name : x_var_name) {
CHECK(graph->HasNode(x_name));
input_tensor.push_back(graph->GetNode(x_name)->mlu_tensor());
}
auto dims = output_dims.size();
int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
int nhwc_axis = nchw_to_nhwc_axis_map[axis];
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
cnmlBaseOp_t concat_op;
cnmlTensor_t outputs = output_tensor->mlu_tensor();
CNML_CALL(cnmlCreateNdConcatOp(&concat_op,
nhwc_axis,
input_tensor.data(),
x_var_name.size(),
&outputs,
1));
graph->FuseOp(concat_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(concat,
kMLU,
paddle::lite::subgraph::mlu::ConcatConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/concat_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = op_info->Input("X");
std::vector<lite::Tensor*> inputs;
for (auto var : x) {
inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
}
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
int axis = op_info->GetAttr<int>("axis");
std::vector<lite::Tensor*> inputs_concat(inputs.size());
for (int j = 0; j < inputs.size(); ++j) {
inputs_concat[j] = inputs[j];
}
size_t num = inputs.size();
int rows = 1;
auto dim_0 = inputs[0]->dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int out_rows = rows, out_cols = 0;
std::vector<int64_t> inputs_cols(inputs.size());
for (int i = 0; i < num; ++i) {
int t_cols = inputs[i]->numel() / rows;
out_cols += t_cols;
inputs_cols[i] = t_cols;
}
for (int k = 0; k < out_rows; ++k) {
float* dst_ptr = out->mutable_data<float>() + k * out_cols;
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = inputs_cols[j];
const float* src_prt = inputs[j]->data<float>() + k * col_len;
std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
col_idx += col_len;
}
}
}
void test_concat(std::vector<std::vector<int64_t>> input, int axis) {
std::string x_var_name = "x";
std::string y_var_name = "y";
std::string out_var_name = "out";
std::string out_ref_var_name = "out_ref";
// prepare input&output variables
Scope scope;
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
x->Resize(DDim(input[0]));
y->Resize(DDim(input[1]));
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
CHECK_EQ(out->dims(), out_ref->dims());
// initialize input&output data
FillTensor<float>(x);
FillTensor<float>(y);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("concat");
opdesc.SetInput("X", {x_var_name, y_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("axis", axis);
auto op = CreateOp<operators::ConcatOpLite>(opdesc, &scope);
concat_ref(op);
out_ref->CopyDataFrom(*out);
Tensor input_x, input_y;
input_x.Resize(DDim(input[0]));
input_y.Resize(DDim(input[1]));
transpose(x->mutable_data<float>(),
input_x.mutable_data<float>(),
{static_cast<int>(input[0][0]),
static_cast<int>(input[0][1]),
static_cast<int>(input[0][2]),
static_cast<int>(input[0][3])},
{0, 2, 3, 1});
transpose(y->mutable_data<float>(),
input_y.mutable_data<float>(),
{static_cast<int>(input[1][0]),
static_cast<int>(input[1][1]),
static_cast<int>(input[1][2]),
static_cast<int>(input[1][3])},
{0, 2, 3, 1});
x->CopyDataFrom(input_x);
y->CopyDataFrom(input_y);
LaunchOp(op, {x_var_name, y_var_name}, {out_var_name});
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize(out->dims());
auto os = out->dims();
transpose(out_data,
output_trans.mutable_data<float>(),
{static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])},
{0, 3, 1, 2});
out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
}
}
TEST(MLUBridges, concat) {
test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0);
test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1);
test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2);
test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3);
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
USE_SUBGRAPH_BRIDGE(concat, kMLU);
...@@ -31,15 +31,16 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -31,15 +31,16 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
const auto* scope = op->scope(); const auto* scope = op->scope();
VLOG(3) << "[MLU] Converting " << op_info->Type() << "... "; VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
// Get input, filter and op attributes // get input, filter and op attributes
const auto input_var_name = op_info->Input("Input").front(); const auto input_var_name = op_info->Input("Input").front();
const auto& input_dims_nhwc = const auto& input_dims =
scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims(); scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims();
const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
const auto filter_var_name = op_info->Input("Filter").front(); const auto filter_var_name = op_info->Input("Filter").front();
auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>(); auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
const auto& filter_dims = filter->dims(); const auto& filter_dims = filter->dims();
const auto output_var_name = op_info->Output("Output").front(); const auto output_var_name = op_info->Output("Output").front();
auto* output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
const auto output_shape = output->dims().Vectorize();
const auto bs = input_dims[0]; const auto bs = input_dims[0];
const auto oc = filter_dims[0]; const auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4); CHECK_EQ(input_dims.size(), 4);
...@@ -70,24 +71,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -70,24 +71,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
input_dims, input_dims,
filter_dims); filter_dims);
std::vector<int64_t> output_shape({bs, oc}); const auto output_tensor = graph->AddNode(
for (size_t i = 0; i < 2; i++) { output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
output_shape.push_back(
(input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
strides[i] +
1);
}
const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
const auto output_tensor = graph->AddNode(output_var_name,
output_shape_nhwc,
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
// Create filter node // Create filter node
const auto filter_tensor = graph->AddNode(filter_var_name, const auto filter_tensor = graph->AddNode(filter_var_name,
...@@ -119,14 +104,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -119,14 +104,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
LOG(FATAL) << "UnSupported weight precision!"; LOG(FATAL) << "UnSupported weight precision!";
} }
cnmlConvOpParam_t conv_param;
CNML_CALL(cnmlCreateConvOpParam(&conv_param,
strides[0],
strides[1],
dilations[0],
dilations[1],
paddings[0] * 2,
paddings[2] * 2));
std::string bias_var_name; std::string bias_var_name;
std::shared_ptr<MLUTensor> bias_tensor; std::shared_ptr<MLUTensor> bias_tensor;
if (HasInputArg(op_info, scope, "Bias")) { if (HasInputArg(op_info, scope, "Bias")) {
...@@ -160,8 +137,66 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -160,8 +137,66 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
graph->FPType()); graph->FPType());
graph->BindConstData(bias_var_name, bias); graph->BindConstData(bias_var_name, bias);
} }
cnmlBaseOp_t conv_op;
const auto input_scale = op_info->GetAttr<float>("input_scale"); const auto input_scale = op_info->GetAttr<float>("input_scale");
bool use_first_conv = false;
if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
use_first_conv = true;
}
cnmlBaseOp_t conv_op;
if (use_first_conv) {
cnmlConvFirstOpParam_t conv_param;
CNML_CALL(cnmlCreateConvFirstOpParam_V2(&conv_param,
strides[0],
strides[1],
dilations[0],
dilations[1],
paddings[2],
paddings[2],
paddings[0],
paddings[0]));
const auto mean_tensor = graph->AddNode("first_conv_mean_tensor",
std::vector<int64_t>{3},
CNML_CONST,
CNML_CNHW,
graph->FPType());
const auto std_tensor = graph->AddNode("first_conv_std_tensor",
std::vector<int64_t>{3},
CNML_CONST,
CNML_CNHW,
graph->FPType());
graph->BindConstRawData("first_conv_mean_tensor",
lite::DeviceInfo::Global().MeanVec().data(),
3,
false);
graph->BindConstRawData("first_conv_std_tensor",
lite::DeviceInfo::Global().StdVec().data(),
3,
false);
graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8);
CNML_CALL(cnmlCreateConvFirstOpForward(
&conv_op,
conv_param,
graph->GetNode(input_var_name)->mlu_tensor(),
mean_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
std_tensor->mlu_tensor()));
CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param));
} else {
cnmlConvOpParam_t conv_param;
CNML_CALL(cnmlCreateConvOpParam(&conv_param,
strides[0],
strides[1],
dilations[0],
dilations[1],
paddings[0] * 2,
paddings[2] * 2));
CNML_CALL(cnmlCreateConvOpForward( CNML_CALL(cnmlCreateConvOpForward(
&conv_op, &conv_op,
conv_param, conv_param,
...@@ -169,6 +204,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -169,6 +204,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
output_tensor->mlu_tensor(), output_tensor->mlu_tensor(),
filter_tensor->mlu_tensor(), filter_tensor->mlu_tensor(),
bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
}
graph->SetComputingDataType( graph->SetComputingDataType(
conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale); conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
...@@ -183,7 +220,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -183,7 +220,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
} }
graph->BindConstData(filter_var_name, filter); graph->BindConstData(filter_var_name, filter);
graph->FuseOp(conv_op); graph->FuseOp(conv_op);
CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
return REBUILD_WHEN_SHAPE_CHANGED; return REBUILD_WHEN_SHAPE_CHANGED;
} }
......
...@@ -25,8 +25,6 @@ namespace lite { ...@@ -25,8 +25,6 @@ namespace lite {
namespace subgraph { namespace subgraph {
namespace mlu { namespace mlu {
int ConvConverter(void* ctx, OpLite* op);
void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) { void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
Scope* scope = op->scope(); Scope* scope = op->scope();
const OpInfo* op_info = op->op_info(); const OpInfo* op_info = op->op_info();
...@@ -246,10 +244,6 @@ void test_conv(int bs, ...@@ -246,10 +244,6 @@ void test_conv(int bs,
} }
} }
input->Resize({bs, ih, iw, ic});
output->Resize(
{output_shape[0], output_shape[2], output_shape[3], output_shape[1]});
// create and convert op to MLU model, then run it on MLU // create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope); auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
LaunchOp(op, {input_var_name}, {output_var_name}); LaunchOp(op, {input_var_name}, {output_var_name});
...@@ -342,9 +336,5 @@ TEST(MLUBridges, conv) { ...@@ -342,9 +336,5 @@ TEST(MLUBridges, conv) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, USE_SUBGRAPH_BRIDGE(conv2d, kMLU)
conv2d, USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU)
paddle::lite::subgraph::mlu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
depthwise_conv2d,
paddle::lite::subgraph::mlu::ConvConverter);
...@@ -77,7 +77,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -77,7 +77,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto output_tensor = graph->AddNode(out_var_name, auto output_tensor = graph->AddNode(out_var_name,
x->dims().Vectorize(), x->dims().Vectorize(),
CNML_TENSOR, CNML_TENSOR,
CNML_NHWC, CNML_NCHW,
graph->FPType()); graph->FPType());
cnmlBaseOp_t elementwise_op; cnmlBaseOp_t elementwise_op;
...@@ -90,7 +90,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -90,7 +90,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto mid_tensor = graph->AddNode(out_var_name + "_mid", auto mid_tensor = graph->AddNode(out_var_name + "_mid",
x->dims().Vectorize(), x->dims().Vectorize(),
CNML_TENSOR, CNML_TENSOR,
CNML_NHWC, CNML_NCHW,
graph->FPType()); graph->FPType());
CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op, CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
x_tensor->mlu_tensor(), x_tensor->mlu_tensor(),
......
...@@ -24,8 +24,6 @@ namespace lite { ...@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph { namespace subgraph {
namespace mlu { namespace mlu {
int ElementwiseConverter(void* ctx, OpLite* op);
template <typename dtype> template <typename dtype>
void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) { void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
Scope* scope = op->scope(); Scope* scope = op->scope();
...@@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) { ...@@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU)
elementwise_add, USE_SUBGRAPH_BRIDGE(elementwise_sub, kMLU)
paddle::lite::subgraph::mlu::ElementwiseConverter); USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU)
REGISTER_SUBGRAPH_BRIDGE(MLU, USE_SUBGRAPH_BRIDGE(elementwise_div, kMLU)
elementwise_sub,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_mul,
paddle::lite::subgraph::mlu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(MLU,
elementwise_div,
paddle::lite::subgraph::mlu::ElementwiseConverter);
...@@ -37,6 +37,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -37,6 +37,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims"); // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>(); auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>(); auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
auto x_dims = x->dims(); auto x_dims = x->dims();
auto w_dims = w->dims(); auto w_dims = w->dims();
...@@ -50,15 +51,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -50,15 +51,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto input_scale = op_info->GetAttr<float>("input_scale"); auto input_scale = op_info->GetAttr<float>("input_scale");
std::vector<int64_t> output_shape_nhwc({1, 1, 1, w_dims[1]});
auto output_tensor = graph->AddNode(output_var_name, auto output_tensor = graph->AddNode(output_var_name,
output_shape_nhwc, output->dims().Vectorize(),
CNML_TENSOR, CNML_TENSOR,
CNML_NHWC, CNML_NCHW,
graph->FPType()); graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
std::string bias_var_name; std::string bias_var_name;
std::shared_ptr<MLUTensor> bias_tensor; std::shared_ptr<MLUTensor> bias_tensor;
......
...@@ -24,8 +24,6 @@ namespace lite { ...@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph { namespace subgraph {
namespace mlu { namespace mlu {
int FCConverter(void* ctx, OpLite* op);
void fc_ref(const std::shared_ptr<operators::FcOpLite> op) { void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
Scope* scope = op->scope(); Scope* scope = op->scope();
const OpInfo* op_info = op->op_info(); const OpInfo* op_info = op->op_info();
...@@ -141,15 +139,34 @@ void test_fc(const std::vector<int64_t>& input_shape, ...@@ -141,15 +139,34 @@ void test_fc(const std::vector<int64_t>& input_shape,
} }
auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope); auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
input->Resize({static_cast<int>(input_shape[0]),
Tensor input_tmp, out_tmp;
input_tmp.Resize(input_shape);
transpose(input->mutable_data<float>(),
input_tmp.mutable_data<float>(),
{static_cast<int>(input_shape[0]),
static_cast<int>(input_shape[1]),
static_cast<int>(input_shape[2]), static_cast<int>(input_shape[2]),
static_cast<int>(input_shape[3]), static_cast<int>(input_shape[3])},
static_cast<int>(input_shape[1])}); {0, 2, 3, 1});
out->Resize({static_cast<int>(input_shape[0]), static_cast<int>(w_shape[1])}); input->CopyDataFrom(input_tmp);
LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name}); LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name});
// compare results auto os = out->dims();
out_tmp.Resize(os);
auto* out_data = out->mutable_data<float>(); auto* out_data = out->mutable_data<float>();
// transpose(out_data,
// out_tmp.mutable_data<float>(),
// {static_cast<int>(os[0]),
// static_cast<int>(os[2]),
// static_cast<int>(os[3]),
// static_cast<int>(os[1])},
// {0, 3, 1, 2});
//
// out_data = out_tmp.mutable_data<float>();
// compare results
auto* out_ref_data = out_ref->mutable_data<float>(); auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) { for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
...@@ -170,4 +187,4 @@ TEST(MLUBridges, fc) { ...@@ -170,4 +187,4 @@ TEST(MLUBridges, fc) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter); USE_SUBGRAPH_BRIDGE(fc, kMLU);
...@@ -25,12 +25,12 @@ namespace mlu { ...@@ -25,12 +25,12 @@ namespace mlu {
std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name, std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
std::vector<int64_t> shape, std::vector<int64_t> shape,
cnmlTensorType_t tensor_type, cnmlTensorType_t tensor_type,
cnmlDataOrder_t data_order, cnmlDataOrder_t shape_order,
cnmlDataType_t mlu_dtype, cnmlDataType_t mlu_dtype,
void* raw_ptr) { void* raw_ptr) {
CHECK(!HasNode(name)); CHECK(!HasNode(name));
auto node = std::shared_ptr<MLUTensor>( auto node = std::shared_ptr<MLUTensor>(
new MLUTensor(shape, tensor_type, data_order, mlu_dtype)); new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
node->set_mlu_ptr(raw_ptr); node->set_mlu_ptr(raw_ptr);
nodes_.insert(std::make_pair(name, node)); nodes_.insert(std::make_pair(name, node));
return node; return node;
......
...@@ -23,6 +23,12 @@ ...@@ -23,6 +23,12 @@
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/mlu/bridges/tensor.h" #include "lite/kernels/mlu/bridges/tensor.h"
#define PRINT_HW_TIME false
#if PRINT_HW_TIME
#include <mutex> //NOLINT
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace subgraph { namespace subgraph {
...@@ -32,13 +38,30 @@ namespace mlu { ...@@ -32,13 +38,30 @@ namespace mlu {
// to the MLU IR graph // to the MLU IR graph
class Graph { class Graph {
public: public:
Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); } Graph() {
CNML_CALL(cnmlCreateFusionOp(&fusion_op_));
#if PRINT_HW_TIME
CNRT_CALL(cnrtCreateNotifier(&notifier_start_));
CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
#endif
}
~Graph() { ~Graph() {
FreeConstData();
CNML_CALL(cnmlDestroyFusionOp(&fusion_op_)); CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
for (auto op : ops_) { for (auto op : ops_) {
CNML_CALL(cnmlDestroyBaseOp(&op)); CNML_CALL(cnmlDestroyBaseOp(&op));
} }
#if PRINT_HW_TIME
CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
double total_time = 0;
for (auto& f : time_log_) {
total_time += f;
}
std::cout << "cnml hardware time for " << time_log_.size()
<< " process:" << total_time / time_log_.size() << std::endl;
#endif
} }
// Data node // Data node
...@@ -89,6 +112,10 @@ class Graph { ...@@ -89,6 +112,10 @@ class Graph {
} }
void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) { void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
#if PRINT_HW_TIME
thread_local float hw_time;
CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
#endif
CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_, CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
input_addrs_.data(), input_addrs_.data(),
input_addrs_.size(), input_addrs_.size(),
...@@ -96,7 +123,61 @@ class Graph { ...@@ -96,7 +123,61 @@ class Graph {
output_addrs_.size(), output_addrs_.size(),
&forward_param, &forward_param,
que)); que));
#if PRINT_HW_TIME
CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
#endif
CNRT_CALL(cnrtSyncQueue(que)); CNRT_CALL(cnrtSyncQueue(que));
#if PRINT_HW_TIME
CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
hw_time /= 1000.0f;
DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
std::lock_guard<std::mutex> lk(time_mut_);
time_log_.push_back(hw_time);
#endif
}
template <typename T>
void* RegisterConstData(size_t len) {
void* addr = malloc(len * sizeof(T));
const_data_storage_.push_back(addr);
return addr;
}
void FreeConstData() {
for (auto& addr : const_data_storage_) {
free(addr);
}
}
void BindConstRawData(std::string tensor_name,
const float* data,
size_t len,
bool alloc = true) {
void* alloc_data;
if (fp_type_ == CNML_DATA_FLOAT32) {
if (alloc) {
alloc_data = RegisterConstData<float>(len);
memcpy(alloc_data, data, len * sizeof(float));
} else {
alloc_data = const_cast<void*>(static_cast<const void*>(data));
}
CNML_CALL(cnmlBindConstData_V2(
nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
} else if (fp_type_ == CNML_DATA_FLOAT16) {
void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
CNRT_CALL(
cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
CNRT_FLOAT32,
data_fp16,
CNRT_FLOAT16,
len,
nullptr));
CNML_CALL(cnmlBindConstData_V2(
nodes_[tensor_name]->mlu_tensor(), data_fp16, false));
} else {
CHECK(0);
}
} }
void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) { void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
...@@ -158,6 +239,12 @@ class Graph { ...@@ -158,6 +239,12 @@ class Graph {
std::vector<std::shared_ptr<MLUTensor>> output_tensors_; std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
std::vector<cnmlBaseOp_t> ops_; std::vector<cnmlBaseOp_t> ops_;
cnmlFusionOp_t fusion_op_; cnmlFusionOp_t fusion_op_;
std::vector<void*> const_data_storage_;
#if PRINT_HW_TIME
cnrtNotifier_t notifier_start_{}, notifier_end_{};
std::mutex time_mut_;
std::vector<float> time_log_;
#endif
}; };
} // namespace mlu } // namespace mlu
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
auto out = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto x_dims = x->dims();
CHECK_EQ(x_dims.size(), 4);
auto scale = op_info->GetAttr<float>("scale");
auto out_w = op_info->GetAttr<int>("out_w");
auto out_h = op_info->GetAttr<int>("out_h");
auto align_corners = op_info->GetAttr<bool>("align_corners");
CHECK(graph->HasNode(x_var_name));
auto input_tensor = graph->GetNode(x_var_name);
auto in_h = x_dims[2];
auto in_w = x_dims[3];
// Priority: SizeTensor > OutSize > Scale > scale > out_h/out_w
if (HasInputArg(op_info, scope, "SizeTensor")) {
LOG(ERROR) << "Not support SizeTensor input now";
CHECK(0);
} else {
if (HasInputArg(op_info, scope, "Scale")) {
LOG(ERROR) << "Not support Scale input now";
CHECK(0);
}
if (scale > 0) {
out_h = static_cast<int>(in_h * scale);
out_w = static_cast<int>(in_w * scale);
out_h = out_h > 0 ? out_h : -1;
out_w = out_w > 0 ? out_w : -1;
}
if (HasInputArg(op_info, scope, "OutSize")) {
LOG(ERROR) << "Not support OutSize input now";
CHECK(0);
}
}
auto output_tensor = graph->AddNode(out_var_name,
out->dims().Vectorize(),
CNML_TENSOR,
CNML_NCHW,
graph->FPType());
cnmlBaseOp_t interp_op;
cnmlNearestNeighborOpParam_t nn_param;
CNML_CALL(cnmlCreateNearestNeighborOpParam(&nn_param, out_w, out_h));
CNML_CALL(cnmlSetNearestNeighborAlignCorner(&nn_param, align_corners));
CNML_CALL(cnmlCreateNearestNeighborOp(&interp_op,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
nn_param));
CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
graph->FuseOp(interp_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(nearest_interp,
kMLU,
paddle::lite::subgraph::mlu::InterpolateConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/interpolate_op.h"
#include <gtest/gtest.h>
#include <string>
#include "lite/core/device_info.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
template <typename dtype>
void ResizeNearestAlign(const lite::Tensor* x,
lite::Tensor* out,
bool with_align) {
auto x_dims = x->dims();
int num = x_dims[0];
int channels = x_dims[1];
int hin = x_dims[2];
int win = x_dims[3];
int hout = out->dims()[2];
int wout = out->dims()[3];
dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
: (static_cast<float>(win) / (wout));
dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
: (static_cast<float>(hin) / (hout));
const dtype* src = x->data<dtype>();
dtype* dst = out->mutable_data<dtype>();
int dst_stride_w = 1;
int dst_stride_h = wout;
int dst_stride_c = wout * hout;
int dst_stride_batch = wout * hout * channels;
int src_stride_w = 1;
int src_stride_h = win;
int src_stride_c = win * hin;
int src_stride_batch = win * hin * channels;
for (int n = 0; n < num; ++n) {
for (int c = 0; c < channels; ++c) {
int src_index = n * src_stride_batch + c * src_stride_c;
for (int h = 0; h < hout; ++h) {
for (int w = 0; w < wout; ++w) {
int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
: static_cast<int>(scale_w * w);
fw = (fw < 0) ? 0 : fw;
int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
: static_cast<int>(scale_h * h);
fh = (fh < 0) ? 0 : fh;
int w_start = static_cast<int>(fw);
int h_start = static_cast<int>(fh);
int dst_index = n * dst_stride_batch + c * dst_stride_c +
h * dst_stride_h + w * dst_stride_w;
dst[dst_index] =
src[src_index + w_start * src_stride_w + h_start * src_stride_h];
}
}
}
}
}
template <typename DType>
void BilinearInterpRef(const lite::Tensor* x,
lite::Tensor* out,
bool align_corners,
int align_mode) {
auto x_dims = x->dims();
int batch_size = x_dims[0];
int channel_size = x_dims[1];
auto x_h = x_dims[2];
auto x_w = x_dims[3];
CHECK_EQ(x_dims.size(), 4);
auto out_dims = out->dims();
int out_h = out_dims[2];
int out_w = out_dims[3];
// copy from x if no change
if (x_h == out_h && x_w == out_w) {
out->CopyDataFrom(*x);
return;
}
float ratio_h = 0.f;
float ratio_w = 0.f;
if (out_h > 1) {
ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
: static_cast<float>(x_h) / out_h;
}
if (out_w > 1) {
ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
: static_cast<float>(x_w) / out_w;
}
// naive bilinear interpolation
auto x_data = x->data<DType>();
auto out_data = out->mutable_data<DType>();
bool align_flag = (align_mode == 0 && !align_corners);
std::vector<int> vy_n, vy_s;
std::vector<float> vd_n, vd_s;
vy_n.reserve(out_h);
vy_s.reserve(out_h);
vd_n.reserve(out_h);
vd_s.reserve(out_h);
for (int k = 0; k < out_h; k++) {
int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
: static_cast<int>(ratio_h * k);
yn = (yn > 0) ? yn : 0;
int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
float idx_src_y = ratio_h * (k + 0.5) - 0.5;
idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
float ds = 1.f - dn;
{
vy_n[k] = yn;
vy_s[k] = ys;
vd_n[k] = dn;
vd_s[k] = ds;
}
}
std::vector<int> vx_w, vx_e;
std::vector<float> vd_w, vd_e;
vx_w.reserve(out_w);
vx_e.reserve(out_w);
vd_w.reserve(out_w);
vd_e.reserve(out_w);
for (int l = 0; l < out_w; l++) {
int xw = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
: static_cast<int>(ratio_w * l);
xw = (xw > 0) ? xw : 0;
int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
float idx_src_x = ratio_w * (l + 0.5) - 0.5;
idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
float de = 1.f - dw;
{
vx_w[l] = xw;
vx_e[l] = xe;
vd_w[l] = dw;
vd_e[l] = de;
}
}
std::vector<int64_t> x_strides(x_dims.size(), 1);
for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
}
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < channel_size; j++) {
for (int k = 0; k < out_h; k++) {
for (int l = 0; l < out_w; l++) {
DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
*out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
out_data++;
}
}
}
}
}
class InterpComputeTester {
protected:
// common attributes for this op.
std::string x_var_name = "X";
std::string outsize_var_name = "OutSize";
std::string out_var_name = "Out";
std::string out_ref_var_name = "out_ref";
DDim dims_{{1, 2, 3, 4}};
Scope scope;
std::string interp_method_ = "nearest";
float scale_ = -1.f;
int out_h_ = -1;
int out_w_ = -1;
bool align_corners_ = true;
int align_mode_ = 1;
bool use_outsize_ = false;
public:
InterpComputeTester(const std::string& alias,
DDim dims,
std::string interp_method = "nearest",
float scale = -1.f,
int out_h = -1,
int out_w = -1,
bool align_corners = true,
int align_mode = 1,
bool use_outsize = false)
: dims_(dims),
interp_method_(interp_method),
scale_(scale),
out_h_(out_h),
out_w_(out_w),
align_corners_(align_corners),
align_mode_(align_mode),
use_outsize_(use_outsize) {}
void Execute(float abs_error) {
cpp::OpDesc op_desc;
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* outsize = scope.Var(outsize_var_name)->GetMutable<Tensor>();
auto* outref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
int out_h = out_h_;
int out_w = out_w_;
if (scale_ > 0) {
out_h = static_cast<int>(dims_[2] * scale_);
out_w = static_cast<int>(dims_[3] * scale_);
}
x->Resize(dims_);
/* printf("----output tensor dims: %ld, %d, %d, %ld\n", dims_[0], out_h,
* out_w, dims_[1]); */
std::vector<int64_t> out_shape_nchw = {dims_[0], dims_[1], out_h, out_w};
outref->Resize(out_shape_nchw);
outsize->Resize({2});
FillTensor<float, float>(x, -1.f, 1.f);
if (use_outsize_) {
outsize->mutable_data<int>()[0] = out_h;
outsize->mutable_data<int>()[1] = out_w;
outsize->set_persistable(true);
}
if (interp_method_ == "nearest") {
op_desc.SetType("nearest_interp");
} else if (interp_method_ == "bilinear") {
op_desc.SetType("bilinear_interp");
} else {
LOG(FATAL) << "unsupport";
}
op_desc.SetInput("X", {x_var_name});
if (use_outsize_) {
op_desc.SetInput("OutSize", {outsize_var_name});
}
op_desc.SetOutput("Out", {out_var_name});
op_desc.SetAttr("scale", scale_);
op_desc.SetAttr("out_h", out_h_);
op_desc.SetAttr("out_w", out_w_);
op_desc.SetAttr("align_corners", align_corners_);
op_desc.SetAttr("align_mode", align_mode_);
op_desc.SetAttr("interp_method", interp_method_);
auto op = CreateOp<operators::InterpolateOp>(op_desc, &scope);
if (interp_method_ == "nearest") {
ResizeNearestAlign<float>(x, outref, align_corners_);
} else if (interp_method_ == "bilinear") {
BilinearInterpRef<float>(x, outref, align_corners_, align_mode_);
}
int in = dims_[0], ic = dims_[1], ih = dims_[2], iw = dims_[3];
Tensor input_trans;
input_trans.Resize(dims_);
transpose(x->mutable_data<float>(),
input_trans.mutable_data<float>(),
{in, ic, ih, iw},
{0, 2, 3, 1});
x->CopyDataFrom(input_trans);
if (use_outsize_) {
LaunchOp(op, {x_var_name, outsize_var_name}, {out_var_name});
} else {
LaunchOp(op, {x_var_name}, {out_var_name});
}
auto* out_ref_data = outref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize(out_shape_nchw);
transpose(
out->mutable_data<float>(),
output_trans.mutable_data<float>(),
{static_cast<int>(dims_[0]), out_h, out_w, static_cast<int>(dims_[1])},
{0, 3, 1, 2});
auto* out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); ++i) {
EXPECT_NEAR(out_data[i], out_ref_data[i], abs_error);
}
}
};
void TestInterpOuthw(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
/* for (auto interp_method : std::vector<std::string>{"nearest",
* "bilinear"}) { */
for (auto interp_method : std::vector<std::string>{"nearest"}) {
for (int out_h : {6, 8, 12}) {
for (int out_w : {6, 9}) {
printf("testcase %s: out_w %d, out_h %d\n",
interp_method.c_str(),
out_w,
out_h);
InterpComputeTester tester(
"def", DDim(x_dims), interp_method, -1.f, out_h, out_w);
tester.Execute(abs_error);
}
}
}
}
}
void TestInterpScale(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
/* for (auto interp_method : std::vector<std::string>{"nearest",
* "bilinear"}) { */
for (auto interp_method : std::vector<std::string>{"nearest"}) {
for (float scale : {0.3f, 1.f, 1.7f}) {
printf("testcase %s: scale: %f\n", interp_method.c_str(), scale);
InterpComputeTester tester("def", DDim(x_dims), interp_method, scale);
tester.Execute(abs_error);
}
}
}
}
void TestInterpOutsize(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
/* for (auto interp_method : std::vector<std::string>{"nearest",
* "bilinear"}) { */
for (auto interp_method : std::vector<std::string>{"nearest"}) {
printf("testcase %s: outsize: %d %d\n", interp_method.c_str(), 4, 4);
InterpComputeTester tester(
"def", DDim(x_dims), interp_method, -1, 4, 4, true, 1, true);
tester.Execute(abs_error);
}
}
}
void TestInterpAlignCorners(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
for (bool align_corners : {true, false}) {
printf(
"testcase nearest: scale: 0.4, out_w -1 out_h -1, align_corners %d\n",
align_corners);
InterpComputeTester tester(
"def", DDim(x_dims), "nearest", 0.4, -1, -1, align_corners);
tester.Execute(abs_error);
}
}
}
void TestInterpAlignMode(float abs_error = 2e-5) {
for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
for (bool align_corners : {true, false}) {
for (int align_mode : {0, 1}) {
printf(
"testcase bilinear: scale: 0.7, out_w -1 out_h -1, align_corners "
"%d, mode %d\n",
align_corners,
align_mode);
InterpComputeTester tester("def",
DDim(x_dims),
"bilinear",
0.7,
-1,
-1,
align_corners,
align_mode);
tester.Execute(abs_error);
}
}
}
}
TEST(MLUBridges, interpolate) {
float abs_error = 2e-5;
TestInterpOuthw(abs_error);
TestInterpScale(abs_error);
// bug, not usable
// TestInterpOutsize(abs_error);
TestInterpAlignCorners(abs_error);
// only for bilinear interp
// TestInterpAlignMode(abs_error);
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
...@@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(pool2d, kMLU); ...@@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(pool2d, kMLU);
USE_SUBGRAPH_BRIDGE(softmax, kMLU); USE_SUBGRAPH_BRIDGE(softmax, kMLU);
USE_SUBGRAPH_BRIDGE(batch_norm, kMLU); USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
USE_SUBGRAPH_BRIDGE(fc, kMLU); USE_SUBGRAPH_BRIDGE(fc, kMLU);
USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU);
USE_SUBGRAPH_BRIDGE(concat, kMLU);
USE_SUBGRAPH_BRIDGE(scale, kMLU);
...@@ -47,9 +47,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -47,9 +47,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
// Get input, and attributes // Get input, and attributes
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto x = scope->FindTensor(x_var_name); auto x = scope->FindTensor(x_var_name);
auto input_dims_nhwc = x->dims();
const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
auto output_var_name = op_info->Output("Out").front(); auto output_var_name = op_info->Output("Out").front();
auto output_shape = scope->FindTensor(output_var_name)->dims().Vectorize();
auto pooling_type = op_info->GetAttr<std::string>("pooling_type"); auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
auto ceil_mode = op_info->GetAttr<bool>("ceil_mode"); auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings"); auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
...@@ -81,23 +80,17 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -81,23 +80,17 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
strides, strides,
ksize); ksize);
std::vector<int64_t> output_shape({input_dims[0], input_dims[1]}); // std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
for (size_t i = 0; i < 2; i++) { // for (size_t i = 0; i < 2; i++) {
output_shape.push_back( // output_shape.push_back(
(input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) / // (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
strides[i] + // ksize[0]) /
1); // strides[i] +
} // 1);
// }
auto output_shape_nhwc = DimNCHW2NHWC(output_shape); auto output_tensor = graph->AddNode(
auto output_tensor = graph->AddNode(output_var_name, output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
output_shape_nhwc,
CNML_TENSOR,
CNML_NHWC,
graph->FPType());
scope->FindVar(output_var_name)
->GetMutable<::paddle::lite::Tensor>()
->Resize(output_shape_nhwc);
cnmlPoolOpParam_t pool_param; cnmlPoolOpParam_t pool_param;
CNML_CALL( CNML_CALL(
......
...@@ -24,8 +24,6 @@ namespace lite { ...@@ -24,8 +24,6 @@ namespace lite {
namespace subgraph { namespace subgraph {
namespace mlu { namespace mlu {
int PoolConverter(void* ctx, OpLite* op);
void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) { void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
Scope* scope = op->scope(); Scope* scope = op->scope();
const OpInfo* op_info = op->op_info(); const OpInfo* op_info = op->op_info();
...@@ -182,12 +180,7 @@ void test_pool(int bs, ...@@ -182,12 +180,7 @@ void test_pool(int bs,
{0, 2, 3, 1}); {0, 2, 3, 1});
auto os = out->dims(); auto os = out->dims();
out->Resize({static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])});
x->CopyDataFrom(input_trans); x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name}); LaunchOp(op, {x_var_name}, {out_var_name});
...@@ -275,6 +268,4 @@ TEST(MLUBridges, pool) { ...@@ -275,6 +268,4 @@ TEST(MLUBridges, pool) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, USE_SUBGRAPH_BRIDGE(pool2d, kMLU)
pool2d,
paddle::lite::subgraph::mlu::PoolConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/bridges/graph.h"
#include "lite/kernels/mlu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[MLU] Converting " + op_type + "...";
// Create act node and set params from op
auto x_var_name = op_info->Input("X").front();
auto out_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
auto output_dims = output->dims().Vectorize();
auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
auto bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
auto scale = op_info->GetAttr<float>("scale");
auto bias = op_info->GetAttr<float>("bias");
auto beta = bias_after_scale ? bias : bias * scale;
std::vector<int64_t> shape = {1, 1, 1, 1};
std::string prefix = string_format("_%p", op);
auto alpha_tensor = graph->AddNode(
"Alpha" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType());
auto beta_tensor = graph->AddNode(
"Beta" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType());
graph->BindConstRawData("Alpha" + prefix, &scale, 1);
graph->BindConstRawData("Beta" + prefix, &beta, 1);
auto input_tensor = graph->GetNode(x_var_name);
cnmlBaseOp_t scale_op;
CNML_CALL(cnmlCreateScaleOp(&scale_op,
input_tensor->mlu_tensor(),
output_tensor->mlu_tensor(),
alpha_tensor->mlu_tensor(),
beta_tensor->mlu_tensor()));
graph->FuseOp(scale_op);
return SUCCESS;
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(scale,
kMLU,
paddle::lite::subgraph::mlu::ScaleConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/scale_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_registry.h"
#include "lite/kernels/mlu/bridges/test_helper.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace mlu {
void scale_ref(const std::shared_ptr<operators::ScaleOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
float scale = op_info->GetAttr<float>("scale");
float bias = op_info->GetAttr<float>("bias");
bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
if (!bias_after_scale) {
bias *= scale;
}
auto x_data = x->data<float>();
auto out_data = out->mutable_data<float>();
DDim x_dims = x->dims();
DDim out_dims = out->dims();
CHECK_EQ(x_dims.production(), out_dims.production());
for (int i = 0; i < out_dims.production(); i++) {
out_data[i] = x_data[i] * scale + bias;
}
}
void test_scale(int bs,
int ic,
int ih,
int iw,
bool bias_after_scale,
float scale,
float bias) {
// prepare input&output variables
Scope scope;
std::string x_var_name("x");
std::string out_var_name("out");
std::string out_ref_var_name("out_ref");
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize({bs, ic, ih, iw});
// initialize input&output data
FillTensor<float, int>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("scale");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("bias_after_scale", bias_after_scale);
opdesc.SetAttr("scale", scale);
opdesc.SetAttr("bias", bias);
// create and convert op to MLU model, then run it on MLU
auto op = CreateOp<operators::ScaleOp>(opdesc, &scope);
scale_ref(op);
out_ref->CopyDataFrom(*out);
Tensor input_trans;
input_trans.Resize({bs, ic, ih, iw});
transpose(x->mutable_data<float>(),
input_trans.mutable_data<float>(),
{bs, ic, ih, iw},
{0, 2, 3, 1});
auto os = out->dims();
out->Resize({static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])});
x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name});
// execute reference implementation and save to output tensor('out')
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
Tensor output_trans;
output_trans.Resize(os);
transpose(out_data,
output_trans.mutable_data<float>(),
{static_cast<int>(os[0]),
static_cast<int>(os[2]),
static_cast<int>(os[3]),
static_cast<int>(os[1])},
{0, 3, 1, 2});
out_data = output_trans.mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
TEST(MLUBridges, scale) {
for (auto bs : {1, 3}) {
for (auto ic : {1, 3}) {
for (auto ih : {3, 4}) {
for (auto iw : {4, 3}) {
for (auto bias_after_scale : {false, true}) {
for (auto scale : {-1.0f, 5.0f}) {
for (auto bias : {-2.0f, 30.0f}) {
VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
<< " iw: " << iw
// << " bias_after_scale: " << bias_after_scale
<< " scale: " << scale << " bias: " << bias;
test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias);
}
}
}
}
}
}
}
}
} // namespace mlu
} // namespace subgraph
} // namespace lite
} // namespace paddle
USE_SUBGRAPH_BRIDGE(scale, kMLU);
...@@ -45,11 +45,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -45,11 +45,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
axis = output_dims.size() + axis; axis = output_dims.size() + axis;
} }
} }
int nhwc_axis = nchw_to_nhwc_aixs_map[axis]; int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
auto output_tensor = graph->AddNode( auto output_tensor = graph->AddNode(
out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType()); out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
cnmlBaseOp_t softmax_op; cnmlBaseOp_t softmax_op;
CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op, CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op,
nhwc_axis, nhwc_axis,
......
...@@ -23,8 +23,6 @@ namespace lite { ...@@ -23,8 +23,6 @@ namespace lite {
namespace subgraph { namespace subgraph {
namespace mlu { namespace mlu {
int SoftmaxConverter(void* ctx, OpLite* op);
template <typename dtype> template <typename dtype>
void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) { void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
Scope* scope = op->scope(); Scope* scope = op->scope();
...@@ -112,9 +110,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) { ...@@ -112,9 +110,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
{bs, ic, ih, iw}, {bs, ic, ih, iw},
{0, 2, 3, 1}); {0, 2, 3, 1});
out->Resize({bs, ih, iw, ic});
x->CopyDataFrom(input_trans); x->CopyDataFrom(input_trans);
x->Resize({bs, ih, iw, ic});
LaunchOp(op, {x_var_name}, {out_var_name}); LaunchOp(op, {x_var_name}, {out_var_name});
...@@ -171,6 +167,4 @@ TEST(MLUBridges, softmax) { ...@@ -171,6 +167,4 @@ TEST(MLUBridges, softmax) {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(MLU, USE_SUBGRAPH_BRIDGE(softmax, kMLU)
softmax,
paddle::lite::subgraph::mlu::SoftmaxConverter);
...@@ -47,6 +47,8 @@ class MLUTensor { ...@@ -47,6 +47,8 @@ class MLUTensor {
return mlu_ptr_; return mlu_ptr_;
} }
void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
~MLUTensor(); ~MLUTensor();
private: private:
......
...@@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op, ...@@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const std::vector<std::string>& input_var_names, const std::vector<std::string>& input_var_names,
const std::vector<std::string>& output_var_names) { const std::vector<std::string>& output_var_names) {
CNRT_CALL(cnrtInit(0)); CNRT_CALL(cnrtInit(0));
SetMluDevice(0); ::paddle::lite::SetMluDevice(0);
cnrtQueue_t queue_; cnrtQueue_t queue_;
cnrtInvokeFuncParam_t forward_param; cnrtInvokeFuncParam_t forward_param;
u32_t affinity = 1; u32_t affinity = 1;
...@@ -47,7 +47,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op, ...@@ -47,7 +47,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const auto& bridges = subgraph::Registry::Instance(); const auto& bridges = subgraph::Registry::Instance();
CHECK(bridges.Exists(op_type, TARGET(kMLU))); CHECK(bridges.Exists(op_type, TARGET(kMLU)));
// Convert all of input data vars and added into the MLU IR graph // Convert input data var and add it into the MLU IR graph
for (auto& input_name : input_var_names) { for (auto& input_name : input_var_names) {
auto input_tensor = scope->FindMutableTensor(input_name); auto input_tensor = scope->FindMutableTensor(input_name);
CHECK(input_tensor); CHECK(input_tensor);
...@@ -58,7 +58,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op, ...@@ -58,7 +58,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
graph.AddNode(input_name, graph.AddNode(input_name,
input_tensor->dims().Vectorize(), input_tensor->dims().Vectorize(),
CNML_TENSOR, CNML_TENSOR,
CNML_NHWC, CNML_NCHW,
graph.FPType(), graph.FPType(),
reinterpret_cast<void*>( reinterpret_cast<void*>(
input_tensor->mutable_data<float>(TARGET(kMLU)))); input_tensor->mutable_data<float>(TARGET(kMLU))));
...@@ -68,6 +68,8 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op, ...@@ -68,6 +68,8 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
sizeof(float) * input_tensor->dims().production(), sizeof(float) * input_tensor->dims().production(),
CNRT_MEM_TRANS_DIR_HOST2DEV)); CNRT_MEM_TRANS_DIR_HOST2DEV));
} }
op->CheckShape();
op->InferShape();
bridges.Select(op_type, TARGET(kMLU))( bridges.Select(op_type, TARGET(kMLU))(
reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr); reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr);
......
...@@ -84,7 +84,7 @@ struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> { ...@@ -84,7 +84,7 @@ struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
template <> template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> { struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
typedef ::paddle::lite::fluid::float16 T; typedef paddle::lite::fluid::float16 T;
}; };
} // namespace mlu } // namespace mlu
......
...@@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL( ...@@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL(
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize(); .Finalize();
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
// host_to_device)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
// .Finalize();
//
//
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
// device_to_host)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/mlu/layout_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
layout,
kMLU,
kFloat,
kNHWC,
paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFloat)>,
def_layout_nhwc2nchw_fp32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(
layout,
kMLU,
kFP16,
kNHWC,
paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFP16)>,
def_layout_nhwc2nchw_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kNCHW))})
.Finalize();
REGISTER_LITE_KERNEL(
layout,
kMLU,
kFloat,
kNHWC,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFloat)>,
def_layout_nchw2nhwc_fp32)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(
layout,
kMLU,
kFP16,
kNHWC,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFP16)>,
def_layout_nchw2nhwc_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kFP16),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(
layout,
kMLU,
kInt8,
kNHWC,
paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
def_layout_nchw2nhwc_fp32_int8)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt8),
DATALAYOUT(kNCHW))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt8),
DATALAYOUT(kNHWC))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Eigen/Core>
#include <string>
#include <vector>
#include "lite/backends/x86/math/math_function.h"
#include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/operators/layout_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace mlu {
template <paddle::lite_api::PrecisionType>
struct FPTypeTraits {};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
typedef float T;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
typedef paddle::lite::fluid::float16 T;
};
template <>
struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
typedef int8_t T;
};
template <lite::TargetType Target, typename T>
inline void LayoutTransCompute(const int dim,
const lite::Context<Target>& context,
const lite::Tensor& in,
lite::Tensor* out,
const std::vector<int>& axis) {
switch (dim) {
case 2:
paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 2> trans2;
trans2(context, in, out, axis);
break;
case 3:
paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 3> trans3;
trans3(context, in, out, axis);
break;
case 4:
paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 4> trans4;
trans4(context, in, out, axis);
break;
default:
CHECK(0) << ("Unsupport dim in mlu layout");
}
}
template <PrecisionType Precision>
class LayoutNchwToNhwcCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
public:
using param_t = operators::LayoutParam;
void Run() override {
auto& param = this->template Param<param_t>();
auto* x = param.x;
auto* out = param.y;
out->template mutable_data<typename FPTypeTraits<Precision>::T>();
auto x_dims = param.x->dims().size();
auto& context = this->ctx_->template As<X86Context>();
const auto origin_dims = out->dims().Vectorize();
std::vector<int> axis;
switch (x_dims) {
case 2:
axis = {0, 1};
break;
case 3:
axis = {0, 2, 1};
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[1]});
break;
case 4:
axis = {0, 2, 3, 1};
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
break;
default:
CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
}
LayoutTransCompute<lite::TargetType::kX86,
typename FPTypeTraits<Precision>::T>(
x_dims, context, *x, out, axis);
if (x_dims > 2) {
out->Resize(origin_dims);
}
}
std::string doc() const override {
return "Mlu layout transform nchw to nhwc";
}
};
template <PrecisionType Precision>
class LayoutNhwcToNchwCompute
: public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
public:
using param_t = operators::LayoutParam;
void Run() override {
auto& param = this->template Param<param_t>();
auto* x = param.x;
auto* out = param.y;
out->template mutable_data<typename FPTypeTraits<Precision>::T>();
auto x_dims = param.x->dims().size();
auto& context = this->ctx_->template As<X86Context>();
const auto origin_dims = out->dims().Vectorize();
std::vector<int> axis;
switch (x_dims) {
case 2:
axis = {0, 1};
break;
case 3:
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[2], out->dims()[1]});
axis = {0, 2, 1};
break;
case 4:
out->Resize(std::vector<int64_t>{
out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
axis = {0, 3, 1, 2};
break;
default:
CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw";
}
LayoutTransCompute<lite::TargetType::kX86,
typename FPTypeTraits<Precision>::T>(
x_dims, context, *x, out, axis);
if (x_dims > 2) {
out->Resize(origin_dims);
}
}
std::string doc() const override {
return "Mlu layout transform nhwc to nchw";
}
};
} // namespace mlu
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine {
graph_.SetFPType(type); graph_.SetFPType(type);
} }
int Build() {
// In order to attach all of the ops of the block desc, we need to build
// the original program firstly.
BuildOriginProgram();
// Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
build_device_program_status_ = BuildDeviceProgram();
return build_device_program_status_;
}
int Launch() {
// Rebuild device program when the shapes of input tensors have been
// changed.
if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
build_device_program_status_) &&
InputShapeChanged()) {
Build();
}
if (subgraph::CHECK_FAILED(build_device_program_status_)) {
LaunchOriginProgram();
} else {
LaunchDeviceProgram();
}
return 0;
}
protected: protected:
int BuildDeviceProgram() override { int BuildDeviceProgram() override {
int status = 0; int status = 0;
...@@ -57,7 +83,7 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -57,7 +83,7 @@ class SubgraphEngine : public subgraph::Engine {
graph_.AddNode(input_name, graph_.AddNode(input_name,
input_tensor->dims().Vectorize(), input_tensor->dims().Vectorize(),
CNML_TENSOR, CNML_TENSOR,
CNML_NHWC, CNML_NCHW,
graph_.FPType(), graph_.FPType(),
const_cast<void*>(input_tensor->raw_data())); const_cast<void*>(input_tensor->raw_data()));
CHECK(input_node); CHECK(input_node);
...@@ -71,9 +97,9 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -71,9 +97,9 @@ class SubgraphEngine : public subgraph::Engine {
for (auto& inst : origin_program_) { for (auto& inst : origin_program_) {
auto op = inst.op(); auto op = inst.op();
CHECK(op); CHECK(op);
op->CheckShape();
op->InferShape();
std::string op_type = op->op_info()->Type(); std::string op_type = op->op_info()->Type();
op->CheckShape();
const_cast<OpLite*>(op)->InferShape();
if (!bridges.Exists(op_type, TARGET(kMLU))) { if (!bridges.Exists(op_type, TARGET(kMLU))) {
LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type; LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
return subgraph::FAILED; return subgraph::FAILED;
...@@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine {
graph_.AddInput(graph_.GetNode(input_name)); graph_.AddInput(graph_.GetNode(input_name));
} }
CHECK(!valid_output_names.empty()) << "[MLU] no valid output names"; CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
// auto& mlu_context = this->ctx_->template As<MLUContext>(); auto& mlu_context = this->ctx_->template As<MLUContext>();
// auto core_version = mlu_context.MLUCoreVersion(); auto core_version = mlu_context.MLUCoreVersion();
// auto core_number = mlu_context.MLUCoreNumber(); auto core_number = mlu_context.MLUCoreNumber();
// graph_.Compile(core_version, core_number); graph_.Compile(core_version, core_number);
return status; return status;
} }
int LaunchDeviceProgram() override { int LaunchDeviceProgram() override {
// auto& mlu_context = this->ctx_->template As<MLUContext>(); auto& mlu_context = this->ctx_->template As<MLUContext>();
// auto exec_queue = mlu_context.exec_queue(); auto exec_queue = mlu_context.exec_queue();
// u32_t affinity = mlu_context.affinity(); u32_t affinity = mlu_context.affinity();
// cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
// int data_param = 1; int data_param = 1;
// forward_param.data_parallelism = &data_param; forward_param.data_parallelism = &data_param;
// forward_param.affinity = &affinity; forward_param.affinity = &affinity;
// forward_param.end = CNRT_PARAM_END; forward_param.end = CNRT_PARAM_END;
// graph_.Compute(forward_param, exec_queue); graph_.Compute(forward_param, exec_queue);
return 0; return 0;
} }
......
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM) if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
return() return()
endif() endif()
......
...@@ -33,7 +33,7 @@ add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kerne ...@@ -33,7 +33,7 @@ add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kerne
add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
# extra # extra
# wait to add ... # wait to add ...
...@@ -97,6 +97,10 @@ lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc ...@@ -97,6 +97,10 @@ lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
DEPS pad2d_opencl layout_opencl op_registry program context) DEPS pad2d_opencl layout_opencl op_registry program context)
lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
DEPS box_coder_opencl op_registry program context)
###################### ######################
# buffer kernel # # buffer kernel #
###################### ######################
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::BoxCoderParam;
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
boxcoder_param_ = param_.get_mutable<param_t>();
if (boxcoder_param_->code_type == "decode_center_size" &&
boxcoder_param_->box_normalized == true) {
kernel_func_name_ = "decode_center_size";
} else {
printf("This code_type %s doesn't support \n",
boxcoder_param_->code_type.c_str());
return;
}
CHECK(context.cl_context() != nullptr);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/box_coder_kernel.cl", build_options_);
}
void Run() override {
boxcoder_param_ = param_.get_mutable<param_t>();
const auto& out_dims = boxcoder_param_->proposals->dims();
auto image_shape = InitImageDimInfoWith(out_dims);
auto* out_buf =
boxcoder_param_->proposals->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "boxcoder input shape: ";
#endif
const auto* input_priorbox = boxcoder_param_->prior_box;
const auto* input_priorboxvar = boxcoder_param_->prior_box_var;
const auto* input_targetbox = boxcoder_param_->target_box;
const auto& code_type = boxcoder_param_->code_type;
if (code_type == "decode_center_size") {
auto* prior_box_image = input_priorbox->data<half_t, cl::Image2D>();
auto* prior_box_var_image =
input_priorboxvar->data<half_t, cl::Image2D>();
auto* target_box_image = input_targetbox->data<half_t, cl::Image2D>();
int new_dims[4] = {1, 1, 1, 1};
for (int i = 0; i < out_dims.size(); i++) {
new_dims[4 - out_dims.size() + i] = out_dims[i];
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto default_work_size =
DefaultWorkSize(out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(image_shape["width"]),
static_cast<int64_t>(image_shape["height"])}));
int out_C = new_dims[1];
int out_H = new_dims[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(boxcoder_param_->proposals->target());
VLOG(4) << "output shape: " << out_dims[0] << ", " << out_dims[1] << ", "
<< out_dims[2] << ", " << out_dims[3];
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
VLOG(4) << "out_C = " << out_C;
VLOG(4) << "out_H = " << out_H;
VLOG(4) << "default_work_size = " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx++, *prior_box_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *prior_box_var_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *target_box_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_C);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_H);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_buf, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1];
#endif
}
}
std::string doc() { return "Boxcoder using cl::Image, kFP16"; }
param_t* boxcoder_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{" -DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
typedef paddle::lite::kernels::opencl::BoxCoderComputeImage BoxCoder_image;
REGISTER_LITE_KERNEL(
box_coder, kOpenCL, kFP16, kImageDefault, BoxCoder_image, ImageDefault)
.BindInput("PriorBox",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("PriorBoxVar",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("TargetBox",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("OutputBox",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <memory>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle {
namespace lite {
void box_coder_ref(float* proposals_data,
const float* anchors_data,
const float* bbox_deltas_data,
const float* variances_data,
int axis,
bool box_normalized,
std::string code_type,
int row,
int col) {
if (code_type == "decode_center_size") {
int anchor_len = 4;
int out_len = 4;
int var_len = 4;
int delta_len = 4;
float normalized = !box_normalized ? 1.f : 0;
for (int64_t row_id = 0; row_id < row; ++row_id) {
for (int64_t col_id = 0; col_id < col; ++col_id) {
size_t delta_offset = row_id * col * delta_len + col_id * delta_len;
size_t out_offset = row_id * col * out_len + col_id * out_len;
int prior_box_offset =
axis == 0 ? col_id * anchor_len : row_id * anchor_len;
int var_offset = axis == 0 ? col_id * var_len : row_id * var_len;
auto anchor_data_tmp = anchors_data + prior_box_offset;
auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset;
auto proposals_data_tmp = proposals_data + out_offset;
auto anchor_width =
anchor_data_tmp[2] - anchor_data_tmp[0] + normalized;
auto anchor_height =
anchor_data_tmp[3] - anchor_data_tmp[1] + normalized;
auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width;
auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height;
float bbox_center_x = 0, bbox_center_y = 0;
float bbox_width = 0, bbox_height = 0;
auto variances_data_tmp = variances_data + var_offset;
bbox_center_x =
variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width +
anchor_center_x;
bbox_center_y =
variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) *
anchor_width;
bbox_height =
std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) *
anchor_height;
proposals_data_tmp[0] = bbox_center_x - bbox_width / 2;
proposals_data_tmp[1] = bbox_center_y - bbox_height / 2;
proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized;
proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized;
}
}
} else if (code_type == "encode_center_size") {
LOG(FATAL) << "not implemented type: " << code_type;
} else {
LOG(FATAL) << "not supported type: " << code_type;
}
}
// #define BOXCODER_FP16_LOOP_TEST
// #define BOXCODER_FP16_PRINT_RESULT
TEST(box_coder_image2d, compute) {
#ifdef BOXCODER_FP16_LOOP_TEST
for (auto n : {1, 2, 3, 4}) {
for (auto m : {1, 3, 4, 8}) {
for (auto norm : {true}) {
for (auto code_type : {"decode_center_size"}) {
for (auto axis : {0}) {
#else
const int n = 1;
const int m = 1;
const bool norm = true;
const std::string code_type = "decode_center_size";
const int axis = 0;
#endif // BOXCODER_FP16_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << m
<< " ========";
LOG(INFO) << "======== parameters: norm = " << norm
<< ", axis = " << axis << "code_type: " << code_type;
auto kernels =
KernelRegistry::Global().Create("box_coder",
TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
LOG(INFO) << "get kernel:" << kernel->doc();
lite::Tensor prior_box, prior_box_var, target_box, output_box;
operators::BoxCoderParam param;
param.prior_box = &prior_box;
param.prior_box_var = &prior_box_var;
param.target_box = &target_box;
param.proposals = &output_box;
param.axis = axis;
param.box_normalized = norm;
param.code_type = code_type;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> boxcoder_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(boxcoder_context->As<OpenCLContext>()));
kernel->SetContext(std::move(boxcoder_context));
const DDim prior_box_dims =
DDim(std::vector<DDim::value_type>{1, 1, m, 4});
const DDim prior_box_var_dims =
DDim(std::vector<DDim::value_type>{1, 1, m, 4});
const DDim target_box_dims =
DDim(std::vector<DDim::value_type>{1, n, m, 4});
const DDim out_dim =
DDim(std::vector<DDim::value_type>{1, n, m, 4});
prior_box.Resize(prior_box_dims);
prior_box_var.Resize(prior_box_var_dims);
target_box.Resize(target_box_dims);
output_box.Resize(out_dim);
std::vector<float> prior_box_data(prior_box_dims.production());
std::vector<float> prior_box_var_data(
prior_box_var_dims.production());
std::vector<float> target_box_data(target_box_dims.production());
for (int i = 0; i < prior_box_dims.production(); i++) {
prior_box_data[i] = i * 1.1 / prior_box_dims.production();
}
for (int i = 0; i < prior_box_var_dims.production(); i++) {
prior_box_var_data[i] = i * 1.2 / prior_box_var_dims.production();
}
for (int i = 0; i < target_box_dims.production(); i++) {
target_box_data[i] = i * 1.3 / target_box_dims.production();
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter =
new CLImageConverterDefault();
DDim prior_box_image_shape =
default_converter->InitImageDimInfoWith(prior_box_dims);
LOG(INFO) << "prior_box_image_shape = " << prior_box_image_shape[0]
<< " " << prior_box_image_shape[1];
std::vector<half_t> prior_box_image_data(
prior_box_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(prior_box_data.data(),
prior_box_image_data.data(),
prior_box_dims);
auto* prior_box_image = prior_box.mutable_data<half_t, cl::Image2D>(
prior_box_image_shape[0],
prior_box_image_shape[1],
prior_box_image_data.data());
DDim prior_box_var_image_shape =
default_converter->InitImageDimInfoWith(prior_box_var_dims);
LOG(INFO) << "prior_box_var_image_shape = "
<< prior_box_var_image_shape[0] << " "
<< prior_box_var_image_shape[1];
std::vector<half_t> prior_box_var_image_data(
prior_box_var_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(prior_box_var_data.data(),
prior_box_var_image_data.data(),
prior_box_var_dims);
auto* prior_box_var_image =
prior_box_var.mutable_data<half_t, cl::Image2D>(
prior_box_var_image_shape[0],
prior_box_var_image_shape[1],
prior_box_var_image_data.data());
DDim target_box_image_shape =
default_converter->InitImageDimInfoWith(target_box_dims);
LOG(INFO) << "target_box_image_shape = "
<< target_box_image_shape[0] << " "
<< target_box_image_shape[1];
std::vector<half_t> target_box_image_data(
target_box_image_shape.production() * 4); // 4 : RGBA
default_converter->NCHWToImage(target_box_data.data(),
target_box_image_data.data(),
target_box_dims);
auto* target_box_image =
target_box.mutable_data<half_t, cl::Image2D>(
target_box_image_shape[0],
target_box_image_shape[1],
target_box_image_data.data());
DDim out_image_shape =
default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = output_box.mutable_data<half_t, cl::Image2D>(
out_image_shape[0], out_image_shape[1]);
kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.proposals->data<half_t, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the "
"target cl tensor.";
}
lite::Tensor out_ref_tensor;
out_ref_tensor.Resize(out_dim);
box_coder_ref(out_ref_tensor.mutable_data<float>(),
prior_box_data.data(),
target_box_data.data(),
prior_box_var_data.data(),
axis,
norm,
code_type,
target_box_dims[0],
target_box_dims[1]);
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data =
new half_t[40000]; // [out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
// result
#ifdef BOXCODER_FP16_PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
std::cout << target_box_data[eidx] << " -> " << out_data[eidx]
<< std::endl;
}
#endif // BOXCODER_FP16_PRINT_RESULT
const float* out_ref = out_ref_tensor.data<float>();
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_ref[i]);
auto relative_diff =
COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
(abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) &&
(abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << ", in_data[" << i
<< "]: " << target_box_data[i] << ", out_data[" << i
<< "]: " << out_data[i] << ", out_ref[" << i
<< "]: " << out_ref[i] << ", abs_diff: " << abs_diff
<< ", relative_diff: " << relative_diff
<< ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
}
}
#ifdef BOXCODER_FP16_LOOP_TEST
} // axis
} // code_type
} // norm
} // m
} // n
#else
// nothing to do.
#endif
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(box_coder, kOpenCL, kFP16, kImageDefault, ImageDefault);
...@@ -2,7 +2,7 @@ if(NOT LITE_WITH_X86) ...@@ -2,7 +2,7 @@ if(NOT LITE_WITH_X86)
return() return()
endif() endif()
add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function) add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_function)
# lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
# lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
# lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
...@@ -30,6 +30,8 @@ add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} ...@@ -30,6 +30,8 @@ add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps}
add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute) add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
#add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps}) #add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps})
add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps}) add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})
add_kernel(sequence_unpad_compute_x86 X86 basic SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps} sequence_padding)
add_kernel(sequence_conv_compute_x86 X86 basic SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_function blas context_project)
# lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type) add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/fluid/eigen.h" #include "lite/fluid/eigen.h"
#include "lite/operators/activation_ops.h" #include "lite/operators/op_params.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -231,8 +231,8 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -231,8 +231,8 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
// auto& context = ctx_->As<X86Context>(); // auto& context = ctx_->As<X86Context>();
auto& param = *param_.get_mutable<operators::ActivationParam>(); auto& param = *param_.get_mutable<operators::ActivationParam>();
const T* x_data = param.X->data<T>(); const T* x_data = param.X->template data<T>();
T* out_data = param.Out->mutable_data<T>(); T* out_data = param.Out->template mutable_data<T>();
size_t x_size = param.X->numel(); size_t x_size = param.X->numel();
for (size_t i = 0; i < x_size; i++) { for (size_t i = 0; i < x_size; i++) {
out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i])); out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i]));
......
...@@ -45,9 +45,9 @@ class AttentionPaddingMaskCompute ...@@ -45,9 +45,9 @@ class AttentionPaddingMaskCompute
auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]); auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
const int att_batch = bottom0->lod()[0].size() - 1; const int att_batch = bottom0->lod()[0].size() - 1;
const int src_batch = bottom1->lod()[0].size() - 1; const int src_batch = bottom1->lod()[0].size() - 1;
int* pad_begin = _pad_begin->mutable_data<int>(); int* pad_begin = _pad_begin->template mutable_data<int>();
for (int i = 0; i < src_batch; ++i) { for (int i = 0; i < src_batch; ++i) {
const auto* src_data = bottom1->data<T>() + src_len * i; const auto* src_data = bottom1->template data<T>() + src_len * i;
int index = src_len - 1; int index = src_len - 1;
for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]); for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
--index) { --index) {
...@@ -56,13 +56,14 @@ class AttentionPaddingMaskCompute ...@@ -56,13 +56,14 @@ class AttentionPaddingMaskCompute
} }
const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]); const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
auto* top_data = top->mutable_data<T>(); auto* top_data = top->template mutable_data<T>();
memcpy(top_data, memcpy(top_data,
bottom0->data<T>(), bottom0->template data<T>(),
bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T)); bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
for (int i = 0; i < att_batch; ++i) { for (int i = 0; i < att_batch; ++i) {
for (int j = 0; j < att_len; ++j) { for (int j = 0; j < att_len; ++j) {
top_data = top->mutable_data<T>() + src_len * (att_len * i + j); top_data =
top->template mutable_data<T>() + src_len * (att_len * i + j);
int src_idx = i % src_batch; int src_idx = i % src_batch;
for (int k = pad_begin[src_idx]; k < src_len; ++k) { for (int k = pad_begin[src_idx]; k < src_len; ++k) {
top_data[k] = _mask; top_data[k] = _mask;
......
...@@ -59,26 +59,26 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -59,26 +59,26 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
const int sample_size = x->dims().production() / N / C; const int sample_size = x->dims().production() / N / C;
// alloc memory // alloc memory
param.y->mutable_data<T>(); param.y->template mutable_data<T>();
if (!param.is_test) { if (!param.is_test) {
param.mean_out->mutable_data<T>(); param.mean_out->template mutable_data<T>();
param.variance_out->mutable_data<T>(); param.variance_out->template mutable_data<T>();
param.saved_mean->mutable_data<T>(); param.saved_mean->template mutable_data<T>();
param.saved_variance->mutable_data<T>(); param.saved_variance->template mutable_data<T>();
} }
if (!global_stats) { if (!global_stats) {
// saved_xx is use just in this batch of data // saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(), EigenVectorArrayMap<T> saved_mean_e(
C); param.saved_mean->template mutable_data<T>(), C);
EigenVectorArrayMap<T> saved_variance_e( EigenVectorArrayMap<T> saved_variance_e(
param.saved_variance->mutable_data<T>(), C); param.saved_variance->template mutable_data<T>(), C);
saved_mean_e.setZero(); saved_mean_e.setZero();
saved_variance_e.setZero(); saved_variance_e.setZero();
EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(), EigenVectorArrayMap<T> running_mean_arr(
C); param.mean_out->template mutable_data<T>(), C);
EigenVectorArrayMap<T> running_var_arr( EigenVectorArrayMap<T> running_var_arr(
param.variance_out->mutable_data<T>(), C); param.variance_out->template mutable_data<T>(), C);
if ((N * sample_size) == 1) { if ((N * sample_size) == 1) {
LOG(WARNING) << "Only 1 element in normalization dimension, " LOG(WARNING) << "Only 1 element in normalization dimension, "
...@@ -89,7 +89,8 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -89,7 +89,8 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
switch (param.data_layout) { switch (param.data_layout) {
case DATALAYOUT(kNCHW): { case DATALAYOUT(kNCHW): {
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C); ConstEigenArrayMap<T> x_arr(
x->template data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) { for (int nc = 0; nc < N * C; ++nc) {
saved_mean_e(nc % C) += x_arr.col(nc).sum(); saved_mean_e(nc % C) += x_arr.col(nc).sum();
} }
...@@ -115,33 +116,37 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -115,33 +116,37 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
// use SavedMean and SavedVariance to do normalize // use SavedMean and SavedVariance to do normalize
Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C); Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
if (global_stats) { if (global_stats) {
ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C); ConstEigenVectorArrayMap<T> var_arr(param.variance->template data<T>(),
C);
inv_std = (var_arr + param.epsilon).sqrt().inverse(); inv_std = (var_arr + param.epsilon).sqrt().inverse();
} else { } else {
EigenVectorArrayMap<T> saved_inv_std( EigenVectorArrayMap<T> saved_inv_std(
param.saved_variance->mutable_data<T>(), C); param.saved_variance->template mutable_data<T>(), C);
// inverse SavedVariance first, gradient will use it too. // inverse SavedVariance first, gradient will use it too.
saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt(); saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
inv_std = saved_inv_std; inv_std = saved_inv_std;
} }
ConstEigenVectorArrayMap<T> mean_arr( ConstEigenVectorArrayMap<T> mean_arr(
global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C); global_stats ? param.mean->template data<T>()
: param.saved_mean->template data<T>(),
C);
// ((x - est_mean) * (inv_var) * scale + bias // ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====> // formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) // (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C); ConstEigenVectorArrayMap<T> scale_arr(param.scale->template data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C); ConstEigenVectorArrayMap<T> bias_arr(param.bias->template data<T>(), C);
Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr; Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
Eigen::Array<T, Eigen::Dynamic, 1> new_bias = Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
bias_arr - mean_arr * inv_std * scale_arr; bias_arr - mean_arr * inv_std * scale_arr;
switch (param.data_layout) { switch (param.data_layout) {
case DATALAYOUT(kNCHW): { case DATALAYOUT(kNCHW): {
EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C); EigenArrayMap<T> y_arr(
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C); param.y->template mutable_data<T>(), sample_size, N * C);
ConstEigenArrayMap<T> x_arr(x->template data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) { for (int nc = 0; nc < N * C; ++nc) {
y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
} }
......
...@@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast, ...@@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast,
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(
cast,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::CastCompute<::paddle::lite::fluid::float16>,
fp16_to_any)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
...@@ -47,7 +47,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -47,7 +47,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
int64_t axis = static_cast<int64_t>(param.axis); int64_t axis = static_cast<int64_t>(param.axis);
auto* axis_tensor = param.axis_tensor; auto* axis_tensor = param.axis_tensor;
if (axis_tensor != nullptr) { if (axis_tensor != nullptr) {
auto* axis_tensor_data = axis_tensor->data<int>(); auto* axis_tensor_data = axis_tensor->template data<int>();
axis = static_cast<int64_t>(axis_tensor_data[0]); axis = static_cast<int64_t>(axis_tensor_data[0]);
} }
...@@ -60,7 +60,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -60,7 +60,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
int concat_input_size = count(axis + 1, x_dims.size(), x_dims); int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
const int top_concat_axis = out->dims()[axis]; const int top_concat_axis = out->dims()[axis];
for (size_t i = 0; i < param.x.size(); ++i) { for (size_t i = 0; i < param.x.size(); ++i) {
const T* bottom_data = param.x[i]->data<T>(); const T* bottom_data = param.x[i]->template data<T>();
const int64_t bottom_concat_axis = param.x[i]->dims()[axis]; const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
for (int n = 0; n < num_concat; ++n) { for (int n = 0; n < num_concat; ++n) {
std::memcpy( std::memcpy(
......
...@@ -52,7 +52,7 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -52,7 +52,7 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto& context = ctx_->As<X86Context>(); auto& context = ctx_->As<X86Context>();
auto& param = *param_.get_mutable<operators::ConvParam>(); auto& param = *param_.get_mutable<operators::ConvParam>();
lite::Tensor filter = *param.filter; lite::Tensor filter = *param.filter;
param.output->mutable_data<T>(); param.output->template mutable_data<T>();
const int batch_size = static_cast<int>(param.x->dims()[0]); const int batch_size = static_cast<int>(param.x->dims()[0]);
std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize()); std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
...@@ -95,9 +95,9 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -95,9 +95,9 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto blas = auto blas =
paddle::lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context); paddle::lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
lite::Tensor in_batch = param.x->Slice<T>(i, i + 1); lite::Tensor in_batch = param.x->template Slice<T>(i, i + 1);
in_batch.Resize(input_shape); in_batch.Resize(input_shape);
lite::Tensor out_batch = param.output->Slice<T>(i, i + 1); lite::Tensor out_batch = param.output->template Slice<T>(i, i + 1);
out_batch.Resize(output_matrix_shape); out_batch.Resize(output_matrix_shape);
for (int g = 0; g < param.groups; g++) { for (int g = 0; g < param.groups; g++) {
lite::Tensor in_slice = lite::Tensor in_slice =
......
...@@ -38,10 +38,10 @@ class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -38,10 +38,10 @@ class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
using param_t = operators::DropoutParam; using param_t = operators::DropoutParam;
void Run() override { void Run() override {
auto& param = *param_.get_mutable<operators::DropoutParam>(); auto& param = *param_.get_mutable<operators::DropoutParam>();
const auto* x_data = param.x->data<T>(); const auto* x_data = param.x->template data<T>();
auto* out_data = param.output->mutable_data<T>(); auto* out_data = param.output->template mutable_data<T>();
if (!param.is_test) { if (!param.is_test) {
auto* mask_data = param.mask->mutable_data<T>(); auto* mask_data = param.mask->template mutable_data<T>();
std::random_device rnd; std::random_device rnd;
std::minstd_rand engine; std::minstd_rand engine;
int seed = param.fix_seed ? param.seed : rnd(); int seed = param.fix_seed ? param.seed : rnd();
......
...@@ -248,8 +248,8 @@ class TransformFunctor { ...@@ -248,8 +248,8 @@ class TransformFunctor {
lite::Tensor *z, lite::Tensor *z,
const lite::Context<Target> &ctx, const lite::Context<Target> &ctx,
Functor func) Functor func)
: x_(x->data<T>()), : x_(x->template data<T>()),
y_(y->data<T>()), y_(y->template data<T>()),
z_(z->mutable_data<OutType>()), z_(z->mutable_data<OutType>()),
nx_(x->numel()), nx_(x->numel()),
ctx_(ctx), ctx_(ctx),
...@@ -483,9 +483,10 @@ void FusedElemwiseAndActComputeNoBroadcast(const lite::Context<Target> &ctx, ...@@ -483,9 +483,10 @@ void FusedElemwiseAndActComputeNoBroadcast(const lite::Context<Target> &ctx,
x.data<T>(), x.data<T>(),
y.data<T>(), y.data<T>(),
compound_functor, compound_functor,
out->mutable_data<T>(), out->template mutable_data<T>(),
intermediate_out == nullptr ? nullptr intermediate_out == nullptr
: intermediate_out->mutable_data<T>()}); ? nullptr
: intermediate_out->template mutable_data<T>()});
} }
template <lite::TargetType Target, template <lite::TargetType Target,
...@@ -523,9 +524,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx, ...@@ -523,9 +524,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
compound_functor, compound_functor,
h, h,
w, w,
out->mutable_data<T>(), out->template mutable_data<T>(),
intermediate_out == nullptr ? nullptr intermediate_out == nullptr
: intermediate_out->mutable_data<T>()); ? nullptr
: intermediate_out->template mutable_data<T>());
} else { } else {
FusedElemwiseAndActBroadcast2CPU<T, FusedElemwiseAndActBroadcast2CPU<T,
...@@ -539,9 +541,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx, ...@@ -539,9 +541,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
n, n,
post, post,
compound_functor, compound_functor,
out->mutable_data<T>(), out->template mutable_data<T>(),
intermediate_out == nullptr ? nullptr intermediate_out == nullptr
: intermediate_out->mutable_data<T>()); ? nullptr
: intermediate_out->template mutable_data<T>());
} }
} }
......
...@@ -140,9 +140,9 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -140,9 +140,9 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
int M = output->dims().production() / w_dims1; int M = output->dims().production() / w_dims1;
const T* input_data = input->data<T>(); const T* input_data = input->template data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->template data<T>();
T* output_data = output->mutable_data<T>(); T* output_data = output->template mutable_data<T>();
auto& context = ctx_->As<X86Context>(); auto& context = ctx_->As<X86Context>();
FCFunctor<lite::TargetType::kX86, T> fc; FCFunctor<lite::TargetType::kX86, T> fc;
...@@ -153,7 +153,7 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -153,7 +153,7 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
input_data, input_data,
w_data, w_data,
output_data, output_data,
bias ? bias->data<T>() : NULL, bias ? bias->template data<T>() : NULL,
with_relu, with_relu,
padding_weights); padding_weights);
} }
......
...@@ -42,9 +42,9 @@ class FillConstantBatchSizeLikeCompute ...@@ -42,9 +42,9 @@ class FillConstantBatchSizeLikeCompute
int output_dim_idx = param.output_dim_idx; int output_dim_idx = param.output_dim_idx;
odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1; odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
out->Resize(odims); out->Resize(odims);
// out->mutable_data<T>(); // out->template mutable_data<T>();
} }
out->mutable_data<T>(); out->template mutable_data<T>();
auto value = param.value; auto value = param.value;
paddle::lite::x86::math::SetConstant<lite::TargetType::kX86, T> setter; paddle::lite::x86::math::SetConstant<lite::TargetType::kX86, T> setter;
......
...@@ -50,9 +50,9 @@ void CPUGather(const lite::Tensor* src, ...@@ -50,9 +50,9 @@ void CPUGather(const lite::Tensor* src,
auto src_dims = src->dims(); auto src_dims = src->dims();
const T* p_src = src->data<T>(); const T* p_src = src->template data<T>();
const IndexT* p_index = index->data<IndexT>(); const IndexT* p_index = index->data<IndexT>();
T* p_output = output->mutable_data<T>(); T* p_output = output->template mutable_data<T>();
// slice size // slice size
int slice_size = 1; int slice_size = 1;
...@@ -77,7 +77,7 @@ class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -77,7 +77,7 @@ class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto index = param.Index; auto index = param.Index;
auto out = param.Out; auto out = param.Out;
out->mutable_data<T>(); out->template mutable_data<T>();
if (x->dims().production() == 0) return; if (x->dims().production() == 0) return;
/* /*
* Since there's no type defined for lite::Tensor in Paddle-Lite, then * Since there's no type defined for lite::Tensor in Paddle-Lite, then
......
...@@ -44,7 +44,7 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context, ...@@ -44,7 +44,7 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
bool indexed_src) { bool indexed_src) {
lite::x86::math::CopyMatrixRowsFunctor<TARGET(kX86), T> row_shuffle; lite::x86::math::CopyMatrixRowsFunctor<TARGET(kX86), T> row_shuffle;
dst->Resize(src.dims()); dst->Resize(src.dims());
dst->mutable_data<T>(); dst->template mutable_data<T>();
row_shuffle(context, src, index_lod, dst, indexed_src); row_shuffle(context, src, index_lod, dst, indexed_src);
} }
...@@ -65,18 +65,19 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -65,18 +65,19 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto* input = param.input; auto* input = param.input;
auto* h0 = param.h0; auto* h0 = param.h0;
auto* weight = param.weight; auto* weight = param.weight;
const T* weight_data = weight->data<T>(); const T* weight_data = weight->template data<T>();
auto* bias = param.bias; auto* bias = param.bias;
auto* batch_gate = param.batch_gate; auto* batch_gate = param.batch_gate;
auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev; auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
auto* batch_hidden = param.batch_hidden; auto* batch_hidden = param.batch_hidden;
T* batch_gate_ptr = batch_gate->mutable_data<T>(); T* batch_gate_ptr = batch_gate->template mutable_data<T>();
T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data<T>(); T* batch_reset_hidden_prev_ptr =
T* batch_hidden_ptr = batch_hidden->mutable_data<T>(); batch_reset_hidden_prev->template mutable_data<T>();
T* batch_hidden_ptr = batch_hidden->template mutable_data<T>();
auto* hidden = param.hidden; auto* hidden = param.hidden;
hidden->mutable_data<T>(); hidden->template mutable_data<T>();
const auto& hidden_dims = hidden->dims(); const auto& hidden_dims = hidden->dims();
...@@ -99,7 +100,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -99,7 +100,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
// Since the batch computing for GRU reorders the input sequences // Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
// to reorder. // to reorder.
const std::vector<size_t>& order(batch_gate->lod()[2]); const std::vector<uint64_t>& order(batch_gate->lod()[2]);
ReorderInitState<T>(context, *h0, order, &ordered_h0, true); ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
gru_value.prev_out_value = ordered_h0.mutable_data<T>(); gru_value.prev_out_value = ordered_h0.mutable_data<T>();
} else { } else {
......
...@@ -47,9 +47,9 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -47,9 +47,9 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto x_dims = x->dims(); auto x_dims = x->dims();
y->mutable_data<T>(); y->template mutable_data<T>();
Mean->mutable_data<T>(); Mean->template mutable_data<T>();
Var->mutable_data<T>(); Var->template mutable_data<T>();
auto matrix_dim = x_dims.Flatten2D(begin_norm_axis); auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]); int left = static_cast<int>(matrix_dim[0]);
...@@ -73,10 +73,10 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -73,10 +73,10 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
.At(right); .At(right);
ker(in.mutable_data<T>(), ker(in.mutable_data<T>(),
out.mutable_data<T>(), out.mutable_data<T>(),
Mean->mutable_data<T>(), Mean->template mutable_data<T>(),
Var->mutable_data<T>(), Var->template mutable_data<T>(),
Scale->data<T>(), Scale->template data<T>(),
Bias->data<T>(), Bias->template data<T>(),
static_cast<int>(left), static_cast<int>(left),
epsilon, epsilon,
right); right);
......
...@@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto *ids_t = param.Ids; auto *ids_t = param.Ids;
auto *output_t = param.Out; auto *output_t = param.Out;
int64_t padding_idx = param.padding_idx; int64_t padding_idx = param.padding_idx;
const int64_t *ids = ids_t->data<int64_t>(); const int64_t *ids = ids_t->template data<int64_t>();
int64_t ids_numel = ids_t->dims().production(); int64_t ids_numel = ids_t->dims().production();
auto *table_t = param.W; auto *table_t = param.W;
int64_t row_number = table_t->dims()[0]; int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1]; int64_t row_width = table_t->dims()[1];
const T *table = table_t->data<T>(); const T *table = table_t->template data<T>();
T *output = output_t->mutable_data<T>(); T *output = output_t->template mutable_data<T>();
memset(output, 0, output_t->dims().production() * sizeof(T)); memset(output, 0, output_t->dims().production() * sizeof(T));
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != -1 && ids[i] == padding_idx) { if (padding_idx != -1 && ids[i] == padding_idx) {
......
...@@ -35,7 +35,7 @@ void MatchMatrixTensorCompute<T>::Run() { ...@@ -35,7 +35,7 @@ void MatchMatrixTensorCompute<T>::Run() {
const auto& offset_l = x->lod()[0]; const auto& offset_l = x->lod()[0];
const auto& offset_r = y->lod()[0]; const auto& offset_r = y->lod()[0];
std::vector<size_t> top_offset; std::vector<uint64_t> top_offset;
int top_size = 0; int top_size = 0;
top_offset.push_back(top_size); top_offset.push_back(top_size);
for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
...@@ -97,9 +97,9 @@ void MatchMatrixTensorCompute<T>::Run() { ...@@ -97,9 +97,9 @@ void MatchMatrixTensorCompute<T>::Run() {
int batch_size = x->lod()[0].size() - 1; int batch_size = x->lod()[0].size() - 1;
int lod_lv1_size = batch_size * dim_t; int lod_lv1_size = batch_size * dim_t;
int lod_lv2_size = x->lod()[0].back() * dim_t; int lod_lv2_size = x->lod()[0].back() * dim_t;
std::vector<size_t> out_lod0(batch_size + 1, 0); std::vector<uint64_t> out_lod0(batch_size + 1, 0);
std::vector<size_t> out_lod1(lod_lv1_size + 1, 0); std::vector<uint64_t> out_lod1(lod_lv1_size + 1, 0);
std::vector<size_t> out_lod2(lod_lv2_size + 1, 0); std::vector<uint64_t> out_lod2(lod_lv2_size + 1, 0);
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
out_lod0[i + 1] = out_lod0[i] + dim_t; out_lod0[i + 1] = out_lod0[i] + dim_t;
int len_l = offset_l[i + 1] - offset_l[i]; int len_l = offset_l[i + 1] - offset_l[i];
......
...@@ -56,7 +56,7 @@ class MatMulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -56,7 +56,7 @@ class MatMulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto *x = param.X; auto *x = param.X;
auto *y = param.Y; auto *y = param.Y;
auto *out = param.Out; auto *out = param.Out;
out->mutable_data<T>(); out->template mutable_data<T>();
auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context); auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
auto mat_dim_a = lite::x86::math::CreateMatrixDescriptor( auto mat_dim_a = lite::x86::math::CreateMatrixDescriptor(
......
...@@ -64,7 +64,7 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -64,7 +64,7 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
y_matrix = *y; y_matrix = *y;
} }
z->mutable_data<T>(); z->template mutable_data<T>();
auto z_dim = z->dims(); auto z_dim = z->dims();
if (z_dim.size() != 2) { if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
......
...@@ -49,7 +49,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -49,7 +49,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
bool reduce_all = param.reduce_all; bool reduce_all = param.reduce_all;
auto* input = param.x; auto* input = param.x;
auto* output = param.output; auto* output = param.output;
param.output->mutable_data<T>(); param.output->template mutable_data<T>();
const auto& dims = param.dim; const auto& dims = param.dim;
bool keep_dim = param.keep_dim; bool keep_dim = param.keep_dim;
......
...@@ -41,8 +41,8 @@ class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -41,8 +41,8 @@ class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
void Run() override { void Run() override {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
scale_compute(param.x->data<T>(), scale_compute(param.x->template data<T>(),
param.output->mutable_data<T>(), param.output->template mutable_data<T>(),
param.x->dims().production(), param.x->dims().production(),
param.scale, param.scale,
param.bias, param.bias,
......
...@@ -84,7 +84,7 @@ void SearchGrnnCompute<T>::PrepareLayout(const Tensor* input_blob) { ...@@ -84,7 +84,7 @@ void SearchGrnnCompute<T>::PrepareLayout(const Tensor* input_blob) {
int max_width = width_data[idx_sorted_by_width_data[0]]; int max_width = width_data[idx_sorted_by_width_data[0]];
// start of reorganizing the input // start of reorganizing the input
std::vector<size_t> new_offset; std::vector<uint64_t> new_offset;
new_offset.resize(max_width + 1); new_offset.resize(max_width + 1);
new_offset[0] = 0; new_offset[0] = 0;
......
...@@ -50,7 +50,7 @@ class SearchGroupPaddingCompute ...@@ -50,7 +50,7 @@ class SearchGroupPaddingCompute
} }
} }
std::vector<size_t> new_offset; std::vector<uint64_t> new_offset;
new_offset.resize(batch + 1); new_offset.resize(batch + 1);
for (int i = 0; i < batch + 1; ++i) { for (int i = 0; i < batch + 1; ++i) {
new_offset[i] = i * max_seq; new_offset[i] = i * max_seq;
...@@ -67,7 +67,7 @@ class SearchGroupPaddingCompute ...@@ -67,7 +67,7 @@ class SearchGroupPaddingCompute
top1_lod.push_back(offset); top1_lod.push_back(offset);
top1->set_lod(top1_lod); top1->set_lod(top1_lod);
top1->Resize({dim0, 1}); top1->Resize({dim0, 1});
memset(top1->mutable_data<T>(), memset(top1->template mutable_data<T>(),
0, 0,
top1->dims()[0] * top1->dims()[1] * sizeof(T)); top1->dims()[0] * top1->dims()[1] * sizeof(T));
// for padding input id // for padding input id
...@@ -76,9 +76,9 @@ class SearchGroupPaddingCompute ...@@ -76,9 +76,9 @@ class SearchGroupPaddingCompute
top2->set_lod(top2_lod); top2->set_lod(top2_lod);
top2->Resize({batch * max_seq, 1}); top2->Resize({batch * max_seq, 1});
// copy data // copy data
const auto* bottom_data = bottom0->data<T>(); const auto* bottom_data = bottom0->template data<T>();
auto* top_data = top0->mutable_data<T>(); auto* top_data = top0->template mutable_data<T>();
auto* top_padding_input_data = top2->mutable_data<T>(); auto* top_padding_input_data = top2->template mutable_data<T>();
for (int i = 0; i < batch; i++) { for (int i = 0; i < batch; i++) {
const int copy_step = offset[i + 1] - offset[i]; const int copy_step = offset[i + 1] - offset[i];
const int start = i * max_seq; const int start = i * max_seq;
......
...@@ -58,8 +58,10 @@ class SearchSeqFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -58,8 +58,10 @@ class SearchSeqFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
int M = x_dims[0]; int M = x_dims[0];
int N = w_dims[0]; int N = w_dims[0];
for (int i = 0; i < M; i++) { for (int i = 0; i < M; i++) {
blas.AXPY( blas.AXPY(N,
N, static_cast<T>(1), b->data<T>(), out->mutable_data<T>() + i * N); static_cast<T>(1),
b->template data<T>(),
out->template mutable_data<T>() + i * N);
} }
} }
} }
......
...@@ -39,9 +39,9 @@ class SequenceArithmeticCompute ...@@ -39,9 +39,9 @@ class SequenceArithmeticCompute
out->Resize(x->dims()); out->Resize(x->dims());
out->set_lod(x->lod()); out->set_lod(x->lod());
auto x_data = x->data<T>(); auto x_data = x->template data<T>();
auto y_data = y->data<T>(); auto y_data = y->template data<T>();
auto out_data = out->mutable_data<T>(); auto out_data = out->template mutable_data<T>();
auto x_seq_offset = x->lod()[0]; auto x_seq_offset = x->lod()[0];
auto y_seq_offset = y->lod()[0]; auto y_seq_offset = y->lod()[0];
int seq_num = x_seq_offset.size() - 1; int seq_num = x_seq_offset.size() - 1;
......
...@@ -25,7 +25,7 @@ namespace x86 { ...@@ -25,7 +25,7 @@ namespace x86 {
template <typename T> template <typename T>
inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs, inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
std::vector<lite::Tensor>* xs_in_order) { std::vector<lite::Tensor>* xs_in_order) {
std::vector<size_t> result; std::vector<uint64_t> result;
result.resize(xs[0]->lod()[0].size()); result.resize(xs[0]->lod()[0].size());
for (size_t i = 1; i < result.size(); ++i) { for (size_t i = 1; i < result.size(); ++i) {
...@@ -75,7 +75,7 @@ class SequenceConcatCompute ...@@ -75,7 +75,7 @@ class SequenceConcatCompute
out_dims[0] = batch_size; out_dims[0] = batch_size;
param.Out->Resize(out_dims); param.Out->Resize(out_dims);
T* dout = param.Out->mutable_data<T>(); T* dout = param.Out->template mutable_data<T>();
std::vector<lite::Tensor> x_in_order; std::vector<lite::Tensor> x_in_order;
param.Out->set_lod(ConcatLoD<T>(param.X, &x_in_order)); param.Out->set_lod(ConcatLoD<T>(param.X, &x_in_order));
......
...@@ -26,7 +26,7 @@ namespace x86 { ...@@ -26,7 +26,7 @@ namespace x86 {
namespace { namespace {
inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs, inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
std::vector<lite::Tensor>* xs_in_order) { std::vector<lite::Tensor>* xs_in_order) {
std::vector<size_t> result; std::vector<uint64_t> result;
result.resize(xs[0]->lod()[0].size()); result.resize(xs[0]->lod()[0].size());
for (size_t i = 1; i < result.size(); ++i) { for (size_t i = 1; i < result.size(); ++i) {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/x86/sequence_conv_compute.h"
REGISTER_LITE_KERNEL(sequence_conv,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::SequenceConvCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <vector>
#include "lite/backends/x86/math/blas.h"
#include "lite/backends/x86/math/context_project.h"
#include "lite/backends/x86/math/math_function.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
namespace math = paddle::lite::x86::math;
template <typename T>
class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::SequenceConvParam;
void Run() override {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<X86Context>();
auto* in = param.X;
auto* filter = param.Filter;
auto* out = param.Out;
out->template mutable_data<T>();
CHECK(in->lod().size() == 1) << "Only support one level sequence now";
int context_start = param.contextStart;
int context_stride = param.contextStride;
int context_length = param.contextLength;
bool padding_trainable = false;
const Tensor* padding_data = nullptr;
int up_pad = std::max(0, -context_start);
int down_pad = std::max(0, context_start + context_length - 1);
auto sequence_width = static_cast<int64_t>(in->dims()[1]);
std::vector<int64_t> col_shape{in->dims()[0],
context_length * sequence_width};
Tensor col;
col.Resize(col_shape);
col.mutable_data<T>();
// Because if padding_trainable is false, padding data should be zeros.
math::SetConstant<TARGET(kX86), T> set_zero;
auto blas = math::GetBlas<TARGET(kX86), T>(ctx);
set_zero(ctx, &col, static_cast<T>(0));
math::ContextProjectFunctor<TARGET(kX86), T> seq_project_functor;
seq_project_functor(ctx,
*in,
padding_data,
padding_trainable,
context_start,
context_length,
context_stride,
up_pad,
down_pad,
&col);
blas.MatMul(col, *filter, out);
}
virtual ~SequenceConvCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -29,8 +29,9 @@ using Tensor = lite::Tensor; ...@@ -29,8 +29,9 @@ using Tensor = lite::Tensor;
template <typename T> template <typename T>
struct SequenceExpandFunctor { struct SequenceExpandFunctor {
void operator()(const Tensor &x, void operator()(
const std::vector<size_t> &ref_lod, /*expand referenced lod*/ const Tensor &x,
const std::vector<uint64_t> &ref_lod, /*expand referenced lod*/
Tensor *out) { Tensor *out) {
int64_t hight = x.dims()[0]; int64_t hight = x.dims()[0];
int64_t width = x.data_size() / hight; int64_t width = x.data_size() / hight;
...@@ -39,13 +40,13 @@ struct SequenceExpandFunctor { ...@@ -39,13 +40,13 @@ struct SequenceExpandFunctor {
T *out_data = out->mutable_data<T, T>(); T *out_data = out->mutable_data<T, T>();
for (int h_id = 0; h_id < hight; ++h_id) { for (int h_id = 0; h_id < hight; ++h_id) {
size_t span = ref_lod[h_id + 1] - ref_lod[h_id]; uint64_t span = ref_lod[h_id + 1] - ref_lod[h_id];
if (span == 0) continue; if (span == 0) continue;
const T *src = in_data + h_id * width; const T *src = in_data + h_id * width;
for (int64_t w_id = 0; w_id < width; ++w_id) { for (uint64_t w_id = 0; w_id < width; ++w_id) {
T ele = src[w_id]; T ele = src[w_id];
size_t offset = ref_lod[h_id] * width; size_t offset = ref_lod[h_id] * width;
for (size_t k = 0; k < span; ++k) { for (uint64_t k = 0; k < span; ++k) {
out_data[offset + k * width + w_id] = ele; out_data[offset + k * width + w_id] = ele;
} }
} }
...@@ -68,7 +69,7 @@ class SequenceExpandAsCompute ...@@ -68,7 +69,7 @@ class SequenceExpandAsCompute
CHECK_EQ(y_lod.size(), 1); CHECK_EQ(y_lod.size(), 1);
CHECK_GT(y_lod[0].size(), 1); CHECK_GT(y_lod[0].size(), 1);
out->mutable_data<T, T>(); out->template mutable_data<T, T>();
SequenceExpandFunctor<T> seq_espand_functor; SequenceExpandFunctor<T> seq_espand_functor;
seq_espand_functor(*x, y_lod[0], out); seq_espand_functor(*x, y_lod[0], out);
......
...@@ -40,7 +40,7 @@ class SequencePoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -40,7 +40,7 @@ class SequencePoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
dims[0] = lod[0].size() - 1; dims[0] = lod[0].size() - 1;
out->Resize({dims}); out->Resize({dims});
out->mutable_data<T>(); out->template mutable_data<T>();
lite::Tensor* index = nullptr; lite::Tensor* index = nullptr;
const bool is_test = true; const bool is_test = true;
......
...@@ -64,9 +64,9 @@ class SequenceReshapeCompute ...@@ -64,9 +64,9 @@ class SequenceReshapeCompute
out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()), out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
out_width}); out_width});
auto* dst_ptr = out->mutable_data<T>(); auto* dst_ptr = out->template mutable_data<T>();
auto size = in->numel() * sizeof(T); auto size = in->numel() * sizeof(T);
std::memcpy(dst_ptr, in->data<T>(), size); std::memcpy(dst_ptr, in->template data<T>(), size);
} }
virtual ~SequenceReshapeCompute() = default; virtual ~SequenceReshapeCompute() = default;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/x86/sequence_unpad_compute.h"
REGISTER_LITE_KERNEL(sequence_unpad,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::SequenceUnpadCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Length",
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/backends/x86/math/sequence_padding.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
namespace math = paddle::lite::x86::math;
template <typename T>
class SequenceUnpadCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::SequenceUnpadParam;
void Run() override {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<X86Context>();
param.Out->template mutable_data<T>();
int64_t padded_length = param.X->dims()[1];
math::UnpaddingLoDTensorFunctor<lite::TargetType::kX86, T>()(
ctx,
*param.X,
param.Out,
padded_length,
0,
false,
math::kBatchLengthWidth);
}
virtual ~SequenceUnpadCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -29,7 +29,7 @@ class ShapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -29,7 +29,7 @@ class ShapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
void Run() override { void Run() override {
auto& param = *param_.get_mutable<operators::ShapeParam>(); auto& param = *param_.get_mutable<operators::ShapeParam>();
// auto& context = context_->As<X86Context>(); // auto& context = context_->As<X86Context>();
auto out_data = param.Out->mutable_data<int32_t>(); auto out_data = param.Out->template mutable_data<int32_t>();
auto in_dims = param.X->dims(); auto in_dims = param.X->dims();
for (int i = 0; i < in_dims.size(); ++i) { for (int i = 0; i < in_dims.size(); ++i) {
out_data[i] = in_dims[i]; out_data[i] = in_dims[i];
......
...@@ -58,7 +58,7 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -58,7 +58,7 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto* x = param.x; auto* x = param.x;
auto* output = param.output; auto* output = param.output;
output->mutable_data<T>(); output->template mutable_data<T>();
const int rank = x->dims().size(); const int rank = x->dims().size();
const int axis = CanonicalAxis(param.axis, rank); const int axis = CanonicalAxis(param.axis, rank);
......
...@@ -35,8 +35,8 @@ class SqueezeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -35,8 +35,8 @@ class SqueezeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto x = param.X; auto x = param.X;
auto output = param.Out; auto output = param.Out;
auto x_dims = x->dims(); auto x_dims = x->dims();
auto* x_data = x->data<T>(); auto* x_data = x->template data<T>();
auto* out_data = output->mutable_data<T>(); auto* out_data = output->template mutable_data<T>();
memcpy(out_data, x_data, x_dims.production() * sizeof(T)); memcpy(out_data, x_data, x_dims.production() * sizeof(T));
} }
...@@ -54,9 +54,9 @@ class Squeeze2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -54,9 +54,9 @@ class Squeeze2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto output = param.Out; auto output = param.Out;
auto xshape = param.XShape; auto xshape = param.XShape;
auto x_dims = x->dims(); auto x_dims = x->dims();
auto* x_data = x->data<T>(); auto* x_data = x->template data<T>();
auto* out_data = output->mutable_data<T>(); auto* out_data = output->template mutable_data<T>();
auto* xshape_data = xshape->mutable_data<T>(); auto* xshape_data = xshape->template mutable_data<T>();
memcpy(out_data, x_data, x_dims.production() * sizeof(T)); memcpy(out_data, x_data, x_dims.production() * sizeof(T));
memcpy(xshape_data, x_data, x_dims.production() * sizeof(T)); memcpy(xshape_data, x_data, x_dims.production() * sizeof(T));
} }
......
...@@ -40,9 +40,9 @@ class StackCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -40,9 +40,9 @@ class StackCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
if (axis < 0) axis += (x[0]->dims().size() + 1); if (axis < 0) axis += (x[0]->dims().size() + 1);
int n = static_cast<int>(x.size()); int n = static_cast<int>(x.size());
auto y_data = y->mutable_data<T>(); auto y_data = y->template mutable_data<T>();
std::vector<const T*> x_datas(n); std::vector<const T*> x_datas(n);
for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data<T>(); for (int i = 0; i < n; ++i) x_datas[i] = x[i]->template data<T>();
int pre = 1, post = 1; int pre = 1, post = 1;
auto dim = x[0]->dims(); auto dim = x[0]->dims();
......
...@@ -73,7 +73,7 @@ class TransposeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -73,7 +73,7 @@ class TransposeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
auto* x = param.x; auto* x = param.x;
auto* out = param.output; auto* out = param.output;
out->mutable_data<T>(); out->template mutable_data<T>();
int ndims = param.axis.size(); int ndims = param.axis.size();
auto& context = ctx_->As<X86Context>(); auto& context = ctx_->As<X86Context>();
TransCompute<lite::TargetType::kX86, T>( TransCompute<lite::TargetType::kX86, T>(
...@@ -92,7 +92,7 @@ class Transpose2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -92,7 +92,7 @@ class Transpose2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
auto* x = param.x; auto* x = param.x;
auto* out = param.output; auto* out = param.output;
out->mutable_data<T>(); out->template mutable_data<T>();
int ndims = param.axis.size(); int ndims = param.axis.size();
auto& context = ctx_->As<X86Context>(); auto& context = ctx_->As<X86Context>();
TransCompute<lite::TargetType::kX86, T>( TransCompute<lite::TargetType::kX86, T>(
......
...@@ -34,8 +34,8 @@ class UniformRandomCompute ...@@ -34,8 +34,8 @@ class UniformRandomCompute
auto *param_out = &param.Out->raw_tensor(); auto *param_out = &param.Out->raw_tensor();
T *data = T *data = param_out->template mutable_data<T>(
param_out->mutable_data<T>(context.x86_device_context()->GetPlace()); context.x86_device_context()->GetPlace());
unsigned int seed = static_cast<unsigned int>(param.seed); unsigned int seed = static_cast<unsigned int>(param.seed);
std::minstd_rand engine; std::minstd_rand engine;
......
...@@ -80,7 +80,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -80,7 +80,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
std::vector<int64_t> col_dims_vec{top_size}; std::vector<int64_t> col_dims_vec{top_size};
col_dims_vec.push_back(1); col_dims_vec.push_back(1);
col->Resize(col_dims_vec); col->Resize(col_dims_vec);
auto* top_data = col->mutable_data<T>(); auto* top_data = col->template mutable_data<T>();
const auto* bottom_data = input.data<T>(); const auto* bottom_data = input.data<T>();
int kernel_win_size = kernel_h * kernel_w; int kernel_win_size = kernel_h * kernel_w;
...@@ -149,7 +149,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -149,7 +149,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
// const auto& offset_y = in_row->lod()[0]; // const auto& offset_y = in_row->lod()[0];
const auto& offset_y = param.X->lod()[1]; const auto& offset_y = param.X->lod()[1];
const auto& offset_x = param.X->lod()[2]; const auto& offset_x = param.X->lod()[2];
std::vector<size_t> top_offset; std::vector<uint64_t> top_offset;
int top_size = 0; int top_size = 0;
top_offset.push_back(top_size); top_offset.push_back(top_size);
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
...@@ -178,9 +178,9 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -178,9 +178,9 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
std::vector<int64_t> top_dims_vec{top_size}; std::vector<int64_t> top_dims_vec{top_size};
top_dims_vec.push_back(1); top_dims_vec.push_back(1);
top->Resize(top_dims_vec); top->Resize(top_dims_vec);
auto* top_data = top->mutable_data<T>(); auto* top_data = top->template mutable_data<T>();
const auto* w_data = w->data<T>(); const auto* w_data = w->template data<T>();
const auto* col_data = col->data<T>(); const auto* col_data = col->template data<T>();
auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context); auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
......
...@@ -140,7 +140,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom, ...@@ -140,7 +140,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom,
const auto& col_offset = col->lod()[0]; const auto& col_offset = col->lod()[0];
const auto& offset_x = in_col->lod()[0]; const auto& offset_x = in_col->lod()[0];
const auto& offset_y = in_row->lod()[0]; const auto& offset_y = in_row->lod()[0];
std::vector<size_t> top_offset; std::vector<uint64_t> top_offset;
int top_size = 0; int top_size = 0;
top_offset.push_back(top_size); top_offset.push_back(top_size);
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
......
if(NOT LITE_WITH_XPU)
return()
endif()
add_subdirectory(bridges) if(LITE_WITH_XTCL)
add_subdirectory(bridges)
add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges}) add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
else()
add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps})
add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu)
add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
add_kernel(activation_compute_xpu XPU basic SRCS activation_compute.cc DEPS ${lite_kernel_deps})
add_kernel(pool_compute_xpu XPU basic SRCS pool_compute.cc DEPS ${lite_kernel_deps})
add_kernel(elementwise_compute_xpu XPU basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps})
add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps})
add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps})
add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps})
add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps})
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/__xpu__multi_encoder_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void XPUMultiEncoderCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
for (auto* fc_weight : param.fc_weight) {
arg_fc_weight_.push_back(
reinterpret_cast<const int16_t*>(fc_weight->data<float>()));
}
for (auto* fc_bias : param.fc_bias) {
arg_fc_bias_.push_back(fc_bias->data<float>());
}
for (auto* ln_scale : param.ln_scale) {
arg_ln_scale_.push_back(ln_scale->data<float>());
}
for (auto* ln_bias : param.ln_bias) {
arg_ln_bias_.push_back(ln_bias->data<float>());
}
if (param.act_type == "relu") {
act_type_ = xdnn::Activation_t::RELU;
}
}
void XPUMultiEncoderCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int batch_size = param.input->dims()[0];
int seq_len = param.input->dims()[1];
int r = xdnn::bert_encoder_transformer_int16<int16_t>(
ctx.GetRawContext(), /* context */
batch_size, /* batch_size */
seq_len, /* from_seq_len */
seq_len, /* to_seq_len */
param.head_num, /* head_num */
param.size_per_head, /* size_per_head */
param.n_layers, /* n_layers */
param.input->data<float>(), /* from_tensor */
param.input->data<float>(), /* to_tensor */
param.mask->data<float>(), /* att_mask */
&arg_fc_weight_[0], /* fc_weights */
&arg_fc_bias_[0], /* fc_biass */
&arg_ln_scale_[0], /* ln_scales */
&arg_ln_bias_[0], /* ln_biass */
param.output->mutable_data<float>(TARGET(kXPU)), /* output */
param.fc_weight_max->data<float>(), /* fc_weights_max */
true, /* pretrans_b */
true, /* use_l3 */
act_type_ /* act_type */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(__xpu__multi_encoder,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::XPUMultiEncoderCompute,
def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("FCWeight", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("FCWeightMax", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class XPUMultiEncoderCompute
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::XPUMultiEncoderParam;
virtual void PrepareForRun();
virtual void Run();
private:
std::vector<const int16_t *> arg_fc_weight_;
std::vector<const float *> arg_fc_bias_;
std::vector<const float *> arg_ln_scale_;
std::vector<const float *> arg_ln_bias_;
xdnn::Activation_t act_type_{xdnn::Activation_t::GELU};
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/__xpu__resnet50_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void XPUResNet50Compute::PrepareForRun() {
auto& param = this->Param<param_t>();
for (auto* filter : param.filter) {
arg_filter_.push_back(
reinterpret_cast<const int16_t*>(filter->data<float>()));
}
for (auto* bias : param.bias) {
arg_bias_.push_back(bias->data<float>());
}
for (auto* max_filter : param.max_filter) {
arg_max_filter_.push_back(max_filter->data<float>());
}
}
void XPUResNet50Compute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int batch_size = param.input->dims()[0];
int r = xdnn::conv2d_int16_resnet<float, int16_t>(
ctx.GetRawContext(), /* context */
batch_size, /* num */
param.input->data<float>(), /* bottom */
&arg_filter_[0], /* weight_list */
param.output->mutable_data<float>(TARGET(kXPU)), /* top */
&arg_bias_[0], /* bias_list */
&arg_max_filter_[0] /* max_filter_list */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(__xpu__resnet50,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::XPUResNet50Compute,
def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class XPUResNet50Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::XPUResNet50Param;
virtual void PrepareForRun();
virtual void Run();
private:
std::vector<const int16_t *> arg_filter_;
std::vector<const float *> arg_max_filter_;
std::vector<const float *> arg_bias_;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/activation_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void ReluCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::RELU, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
void TanhCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::TANH, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
void SigmoidCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int r = xdnn::activation_forward(
ctx.GetRawContext(), /* context */
xdnn::Activation_t::SIGMOID, /* type */
param.X->numel(), /* len */
param.X->data<float>(), /* x */
param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(
tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(sigmoid,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::SigmoidCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class ReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
virtual void Run();
virtual ~ReluCompute() = default;
};
class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
virtual void Run();
virtual ~TanhCompute() = default;
};
class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
virtual void Run();
virtual ~SigmoidCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/batch_norm_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void BatchNormCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
float epsilon = param.epsilon;
auto& x_dims = param.x->dims();
int r = xdnn::batch_norm_infer_forward(
ctx.GetRawContext(), /* context */
epsilon, /* epsilon */
x_dims[0], /* img_n */
x_dims[1], /* img_c */
x_dims[2], /* img_h */
x_dims[3], /* img_w */
param.x->data<float>(), /* img_gm */
param.y->mutable_data<float>(TARGET(kXPU)), /* out_gm */
param.scale->data<float>(), /* scale_gm */
param.bias->data<float>(), /* bias_gm */
param.mean->data<float>(), /* mean_gm */
param.variance->data<float>() /* var__gm */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(batch_norm,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::BatchNormCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class BatchNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::BatchNormParam;
virtual void Run();
virtual ~BatchNormCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
if(NOT LITE_WITH_XPU) if(NOT LITE_WITH_XTCL)
return() return()
endif() endif()
......
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/cast_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
template <typename InType>
void CastCompute<InType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
auto* x = param.X;
auto* out = param.Out;
int out_dtype = param.out_dtype;
auto* in_data = x->template data<InType>();
int numel = x->numel();
int r = 0;
// BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
// SIZE_T = 19;UINT8 = 20;INT8 = 21;
if (out_dtype == 5) {
auto* out_data = out->template mutable_data<float>(TARGET(kXPU));
r = xdnn::cast<InType, float>(
ctx.GetRawContext(), in_data, out_data, numel);
} else if (out_dtype == 2) {
auto* out_data = out->template mutable_data<int>(TARGET(kXPU));
r = xdnn::cast<InType, int>(ctx.GetRawContext(), in_data, out_data, numel);
} else if (out_dtype == 3) {
auto* out_data = out->template mutable_data<int64_t>(TARGET(kXPU));
r = xdnn::cast<InType, int64_t>(
ctx.GetRawContext(), in_data, out_data, numel);
} else {
CHECK(false);
}
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(cast,
kXPU,
kAny,
kNCHW,
paddle::lite::kernels::xpu::CastCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
template <typename InType>
class CastCompute : public KernelLite<TARGET(kXPU), PRECISION(kAny)> {
public:
using param_t = operators::CastParam;
void Run() override;
virtual ~CastCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/conv_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
template <>
void Conv2dCompute<PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.x->dims();
auto& w_dims = param.filter->dims();
int groups = param.groups;
auto& strides = param.strides;
auto paddings = *param.paddings;
auto dilations = *param.dilations;
int r = xdnn::conv2d_forward_int16<float, float, float, float>(
ctx.GetRawContext(), /* context */
x_dims[0], /* num */
x_dims[1], /* input_c */
x_dims[2], /* input_h */
x_dims[3], /* input_w */
w_dims[0], /* num_filter */
w_dims[2], /* kernel_h */
w_dims[3], /* kernel_w */
strides[0], /* stride_h */
strides[1], /* stride_w */
paddings[0], /* pad_h */
paddings[1], /* pad_w */
dilations[0], /* dilation_h */
dilations[1], /* dilation_w */
groups, /* group */
param.x->data<float>(), /* bottom */
param.filter->data<float>(), /* weight */
param.output->mutable_data<float>(TARGET(kXPU)), /* top */
nullptr, /* bias */
nullptr, /* branch */
xdnn::Activation_t::LINEAR, /* type */
nullptr, /* max_image_ptr */
nullptr, /* max_filter_ptr */
nullptr /* max_result_ptr */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace xpu = paddle::lite::kernels::xpu;
using Conv2dFp32 = xpu::Conv2dCompute<PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(conv2d, kXPU, kFloat, kNCHW, Conv2dFp32, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
template <PrecisionType FilterPtype>
class Conv2dCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
public:
using param_t = operators::ConvParam;
virtual void Run();
virtual ~Conv2dCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/dropout_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void DropoutCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int size = param.x->numel() * sizeof(float);
int r = xdnn::memcpy_device(
ctx.GetRawContext(), /* context */
param.output->mutable_data<float>(TARGET(kXPU)), /* dst */
param.x->data<float>(), /* src */
size /* size */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(dropout,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::DropoutCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class DropoutCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::DropoutParam;
virtual void Run();
virtual ~DropoutCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/elementwise_compute.h"
#include <functional>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void ElementwiseAddCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.X->dims().data();
auto& y_dims = param.Y->dims();
int axis = param.axis;
if (param.axis == -1) {
axis = x_dims.size() - y_dims.size();
}
int iter = std::accumulate(
x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
int stride = param.Y->numel();
for (int i = 0; i < iter; ++i) {
const float* x_ptr = param.X->data<float>() + i * stride;
const float* y_ptr = param.Y->data<float>();
float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */
x_ptr, /* x */
y_ptr, /* y */
o_ptr, /* z */
stride /* len */);
CHECK_EQ(r, 0);
}
}
void ElementwiseSubCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.X->dims().data();
auto& y_dims = param.Y->dims();
int axis = param.axis;
if (param.axis == -1) {
axis = x_dims.size() - y_dims.size();
}
int iter = std::accumulate(
x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
int stride = param.Y->numel();
for (int i = 0; i < iter; ++i) {
const float* x_ptr = param.X->data<float>() + i * stride;
const float* y_ptr = param.Y->data<float>();
float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */
x_ptr, /* x */
y_ptr, /* y */
o_ptr, /* z */
stride /* len */);
CHECK_EQ(r, 0);
}
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(elementwise_add,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::ElementwiseAddCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_sub,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::ElementwiseSubCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class ElementwiseAddCompute
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
virtual void Run();
virtual ~ElementwiseAddCompute() = default;
};
class ElementwiseSubCompute
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
virtual void Run();
virtual ~ElementwiseSubCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/target_wrapper.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
/*
* This kernel copies a tensor from host to XPU.
*/
class IoCopyHostToXPUCompute
: public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kX86) ||
param.x->target() == TARGET(kARM));
auto mem_size = param.x->memory_size();
VLOG(4) << "host to xpu, copy size " << mem_size;
auto* data = param.y->mutable_data(TARGET(kXPU), mem_size);
TargetWrapperXPU::MemcpySync(
data, param.x->raw_data(), mem_size, IoDirection::HtoD);
}
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
*res = [](const std::map<std::string, const Type*>& inputs,
const std::string& out) -> const Type* {
CHECK(!inputs.empty());
auto* type = inputs.at("Input");
CHECK(type->target() == TARGET(kHost));
auto out_place = type->place();
out_place.target = TARGET(kXPU);
auto* out_type = Type::Get(type->id(),
out_place.target,
out_place.precision,
out_place.layout,
out_place.device);
return out_type;
};
return res;
}
std::string doc() const override { return "Copy IO from HOST to XPU"; }
};
/*
* This kernel copies a tensor from XPU to host.
*/
class IoCopyXPUToHostCompute
: public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kXPU));
auto mem_size = param.x->memory_size();
VLOG(4) << "xpu to host, copy size " << mem_size;
auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
TargetWrapperXPU::MemcpySync(
data, param.x->raw_data(), mem_size, IoDirection::DtoH);
}
std::string doc() const override { return "Copy IO from XPU to HOST"; }
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(io_copy,
kXPU,
kAny,
kAny,
paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
host_to_device)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy,
kXPU,
kAny,
kAny,
paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
device_to_host)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy_once,
kXPU,
kAny,
kAny,
paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
host_to_device)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
REGISTER_LITE_KERNEL(io_copy_once,
kXPU,
kAny,
kAny,
paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
device_to_host)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kAny),
DATALAYOUT(kAny))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/layer_norm_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void LayerNormCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto x_dims = param.X->dims();
auto axis = param.begin_norm_axis;
auto matrix_dim = x_dims.Flatten2D(axis);
float epsilon = param.epsilon;
int r = xdnn::layer_norm(ctx.GetRawContext(), /* context */
matrix_dim[0], /* m */
matrix_dim[1], /* n */
param.X->data<float>(), /* in */
param.Y->mutable_data<float>(TARGET(kXPU)), /* out */
param.Scale->data<float>(), /* scale */
param.Bias->data<float>(), /* bias */
epsilon /* epsilon */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(layer_norm,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::LayerNormCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class LayerNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::LayerNormParam;
virtual void Run();
virtual ~LayerNormCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/lookup_table_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void LookupTableCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int num = param.Ids->numel();
int embed_dim = param.W->dims()[1];
int r = xdnn::embedding<float, int64_t>(
ctx.GetRawContext(), /* context */
num, /* num */
param.Ids->data<int64_t>(), /* indices */
embed_dim, /* embed_dim */
param.W->data<float>(), /* table */
param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(lookup_table,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::LookupTableCompute,
def)
.BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class LookupTableCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::LookupTableParam;
virtual void Run();
virtual ~LookupTableCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/matmul_compute.h"
#include "lite/backends/xpu/math.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace math = paddle::lite::xpu::math;
void MatMulCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto* x = param.X;
auto* y = param.Y;
auto* out = param.Out;
auto mat_dim_a = math::CreateMatrixDescriptor(
math::RowMatrixFromVector(x->dims()), 0, param.transpose_X);
auto mat_dim_b = math::CreateMatrixDescriptor(
math::ColumnMatrixFromVector(y->dims()), 0, param.transpose_Y);
int lda = (mat_dim_a.trans_ ? mat_dim_a.height_ : mat_dim_a.width_);
int ldb = (mat_dim_b.trans_ ? mat_dim_b.height_ : mat_dim_b.width_);
int ldc = mat_dim_b.width_;
int r = 0;
if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
r = xdnn::fc_int16(ctx.GetRawContext(), /* context */
mat_dim_a.trans_, /* TransA */
mat_dim_b.trans_, /* TransB */
mat_dim_a.height_, /* m */
mat_dim_b.width_, /* n */
mat_dim_a.width_, /* k */
param.alpha, /* alpha */
x->data<float>(), /* A */
y->data<float>(), /* B */
0.0f, /* beta */
out->mutable_data<float>(TARGET(kXPU)) /* C */);
} else {
// batch matmul
r = xdnn::gemm_strided_batched_int16<float, float, float>(
ctx.GetRawContext(), /* context */
mat_dim_a.trans_, /* TransA */
mat_dim_b.trans_, /* TransB */
mat_dim_a.batch_size_, /* batch_size */
mat_dim_a.height_, /* M */
mat_dim_b.width_, /* N */
mat_dim_a.width_, /* K */
param.alpha, /* alpha */
x->data<float>(), /* A */
lda, /* lda */
mat_dim_a.stride_, /* stride_a */
y->data<float>(), /* B */
ldb, /* ldb */
mat_dim_b.stride_, /* stride_b */
0.0f, /* beta */
out->mutable_data<float>(TARGET(kXPU)), /* C */
ldc, /* ldc */
mat_dim_a.height_ * mat_dim_b.width_ /* stride_c */);
}
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
matmul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MatMulCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class MatMulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::MatMulParam;
virtual void Run();
virtual ~MatMulCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/mul_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void MulCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& origin_x = *param.x;
auto& origin_y = *param.y;
auto& x_dims = origin_x.dims();
auto& y_dims = origin_y.dims();
Tensor x_matrix, y_matrix;
if (x_dims.size() > 2) {
x_matrix = ReshapeToMatrix(origin_x, param.x_num_col_dims);
} else {
x_matrix = origin_x;
}
if (y_dims.size() > 2) {
y_matrix = ReshapeToMatrix(origin_y, param.y_num_col_dims);
} else {
y_matrix = origin_y;
}
int m = x_matrix.dims()[0];
int k = x_matrix.dims()[1];
int n = y_matrix.dims()[1];
int r =
xdnn::fc_int16(ctx.GetRawContext(), /* context */
false, /* TransA */
false, /* TransB */
m,
n,
k,
1.0f, /* alpha */
x_matrix.data<float>(), /* A */
y_matrix.data<float>(), /* B */
0.0f, /* beta */
param.output->mutable_data<float>(TARGET(kXPU)) /* C */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
mul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MulCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
static inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src,
int num_col_dims) {
int rank = src.dims().size();
if (rank == 2) {
return src;
}
lite::Tensor res;
res.ShareDataWith(src);
res.Resize(src.dims().Flatten2D(num_col_dims));
return res;
}
class MulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::MulParam;
virtual void Run();
virtual ~MulCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/pool_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void Pool2DCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.x->dims();
CHECK_EQ(x_dims.size(), 4);
auto& o_dims = param.output->dims();
CHECK_EQ(param.ksize.size(), 2);
if (param.global_pooling) {
param.ksize[0] = x_dims[2];
param.ksize[1] = x_dims[3];
}
CHECK_EQ(param.strides.size(), 2);
CHECK_EQ(param.paddings->size(), 4);
auto& paddings = *param.paddings;
auto type = xdnn::MAX_WITHOUT_INDEX;
if (param.pooling_type == "avg") {
if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 &&
paddings[3] == 0) {
type = xdnn::AVG_WITHOUT_PAD;
} else {
type = xdnn::AVG_WITH_PAD;
}
}
int r = xdnn::pooling_forward<float, float>(
ctx.GetRawContext(), /* context */
param.x->data<float>(), /* x */
param.output->mutable_data<float>(TARGET(kXPU)), /* y */
nullptr, /* y_index */
type, /* type */
x_dims[0] * x_dims[1], /* c */
x_dims[2], /* in_h */
x_dims[3], /* in_w */
paddings[0], /* pad_left */
paddings[1], /* pad_right */
paddings[2], /* pad_up */
paddings[3], /* pad_down */
param.ksize[0], /* win_h */
param.ksize[1], /* win_w */
param.strides[0], /* stride_h */
param.strides[1], /* stride_w */
o_dims[2], /* out_h */
o_dims[3] /* out_w */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class Pool2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::PoolParam;
virtual void Run();
virtual ~Pool2DCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/scale_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void ScaleCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.x->dims();
int r = xdnn::scale(ctx.GetRawContext(), /* context */
x_dims.production(), /* len */
param.scale, /* alpha */
param.bias, /* beta */
param.bias_after_scale, /* bias_after_scale */
param.x->data<float>(), /* x */
param.output->mutable_data<float>(TARGET(kXPU)) /* y */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
scale, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ScaleCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class ScaleCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::ScaleParam;
virtual void Run();
virtual ~ScaleCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/slice_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void SliceCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
auto x_dims = param.X->dims();
x_shape_.reserve(x_dims.size());
x_dim_begin_.reserve(x_dims.size());
x_dim_end_.reserve(x_dims.size());
}
void SliceCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto x_dims = param.X->dims();
for (size_t i = 0; i < x_dims.size(); ++i) {
x_shape_[i] = x_dims[i];
x_dim_begin_[i] = 0;
x_dim_end_[i] = x_dims[i];
}
for (size_t i = 0; i < param.axes.size(); ++i) {
int axis = param.axes[i];
x_dim_begin_[axis] = param.starts[i];
x_dim_end_[axis] = param.ends[i];
}
int ndim = param.X->dims().size();
int r = xdnn::slice_forward(
ctx.GetRawContext(), /* context */
&x_shape_[0], /* shape */
&x_dim_begin_[0], /* starts */
&x_dim_end_[0], /* ends */
ndim, /* n */
param.X->data<float>(), /* in */
param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
slice, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SliceCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
class SliceCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::SliceParam;
virtual void PrepareForRun();
virtual void Run();
virtual ~SliceCompute() = default;
private:
std::vector<int> x_shape_;
std::vector<int> x_dim_begin_;
std::vector<int> x_dim_end_;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/softmax_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void SoftmaxCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto& x_dims = param.x->dims();
int axis = CanonicalAxis(param.axis, x_dims.size());
int rows = SizeToAxis(axis, x_dims);
int cols = SizeFromAxis(axis, x_dims);
int r = xdnn::softmax2d_forward(
ctx.GetRawContext(), /* context */
param.x->data<float>(), /* x */
param.output->mutable_data<float>(TARGET(kXPU)), /* y */
rows, /* rows */
cols /* cols */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(softmax,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::SoftmaxCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
static inline int CanonicalAxis(const int axis, const int rank) {
if (axis < 0) {
return axis + rank;
}
return axis;
}
static inline int SizeToAxis(const int axis, lite::DDim dims) {
int size = 1;
for (int i = 0; i < axis; i++) {
size *= dims[i];
}
return size;
}
static inline int SizeFromAxis(const int axis, lite::DDim dims) {
int size = 1;
for (size_t i = axis; i < dims.size(); i++) {
size *= dims[i];
}
return size;
}
class SoftmaxCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::SoftmaxParam;
virtual void Run();
virtual ~SoftmaxCompute() = default;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/stack_compute.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
void StackCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
int n = param.X.size();
void* x_ptr = nullptr;
xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */);
x_ptr_guard_.reset(x_ptr);
x_ptr_cpu_.reserve(n);
}
void StackCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
int n = param.X.size();
auto x_dims = param.X[0]->dims();
int axis = param.axis;
// XXX(miaotianxiang): +1?
if (axis < 0) axis += (x_dims.size() + 1);
auto matrix = x_dims.Flatten2D(axis);
int height = matrix[0];
int width = matrix[1];
for (int i = 0; i < n; ++i) {
x_ptr_cpu_[i] = param.X[i]->data<float>();
}
xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE);
int r = xdnn::stack_forward(
ctx.GetRawContext(), /* context */
height, /* height */
width, /* width */
n, /* n */
x_ptr_guard_.get(), /* x_ptr */
param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
CHECK_EQ(r, 0);
}
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
stack, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::StackCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
struct XPUFreeDeleter {
void operator()(void* p) const { xpu_free(p); }
};
class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::StackParam;
virtual void PrepareForRun();
virtual void Run();
virtual ~StackCompute() = default;
private:
std::unique_ptr<void, XPUFreeDeleter> x_ptr_guard_;
std::vector<const float*> x_ptr_cpu_;
};
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#pragma once #pragma once
#include <xtcl/xtcl.h>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
......
...@@ -14,7 +14,7 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} ) ...@@ -14,7 +14,7 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS}) add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS}) add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS}) add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS}) add_operator(activation_basic_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS}) add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS}) add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS}) add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
...@@ -60,6 +60,7 @@ add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS}) ...@@ -60,6 +60,7 @@ add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS})
add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS}) add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS})
# 3.extra ops # 3.extra ops
add_operator(activation_extra_ops extra SRCS activation_extra_ops.cc DEPS ${op_DEPS})
add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS}) add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS})
add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS}) add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS})
add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS}) add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
...@@ -73,6 +74,7 @@ add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS}) ...@@ -73,6 +74,7 @@ add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS}) add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS}) add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS}) add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS})
add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS}) add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS}) add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS}) add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
...@@ -105,6 +107,7 @@ add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS $ ...@@ -105,6 +107,7 @@ add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS $
add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS}) add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS})
add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS}) add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS}) add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
# for OCR specific # for OCR specific
add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
...@@ -148,6 +151,10 @@ add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DE ...@@ -148,6 +151,10 @@ add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DE
add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS}) add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS})
add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS}) add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
# Only for XPU
add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS})
add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS})
if (NOT LITE_WITH_X86) if (NOT LITE_WITH_X86)
lite_cc_test(test_fc_op SRCS fc_op_test.cc lite_cc_test(test_fc_op SRCS fc_op_test.cc
DEPS fc_op memory DEPS fc_op memory
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/__xpu__multi_encoder_op.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool XPUMultiEncoderOp::CheckShape() const { return true; }
bool XPUMultiEncoderOp::InferShapeImpl() const {
auto input_shape = param_.input->dims();
param_.output->Resize(input_shape);
return true;
}
bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc,
lite::Scope* scope) {
param_.input = const_cast<lite::Tensor*>(
&scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
param_.mask = const_cast<lite::Tensor*>(
&scope->FindVar(op_desc.Input("Mask").front())->Get<lite::Tensor>());
param_.fc_weight_max = const_cast<lite::Tensor*>(
&scope->FindVar(op_desc.Input("FCWeightMax").front())
->Get<lite::Tensor>());
param_.output = scope->FindVar(op_desc.Output("Output").front())
->GetMutable<lite::Tensor>();
param_.fc_weight.clear();
for (auto& name : op_desc.Input("FCWeight")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.fc_weight.push_back(t);
}
param_.fc_bias.clear();
for (auto& name : op_desc.Input("FCBias")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.fc_bias.push_back(t);
}
param_.ln_scale.clear();
for (auto& name : op_desc.Input("LNScale")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.ln_scale.push_back(t);
}
param_.ln_bias.clear();
for (auto& name : op_desc.Input("LNBias")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.ln_bias.push_back(t);
}
param_.n_layers = op_desc.GetAttr<int>("n_layers");
param_.head_num = op_desc.GetAttr<int>("head_num");
param_.size_per_head = op_desc.GetAttr<int>("size_per_head");
param_.act_type = op_desc.GetAttr<std::string>("act_type");
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(__xpu__multi_encoder,
paddle::lite::operators::XPUMultiEncoderOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/core/op_lite.h"
namespace paddle {
namespace lite {
namespace operators {
class XPUMultiEncoderOp : public OpLite {
public:
XPUMultiEncoderOp() {}
explicit XPUMultiEncoderOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShapeImpl() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "MultiEncoder"; }
private:
mutable XPUMultiEncoderParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/__xpu__resnet50_op.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool XPUResNet50Op::CheckShape() const { return true; }
bool XPUResNet50Op::InferShapeImpl() const {
auto input_shape = param_.input->dims();
input_shape[1] = 2048;
input_shape[2] = 1;
input_shape[3] = 1;
param_.output->Resize(input_shape);
return true;
}
bool XPUResNet50Op::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
param_.input = const_cast<lite::Tensor*>(
&scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
param_.output = scope->FindVar(op_desc.Output("Output").front())
->GetMutable<lite::Tensor>();
param_.filter.clear();
for (auto& name : op_desc.Input("Filter")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.filter.push_back(t);
}
param_.bias.clear();
for (auto& name : op_desc.Input("Bias")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.bias.push_back(t);
}
param_.max_filter.clear();
for (auto& name : op_desc.Input("MaxFilter")) {
auto t =
const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
param_.max_filter.push_back(t);
}
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(__xpu__resnet50, paddle::lite::operators::XPUResNet50Op);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/core/op_lite.h"
namespace paddle {
namespace lite {
namespace operators {
class XPUResNet50Op : public OpLite {
public:
XPUResNet50Op() {}
explicit XPUResNet50Op(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShapeImpl() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "ResNet50"; }
private:
mutable XPUResNet50Param param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.i
#include "lite/core/op_registry.h"
#include "lite/operators/activation_ops.h"
// Extra activation ops
REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(hard_swish, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(reciprocal, paddle::lite::operators::ActivationOp);
...@@ -74,6 +74,14 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { ...@@ -74,6 +74,14 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
} else if (opdesc.Type() == "abs") { } else if (opdesc.Type() == "abs") {
// abs // abs
param_.active_type = lite_api::ActivationType::kAbs; param_.active_type = lite_api::ActivationType::kAbs;
} else if (opdesc.Type() == "hard_swish") {
// hard_swish
param_.active_type = lite_api::ActivationType::kHardSwish;
param_.hard_swish_threshold = opdesc.GetAttr<float>("threshold");
param_.hard_swish_scale = opdesc.GetAttr<float>("scale");
param_.hard_swish_offset = opdesc.GetAttr<float>("offset");
} else if (opdesc.Type() == "reciprocal") {
param_.active_type = lite_api::ActivationType::kReciprocal;
} }
VLOG(4) << "opdesc.Type():" << opdesc.Type(); VLOG(4) << "opdesc.Type():" << opdesc.Type();
...@@ -84,21 +92,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { ...@@ -84,21 +92,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
} // namespace operators } // namespace operators
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp); // Baisc activation ops
REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(sigmoid, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(sigmoid, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(tanh, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(tanh, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/ctc_align_op.h"
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool CtcAlignOpLite::CheckShape() const {
CHECK_OR_FALSE(param_.input != nullptr);
CHECK_OR_FALSE(param_.output != nullptr);
auto* input = param_.input;
auto* input_length = param_.input_length;
auto input_lod = input->lod();
CHECK_OR_FALSE(!input_lod.empty() || input_length != nullptr);
return true;
}
bool CtcAlignOpLite::InferShapeImpl() const {
auto input_dims = param_.input->dims();
// It is tricky to set the wrong dimension here.
param_.output->Resize(input_dims);
if (param_.input_length != nullptr && param_.output_length != nullptr) {
param_.output_length->Resize({input_dims[0], 1});
}
return true;
}
bool CtcAlignOpLite::AttachImpl(const cpp::OpDesc& op_desc,
lite::Scope* scope) {
AttachInput(op_desc, scope, "Input", false, &param_.input);
AttachInput(op_desc, scope, "InputLength", true, &param_.input_length);
AttachOutput(op_desc, scope, "Output", false, &param_.output);
AttachOutput(op_desc, scope, "OutputLength", true, &param_.output_length);
param_.blank = op_desc.GetAttr<int>("blank");
param_.merge_repeated = op_desc.GetAttr<bool>("merge_repeated");
param_.padding_value = op_desc.GetAttr<int>("padding_value");
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(ctc_align, paddle::lite::operators::CtcAlignOpLite);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/operators/op_params.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class CtcAlignOpLite : public OpLite {
public:
CtcAlignOpLite() {}
explicit CtcAlignOpLite(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShapeImpl() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "ctc_align"; }
private:
mutable CtcAlignParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
...@@ -336,17 +336,22 @@ struct ConcatParam : ParamBase { ...@@ -336,17 +336,22 @@ struct ConcatParam : ParamBase {
/// ----------------------- activation operators ---------------------- /// ----------------------- activation operators ----------------------
struct ActivationParam : ParamBase { struct ActivationParam : ParamBase {
const lite::Tensor* X{}; const lite::Tensor* X{};
lite::Tensor* Out{};
lite_api::ActivationType active_type;
bool has_active{false};
float Leaky_relu_alpha{0}; // leaky_relu param float Leaky_relu_alpha{0}; // leaky_relu param
float Relu_clipped_coef{6}; // relu_clipped param float Relu_clipped_coef{6}; // relu_clipped param
std::string Prelu_mode{ std::string Prelu_mode{
"channel"}; // prelu param, can be "all", "channel" or "element" "channel"}; // prelu param, can be "all", "channel" or "element"
lite::Tensor* Prelu_alpha{}; // prelu param lite::Tensor* Prelu_alpha{}; // prelu param
float Swish_beta; // swish param float Swish_beta; // swish param
// hard_sigmoid param
float hard_sigmoid_slope{0.2}; float hard_sigmoid_slope{0.2};
float hard_sigmoid_offset{0.5}; float hard_sigmoid_offset{0.5};
lite::Tensor* Out{}; // hard_swish param
bool has_active{false}; float hard_swish_threshold{6.0};
lite_api::ActivationType active_type; float hard_swish_scale{6.0};
float hard_swish_offset{3.0};
}; };
struct ActivationGradParam : ParamBase { struct ActivationGradParam : ParamBase {
...@@ -1019,6 +1024,12 @@ struct SequenceExpandParam : ParamBase { ...@@ -1019,6 +1024,12 @@ struct SequenceExpandParam : ParamBase {
int ref_level{-1}; int ref_level{-1};
}; };
struct SequenceUnpadParam : ParamBase {
const lite::Tensor* X{};
const lite::Tensor* Length{};
lite::Tensor* Out{};
};
struct SequenceExpandAsParam : ParamBase { struct SequenceExpandAsParam : ParamBase {
const lite::Tensor* x{nullptr}; const lite::Tensor* x{nullptr};
const lite::Tensor* y{nullptr}; const lite::Tensor* y{nullptr};
...@@ -1438,6 +1449,40 @@ struct CrfDecodingParam : ParamBase { ...@@ -1438,6 +1449,40 @@ struct CrfDecodingParam : ParamBase {
lite::Tensor* viterbi_path{}; lite::Tensor* viterbi_path{};
}; };
struct CtcAlignParam : ParamBase {
lite::Tensor* input{};
lite::Tensor* input_length{};
lite::Tensor* output{};
lite::Tensor* output_length{};
int blank{0};
bool merge_repeated{true};
int padding_value{0};
};
struct XPUResNet50Param : ParamBase {
lite::Tensor* input{};
std::vector<lite::Tensor*> filter;
std::vector<lite::Tensor*> bias;
std::vector<lite::Tensor*> max_filter;
lite::Tensor* output{};
};
struct XPUMultiEncoderParam : ParamBase {
lite::Tensor* input{};
std::vector<lite::Tensor*> fc_weight;
std::vector<lite::Tensor*> fc_bias;
std::vector<lite::Tensor*> ln_scale;
std::vector<lite::Tensor*> ln_bias;
lite::Tensor* fc_weight_max{};
lite::Tensor* mask{};
lite::Tensor* output{};
int n_layers{};
int head_num{};
int size_per_head{};
std::string act_type{};
};
} // namespace operators } // namespace operators
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/sequence_unpad_op.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool SequenceUnpadOp::CheckShape() const {
CHECK_OR_FALSE(param_.X);
CHECK_OR_FALSE(param_.Length);
CHECK_OR_FALSE(param_.Out);
auto x_dims = param_.X->dims();
auto len_dims = param_.Length->dims();
CHECK(x_dims.size() >= 2) << "Rank of X can't be less than 2";
CHECK(len_dims.size() == 1) << "Rank of Length should be 1";
CHECK(x_dims[0] == len_dims[0])
<< "X and Length should have the same 1st dim";
return true;
}
bool SequenceUnpadOp::InferShapeImpl() const {
auto x_dims = param_.X->dims();
auto len_dims = param_.Length->dims();
auto *seq_len_ptr = param_.Length->data<int64_t>();
int64_t batch_size = len_dims[0];
std::vector<uint64_t> out_lod0(batch_size + 1, 0);
for (int64_t i = 0; i < batch_size; ++i) {
out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
}
paddle::lite::LoD out_lod;
out_lod.push_back(out_lod0);
int64_t out_dim0 = out_lod0.back();
std::vector<int64_t> out_dims{out_dim0};
if (x_dims.size() == 2) {
out_dims.push_back(1);
} else {
for (size_t i = 2; i < x_dims.size(); ++i) {
out_dims.push_back(x_dims[i]);
}
}
param_.Out->Resize(out_dims);
param_.Out->set_lod(out_lod);
return true;
}
bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc,
lite::Scope *scope) {
param_.X = const_cast<lite::Tensor *>(
&scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
param_.Length = const_cast<lite::Tensor *>(
&scope->FindVar(opdesc.Input("Length").front())->Get<lite::Tensor>());
param_.Out =
scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(sequence_unpad, paddle::lite::operators::SequenceUnpadOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class SequenceUnpadOp : public OpLite {
public:
SequenceUnpadOp() {}
explicit SequenceUnpadOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShapeImpl() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "sequence_unpad"; }
private:
mutable SequenceUnpadParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
...@@ -47,6 +47,7 @@ bool StackOp::InferShapeImpl() const { ...@@ -47,6 +47,7 @@ bool StackOp::InferShapeImpl() const {
bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
auto X = op_desc.Input("X"); auto X = op_desc.Input("X");
auto Out = op_desc.Output("Y").front(); auto Out = op_desc.Output("Y").front();
param_.X.clear();
for (auto var : X) { for (auto var : X) {
param_.X.emplace_back(scope->FindVar(var)->GetMutable<lite::Tensor>()); param_.X.emplace_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
} }
......
add_subdirectory(kernels) add_subdirectory(kernels)
add_subdirectory(math) add_subdirectory(math)
add_subdirectory(cv) add_subdirectory(cv)
add_subdirectory(api)
if(LITE_WITH_XPU)
lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/lite_api_test_helper.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
template <typename T>
lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
lite::Tensor ret;
ret.Resize(shape);
T* ptr = ret.mutable_data<T>();
for (int i = 0; i < ret.numel(); ++i) {
ptr[i] = (T)1;
}
return ret;
}
TEST(Ernie, test_ernie_lite_xpu) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
config.set_xpu_workspace_l3_size_per_thread();
auto predictor = lite_api::CreatePaddlePredictor(config);
int64_t batch_size = 1;
int64_t seq_len = 64;
Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
std::vector<int64_t> input_shape{batch_size, seq_len, 1};
predictor->GetInput(0)->Resize(input_shape);
predictor->GetInput(1)->Resize(input_shape);
predictor->GetInput(2)->Resize(input_shape);
predictor->GetInput(3)->Resize(input_shape);
memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor->Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor->Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
std::vector<std::vector<float>> results;
results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022}));
auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2);
ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 3);
for (size_t i = 0; i < results.size(); ++i) {
for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(
out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
}
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/lite_api_test_helper.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
template <typename T>
lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
lite::Tensor ret;
ret.Resize(shape);
T* ptr = ret.mutable_data<T>();
for (int i = 0; i < ret.numel(); ++i) {
ptr[i] = (T)1;
}
return ret;
}
TEST(Ernie, test_ernie_lite_xpu) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
config.set_xpu_workspace_l3_size_per_thread();
auto predictor = lite_api::CreatePaddlePredictor(config);
int64_t batch_size = 1;
int64_t seq_len = 64;
Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
std::vector<int64_t> input_shape{batch_size, seq_len, 1};
predictor->GetInput(0)->Resize(input_shape);
predictor->GetInput(1)->Resize(input_shape);
predictor->GetInput(2)->Resize(input_shape);
predictor->GetInput(3)->Resize(input_shape);
memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
sample_input.raw_data(),
sizeof(int64_t) * batch_size * seq_len);
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor->Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor->Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
std::vector<std::vector<float>> results;
results.emplace_back(std::vector<float>({0.108398}));
auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2);
ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1);
for (size_t i = 0; i < results.size(); ++i) {
for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(
out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
}
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/lite_api_test_helper.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
TEST(Resnet50, test_resnet50_lite_xpu) {
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
config.set_xpu_workspace_l3_size_per_thread();
auto predictor = lite_api::CreatePaddlePredictor(config);
auto input_tensor = predictor->GetInput(0);
std::vector<int64_t> input_shape{1, 3, 224, 224};
input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>();
int input_num = 1;
for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i];
}
for (int i = 0; i < input_num; i++) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor->Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor->Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
std::vector<std::vector<float>> results;
results.emplace_back(std::vector<float>(
{0.000268651, 0.000174053, 0.000213181, 0.000396771, 0.000591516,
0.00018169, 0.000289721, 0.000855934, 0.000732185, 9.2055e-05,
0.000220664, 0.00235289, 0.00571265, 0.00357688, 0.00129667,
0.000465392, 0.000143775, 0.000211628, 0.000617144, 0.000265033}));
auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2);
ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000);
int step = 50;
for (size_t i = 0; i < results.size(); ++i) {
for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j],
1e-5);
}
}
}
} // namespace lite
} // namespace paddle
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm) lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
endif() endif()
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
...@@ -61,6 +61,7 @@ if(LITE_BUILD_EXTRA) ...@@ -61,6 +61,7 @@ if(LITE_BUILD_EXTRA)
lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
# for training kernel # for training kernel
if (LITE_WITH_TRAIN) if (LITE_WITH_TRAIN)
......
...@@ -36,7 +36,9 @@ enum activation_type_test { ...@@ -36,7 +36,9 @@ enum activation_type_test {
FLOOR, FLOOR,
RSQRT, RSQRT,
GELU, GELU,
SQUARE SQUARE,
HARD_SWISH,
RECIPROCAL
}; };
class ActivationComputeTester : public arena::TestCase { class ActivationComputeTester : public arena::TestCase {
...@@ -49,6 +51,9 @@ class ActivationComputeTester : public arena::TestCase { ...@@ -49,6 +51,9 @@ class ActivationComputeTester : public arena::TestCase {
float relu_clipped_coef_ = 6.; float relu_clipped_coef_ = 6.;
std::string prelu_mode_ = ""; std::string prelu_mode_ = "";
float swish_beta_ = 0.; float swish_beta_ = 0.;
float hard_swish_threshold = 6.0;
float hard_swish_scale = 6.0;
float hard_swish_offset = 3.0;
DDim dims_{{1}}; DDim dims_{{1}};
std::string type_ = ""; std::string type_ = "";
activation_type_test act_type_ = RELU; activation_type_test act_type_ = RELU;
...@@ -199,6 +204,20 @@ class ActivationComputeTester : public arena::TestCase { ...@@ -199,6 +204,20 @@ class ActivationComputeTester : public arena::TestCase {
} }
break; break;
} }
case HARD_SWISH: {
for (int i = 0; i < dims_.production(); i++) {
float max_value = std::max(0.f, x_data[i] + hard_swish_offset);
float min_value = std::min(max_value, hard_swish_threshold);
output_data[i] = min_value * x_data[i] / hard_swish_scale;
}
break;
}
case RECIPROCAL: {
for (int i = 0; i < dims_.production(); i++) {
output_data[i] = 1.0 / x_data[i];
}
break;
}
default: default:
LOG(INFO) << "the type of activation is unknow."; LOG(INFO) << "the type of activation is unknow.";
} }
...@@ -221,6 +240,11 @@ class ActivationComputeTester : public arena::TestCase { ...@@ -221,6 +240,11 @@ class ActivationComputeTester : public arena::TestCase {
if (act_type_ == SWISH) { if (act_type_ == SWISH) {
op_desc->SetAttr("beta", swish_beta_); op_desc->SetAttr("beta", swish_beta_);
} }
if (act_type_ == HARD_SWISH) {
op_desc->SetAttr("threshold", hard_swish_threshold);
op_desc->SetAttr("scale", hard_swish_scale);
op_desc->SetAttr("offset", hard_swish_offset);
}
} }
void PrepareData() override { void PrepareData() override {
...@@ -552,5 +576,61 @@ TEST(Activation_gelu, precision) { ...@@ -552,5 +576,61 @@ TEST(Activation_gelu, precision) {
} }
} }
TEST(activation_hard_swish, precision) {
LOG(INFO) << "test hard_swish op";
Place place;
float abs_error = 2e-5;
#if defined(LITE_WITH_ARM)
place = TARGET(kARM);
#else
return;
#endif
for (auto dims : std::vector<std::vector<int64_t>>{
{1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
std::unique_ptr<arena::TestCase> tester(
new ActivationComputeTester(place,
"def",
0.01,
6.,
"all",
0.,
DDim(dims),
"hard_swish",
HARD_SWISH));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
}
TEST(activation_reciprocal, precision) {
LOG(INFO) << "test reciprocal op";
Place place;
float abs_error = 2e-5;
#if defined(LITE_WITH_ARM)
place = TARGET(kARM);
#else
return;
#endif
for (auto dims : std::vector<std::vector<int64_t>>{
{1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
std::unique_ptr<arena::TestCase> tester(
new ActivationComputeTester(place,
"def",
0.01,
6.,
"all",
0.,
DDim(dims),
"reciprocal",
RECIPROCAL));
arena::Arena arena(std::move(tester), place, abs_error);
arena.TestPrecision();
}
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
namespace paddle {
namespace lite {
class CtcAlignComputeTester : public arena::TestCase {
protected:
// common attributes for this op.
std::string input_ = "input";
std::string input_length_ = "input_length";
std::string output_ = "output";
std::string output_length_ = "output_length";
std::vector<int> input_data_;
std::vector<int64_t> input_shape_;
std::vector<std::vector<uint64_t>> input_lod_;
std::vector<int> input_length_data_;
std::vector<int64_t> input_length_shape_;
std::vector<int> output_data_;
std::vector<int64_t> output_shape_;
std::vector<std::vector<uint64_t>> output_lod_;
std::vector<int> output_length_data_;
std::vector<int64_t> output_length_shape_;
int blank_;
bool merge_repeated_;
int padding_value_;
public:
CtcAlignComputeTester(const Place& place,
const std::string& alias,
const std::vector<int>& input_data,
const std::vector<int64_t> input_shape,
const std::vector<std::vector<uint64_t>>& input_lod,
const std::vector<int>& input_length_data,
const std::vector<int64_t> input_length_shape,
const int blank,
const bool merge_repeated,
const int padding_value,
const std::vector<int>& output_data,
const std::vector<int64_t>& output_shape,
const std::vector<std::vector<uint64_t>>& output_lod,
const std::vector<int>& output_length_data,
const std::vector<int64_t>& output_length_shape)
: TestCase(place, alias) {
input_data_ = input_data;
input_shape_ = input_shape;
input_lod_ = input_lod;
input_length_data_ = input_length_data;
input_length_shape_ = input_length_shape;
blank_ = blank;
merge_repeated_ = merge_repeated;
padding_value_ = padding_value;
output_data_ = output_data;
output_shape_ = output_shape;
output_lod_ = output_lod;
output_length_data_ = output_length_data;
output_length_shape_ = output_length_shape;
}
void RunBaseline(Scope* scope) override {
auto* output_tensor = scope->NewTensor(output_);
output_tensor->Resize(output_shape_);
if (!output_lod_.empty()) {
output_tensor->set_lod(output_lod_);
}
auto* output_data = output_tensor->mutable_data<int>();
int64_t output_num = 1;
for (auto e : output_shape_) {
output_num *= e;
}
for (int i = 0; i < output_num; i++) {
output_data[i] = output_data_[i];
}
if (!input_length_data_.empty() && !output_length_data_.empty()) {
auto* output_length_tensor = scope->NewTensor(output_length_);
output_length_tensor->Resize(output_length_shape_);
auto* output_length_data = output_length_tensor->mutable_data<int>();
int64_t num = 1;
for (auto e : output_length_shape_) {
num *= e;
}
for (int i = 0; i < num; i++) {
output_length_data[i] = output_length_data_[i];
}
}
}
void PrepareOpDesc(cpp::OpDesc* op_desc) {
op_desc->SetType("ctc_align");
op_desc->SetInput("Input", {input_});
op_desc->SetOutput("Output", {output_});
if (!input_length_data_.empty()) {
op_desc->SetInput("InputLength", {input_length_});
op_desc->SetOutput("OutputLength", {output_length_});
}
op_desc->SetAttr("blank", blank_);
op_desc->SetAttr("merge_repeated", merge_repeated_);
op_desc->SetAttr("padding_value", padding_value_);
}
void PrepareData() override {
SetCommonTensor(input_, DDim(input_shape_), input_data_.data(), input_lod_);
if (!input_length_data_.empty()) {
SetCommonTensor(
input_length_, DDim(input_length_shape_), input_length_data_.data());
}
}
};
TEST(CtcAlign1, precision) {
LOG(INFO) << "test ctc_align op";
#ifdef LITE_WITH_ARM
// Define variable
const std::vector<int>& input_data = {
0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0};
const std::vector<int64_t> input_shape = {18, 1};
const std::vector<std::vector<uint64_t>> input_lod = {{11, 7}};
const std::vector<int> input_length_data = {};
const std::vector<int64_t> input_length_shape = {};
const int blank = 0;
const bool merge_repeated = false;
const int padding_value = 0;
const std::vector<int> output_data = {1, 2, 2, 4, 4, 5, 6, 6, 7, 7, 7};
const std::vector<int64_t> output_shape = {11, 1};
const std::vector<std::vector<uint64_t>> output_lod = {{7, 4}};
const std::vector<int> output_length_data = {};
const std::vector<int64_t> output_length_shape = {};
// Test
Place place(TARGET(kHost), PRECISION(kInt32));
std::unique_ptr<arena::TestCase> tester(
new CtcAlignComputeTester(place,
"def",
input_data,
input_shape,
input_lod,
input_length_data,
input_length_shape,
blank,
merge_repeated,
padding_value,
output_data,
output_shape,
output_lod,
output_length_data,
output_length_shape));
arena::Arena arena(std::move(tester), place, 2e-5);
arena.TestPrecision();
#endif
}
TEST(CtcAlign2, precision) {
LOG(INFO) << "test ctc_align op";
#ifdef LITE_WITH_ARM
// Define variable
const std::vector<int>& input_data = {
0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0};
const std::vector<int64_t> input_shape = {3, 6};
const std::vector<std::vector<uint64_t>> input_lod = {};
const std::vector<int> input_length_data = {6, 5, 4};
const std::vector<int64_t> input_length_shape = {3, 1};
const int blank = 0;
const bool merge_repeated = true;
const int padding_value = 0;
const std::vector<int> output_data = {
1, 2, 4, 0, 0, 0, 4, 5, 6, 0, 0, 0, 7, 0, 0, 0, 0, 0};
const std::vector<int64_t> output_shape = {3, 6};
const std::vector<std::vector<uint64_t>> output_lod = {};
const std::vector<int> output_length_data = {3, 3, 1};
const std::vector<int64_t> output_length_shape = {3, 1};
// Test
Place place(TARGET(kHost), PRECISION(kInt32));
std::unique_ptr<arena::TestCase> tester(
new CtcAlignComputeTester(place,
"def",
input_data,
input_shape,
input_lod,
input_length_data,
input_length_shape,
blank,
merge_repeated,
padding_value,
output_data,
output_shape,
output_lod,
output_length_data,
output_length_shape));
arena::Arena arena(std::move(tester), place, 2e-5);
arena.TestPrecision();
#endif
}
TEST(CtcAlign3, precision) {
LOG(INFO) << "test ctc_align op";
#ifdef LITE_WITH_ARM
// Define variable
const std::vector<int>& input_data = {
0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0};
const std::vector<int64_t> input_shape = {3, 6};
const std::vector<std::vector<uint64_t>> input_lod = {};
const std::vector<int> input_length_data = {6, 5, 4};
const std::vector<int64_t> input_length_shape = {3, 1};
const int blank = 0;
const bool merge_repeated = false;
const int padding_value = 0;
const std::vector<int> output_data = {
1, 2, 2, 4, 0, 0, 4, 5, 6, 0, 0, 0, 7, 7, 7, 0, 0, 0};
const std::vector<int64_t> output_shape = {3, 6};
const std::vector<std::vector<uint64_t>> output_lod = {};
const std::vector<int> output_length_data = {4, 3, 3};
const std::vector<int64_t> output_length_shape = {3, 1};
// Test
Place place(TARGET(kHost), PRECISION(kInt32));
std::unique_ptr<arena::TestCase> tester(
new CtcAlignComputeTester(place,
"def",
input_data,
input_shape,
input_lod,
input_length_data,
input_length_shape,
blank,
merge_repeated,
padding_value,
output_data,
output_shape,
output_lod,
output_length_data,
output_length_shape));
arena::Arena arena(std::move(tester), place, 2e-5);
arena.TestPrecision();
#endif
}
} // namespace lite
} // namespace paddle
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
......
...@@ -25,6 +25,7 @@ SHUTDOWN_LOG=ON ...@@ -25,6 +25,7 @@ SHUTDOWN_LOG=ON
BUILD_NPU=OFF BUILD_NPU=OFF
NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/ NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
BUILD_XPU=OFF BUILD_XPU=OFF
BUILD_XTCL=OFF
XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/" XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
LITE_WITH_ARM_LANG=OFF LITE_WITH_ARM_LANG=OFF
...@@ -138,6 +139,7 @@ function make_tiny_publish_so { ...@@ -138,6 +139,7 @@ function make_tiny_publish_so {
-DLITE_WITH_NPU=$BUILD_NPU \ -DLITE_WITH_NPU=$BUILD_NPU \
-DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
-DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
...@@ -226,6 +228,7 @@ function make_full_publish_so { ...@@ -226,6 +228,7 @@ function make_full_publish_so {
-DLITE_WITH_NPU=$BUILD_NPU \ -DLITE_WITH_NPU=$BUILD_NPU \
-DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
-DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DLITE_WITH_TRAIN=$BUILD_TRAIN \ -DLITE_WITH_TRAIN=$BUILD_TRAIN \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
...@@ -260,6 +263,7 @@ function make_all_tests { ...@@ -260,6 +263,7 @@ function make_all_tests {
-DLITE_WITH_NPU=$BUILD_NPU \ -DLITE_WITH_NPU=$BUILD_NPU \
-DNPU_DDK_ROOT=$NPU_DDK_ROOT \ -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
-DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
...@@ -330,7 +334,10 @@ function make_cuda { ...@@ -330,7 +334,10 @@ function make_cuda {
-DWITH_TESTING=OFF \ -DWITH_TESTING=OFF \
-DLITE_WITH_ARM=OFF \ -DLITE_WITH_ARM=OFF \
-DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
-DLITE_BUILD_EXTRA=ON -DLITE_BUILD_EXTRA=ON \
-DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT
make publish_inference -j$NUM_PROC make publish_inference -j$NUM_PROC
cd - cd -
...@@ -362,9 +369,10 @@ function make_x86 { ...@@ -362,9 +369,10 @@ function make_x86 {
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
-DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
-DLITE_BUILD_EXTRA=ON \ -DLITE_BUILD_EXTRA=ON \
-DCMAKE_BUILD_TYPE=Release \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XPU=$BUID_XPU \ -DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DCMAKE_BUILD_TYPE=Release
make publish_inference -j$NUM_PROC make publish_inference -j$NUM_PROC
cd - cd -
...@@ -483,6 +491,10 @@ function main { ...@@ -483,6 +491,10 @@ function main {
BUILD_XPU="${i#*=}" BUILD_XPU="${i#*=}"
shift shift
;; ;;
--build_xtcl=*)
BUILD_XTCL="${i#*=}"
shift
;;
--xpu_sdk_root=*) --xpu_sdk_root=*)
XPU_SDK_ROOT="${i#*=}" XPU_SDK_ROOT="${i#*=}"
shift shift
......
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
set -ex set -ex
# global variables with default value # global variables with default value
NEUWARE_HOME="${NEUWARE_HOME}" # XPU SDK NEUWARE_HOME="${NEUWARE_HOME}"
TARGET_NAME="all" # default target TARGET_NAME="all" # default target
BUILD_EXTRA=OFF # ON(with sequence ops)/OFF BUILD_EXTRA=OFF # ON(with sequence ops)/OFF
WITH_TESTING=OFF # ON/OFF WITH_TESTING=ON # ON/OFF
function print_usage { function print_usage {
echo -e "\nUSAGE:" echo -e "\nUSAGE:"
...@@ -20,10 +20,9 @@ function print_usage { ...@@ -20,10 +20,9 @@ function print_usage {
# readonly variables with default value # readonly variables with default value
readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-DWITH_PYTHON=OFF \
-DLITE_WITH_ARM=OFF" -DLITE_WITH_ARM=OFF"
readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1} readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8}
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
readonly workspace=$(pwd) readonly workspace=$(pwd)
...@@ -37,8 +36,7 @@ function prepare_thirdparty { ...@@ -37,8 +36,7 @@ function prepare_thirdparty {
fi fi
tar xzf third-party-05b862.tar.gz tar xzf third-party-05b862.tar.gz
else else
# git submodule update --init --recursive git submodule update --init --recursive
echo "third-party is in ready"
fi fi
} }
...@@ -62,12 +60,12 @@ function prepare_workspace { ...@@ -62,12 +60,12 @@ function prepare_workspace {
} }
function build_mlu { function build_mlu {
prepare_workspace
build_dir=${workspace}/build.lite.mlu build_dir=${workspace}/build.lite.mlu
mkdir -p $build_dir mkdir -p $build_dir
cd $build_dir cd $build_dir
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
prepare_workspace
cmake .. \ cmake .. \
${CMAKE_COMMON_OPTIONS} \ ${CMAKE_COMMON_OPTIONS} \
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
...@@ -75,9 +73,10 @@ function build_mlu { ...@@ -75,9 +73,10 @@ function build_mlu {
-DLITE_WITH_X86=ON \ -DLITE_WITH_X86=ON \
-DWITH_MKL=ON \ -DWITH_MKL=ON \
-DLITE_WITH_MLU=ON \ -DLITE_WITH_MLU=ON \
-DLITE_WITH_PYTHON=OFF \
-DLITE_BUILD_EXTRA=${BUILD_EXTRA} \ -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-DWITH_TESTING=${WITH_TESTING} \ -DWITH_TESTING=${WITH_TESTING} \
-DMLU_SDK_ROOT=${XPU_SDK_ROOT} -DNEUWARE_HOME=${NEUWARE_HOME}
make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
......
#!/bin/bash
set -ex
# global variables with default value
XPU_SDK_ROOT="$(pwd)/../XPU_SDK" # XPU SDK
TARGET_NAME="test_subgraph_pass" # default target
BUILD_EXTRA=ON # ON(with sequence ops)/OFF
WITH_TESTING=ON # ON/OFF
function print_usage {
echo -e "\nUSAGE:"
echo
echo "----------------------------------------"
echo -e "--xpu_sdk_root=<xpu sdk directory>"
echo -e "--target_name=<target name>"
echo "----------------------------------------"
echo
}
# readonly variables with default value
readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-DWITH_PYTHON=OFF \
-DLITE_WITH_ARM=OFF"
readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
readonly workspace=$(pwd)
function prepare_thirdparty {
if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
rm -rf $workspace/third-party
if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
wget $THIRDPARTY_TAR
fi
tar xzf third-party-05b862.tar.gz
else
git submodule update --init --recursive
fi
}
# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
# here we fake an empty file to make cmake works.
function prepare_workspace {
# in build directory
# 1. Prepare gen_code file
GEN_CODE_PATH_PREFIX=lite/gen_code
mkdir -p ./${GEN_CODE_PATH_PREFIX}
touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
# 2.Prepare debug tool
DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
# clone submodule
# git submodule update --init --recursive
prepare_thirdparty
}
function build_xpu {
build_dir=${workspace}/build.lite.xpu
mkdir -p $build_dir
cd $build_dir
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
prepare_workspace
cmake .. \
${CMAKE_COMMON_OPTIONS} \
-DWITH_GPU=OFF \
-DWITH_MKLDNN=OFF \
-DLITE_WITH_X86=ON \
-DWITH_MKL=ON \
-DLITE_WITH_XPU=ON \
-DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-DWITH_TESTING=${WITH_TESTING} \
-DXPU_SDK_ROOT=${XPU_SDK_ROOT}
make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
cd -
echo "Done"
}
function main {
# Parse command line.
for i in "$@"; do
case $i in
--target_name=*)
TARGET_NAME="${i#*=}"
shift
;;
--build_extra=*)
BUILD_EXTRA="${i#*=}"
shift
;;
--xpu_sdk_root=*)
XPU_SDK_ROOT="${i#*=}"
shift
;;
build)
build_xpu
shift
;;
full_publish)
TARGET_NAME=publish_inference
build_xpu
shift
;;
*)
# unknown option
print_usage
exit 1
;;
esac
done
}
main $@
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册