提交 794b01ec 编写于 作者: L Liu Yiqun

Merge branch 'develop' into step_rnn/opt_ddim_lite

......@@ -60,6 +60,7 @@ lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF)
......@@ -74,7 +75,7 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne
lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
# cv build options
lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF)
lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
if(ANDROID OR IOS OR ARMLINUX)
......@@ -192,6 +193,9 @@ if(LITE_WITH_CUDA)
include(cuda)
endif()
if(LITE_WITH_BM)
include(bm)
endif()
include(generic) # simplify cmake module
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_BM)
return()
endif()
if(NOT DEFINED BM_SDK_ROOT)
set(BM_SDK_ROOT $ENV{BM_SDK_ROOT})
if(NOT BM_SDK_ROOT)
message(FATAL_ERROR "Must set BM_SDK_ROOT or env BM_SDK_ROOT when LITE_WITH_BM=ON")
endif()
endif()
message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}")
find_path(BM_SDK_INC NAMES bmruntime_interface.h
PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH)
if(NOT BM_SDK_INC)
message(FATAL_ERROR "Can not find bmruntime_interface.h in ${BM_SDK_ROOT}/include")
endif()
include_directories("${BM_SDK_ROOT}/include/bmruntime")
include_directories("${BM_SDK_ROOT}/include/bmlib")
include_directories("${BM_SDK_ROOT}/include/bmcompiler")
include_directories("${BM_SDK_ROOT}/include/bmcpu")
include_directories("${BM_SDK_ROOT}/include/bmlog")
find_library(BM_SDK_RT_LIB NAMES bmrt
PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
if(NOT BM_SDK_RT_LIB)
message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}")
else()
message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}")
add_library(bmrt SHARED IMPORTED GLOBAL)
set_property(TARGET bmrt PROPERTY IMPORTED_LOCATION ${BM_SDK_RT_LIB})
endif()
find_library(BM_SDK_BM_LIB NAMES bmlib
PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
if(NOT BM_SDK_BM_LIB)
message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}")
else()
message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}")
add_library(bmlib SHARED IMPORTED GLOBAL)
set_property(TARGET bmlib PROPERTY IMPORTED_LOCATION ${BM_SDK_BM_LIB})
endif()
find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler
PATHS ${BM_SDK_ROOT}/lib/bmcompiler)
if(NOT BM_SDK_COMPILER_LIB)
message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}")
else()
message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}")
add_library(bmcompiler SHARED IMPORTED GLOBAL)
set_property(TARGET bmcompiler PROPERTY IMPORTED_LOCATION ${BM_SDK_COMPILER_LIB})
endif()
find_library(BM_SDK_CPU_LIB NAMES bmcpu
PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
if(NOT BM_SDK_CPU_LIB)
message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}")
else()
message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}")
add_library(bmcpu SHARED IMPORTED GLOBAL)
set_property(TARGET bmcpu PROPERTY IMPORTED_LOCATION ${BM_SDK_CPU_LIB})
endif()
set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs")
set(bm_builder_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm builder libs")
......@@ -143,6 +143,10 @@ if (LITE_WITH_FPGA)
add_definitions("-DLITE_WITH_FPGA")
endif()
if (LITE_WITH_BM)
add_definitions("-DLITE_WITH_BM")
endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
if (LITE_WITH_PRECISION_PROFILE)
......
......@@ -30,7 +30,7 @@ if(NOT NPU_DDK_INC)
message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include")
endif()
include_directories("${NPU_DDK_ROOT}")
include_directories("${NPU_DDK_ROOT}/include")
set(NPU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
......
......@@ -174,27 +174,45 @@ if(NOT WITH_DSO)
endif(WIN32)
endif(NOT WITH_DSO)
function(add_cuda_static_lib alias cuda_lib_paths file_name)
unset(ABS_PATH CACHE)
find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH)
add_library(${alias} STATIC IMPORTED GLOBAL)
set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE)
if (NOT ABS_PATH)
message(FATAL_ERROR "Can not find CUDA static library: ${file_name}")
endif()
function(add_cuda_lib TARGET_NAME)
set(options STATIC SHARED)
set(oneValueArgs "NAME")
set(multiValueArgs "PATHS")
cmake_parse_arguments(add_cuda_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
unset(ABS_PATH CACHE)
if (NOT add_cuda_lib_PATHS)
set(add_cuda_lib_PATHS CUDNN_CHECK_LIBRARY_DIRS)
endif()
find_library(ABS_PATH NAMES ${add_cuda_lib_NAME} PATHS ${${add_cuda_lib_PATHS}} NO_DEFAULT_PATH)
add_library(${TARGET_NAME} SHARED IMPORTED GLOBAL)
set_property(TARGET ${TARGET_NAME} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
set(CUDA_MODULES ${CUDA_MODULES} ${TARGET_NAME} PARENT_SCOPE)
if (NOT ABS_PATH)
message(FATAL_ERROR "Can not find CUDA library: ${add_cuda_lib_NAME}")
endif()
endfunction()
add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a)
add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a)
add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a)
add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a)
if(NOT ${CUDA_VERSION} LESS 10.1)
add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a)
if(LITE_WITH_STATIC_CUDA)
message(STATUS "Static link CUDA toolkit.")
add_cuda_lib(cudart_static STATIC NAME libcudart_static.a)
add_cuda_lib(cublas_static STATIC NAME libcublas_static.a)
add_cuda_lib(curand_static STATIC NAME libcurand_static.a)
add_cuda_lib(culibos_static STATIC NAME libculibos.a)
if(NOT ${CUDA_VERSION} LESS 10.1)
add_cuda_lib(cublasLt_static STATIC NAME libcublasLt_static.a)
endif()
set_property(GLOBAL PROPERTY CUDA_MODULES cudnn_static ${CUDA_MODULES})
else()
message(STATUS "Dynamic Link CUDA toolkit.")
add_cuda_lib(cudart SHARED NAME libcudart.so)
add_cuda_lib(cublas SHARED NAME libcublas.so)
add_cuda_lib(curand SHARED NAME libcurand.so)
if(NOT ${CUDA_VERSION} LESS 10.1)
add_cuda_lib(cublasLt SHARED NAME libcublasLt.so)
endif()
set_property(GLOBAL PROPERTY CUDA_MODULES cudnn ${CUDA_MODULES})
endif()
set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES})
# setting nvcc arch flags
select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
......
......@@ -69,9 +69,15 @@ if(CUDNN_FOUND)
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
add_library(cudnn_static STATIC IMPORTED GLOBAL)
set_property(TARGET cudnn_static PROPERTY IMPORTED_LOCATION
if(LITE_WITH_STATIC_CUDA)
add_library(cudnn_static STATIC IMPORTED GLOBAL)
set_property(TARGET cudnn_static PROPERTY IMPORTED_LOCATION
"${CUDNN_LIB_PATH}/libcudnn_static.a")
else()
add_library(cudnn SHARED IMPORTED GLOBAL)
set_property(TARGET cudnn PROPERTY IMPORTED_LOCATION
"${CUDNN_LIB_PATH}/libcudnn.so")
endif(LITE_WITH_STATIC_CUDA)
string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
......
......@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -94,6 +94,12 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (LITE_WITH_BM)
foreach(var ${lite_deps_BM_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE)
endfunction()
......@@ -119,7 +125,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -129,6 +135,7 @@ function(lite_cc_library TARGET)
X86_DEPS ${args_X86_DEPS}
CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS}
BM_DEPS ${args_BM_DEPS}
ARM_DEPS ${args_ARM_DEPS}
CV_DEPS ${args_CV_DEPS}
FPGA_DEPS ${args_FPGA_DEPS}
......@@ -163,7 +170,7 @@ function(lite_cc_binary TARGET)
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -177,6 +184,7 @@ function(lite_cc_binary TARGET)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -210,7 +218,7 @@ function(lite_cc_test TARGET)
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL # (basic|extra)
......@@ -232,6 +240,7 @@ function(lite_cc_test TARGET)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -260,6 +269,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels")
......@@ -270,12 +280,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -341,6 +351,12 @@ function(add_kernel TARGET device level)
endif()
set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "BM")
if (NOT LITE_WITH_BM)
return()
endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
return()
......@@ -374,6 +390,7 @@ function(add_kernel TARGET device level)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -392,7 +409,7 @@ endif()
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -424,6 +441,7 @@ function(add_operator TARGET level)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......
......@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
......@@ -66,6 +67,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (LITE_WITH_FPGA)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
endif(LITE_WITH_FPGA)
if (LITE_WITH_BM)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
endif(LITE_WITH_BM)
else()
set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
endif()
......@@ -220,10 +224,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
)
add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
......@@ -235,10 +243,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
)
add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
endif()
......
......@@ -35,18 +35,18 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
NPU_DEPS ${npu_kernels})
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
if (LITE_WITH_NPU)
# Strips the symbols of our protobuf functions to fix the conflicts during
# loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set_target_properties(paddle_light_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
add_dependencies(paddle_full_api_shared custom_linker_map)
else()
if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
add_library(paddle_light_api_shared SHARED "")
target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
......@@ -60,13 +60,19 @@ if (WITH_TESTING)
${ops} ${host_kernels}
CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels})
XPU_DEPS ${xpu_kernels}
BM_DEPS ${bm_kernels})
endif()
if(LITE_WITH_FPGA)
set(light_api_deps ${light_api_deps} ${fpga_deps})
set(cxx_api_deps ${cxx_api_deps} ${fpga_deps})
endif()
if(LITE_WITH_BM)
set(light_api_deps ${light_api_deps} ${bm_deps})
set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
endif()
message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get CUDA kernels ${cuda_kernels}")
......@@ -75,6 +81,7 @@ message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}")
# for full api
if (NOT LITE_ON_TINY_PUBLISH)
......@@ -84,10 +91,12 @@ if (NOT LITE_ON_TINY_PUBLISH)
SRCS cxx_api.cc
DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
endif()
......@@ -96,7 +105,7 @@ endif()
set(light_api_deps
scope target_wrapper_host model_parser program)
if(LITE_WITH_CUDA)
get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
set(light_api_deps ${light_api_deps} target_wrapper_cuda)
endif()
lite_cc_library(light_api SRCS light_api.cc
......@@ -109,7 +118,8 @@ lite_cc_library(light_api SRCS light_api.cc
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels})
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......@@ -120,11 +130,14 @@ if(WITH_TESTING)
DEPS cxx_api mir_passes lite_api_test_helper
${ops} ${host_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -160,6 +173,12 @@ if(WITH_TESTING)
${ops} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn)
add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz)
if(LITE_WITH_BM)
lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
endif()
endif()
endif()
......@@ -238,9 +257,10 @@ if (NOT LITE_ON_TINY_PUBLISH)
FPGA_DEPS ${fpga_kernels})
# The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps})
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
cc_library(api_full_static SRCS DEPS paddle_api_full cxx_api paddle_api light_api ${cxx_api_deps} ${ops} ${host_kernels} ${cuda_kernels} program tensor memory naive_buffer types ${fluid_modules} protobuf ${cuda_static_deps})
endif()
bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
#-----------------------------------------------------------------------------------------------------
......@@ -250,6 +270,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
DEPS light_api program mir_passes paddle_api_light
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
lite_cc_test(test_apis SRCS apis_test.cc
......@@ -258,6 +279,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -283,11 +305,13 @@ endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
${ops}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
......@@ -302,6 +326,7 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -309,6 +334,7 @@ if(NOT IOS)
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
......@@ -322,6 +348,7 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......
......@@ -13,32 +13,61 @@
// limitations under the License.
#include <gflags/gflags.h>
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <cstdio>
#include <fstream>
#include <iomanip>
#include <numeric>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
DEFINE_string(model_dir, "", "model dir");
DEFINE_string(input_shape,
"1,3,224,224",
"input shapes, separated by colon and comma");
DEFINE_string(result_filename, "", "save test result");
"set input shapes according to the model, "
"separated by colon and comma, "
"such as 1,3,244,244:1,3,300,300.");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_int32(power_mode,
3,
"arm power mode: "
"0 for big cluster, "
"1 for little cluster, "
"2 for all cores, "
"3 for no bind");
DEFINE_int32(threads, 1, "threads num");
DEFINE_string(result_filename,
"result.txt",
"save benchmark "
"result to the file");
DEFINE_bool(run_model_optimize,
false,
"if set true, apply model_optimize_tool to model, use optimized "
"model to test");
DEFINE_bool(is_quantized_model, false, "if set true, test the quantized model");
"if set true, apply model_optimize_tool to "
"model and use optimized model to test. ");
DEFINE_bool(is_quantized_model,
false,
"if set true, "
"test the performance of the quantized model. ");
namespace paddle {
namespace lite_api {
inline double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
}
void OutputOptModel(const std::string& load_model_dir,
const std::string& save_optimized_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) {
......@@ -58,7 +87,7 @@ void OutputOptModel(const std::string& load_model_dir,
paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
.c_str());
if (ret == 0) {
LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
LOG(INFO) << "Delete old optimized model " << save_optimized_model_dir;
}
predictor->SaveOptimizedModel(save_optimized_model_dir,
LiteModelType::kNaiveBuffer);
......@@ -69,23 +98,22 @@ void OutputOptModel(const std::string& load_model_dir,
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::vector<std::vector<int64_t>>& input_shapes,
const std::string& model_dir,
const int repeat,
const int thread_num,
const int warmup_times,
const std::string model_name) {
// set config and create predictor
lite_api::MobileConfig config;
config.set_threads(thread_num);
config.set_power_mode(LITE_POWER_NO_BIND);
config.set_threads(FLAGS_threads);
config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
config.set_model_dir(model_dir);
auto predictor = lite_api::CreatePaddlePredictor(config);
// set input
for (int j = 0; j < input_shapes.size(); ++j) {
auto input_tensor = predictor->GetInput(j);
input_tensor->Resize(input_shapes[j]);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (int i = 0; i < input_shapes[j].size(); ++i) {
for (size_t i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
for (int i = 0; i < input_num; ++i) {
......@@ -93,26 +121,37 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
}
}
for (int i = 0; i < warmup_times; ++i) {
// warmup
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor->Run();
}
auto start = lite::GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
// run
std::vector<float> perf_vct;
for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor->Run();
auto end = GetCurrentUS();
perf_vct.push_back((end - start) / 1000.0);
}
auto end = lite::GetCurrentUS();
std::FILE* pf = std::fopen(FLAGS_result_filename.c_str(), "a");
if (nullptr == pf) {
LOG(INFO) << "create result file error";
exit(0);
std::sort(perf_vct.begin(), perf_vct.end());
float min_res = perf_vct.back();
float max_res = perf_vct.front();
float total_res = accumulate(perf_vct.begin(), perf_vct.end(), 0.0);
float avg_res = total_res / FLAGS_repeats;
// save result
std::ofstream ofs(FLAGS_result_filename, std::ios::app);
if (!ofs.is_open()) {
LOG(FATAL) << "open result file failed";
}
fprintf(pf,
"-- %-18s avg = %5.4f ms\n",
model_name.c_str(),
(end - start) / repeat / 1000.0);
std::fclose(pf);
ofs.precision(5);
ofs << std::setw(20) << std::fixed << std::left << model_name;
ofs << "min = " << std::setw(12) << min_res;
ofs << "max = " << std::setw(12) << max_res;
ofs << "average = " << std::setw(12) << avg_res;
ofs << std::endl;
ofs.close();
}
#endif
......@@ -122,9 +161,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "" || FLAGS_result_filename == "") {
LOG(INFO) << "usage: "
<< "--model_dir /path/to/your/model --result_filename "
"/path/to/resultfile";
LOG(INFO) << "please run ./benchmark_bin --help to obtain usage.";
exit(0);
}
......@@ -166,11 +203,11 @@ int main(int argc, char** argv) {
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) {
for (size_t i = 0; i < str_input_shapes.size(); ++i) {
input_shapes.push_back(get_shape(str_input_shapes[i]));
}
// Output optimized model
// Output optimized model if needed
if (FLAGS_run_model_optimize) {
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
......@@ -180,12 +217,7 @@ int main(int argc, char** argv) {
// Run inference using optimized model
std::string run_model_dir =
FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
paddle::lite_api::Run(input_shapes,
run_model_dir,
FLAGS_repeats,
FLAGS_threads,
FLAGS_warmup,
model_name);
paddle::lite_api::Run(input_shapes, run_model_dir, model_name);
#endif
return 0;
}
......@@ -42,11 +42,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
int num_threads = config.cpu_math_library_num_threads();
int num_threads = config.x86_math_library_num_threads();
int real_num_threads = num_threads > 1 ? num_threads : 1;
paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
omp_set_num_threads(real_num_threads);
VLOG(3) << "set_cpu_math_library_math_threads() is set successfully and the "
VLOG(3) << "set_x86_math_library_math_threads() is set successfully and the "
"number of threads is:"
<< num_threads;
#endif
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <gflags/gflags.h>
#include <sstream>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
......@@ -33,10 +34,10 @@ using paddle::lite::profile::Timer;
DEFINE_string(input_shape,
"1,3,224,224",
"input shapes, separated by colon and comma");
DEFINE_bool(use_optimize_nb,
false,
"optimized & naive buffer model for mobile devices");
DEFINE_string(arg_name, "", "the arg name");
namespace paddle {
namespace lite_api {
......@@ -86,6 +87,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
for (int i = 0; i < input_shapes[j].size(); ++i) {
input_num *= input_shapes[j][i];
}
for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f;
}
......@@ -122,6 +124,28 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
output_num *= output_shape[i];
}
LOG(INFO) << "output_num: " << output_num;
// please turn off memory_optimize_pass to use this feature.
if (FLAGS_arg_name != "") {
auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
auto arg_shape = arg_tensor->shape();
int arg_num = 1;
std::ostringstream os;
os << "{";
for (int i = 0; i < arg_shape.size(); ++i) {
arg_num *= arg_shape[i];
os << arg_shape[i] << ",";
}
os << "}";
float sum = 0.;
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
}
}
#endif
......
......@@ -133,7 +133,9 @@ class LITE_API CxxConfig : public ConfigBase {
std::string model_file_;
std::string param_file_;
bool model_from_memory_{false};
int cpu_math_library_math_threads_ = 1;
#ifdef LITE_WITH_X86
int x86_math_library_math_threads_ = 1;
#endif
public:
void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
......@@ -153,12 +155,14 @@ class LITE_API CxxConfig : public ConfigBase {
std::string param_file() const { return param_file_; }
bool model_from_memory() const { return model_from_memory_; }
void set_cpu_math_library_num_threads(int threads) {
cpu_math_library_math_threads_ = threads;
#ifdef LITE_WITH_X86
void set_x86_math_library_num_threads(int threads) {
x86_math_library_math_threads_ = threads;
}
int cpu_math_library_num_threads() const {
return cpu_math_library_math_threads_;
int x86_math_library_num_threads() const {
return x86_math_library_math_threads_;
}
#endif
};
/// MobileConfig is the config for the light weight predictor, it will skip
......
......@@ -55,7 +55,8 @@ const std::string& TargetToStr(TargetType target) {
"any",
"fpga",
"npu",
"xpu"};
"xpu",
"bm"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -94,7 +95,8 @@ const std::string& TargetRepr(TargetType target) {
"kAny",
"kFPGA",
"kNPU",
"kXPU"};
"kXPU",
"kBM"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -135,6 +137,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kOpenCL),
TARGET(kNPU),
TARGET(kXPU),
TARGET(kBM),
TARGET(kFPGA)});
if (target == TARGET(kAny)) {
return valid_set;
......
......@@ -52,8 +52,9 @@ enum class TargetType : int {
kFPGA = 7,
kNPU = 8,
kXPU = 9,
kBM = 10,
kAny = 6, // any target
NUM = 10, // number of fields.
NUM = 11, // number of fields.
};
enum class PrecisionType : int {
kUnk = 0,
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <fstream>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_img_txt_path,
"",
"if set input_img_txt_path, read the img filename as input.");
namespace paddle {
namespace lite {
void TestModel(const std::vector<Place>& valid_places) {
lite::Predictor predictor;
std::vector<std::string> passes;
passes.push_back("bm_subgraph_pass");
predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
if (FLAGS_input_img_txt_path.empty()) {
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
} else {
std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
if (!fs.is_open()) {
LOG(FATAL) << "open input_img_txt error.";
}
for (int i = 0; i < item_size; i++) {
fs >> data[i];
}
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor.Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
auto* out_data = out->data<float>();
FILE* fp = fopen("result.txt", "wb");
for (int i = 0; i < out->numel(); i++) {
fprintf(fp, "%f\n", out_data[i]);
}
fclose(fp);
}
TEST(ResNet50, test_bm) {
std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
TestModel(valid_places);
}
} // namespace lite
} // namespace paddle
......@@ -30,7 +30,9 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
std::string model_dir = FLAGS_model_dir;
lite_api::CxxConfig config;
config.set_model_dir(model_dir);
config.set_cpu_math_library_num_threads(1);
#ifdef LITE_WITH_X86
config.set_x86_math_library_num_threads(1);
#endif
config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
......
......@@ -6,3 +6,4 @@ add_subdirectory(fpga)
add_subdirectory(host)
add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(bm)
......@@ -79,6 +79,7 @@ void conv_compute_6x6_3x3(const float* input,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx) {
auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2];
float* tmp_work_space =
......@@ -296,7 +297,7 @@ void conv_compute_6x6_3x3(const float* input,
wout,
false,
zero_ptr,
nullptr);
&act_param);
}
} else {
for (int ci = 0; ci < oc_4; ++ci) {
......@@ -343,7 +344,7 @@ void conv_compute_6x6_3x3(const float* input,
wout,
false,
zero_ptr,
nullptr);
&act_param);
}
}
}
......@@ -366,6 +367,7 @@ void conv_compute_2x2_3x3(const float* input,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx) {
auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2];
float* tmp_work_space =
......@@ -565,7 +567,7 @@ void conv_compute_2x2_3x3(const float* input,
wout,
false,
zero_ptr,
nullptr);
&act_param);
}
} else {
for (int ci = 0; ci < oc_4; ++ci) {
......@@ -606,7 +608,7 @@ void conv_compute_2x2_3x3(const float* input,
wout,
false,
zero_ptr,
nullptr);
&act_param);
}
}
}
......@@ -627,6 +629,7 @@ void conv_compute_2x2_3x3_small(const float* input,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx) {
auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2];
float* tmp_work_space =
......@@ -819,7 +822,7 @@ void conv_compute_2x2_3x3_small(const float* input,
wout,
false,
zero_ptr,
nullptr);
&act_param);
}
} else {
for (int ci = 0; ci < oc_4; ++ci) {
......@@ -860,7 +863,7 @@ void conv_compute_2x2_3x3_small(const float* input,
wout,
false,
zero_ptr,
nullptr);
&act_param);
}
}
}
......
......@@ -924,58 +924,58 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\
"st1 {v15.4s}, [%[doutr3]], #16 \n"
#define RIGHT_RESULT_S1_RELU6 \
"fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ \
\
"fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
"fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
"fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
\
"fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/ \
\
"fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
"fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
"fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
\
"ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \
"ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \
"bif v12.16b, v22.16b, v18.16b \n" \
"fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
"fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
"fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ \
\
"fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
"fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
"st1 {v12.4s}, [%[doutr0]], #16 \n" \
\
"fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/ \
\
"fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
"fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
\
"ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \
"ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \
"bif v13.16b, v23.16b, v18.16b \n" \
\
"fmla v15.4s , v10.4s, v20.s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
\
"fmax v14.4s, v14.4s, v20.4s \n" /*relu*/ \
"st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \
\
"fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
\
"fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/ \
\
"fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
\
"bif v14.16b, v24.16b, v18.16b \n" \
"fmax v15.4s, v15.4s, v20.4s \n" /*relu*/ \
\
"st1 {v14.4s}, [%[doutr2]], #16 \n" \
\
"fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/ \
"bif v15.16b, v25.16b, v18.16b \n" \
\
#define RIGHT_RESULT_S1_RELU6 \
"fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ \
\
"fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
"fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
"fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
\
"fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/ \
\
"fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
"fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
"fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
\
"ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \
"ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \
"bif v12.16b, v22.16b, v18.16b \n" \
"fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
"fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
"fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ \
\
"fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
"fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
"st1 {v12.4s}, [%[doutr0]], #16 \n" \
\
"fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/ \
\
"fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
"fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
\
"ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \
"ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \
"bif v13.16b, v23.16b, v18.16b \n" \
\
"fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
\
"fmax v14.4s, v14.4s, v20.4s \n" /*relu*/ \
"st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \
\
"fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
\
"fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/ \
\
"fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
\
"bif v14.16b, v24.16b, v18.16b \n" \
"fmax v15.4s, v15.4s, v20.4s \n" /*relu*/ \
\
"st1 {v14.4s}, [%[doutr2]], #16 \n" \
\
"fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/ \
"bif v15.16b, v25.16b, v18.16b \n" \
\
"st1 {v15.4s}, [%[doutr3]], #16 \n"
#define RIGHT_RESULT_S1_LEAKY_RELU \
......@@ -1586,7 +1586,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
/* r3 */ \
"vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \
\
"vld1.32 {d28-d29}, [%[six_ptr]]! @ load din r0\n" \
"vld1.32 {d28-d29}, [%[six_ptr]] @ load din r0\n" \
"vmax.f32 q4, q4, %q[vzero] @ relu \n" \
\
"vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \
......@@ -1617,7 +1617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
/* r3 */ \
"vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \
\
"vld1.32 {d28-d29}, [%[scale_ptr]]! @ load din r0\n" \
"vld1.32 {d28-d29}, [%[scale_ptr]] @ load din r0\n" \
\
"vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \
\
......@@ -1694,7 +1694,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
/* r3 */ \
"vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \
\
"vld1.32 {d28-d29}, [%[scale_ptr]]! @ load din r0\n" \
"vld1.32 {d28-d29}, [%[scale_ptr]] @ load din r0\n" \
\
"vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \
\
......@@ -2339,17 +2339,29 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
int size_out_channel = w_out * h_out;
int w_stride = 9;
int tile_w = (w_in + 3) >> 2;
int cnt_col = tile_w - 2;
unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
int tile_w = w_out >> 2;
int remain = w_out % 4;
int cnt_col = tile_w - 1;
unsigned int size_pad_right = (unsigned int)(5 + (tile_w << 2) - w_in);
const unsigned int remian_idx[4] = {0, 1, 2, 3};
if (remain == 0 && size_pad_right == 5) {
size_pad_right = 1;
cnt_col -= 1;
remain = 4;
} else if (remain == 0 && size_pad_right == 6) {
size_pad_right = 2;
cnt_col -= 1;
remain = 4;
}
uint32x4_t vmask_rp1 =
vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
uint32x4_t vmask_rp2 =
vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
uint32x4_t vmask_result =
vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
vcgtq_u32(vdupq_n_u32(remain), vld1q_u32(remian_idx));
unsigned int vmask[8];
vst1q_u32(vmask, vmask_rp1);
......@@ -2398,7 +2410,7 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
const float *din_ptr5 = dr5;
float *ptr_zero = const_cast<float *>(zero);
#ifdef __aarch64__
for (int i = 0; i < h_in; i += 4) {
for (int i = 0; i < h_out; i += 4) {
//! process top pad pad_h = 1
din_ptr0 = dr0;
din_ptr1 = dr1;
......@@ -2484,7 +2496,7 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
dout_ptr = dout_ptr + 4 * w_out;
}
#else
for (int i = 0; i < h_in; i += 2) {
for (int i = 0; i < h_out; i += 2) {
//! process top pad pad_h = 1
din_ptr0 = dr0;
din_ptr1 = dr1;
......@@ -2883,39 +2895,57 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
wbias = vdupq_n_f32(0.f);
}
int hs = -1;
int he = 3;
float out_buf1[4];
float out_buf2[4];
float trash_buf[4];
int h_cnt = (h_out + 1) >> 1;
float *doutr0 = dout_channel;
float *doutr1 = dout_channel + w_out;
for (int j = 0; j < h_cnt; ++j) {
const float *dr0 = din_channel + hs * w_in;
const float *dr1 = dr0 + w_in;
const float *dr2 = dr1 + w_in;
const float *dr3 = dr2 + w_in;
const float *dr0 = din_channel;
const float *dr1 = dr0 + w_in;
const float *dr2 = dr1 + w_in;
const float *dr3 = dr2 + w_in;
if (hs == -1) {
dr0 = zero;
for (int j = 0; j < h_out; j += 2) {
const float *dr0_ptr = dr0;
const float *dr1_ptr = dr1;
const float *dr2_ptr = dr2;
const float *dr3_ptr = dr3;
if (j == 0) {
dr0_ptr = zero;
dr1_ptr = dr0;
dr2_ptr = dr1;
dr3_ptr = dr2;
dr0 = dr1;
dr1 = dr2;
} else {
dr0 = dr2;
dr1 = dr3;
}
dr2 = dr1 + w_in;
dr3 = dr2 + w_in;
//! process bottom pad
if (j + 3 > h_in) {
switch (j + 3 - h_in) {
case 3:
dr1_ptr = zero;
case 2:
dr2_ptr = zero;
case 1:
dr3_ptr = zero;
default:
break;
}
}
switch (he - h_in) {
case 2:
dr2 = zero;
doutr1 = trash_buf;
case 1:
dr3 = zero;
default:
break;
//! process bottom remain
if (j + 2 > h_out) {
doutr1 = trash_buf;
}
act_switch_3x3s1p1_s(dr0,
dr1,
dr2,
dr3,
act_switch_3x3s1p1_s(dr0_ptr,
dr1_ptr,
dr2_ptr,
dr3_ptr,
out_buf1,
out_buf2,
wr0,
......@@ -2931,8 +2961,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
}
doutr0 = doutr1;
doutr1 += w_out;
hs += 2;
he += 2;
} // end of processing heights
} // end of processing channels
} // end of processing batchs
......@@ -3458,6 +3486,12 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
const int remian_idx[4] = {0, 1, 2, 3};
if (remain == 0 && size_pad_right == 6) { // w_in == w_out and w_out % 4 == 0
tile_w -= 1;
remain = 4;
size_pad_right = 2;
}
uint32x4_t vmask_rp1 =
vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
uint32x4_t vmask_rp2 =
......@@ -3603,16 +3637,14 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
dr2 = dr1 + w_in;
dr3 = dr2 + w_in;
//! process bottom pad
if (i + 3 >= h_in) {
switch (i + 3 - h_in) {
if (i + 4 > h_in) {
switch (i + 4 - h_in) {
case 3:
din_ptr1 = zero_ptr;
case 2:
din_ptr2 = zero_ptr;
case 1:
din_ptr3 = zero_ptr;
case 0:
din_ptr3 = zero_ptr;
default:
break;
}
......@@ -4016,22 +4048,21 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
doutr0 = dout_channel + j * w_out;
doutr1 = doutr0 + w_out;
if (j + 3 >= h_in) {
switch (j + 3 - h_in) {
if (j + 4 > h_in) {
switch (j + 4 - h_in) {
case 3:
dr1 = zero_ptr;
case 2:
dr2 = zero_ptr;
case 1:
dr3 = zero_ptr;
doutr1 = trash_buf;
case 0:
dr3 = zero_ptr;
doutr1 = trash_buf;
default:
break;
}
}
if (j + 2 > h_out) {
doutr1 = trash_buf;
}
unsigned int *vmask_ptr = vmask;
act_switch_3x3s1p0_s(dr0,
dr1,
......
......@@ -1202,15 +1202,17 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
int out_pad_idx[4] = {0, 1, 2, 3};
int size_pad_bottom = h_out * 2 - h_in;
int cnt_col = (w_out >> 2) - 2;
int size_right_remain = w_in - (7 + cnt_col * 8);
if (size_right_remain >= 9) {
cnt_col++;
size_right_remain -= 8;
}
int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);
int tile_w = w_out >> 2;
int cnt_remain = w_out % 4;
unsigned int size_right_remain = (unsigned int)(7 + (tile_w << 3) - w_in);
size_right_remain = 8 - size_right_remain;
int size_right_pad = w_out * 2 - w_in;
if (cnt_remain == 0 && size_right_remain == 0) {
cnt_remain = 4;
tile_w -= 1;
size_right_remain = 8;
}
int cnt_col = tile_w - 1;
uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
vld1q_s32(right_pad_idx)); // 0 2 4 6
......@@ -1276,7 +1278,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
float* doutr1_ptr = nullptr;
#ifdef __aarch64__
for (int i = 0; i < h_in; i += 4) {
for (int i = 0; i < h_out; i += 2) {
din0_ptr = dr0;
din1_ptr = dr1;
din2_ptr = dr2;
......@@ -1303,8 +1305,8 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
dr4 = dr3 + w_in;
//! process bottom pad
if (i + 4 > h_in) {
switch (i + 4 - h_in) {
if (i * 2 + 4 > h_in) {
switch (i * 2 + 4 - h_in) {
case 4:
din1_ptr = zero_ptr;
case 3:
......@@ -1318,7 +1320,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
}
}
//! process output pad
if (i / 2 + 2 > h_out) {
if (i + 2 > h_out) {
doutr1_ptr = write_ptr;
}
int cnt = cnt_col;
......@@ -1343,7 +1345,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
doutr0 = doutr0 + 2 * w_out;
}
#else
for (int i = 0; i < h_in; i += 2) {
for (int i = 0; i < h_out; i++) {
din0_ptr = dr0;
din1_ptr = dr1;
din2_ptr = dr2;
......@@ -1364,8 +1366,8 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
}
//! process bottom pad
if (i + 2 > h_in) {
switch (i + 2 - h_in) {
if (i * 2 + 2 > h_in) {
switch (i * 2 + 2 - h_in) {
case 2:
din1_ptr = zero_ptr;
case 1:
......@@ -1641,7 +1643,8 @@ void act_switch_3x3s2p0(const float* din0_ptr,
"ld1 {v20.4s}, [%[inptr3]] \n"
"ld1 {v21.4s}, [%[inptr4]] \n"
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8}
MID_COMPUTE_S2 MID_RESULT_S2_RELU6
"ld1 {v22.4s}, [%[six_ptr]] \n" MID_COMPUTE_S2
MID_RESULT_S2_RELU6
"cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2_RELU6
......@@ -1700,7 +1703,8 @@ void act_switch_3x3s2p0(const float* din0_ptr,
"ld1 {v20.4s}, [%[inptr3]] \n"
"ld1 {v21.4s}, [%[inptr4]] \n"
"ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8}
MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
"ld1 {v22.4s}, [%[scale_ptr]] \n" MID_COMPUTE_S2
MID_RESULT_S2_LEAKY_RELU
"cmp %w[remain], #1 \n"
"blt 4f \n" RIGHT_COMPUTE_S2
RIGHT_RESULT_S2_LEAKY_RELU
......@@ -1718,7 +1722,7 @@ void act_switch_3x3s2p0(const float* din0_ptr,
[w1] "w"(wr1),
[w2] "w"(wr2),
[remain] "r"(cnt_remain),
[six_ptr] "r"(vscale),
[scale_ptr] "r"(vscale),
[mask1] "w"(vmask_rp1),
[mask2] "w"(vmask_rp2),
[wmask] "w"(wmask),
......@@ -1834,7 +1838,14 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
int tile_w = w_out >> 2;
int cnt_remain = w_out % 4;
unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
unsigned int size_right_remain = (unsigned int)(8 + (tile_w << 3) - w_in);
size_right_remain = 8 - size_right_remain;
if (cnt_remain == 0 && size_right_remain == 0) {
cnt_remain = 4;
tile_w -= 1;
size_right_remain = 8;
}
uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
vld1q_s32(right_pad_idx)); // 0 2 4 6
......
......@@ -2237,7 +2237,7 @@ inline void act_switch_process(float* src,
int cnt = size >> 4;
int remain = size % 16;
float32x4_t vzero = vdupq_n_f32(0.f);
if (act_param != nullptr && act_param->has_active) {
if (act_param != nullptr) {
float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
if (cnt > 0) {
......@@ -2327,6 +2327,7 @@ inline void act_switch_process(float* src,
src++;
dst++;
}
break;
case lite_api::ActivationType::kRelu6:
for (int i = 0; i < remain; i++) {
float tmp = *src >= 0.f ? *src : 0.f;
......@@ -2336,6 +2337,7 @@ inline void act_switch_process(float* src,
src++;
dst++;
}
break;
case lite_api::ActivationType::kLeakyRelu:
for (int i = 0; i < remain; i++) {
if (*src >= 0.f) {
......
......@@ -150,11 +150,26 @@ void conv_depthwise_5x5s2_fp32(const float* din,
int win,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
const operators::ConvParam& param,
const operators::ActivationParam act_param,
ARMContext* ctx);
void conv_depthwise_5x5s2p2_fp32(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
ARMContext* ctx);
template <typename Dtype>
void conv_depthwise_5x5s1_int8(Dtype* dout,
const int8_t* din,
......
......@@ -180,6 +180,8 @@ void conv1x1s1_gemm(const float* i_data,
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
auto act_param = param.activation_param;
int hblock = get_hblock(ctx);
int m_roundup = hblock * ((m + hblock - 1) / hblock);
int weights_size_per_group = m * k;
......@@ -223,7 +225,7 @@ void conv1x1s1_gemm(const float* i_data,
n,
bias_group,
flag_bias,
flag_relu,
act_param,
ctx);
}
}
......@@ -361,6 +363,8 @@ void conv_im2col_gemm(const float* i_data,
int hblock = get_hblock(ctx);
int m_roundup = hblock * ((m + hblock - 1) / hblock);
int weights_size_per_group = m * k;
auto act_param = param.activation_param;
if (n > 1) {
weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
}
......@@ -422,7 +426,7 @@ void conv_im2col_gemm(const float* i_data,
n,
bias_group,
flag_bias,
flag_relu,
act_param,
ctx);
}
}
......@@ -585,10 +589,9 @@ void conv_depthwise_3x3_fp32(const void* din,
int stride = param.strides[1];
int pad = pad_w;
bool flag_bias = param.bias != nullptr;
bool pads_equal =
((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
if (stride == 1) {
if (pads_equal && (pad_h == pad_w) && (pad < 2)) { // support pad = [0, 1]
if (pads_less && (pad_h == pad_w) && (pad < 2)) { // support pad = [0, 1]
conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
......@@ -620,9 +623,8 @@ void conv_depthwise_3x3_fp32(const void* din,
act_param,
ctx);
}
} else if (stride == 2) {
if (pad_h == pad_w && (pad < 2)) { // support pad = [0, 1]
if (pads_less && pad_h == pad_w && (pad < 2)) { // support pad = [0, 1]
conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
......@@ -674,12 +676,13 @@ void conv_depthwise_5x5_fp32(const void* din,
ARMContext* ctx,
const float* scale) {
auto paddings = *param.paddings;
auto act_param = param.activation_param;
int pad = paddings[0];
int stride = param.strides[1];
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
ctx->ExtendWorkspace((w_in + w_out) * sizeof(float));
if (pad == 2 && stride == 2) {
if (stride == 2) {
conv_depthwise_5x5s2_fp32(reinterpret_cast<const float*>(din),
reinterpret_cast<float*>(dout),
num,
......@@ -691,9 +694,8 @@ void conv_depthwise_5x5_fp32(const void* din,
w_in,
reinterpret_cast<const float*>(weights),
bias,
pad,
flag_bias,
flag_relu,
param,
act_param,
ctx);
} else if (stride == 1) {
conv_depthwise_5x5s1_fp32(reinterpret_cast<const float*>(din),
......
......@@ -44,6 +44,8 @@ void conv_winograd3x3(const float* din,
int size_out_channel = wout * hout;
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
auto act_param = param.activation_param;
act_param.has_active = false;
//! transform input
int tile_w = (wout + 5) / 6;
......@@ -127,7 +129,7 @@ void conv_winograd3x3(const float* din,
size_tile,
nullptr,
false,
false,
act_param,
ctx);
}
......
......@@ -115,7 +115,241 @@ void fill_bias_relu<int>(int* tensor,
}
}
}
#ifdef __aarch64__
#define FILL_BIAS \
"1: \n" \
"ld1 {v0.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"ld1 {v1.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"ld1 {v2.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"ld1 {v3.4s}, [%[din_ptr]], #16 \n" /*vld1q_f32(din_ptr0)*/ \
"add v0.4s, v0.4s, %[vbias].4s \n" \
"add v1.4s, v1.4s, %[vbias].4s \n" \
"add v2.4s, v2.4s, %[vbias].4s \n" \
"add v3.4s, v3.4s, %[vbias].4s \n"
#define FILL_RELU \
"fmax v0.4s, v0.4s, %[vzero].4s \n" /* vmaxq_f32() */ \
"fmax v1.4s, v1.4s, %[vzero].4s \n" /* vmaxq_f32() */ \
"fmax v2.4s, v2.4s, %[vzero].4s \n" /* vmaxq_f32() */ \
"fmax v3.4s, v3.4s, %[vzero].4s \n" /* vmaxq_f32() */
#define FILL_RELU6 \
"fmin v0.4s, v0.4s, %[vsix].4s \n" /* vmaxq_f32() */ \
"fmin v1.4s, v1.4s, %[vsix].4s \n" /* vmaxq_f32() */ \
"fmin v2.4s, v2.4s, %[vsix].4s \n" /* vmaxq_f32() */ \
"fmin v3.4s, v3.4s, %[vsix].4s \n" /* vmaxq_f32() */
#define FILL_LEAKY_RELU \
"cmhs v4.4s, v0.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v5.4s, v0.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v6.4s, v1.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v7.4s, v1.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v8.4s, v2.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v9.4s, v2.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"cmhs v10.4s, v3.4s, %[vzero].4s \n" /* vcgeq_u32 */ \
"fmul v11.4s, v3.4s, %[vscale].4s \n" /* vmulq_f32 */ \
"bif v0.16b, v5.16b, v4.16b \n" /* choose*/ \
"bif v1.16b, v7.16b, v6.16b \n" /* choose*/ \
"bif v2.16b, v9.16b, v8.16b \n" /* choose*/ \
"bif v3.16b, v11.16b, v10.16b \n" /* choose*/
#define FILL_STORE \
"subs %w[cnt], %w[cnt], #1 \n" \
"st1 {v0.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
"st1 {v1.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
"st1 {v2.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
"st1 {v3.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
"bne 1b \n"
#else
#define FILL_BIAS \
"1: \n" \
"vld1.32 {d6-d7}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d8-d9}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vadd.f32 q3, q3, %q[vbias] @ add \n" \
"vadd.f32 q4, q4, %q[vbias] @ add \n" \
"vadd.f32 q5, q5, %q[vbias] @ add \n" \
"vadd.f32 q6, q6, %q[vbias] @ add \n"
#define FILL_RELU \
"vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n"
#define FILL_RELU6 \
"vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \
"vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \
"vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \
"vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n"
#define FILL_LEAKY_RELU \
"vcge.f32 q7, q3, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q8, q3, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q9, q4, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q10, q4, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q11, q5, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q12, q5, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q13, q6, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q14, q6, %q[vscale] @ vmulq_f32 \n" \
"vbif q3, q8, q7 @ choose \n" \
"vbif q4, q10, q9 @ choose \n" \
"vbif q5, q12, q11 @ choose \n" \
"vbif q6, q14, q13 @ choose \n"
#define FILL_STORE \
"subs %[cnt], #1 \n" \
"vst1.32 {d6-d7}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d8-d9}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d10-d11}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d12-d13}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"bne 1b \n"
#endif
template <>
void fill_bias_act<float>(float* tensor,
const float* bias,
int channel,
int channel_size,
bool flag_bias,
const operators::ActivationParam* act_param) {
float* data = tensor;
int cnt = channel_size >> 4;
int remain = channel_size % 16;
float32x4_t vzero = vdupq_n_f32(0.f);
if (act_param != nullptr && act_param->has_active) {
float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
for (int j = 0; j < channel; j++) {
float bias_data = flag_bias ? bias[j] : 0.f;
float* src = data + j * channel_size;
float* dst = data + j * channel_size;
float32x4_t vbias = vdupq_n_f32(bias_data);
if (cnt > 0) {
switch (act_param->active_type) {
case lite_api::ActivationType::kRelu:
#ifdef __aarch64__
asm volatile(
FILL_BIAS FILL_RELU FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vbias] "w"(vbias)
: "memory", "cc", "v0", "v1", "v2", "v3");
#else
asm volatile(
FILL_BIAS FILL_RELU FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vbias] "w"(vbias)
: "memory", "cc", "q3", "q4", "q5", "q6");
#endif
break;
case lite_api::ActivationType::kRelu6:
#ifdef __aarch64__
asm volatile(
FILL_BIAS FILL_RELU FILL_RELU6 FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vsix] "w"(vsix), [vbias] "w"(vbias)
: "memory", "cc", "v0", "v1", "v2", "v3");
#else
asm volatile(
FILL_BIAS FILL_RELU FILL_RELU6 FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vsix] "w"(vsix), [vbias] "w"(vbias)
: "memory", "cc", "q3", "q4", "q5", "q6");
#endif
break;
case lite_api::ActivationType::kLeakyRelu:
#ifdef __aarch64__
asm volatile(
FILL_BIAS FILL_LEAKY_RELU FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vscale] "w"(vscale), [vbias] "w"(vbias)
: "memory",
"cc",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11");
#else
asm volatile(
FILL_BIAS FILL_LEAKY_RELU FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vzero] "w"(vzero), [vscale] "w"(vscale), [vbias] "w"(vbias)
: "memory",
"cc",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14");
#endif
break;
default:
LOG(FATAL) << "this act_type: "
<< static_cast<int>(act_param->active_type)
<< " fuse not support";
}
}
// remain
switch (act_param->active_type) {
case lite_api::ActivationType::kRelu:
for (int i = 0; i < remain; i++) {
*dst = *src >= 0.f ? *src : 0.f;
src++;
dst++;
}
case lite_api::ActivationType::kRelu6:
for (int i = 0; i < remain; i++) {
float tmp = *src >= 0.f ? *src : 0.f;
*dst = tmp <= act_param->Relu_clipped_coef
? tmp
: act_param->Relu_clipped_coef;
src++;
dst++;
}
case lite_api::ActivationType::kLeakyRelu:
for (int i = 0; i < remain; i++) {
if (*src >= 0.f) {
*dst = *src;
} else {
*dst = *src * act_param->Leaky_relu_alpha;
}
src++;
dst++;
}
break;
default:
LOG(FATAL) << "this act_type: "
<< static_cast<int>(act_param->active_type)
<< " fuse not support";
}
}
} else {
for (int j = 0; j < channel; ++j) {
float bias_data = flag_bias ? bias[j] : 0.f;
float32x4_t vbias = vdupq_n_f32(bias_data);
float* src = data + j * channel_size;
float* dst = data + j * channel_size;
#ifdef __aarch64__
asm volatile(FILL_BIAS FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vbias] "w"(vbias)
: "memory", "cc", "v0", "v1", "v2", "v3");
#else
asm volatile(FILL_BIAS FILL_STORE
: [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
: [vbias] "w"(vbias)
: "memory", "cc", "q3", "q4", "q5", "q6");
#endif
}
}
}
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -37,7 +37,22 @@ void fill_bias_relu(Dtype* tensor,
int channel_size,
bool flag_bias,
bool flag_relu);
/**
* * \brief neon implementation to add bias and activation(relu, relu6,
* leakyrelu)
* * @param tensor
* * @param bias
* * @param channel
* * @param channel_size
*
*/
template <typename Dtype>
void fill_bias_act(Dtype* tensor,
const Dtype* bias,
int channel,
int channel_size,
bool flag_bias,
const operators::ActivationParam* act_param);
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -21,128 +21,179 @@ namespace arm {
namespace math {
template <>
void fill_bias_fc<float>(float *out, const float *bias, int num, int channel) {
void fill_bias_fc<float>(
float *out, const float *bias, int num, int channel, bool flag_relu) {
int cnt = channel >> 4;
int remain = channel & 15;
for (int j = 0; j < num; ++j) {
const float *ptr_bias = bias;
float *ptr_out = out + j * channel;
float32x4_t vout1;
float32x4_t vout2;
float32x4_t vout3;
float32x4_t vout4;
for (int i = 0; i < cnt; ++i) {
float32x4_t vin1 = vld1q_f32(ptr_out);
float32x4_t vb1 = vld1q_f32(ptr_bias);
float32x4_t vin2 = vld1q_f32(ptr_out + 4);
float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
float32x4_t vin3 = vld1q_f32(ptr_out + 8);
float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
float32x4_t vin4 = vld1q_f32(ptr_out + 12);
float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
vout1 = vaddq_f32(vin1, vb1);
vout2 = vaddq_f32(vin2, vb2);
vout3 = vaddq_f32(vin3, vb3);
vout4 = vaddq_f32(vin4, vb4);
vst1q_f32(ptr_out, vout1);
vst1q_f32(ptr_out + 4, vout2);
vst1q_f32(ptr_out + 8, vout3);
vst1q_f32(ptr_out + 12, vout4);
ptr_out += 16;
ptr_bias += 16;
if (flag_relu) {
float32x4_t vzero = vdupq_n_f32(0.f);
for (int j = 0; j < num; ++j) {
const float *ptr_bias = bias;
float *ptr_out = out + j * channel;
for (int i = 0; i < cnt; ++i) {
float32x4_t vin1 = vld1q_f32(ptr_out);
float32x4_t vb1 = vld1q_f32(ptr_bias);
float32x4_t vin2 = vld1q_f32(ptr_out + 4);
float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
float32x4_t vin3 = vld1q_f32(ptr_out + 8);
float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
float32x4_t vin4 = vld1q_f32(ptr_out + 12);
float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
float32x4_t vout1 = vaddq_f32(vin1, vb1);
float32x4_t vout2 = vaddq_f32(vin2, vb2);
float32x4_t vout3 = vaddq_f32(vin3, vb3);
float32x4_t vout4 = vaddq_f32(vin4, vb4);
vout1 = vmaxq_f32(vout1, vzero);
vout2 = vmaxq_f32(vout2, vzero);
vout3 = vmaxq_f32(vout3, vzero);
vout4 = vmaxq_f32(vout4, vzero);
vst1q_f32(ptr_out, vout1);
vst1q_f32(ptr_out + 4, vout2);
vst1q_f32(ptr_out + 8, vout3);
vst1q_f32(ptr_out + 12, vout4);
ptr_out += 16;
ptr_bias += 16;
}
for (int i = 0; i < remain; ++i) {
*ptr_out += *(ptr_bias++);
*ptr_out = *ptr_out > 0.f ? *ptr_out : 0.f;
ptr_out++;
}
}
#if 0
if (cnt > 0) {
asm(
"1: \n"
"vld1.32 {d0-d1}, [%[ptr_out]] @ load data\n"
"vld1.32 {d2-d3}, [%[ptr_bias]]! @ load data\n"
"vadd.f32 q2, q0, q1 @ add bias\n"
"vst1.32 {d4-d5}, [%[ptr_out]]! @ store result\n"
"subs %[cnt], #1 @ loop count -1\n"
"bne 1b @ jump to main loop\n"
:[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
[cnt] "+r"(cnt)
:
:"q0", "q1", "q2"
);
}
#endif
for (int i = 0; i < remain; ++i) {
*(ptr_out++) += *(ptr_bias++);
} else {
for (int j = 0; j < num; ++j) {
const float *ptr_bias = bias;
float *ptr_out = out + j * channel;
for (int i = 0; i < cnt; ++i) {
float32x4_t vin1 = vld1q_f32(ptr_out);
float32x4_t vb1 = vld1q_f32(ptr_bias);
float32x4_t vin2 = vld1q_f32(ptr_out + 4);
float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
float32x4_t vin3 = vld1q_f32(ptr_out + 8);
float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
float32x4_t vin4 = vld1q_f32(ptr_out + 12);
float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
float32x4_t vout1 = vaddq_f32(vin1, vb1);
float32x4_t vout2 = vaddq_f32(vin2, vb2);
float32x4_t vout3 = vaddq_f32(vin3, vb3);
float32x4_t vout4 = vaddq_f32(vin4, vb4);
vst1q_f32(ptr_out, vout1);
vst1q_f32(ptr_out + 4, vout2);
vst1q_f32(ptr_out + 8, vout3);
vst1q_f32(ptr_out + 12, vout4);
ptr_out += 16;
ptr_bias += 16;
}
for (int i = 0; i < remain; ++i) {
*(ptr_out++) += *(ptr_bias++);
}
}
}
}
template <>
void fill_bias_fc<int>(int *out, const int *bias, int num, int channel) {
void fill_bias_fc<int>(
int *out, const int *bias, int num, int channel, bool flag_relu) {
int cnt = channel >> 4;
int remain = channel & 15;
for (int j = 0; j < num; ++j) {
const int *ptr_bias = bias;
int *ptr_out = out + j * channel;
int32x4_t vout1;
int32x4_t vout2;
int32x4_t vout3;
int32x4_t vout4;
for (int i = 0; i < cnt; ++i) {
int32x4_t vin1 = vld1q_s32(ptr_out);
int32x4_t vb1 = vld1q_s32(ptr_bias);
int32x4_t vin2 = vld1q_s32(ptr_out + 4);
int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
int32x4_t vin3 = vld1q_s32(ptr_out + 8);
int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
int32x4_t vin4 = vld1q_s32(ptr_out + 12);
int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
vout1 = vaddq_s32(vin1, vb1);
vout2 = vaddq_s32(vin2, vb2);
vout3 = vaddq_s32(vin3, vb3);
vout4 = vaddq_s32(vin4, vb4);
vst1q_s32(ptr_out, vout1);
vst1q_s32(ptr_out + 4, vout2);
vst1q_s32(ptr_out + 8, vout3);
vst1q_s32(ptr_out + 12, vout4);
ptr_out += 16;
ptr_bias += 16;
}
#if 0
if (cnt > 0) {
asm(
"1: \n"
"vld1.32 {d0-d1}, [%[ptr_out]] @ load data\n"
"vld1.32 {d2-d3}, [%[ptr_bias]]! @ load data\n"
"vadd.s32 q2, q0, q1 @ add bias\n"
"vst1.32 {d4-d5}, [%[ptr_out]]! @ store result\n"
"subs %[cnt], #1 @ loop count -1\n"
"bne 1b @ jump to main loop\n"
:[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
[cnt] "+r"(cnt)
:
:"q0", "q1", "q2"
);
if (flag_relu) {
for (int j = 0; j < num; ++j) {
const int *ptr_bias = bias;
int *ptr_out = out + j * channel;
int32x4_t vzero = vdupq_n_s32(0);
for (int i = 0; i < cnt; ++i) {
int32x4_t vin1 = vld1q_s32(ptr_out);
int32x4_t vb1 = vld1q_s32(ptr_bias);
int32x4_t vin2 = vld1q_s32(ptr_out + 4);
int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
int32x4_t vin3 = vld1q_s32(ptr_out + 8);
int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
int32x4_t vin4 = vld1q_s32(ptr_out + 12);
int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
int32x4_t vout1 = vaddq_s32(vin1, vb1);
int32x4_t vout2 = vaddq_s32(vin2, vb2);
int32x4_t vout3 = vaddq_s32(vin3, vb3);
int32x4_t vout4 = vaddq_s32(vin4, vb4);
vout1 = vmaxq_s32(vout1, vzero);
vout2 = vmaxq_s32(vout2, vzero);
vout3 = vmaxq_s32(vout3, vzero);
vout4 = vmaxq_s32(vout4, vzero);
vst1q_s32(ptr_out, vout1);
vst1q_s32(ptr_out + 4, vout2);
vst1q_s32(ptr_out + 8, vout3);
vst1q_s32(ptr_out + 12, vout4);
ptr_out += 16;
ptr_bias += 16;
}
for (int i = 0; i < remain; ++i) {
*ptr_out += *(ptr_bias++);
*ptr_out = *ptr_out > 0 ? *ptr_out : 0;
ptr_out++;
}
}
#endif
for (int i = 0; i < remain; ++i) {
*(ptr_out++) += *(ptr_bias++);
} else {
for (int j = 0; j < num; ++j) {
const int *ptr_bias = bias;
int *ptr_out = out + j * channel;
int32x4_t vout1;
int32x4_t vout2;
int32x4_t vout3;
int32x4_t vout4;
for (int i = 0; i < cnt; ++i) {
int32x4_t vin1 = vld1q_s32(ptr_out);
int32x4_t vb1 = vld1q_s32(ptr_bias);
int32x4_t vin2 = vld1q_s32(ptr_out + 4);
int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
int32x4_t vin3 = vld1q_s32(ptr_out + 8);
int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
int32x4_t vin4 = vld1q_s32(ptr_out + 12);
int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
vout1 = vaddq_s32(vin1, vb1);
vout2 = vaddq_s32(vin2, vb2);
vout3 = vaddq_s32(vin3, vb3);
vout4 = vaddq_s32(vin4, vb4);
vst1q_s32(ptr_out, vout1);
vst1q_s32(ptr_out + 4, vout2);
vst1q_s32(ptr_out + 8, vout3);
vst1q_s32(ptr_out + 12, vout4);
ptr_out += 16;
ptr_bias += 16;
}
for (int i = 0; i < remain; ++i) {
*(ptr_out++) += *(ptr_bias++);
}
}
}
}
......
......@@ -356,7 +356,8 @@ inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
}
template <typename T>
void fill_bias_fc(T* tensor, const T* bias, int num, int channel);
void fill_bias_fc(
T* tensor, const T* bias, int num, int channel, bool flag_relu);
template <lite_api::ActivationType Act = lite_api::ActivationType::kIndentity>
inline float32x4_t vactive_f32(const float32x4_t& x) {
......
......@@ -383,6 +383,8 @@ struct GRUUnitFunctor {
const lite_api::ActivationType active_gate,
bool origin_mode,
ARMContext* ctx) {
operators::ActivationParam act_param;
act_param.has_active = false;
if (value.prev_out_value) {
sgemm(false,
false,
......@@ -399,7 +401,7 @@ struct GRUUnitFunctor {
frame_size * 3,
nullptr,
false,
false,
act_param,
ctx);
}
gru_unit_reset_act(active_gate, value, frame_size, batch_size);
......@@ -420,7 +422,7 @@ struct GRUUnitFunctor {
frame_size * 3,
nullptr,
false,
false,
act_param,
ctx);
}
......
......@@ -14,6 +14,7 @@
#include "lite/backends/arm/math/packed_sgemm.h"
#include <arm_neon.h>
#include "lite/backends/arm/math/conv_block_utils.h"
namespace paddle {
namespace lite {
......@@ -51,7 +52,7 @@ void sgemm_prepacked_8x12(bool is_transB,
int ldc,
const float *bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext *ctx);
void pack_m4(float *out,
......@@ -83,7 +84,7 @@ void sgemm_prepacked_4x4(bool is_transB,
int ldc,
const float *bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext *ctx);
#else
// for kA72
......@@ -136,7 +137,7 @@ void sgemm_prepacked_6x8(bool is_transB,
int ldc,
const float *bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext *ctx);
// for kA73, 4x8
void sgemm_prepacked_4x8(bool is_transB,
......@@ -151,7 +152,7 @@ void sgemm_prepacked_4x8(bool is_transB,
int ldc,
const float *bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext *ctx);
#endif // __aarch64__
......@@ -249,7 +250,7 @@ void sgemm_prepack(bool is_transB,
int ldc,
const float *bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext *ctx) {
#ifdef __aarch64__
if (M <= 4) {
......@@ -265,7 +266,7 @@ void sgemm_prepack(bool is_transB,
ldc,
bias,
has_bias,
has_relu,
act_param,
ctx);
} else {
sgemm_prepacked_8x12(is_transB,
......@@ -280,7 +281,7 @@ void sgemm_prepack(bool is_transB,
ldc,
bias,
has_bias,
has_relu,
act_param,
ctx);
}
#else // armv7
......@@ -297,7 +298,7 @@ void sgemm_prepack(bool is_transB,
ldc,
bias,
has_bias,
has_relu,
act_param,
ctx);
} else {
sgemm_prepacked_6x8(is_transB,
......@@ -312,7 +313,7 @@ void sgemm_prepack(bool is_transB,
ldc,
bias,
has_bias,
has_relu,
act_param,
ctx);
}
#endif // arm64
......@@ -2283,7 +2284,7 @@ void sgemm_prepacked_8x12(bool is_transB,
int ldc,
const float *bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext *ctx) {
size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
auto workspace = ctx->workspace_data<float>();
......@@ -2837,33 +2838,6 @@ void sgemm_prepacked_8x12(bool is_transB,
"fmla v28.4s, v4.4s, v1.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/
"fmla v31.4s, v4.4s, v1.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/
"11: \n" /* check if relu */
"cbz %w[relu], 12f\n" /* skip relu */
"movi v2.4s, #0\n" /* for relu*/
"fmax v8.4s, v8.4s, v2.4s\n" /* relu*/
"fmax v9.4s, v9.4s, v2.4s\n" /* relu*/
"fmax v10.4s, v10.4s, v2.4s\n" /* relu*/
"fmax v11.4s, v11.4s, v2.4s\n" /* relu*/
"fmax v12.4s, v12.4s, v2.4s\n" /* relu*/
"fmax v13.4s, v13.4s, v2.4s\n" /* relu*/
"fmax v14.4s, v14.4s, v2.4s\n" /* relu*/
"fmax v15.4s, v15.4s, v2.4s\n" /* relu*/
"fmax v16.4s,v16.4s,v2.4s\n" /* relu*/
"fmax v17.4s,v17.4s,v2.4s\n" /* relu*/
"fmax v18.4s, v18.4s, v2.4s\n" /* relu*/
"fmax v19.4s, v19.4s, v2.4s\n" /* relu*/
"fmax v20.4s, v20.4s, v2.4s\n" /* relu*/
"fmax v21.4s, v21.4s, v2.4s\n" /* relu*/
"fmax v22.4s, v22.4s, v2.4s\n" /* relu*/
"fmax v23.4s, v23.4s, v2.4s\n" /* relu*/
"fmax v24.4s,v24.4s,v2.4s\n" /* relu*/
"fmax v25.4s,v25.4s,v2.4s\n" /* relu*/
"fmax v26.4s, v26.4s, v2.4s\n" /* relu*/
"fmax v27.4s, v27.4s, v2.4s\n" /* relu*/
"fmax v28.4s, v28.4s, v2.4s\n" /* relu*/
"fmax v29.4s, v29.4s, v2.4s\n" /* relu*/
"fmax v30.4s, v30.4s, v2.4s\n" /* relu*/
"fmax v31.4s, v31.4s, v2.4s\n" /* relu*/
"12: \n"
"st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n" /* store r0 */
"st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
"st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
......@@ -2886,7 +2860,6 @@ void sgemm_prepacked_8x12(bool is_transB,
[c_ptr6] "+r"(c_ptr6),
[c_ptr7] "+r"(c_ptr7)
: [bias_ptr] "r"(bias_local),
[relu] "r"(has_relu),
[has_beta] "r"(has_beta),
[beta] "r"(beta)
: "cc","memory",
......@@ -2911,6 +2884,13 @@ void sgemm_prepacked_8x12(bool is_transB,
}
}
}
if (act_param.has_active) {
#pragma omp parallel for num_threads(threads)
for (unsigned int x = 0; x < M; x++) {
float *dst = C + x * ldc;
act_switch_process(dst, dst, N, &act_param);
}
}
}
void sgemm_prepacked_4x4(bool is_transB,
......@@ -2925,7 +2905,7 @@ void sgemm_prepacked_4x4(bool is_transB,
int ldc,
const float *bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext *ctx) {
size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
auto workspace = ctx->workspace_data<float>();
......@@ -3158,13 +3138,6 @@ void sgemm_prepacked_4x4(bool is_transB,
"fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b1 =q6*/
"11: \n" /* check if relu */
"cbz %w[relu], 12f\n" /* skip relu */
"movi v2.4s, #0\n" /* for relu*/
"fmax v8.4s, v8.4s, v2.4s\n" /* relu*/
"fmax v9.4s, v9.4s, v2.4s\n" /* relu*/
"fmax v10.4s, v10.4s, v2.4s\n" /* relu*/
"fmax v11.4s, v11.4s, v2.4s\n" /* relu*/
"12: \n"
"st1 {v8.4s}, [%[c_ptr0]], #16\n" /* store r0 */
"st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */
"st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */
......@@ -3179,7 +3152,6 @@ void sgemm_prepacked_4x4(bool is_transB,
[c_ptr2] "+r"(c_ptr2),
[c_ptr3] "+r"(c_ptr3)
: [bias_ptr] "r"(bias_local),
[relu] "r"(has_relu),
[has_beta] "r"(has_beta),
[beta] "r"(beta)
: "cc","memory",
......@@ -3197,6 +3169,13 @@ void sgemm_prepacked_4x4(bool is_transB,
}
}
}
if (act_param.has_active) {
#pragma omp parallel for num_threads(threads)
for (unsigned int x = 0; x < M; x++) {
float *dst = C + x * ldc;
act_switch_process(dst, dst, N, &act_param);
}
}
}
#else // __aarch64__
/**
......@@ -3222,7 +3201,7 @@ void sgemm_prepacked_6x8(bool is_transB,
int ldc,
const float* bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext* ctx) {
size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
auto* workspace = ctx->workspace_data<float>();
......@@ -3601,22 +3580,6 @@ void sgemm_prepacked_6x8(bool is_transB,
"vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4\n"
"vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5\n"
"2: @ check relu\n"
"cmp %[relu], #0 @ check if has relu\n"
"ble 6f @ skip relu if relu <= 0\n"
"vmov.u32 q0, #0 @ for relu\n"
"vmax.f32 q4, q4, q0 @ for relu\n"
"vmax.f32 q5, q5, q0 @ for relu\n"
"vmax.f32 q6, q6, q0 @ for relu\n"
"vmax.f32 q7, q7, q0 @ for relu\n"
"vmax.f32 q8, q8, q0 @ for relu\n"
"vmax.f32 q9, q9, q0 @ for relu\n"
"vmax.f32 q10, q10, q0 @ for relu\n"
"vmax.f32 q11, q11, q0 @ for relu\n"
"vmax.f32 q12, q12, q0 @ for relu\n"
"vmax.f32 q13, q13, q0 @ for relu\n"
"vmax.f32 q14, q14, q0 @ for relu\n"
"vmax.f32 q15, q15, q0 @ for relu\n"
"6: @ store result\n"
"vst1.32 {d8-d11}, [%[c_ptr0]]! @ store r0\n"
"vst1.32 {d12-d15}, [%[c_ptr1]]! @ store r1\n"
"vst1.32 {d16-d19}, [%[c_ptr2]]! @ store r2\n"
......@@ -3634,7 +3597,6 @@ void sgemm_prepacked_6x8(bool is_transB,
[k] "+r"(k),
[tails] "+r"(tails)
: [bias_ptr] "r"(bias_local),
[relu] "r"(has_relu),
[beta] "r"(beta)
: "q0","q1","q2","q3","q4",
"q5","q6","q7","q8","q9","q10","q11",
......@@ -3654,6 +3616,13 @@ void sgemm_prepacked_6x8(bool is_transB,
}
}
}
if (act_param.has_active) {
#pragma omp parallel for num_threads(threads)
for (unsigned int x = 0; x < M; x++) {
float* dst = C + x * ldc;
act_switch_process(dst, dst, N, &act_param);
}
}
}
void sgemm_prepacked_4x8(bool is_transB,
......@@ -3668,7 +3637,7 @@ void sgemm_prepacked_4x8(bool is_transB,
int ldc,
const float* bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext* ctx) {
size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
auto* workspace = ctx->workspace_data<float>();
......@@ -3953,18 +3922,6 @@ void sgemm_prepacked_4x8(bool is_transB,
/*aptr - 16*/
"sub %[a_ptr], %[a_ptr], #16 @ tail--\n"
"2: @ check relu\n"
"cmp %[relu], #0 @ check if has relu\n"
"ble 6f @ skip relu if relu <= 0\n"
"vmov.u32 q0, #0 @ for relu\n"
"vmax.f32 q8, q8, q0 @ for relu\n"
"vmax.f32 q9, q9, q0 @ for relu\n"
"vmax.f32 q10, q10, q0 @ for relu\n"
"vmax.f32 q11, q11, q0 @ for relu\n"
"vmax.f32 q12, q12, q0 @ for relu\n"
"vmax.f32 q13, q13, q0 @ for relu\n"
"vmax.f32 q14, q14, q0 @ for relu\n"
"vmax.f32 q15, q15, q0 @ for relu\n"
"6: @ store result\n"
"vst1.32 {d16-d19}, [%[c_ptr0]]! @ store r0\n"
"vst1.32 {d20-d23}, [%[c_ptr1]]! @ store r1\n"
"vst1.32 {d24-d27}, [%[c_ptr2]]! @ store r2\n"
......@@ -3978,7 +3935,6 @@ void sgemm_prepacked_4x8(bool is_transB,
[k] "+r"(k),
[tails] "+r"(tails)
: [bias_ptr] "r"(bias_local),
[relu] "r"(has_relu),
[beta] "r"(beta)
: "q0","q1","q2","q3",
"q4","q5","q6","q7","q8","q9","q10",
......@@ -3995,6 +3951,13 @@ void sgemm_prepacked_4x8(bool is_transB,
}
}
}
if (act_param.has_active) {
#pragma omp parallel for num_threads(threads)
for (unsigned int x = 0; x < M; x++) {
float* dst = C + x * ldc;
act_switch_process(dst, dst, N, &act_param);
}
}
}
#endif // __aarch64__
......
......@@ -17,6 +17,7 @@
#include <cmath>
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
......@@ -74,7 +75,7 @@ void sgemm_prepack(bool is_transB,
int ldc,
const float* bias,
bool has_bias,
bool has_relu,
const operators::ActivationParam act_param,
ARMContext* ctx);
} // namespace math
......
......@@ -898,6 +898,121 @@ void pooling_global_avg(const float* din,
}
}
void pooling1x1s2p0_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win) {
int size_channel_out = wout * hout;
int size_channel_in = win * hin;
auto data_out = static_cast<float*>(dout);
auto data_in = static_cast<const float*>(din);
int w_unroll_size = wout / 4;
int w_unroll_remian = wout - w_unroll_size * 4;
int win_ext = w_unroll_size * 8;
auto zero_ptr =
static_cast<float*>(TargetMalloc(TARGET(kARM), win * sizeof(float)));
memset(zero_ptr, 0, win * sizeof(float));
auto write_ptr =
static_cast<float*>(TargetMalloc(TARGET(kARM), wout * sizeof(float)));
for (int n = 0; n < num; ++n) {
float* data_out_batch = data_out + n * chout * size_channel_out;
const float* data_in_batch = data_in + n * chin * size_channel_in;
#pragma omp parallel for
for (int c = 0; c < chout; c++) {
float* data_out_channel = data_out_batch + c * size_channel_out;
const float* data_in_channel = data_in_batch + c * size_channel_in;
for (int h = 0; h < hout; h += 4) {
const float* din0_ptr = data_in_channel + h * 2 * win;
const float* din1_ptr = din0_ptr + 2 * win;
const float* din2_ptr = din1_ptr + 2 * win;
const float* din3_ptr = din2_ptr + 2 * win;
float* doutr0 = data_out_channel + h * wout;
float* doutr1 = doutr0 + wout;
float* doutr2 = doutr1 + wout;
float* doutr3 = doutr2 + wout;
if (h + 4 > hout) {
switch (h + 4 - hout) {
case 3:
doutr1 = write_ptr;
case 2:
doutr2 = write_ptr;
case 1:
doutr3 = write_ptr;
default:
break;
}
}
if (h * 2 + 7 > hin) {
switch (h * 2 + 7 - hin) {
case 7:
din0_ptr = zero_ptr;
case 6:
case 5:
din1_ptr = zero_ptr;
case 4:
case 3:
din2_ptr = zero_ptr;
case 2:
case 1:
din3_ptr = zero_ptr;
default:
break;
}
}
for (int i = 0; i < w_unroll_size; i++) {
float32x4x2_t din0 = vld2q_f32(din0_ptr);
float32x4x2_t din1 = vld2q_f32(din1_ptr);
float32x4x2_t din2 = vld2q_f32(din2_ptr);
float32x4x2_t din3 = vld2q_f32(din3_ptr);
din0_ptr += 8;
din1_ptr += 8;
din2_ptr += 8;
din3_ptr += 8;
vst1q_f32(doutr0, din0.val[0]);
vst1q_f32(doutr1, din1.val[0]);
vst1q_f32(doutr2, din2.val[0]);
vst1q_f32(doutr3, din3.val[0]);
doutr0 += 4;
doutr1 += 4;
doutr2 += 4;
doutr3 += 4;
}
int j = win_ext;
for (int i = 0; i < w_unroll_remian; i++) {
if (j >= win) {
*doutr0++ = 0.f;
*doutr1++ = 0.f;
*doutr2++ = 0.f;
*doutr3++ = 0.f;
} else {
*doutr0++ = *din0_ptr;
*doutr1++ = *din1_ptr;
*doutr2++ = *din2_ptr;
*doutr3++ = *din3_ptr;
din0_ptr += 2;
din1_ptr += 2;
din2_ptr += 2;
din3_ptr += 2;
}
j += 2;
}
}
}
}
TargetFree(TARGET(kARM), zero_ptr);
TargetFree(TARGET(kARM), write_ptr);
}
void pooling2x2s2_max(const float* din,
float* dout,
int num,
......
......@@ -64,6 +64,16 @@ void pooling_global_avg(const float* din,
int hin,
int win);
void pooling1x1s2p0_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win);
void pooling2x2s2_max(const float* din,
float* dout,
int num,
......
......@@ -34,7 +34,7 @@ void sgemm(bool is_transA,
int ldc,
const float* bias,
bool is_bias,
bool is_relu,
const operators::ActivationParam act_param,
ARMContext* ctx) {
int hblock = get_hblock(ctx);
int m_roundup = hblock * ((M + hblock - 1) / hblock);
......@@ -56,7 +56,7 @@ void sgemm(bool is_transA,
ldc,
bias,
is_bias,
is_relu,
act_param,
ctx);
TargetFree(TargetType::kARM, packed_A);
}
......
......@@ -39,7 +39,7 @@ void sgemm(bool is_transA,
int ldc,
const float* bias,
bool is_bias,
bool is_relu,
const operators::ActivationParam act_param,
ARMContext* ctx);
} // namespace math
......
if (NOT LITE_WITH_BM)
return()
endif()
lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs})
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/bm/target_wrapper.h"
#include <bmcompiler_if.h>
#include <bmlib_runtime.h>
#include <utility>
namespace paddle {
namespace lite {
int TargetWrapperBM::device_id_ = 0;
std::map<int, void*> TargetWrapperBM::bm_hds_;
size_t TargetWrapperBM::num_devices() {
int count = 0;
bm_dev_getcount(&count);
return count;
}
void TargetWrapperBM::SetDevice(int id) {
/*
if (id < 0 || (size_t)id >= num_devices()) {
LOG(FATAL) << "Failed with invalid device id " << id;
}
*/
device_id_ = id;
if (bm_hds_.find(id) == bm_hds_.end()) {
bm_handle_t bm_handle;
bm_status_t ret = bm_dev_request(&bm_handle, id);
CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
<< static_cast<int>(ret);
bm_hds_.insert(std::pair<int, bm_handle_t>(id, bm_handle));
}
return;
}
void* TargetWrapperBM::GetHandle() {
if (bm_hds_.find(device_id_) == bm_hds_.end()) {
LOG(FATAL) << "device not initialized " << device_id_;
}
return bm_hds_.at(device_id_);
}
void* TargetWrapperBM::Malloc(size_t size) {
void* ptr{};
if (bm_hds_.find(device_id_) == bm_hds_.end()) {
SetDevice(device_id_);
}
bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
bm_device_mem_t* p_mem =
reinterpret_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
bm_malloc_device_byte(bm_handle, p_mem, size);
ptr = reinterpret_cast<void*>(p_mem);
return ptr;
}
void TargetWrapperBM::Free(void* ptr) {
if (ptr != NULL) {
bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
bm_device_mem_t* mem = static_cast<bm_device_mem_t*>(ptr);
bm_free_device(bm_handle, *mem);
free(ptr);
}
return;
}
void TargetWrapperBM::MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir) {
if (bm_hds_.find(device_id_) == bm_hds_.end()) {
return;
}
bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
bm_device_mem_t* pmem{};
const bm_device_mem_t* pcst_mem{};
switch (dir) {
case IoDirection::HtoD:
pmem = static_cast<bm_device_mem_t*>(dst);
bm_memcpy_s2d_partial_offset(
bm_handle, *pmem, const_cast<void*>(src), size, 0);
break;
case IoDirection::DtoH:
pcst_mem = static_cast<const bm_device_mem_t*>(src);
bm_memcpy_d2s_partial_offset(
bm_handle, reinterpret_cast<void*>(dst), *pcst_mem, size, 0);
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
break;
}
return;
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
using TargetWrapperBM = TargetWrapper<TARGET(kBM)>;
template <>
class TargetWrapper<TARGET(kBM)> {
public:
using stream_t = int;
using event_t = int;
static size_t num_devices();
static size_t maximum_stream() { return 0; }
static void SetDevice(int id);
static void CreateStream(stream_t* stream) {}
static void DestroyStream(const stream_t& stream) {}
static void CreateEvent(event_t* event) {}
static void DestroyEvent(const event_t& event) {}
static void RecordEvent(const event_t& event) {}
static void SyncEvent(const event_t& event) {}
static void StreamSync(const stream_t& stream) {}
static void* Malloc(size_t size);
static void Free(void* ptr);
static void* GetHandle();
static void MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir);
static void MemcpyAsync(void* dst,
const void* src,
size_t size,
IoDirection dir,
const stream_t& stream) {}
static void MemsetSync(void* devPtr, int value, size_t count) {}
static void MemsetAsync(void* devPtr,
int value,
size_t count,
const stream_t& stream) {}
private:
static int device_id_;
static std::map<int, void*> bm_hds_;
};
} // namespace lite
} // namespace paddle
if(NOT LITE_WITH_CUDA)
return()
endif()
get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps})
nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps})
nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps})
nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps})
add_subdirectory(math)
......@@ -18,8 +18,8 @@
#include <string>
#include <unordered_map>
#include <vector>
#include "ai_ddk_lib/include/HiAiModelManagerService.h"
#include "ai_ddk_lib/include/hiai_ir_build.h"
#include "HiAiModelManagerService.h" // NOLINT
#include "hiai_ir_build.h" // NOLINT
namespace paddle {
namespace lite {
......
......@@ -214,3 +214,172 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
}
}
__kernel void conv2d_1x1_simple(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
int2 in_pos_in_one_block2 =
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
CL_DTYPE4 output0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
CL_DTYPE4 output1 = output0;
CL_DTYPE4 output2 = output0;
CL_DTYPE4 output3 = output0;
#elif defined(BIASE_ELE)
CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
CL_DTYPE4 output1 = output0;
CL_DTYPE4 output2 = output0;
CL_DTYPE4 output3 = output0;
#else
CL_DTYPE4 output0 = 0.0f;
CL_DTYPE4 output1 = 0.0f;
CL_DTYPE4 output2 = 0.0f;
CL_DTYPE4 output3 = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
CL_DTYPE4 input0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
CL_DTYPE4 weight0 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
CL_DTYPE4 weight1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
CL_DTYPE4 weight2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
CL_DTYPE4 weight3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
CL_DTYPE4 input1 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
CL_DTYPE4 input2 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(input2.w, weight3, output2);
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
CL_DTYPE4 input3 =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(input3.w, weight3, output3);
}
#ifdef BATCH_NORM
output0 = output0 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation_type4(output0);
output1 = activation_type4(output1);
output2 = activation_type4(output2);
output3 = activation_type4(output3);
#endif
if (out_w0 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
}
if (out_w1 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
}
if (out_w2 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
}
if (out_w3 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
}
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void depth_conv2d(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int filter_width,
__private const int filter_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
CL_DTYPE4 output =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
#else
CL_DTYPE4 output = 0.0f;
#endif
int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block =
(int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x;
int filter_y = pos_in_filter_block.y;
int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
int2 align = {filter_width / 2, filter_height / 2};
for (int fy = 0; fy < filter_height; ++fy) {
for (int fx = 0; fx < filter_width; ++fx) {
int x_off = fx - align.x;
int y_off = fy - align.y;
CL_DTYPE4 in = select(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
sampler,
(int2)(input_x_base + x_off, input_y_base + y_off)),
(CL_DTYPE4)(0.0f),
(ushort4)((in_pos_in_one_block.x + x_off < 0 ||
in_pos_in_one_block.y + y_off < 0 ||
in_pos_in_one_block.x + x_off >= input_width ||
in_pos_in_one_block.y + y_off >= input_height)
<< 15));
CL_DTYPE4 f = READ_IMG_TYPE(
CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + fx, filter_y + fy));
output += in * f;
}
}
#ifdef BATCH_NORM
output = output * READ_IMG_TYPE(
CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation_type4(output);
#endif
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}
\ No newline at end of file
......@@ -89,7 +89,7 @@ All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automati
3. Add reference function of `your_key`.
Note:
- this should be run on CPU and do not depend on any third-party.
- Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
- Add `USE_JITKERNEL_REFER_LITE(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
Test more data type for some special functions if necessary, for example `int8`.
5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.
......
......@@ -79,7 +79,7 @@ PaddlePaddle/Paddle/paddle/fluid/
# 如何添加新的算子
1.`KernelType` 中添加 `your_key`
2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。
2. 实现Reference 的逻辑,这个是必须是在CPU上的实现,并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER_LITE(your_key)`来使用该kernel。
3. (optional) 实现更多的算法在`more`目录下,可以依赖mkl,intrinsic或者mkldnn等第三方库。
4. (optional) 实现基于Xbyak的生成code,在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`,并注册在与refer相同的`KernelType`上。
5. 添加新的`KernelTuple`,需要与`KernelType`一一对应,是所有类型的一个打包,包括数据类型,属性的类型,以及返回的函数类型。可以参考`SeqPoolTuple`,新加的Attr类型需要特例化`JitCodeKey`方法。
......
......@@ -4,33 +4,33 @@ file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
function(USE_JITKERNEL_GEN TARGET)
file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
function(USE_JITKERNEL_GEN_LITE TARGET)
file(APPEND ${jit_file} "USE_JITKERNEL_GEN_LITE(${TARGET});\n")
endfunction()
# use gen jitcode kernel by name
USE_JITKERNEL_GEN(kMatMul)
USE_JITKERNEL_GEN(kVMul)
USE_JITKERNEL_GEN(kVAdd)
USE_JITKERNEL_GEN(kVSub)
USE_JITKERNEL_GEN(kVAddRelu)
USE_JITKERNEL_GEN(kVScal)
USE_JITKERNEL_GEN(kVAddBias)
USE_JITKERNEL_GEN(kVRelu)
USE_JITKERNEL_GEN(kVSquare)
USE_JITKERNEL_GEN(kVIdentity)
USE_JITKERNEL_GEN(kVExp)
USE_JITKERNEL_GEN(kVSigmoid)
USE_JITKERNEL_GEN(kVTanh)
USE_JITKERNEL_GEN(kLSTMCtHt)
USE_JITKERNEL_GEN(kLSTMC1H1)
USE_JITKERNEL_GEN(kGRUH1)
USE_JITKERNEL_GEN(kGRUHtPart1)
USE_JITKERNEL_GEN(kGRUHtPart2)
USE_JITKERNEL_GEN(kNCHW16CMulNC)
USE_JITKERNEL_GEN(kSeqPool)
USE_JITKERNEL_GEN(kHMax)
USE_JITKERNEL_GEN(kHSum)
USE_JITKERNEL_GEN(kEmbSeqPool)
USE_JITKERNEL_GEN(kSgd)
USE_JITKERNEL_GEN(kVBroadcast)
USE_JITKERNEL_GEN_LITE(kMatMul)
USE_JITKERNEL_GEN_LITE(kVMul)
USE_JITKERNEL_GEN_LITE(kVAdd)
USE_JITKERNEL_GEN_LITE(kVSub)
USE_JITKERNEL_GEN_LITE(kVAddRelu)
USE_JITKERNEL_GEN_LITE(kVScal)
USE_JITKERNEL_GEN_LITE(kVAddBias)
USE_JITKERNEL_GEN_LITE(kVRelu)
USE_JITKERNEL_GEN_LITE(kVSquare)
USE_JITKERNEL_GEN_LITE(kVIdentity)
USE_JITKERNEL_GEN_LITE(kVExp)
USE_JITKERNEL_GEN_LITE(kVSigmoid)
USE_JITKERNEL_GEN_LITE(kVTanh)
USE_JITKERNEL_GEN_LITE(kLSTMCtHt)
USE_JITKERNEL_GEN_LITE(kLSTMC1H1)
USE_JITKERNEL_GEN_LITE(kGRUH1)
USE_JITKERNEL_GEN_LITE(kGRUHtPart1)
USE_JITKERNEL_GEN_LITE(kGRUHtPart2)
USE_JITKERNEL_GEN_LITE(kNCHW16CMulNC)
USE_JITKERNEL_GEN_LITE(kSeqPool)
USE_JITKERNEL_GEN_LITE(kHMax)
USE_JITKERNEL_GEN_LITE(kHSum)
USE_JITKERNEL_GEN_LITE(kEmbSeqPool)
USE_JITKERNEL_GEN_LITE(kSgd)
USE_JITKERNEL_GEN_LITE(kVBroadcast)
......@@ -156,9 +156,9 @@ size_t VTanhCreator::CodeSize(const int& d) const {
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator);
REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);
REGISTER_JITKERNEL_GEN_LITE(kVRelu, gen::VReluCreator);
REGISTER_JITKERNEL_GEN_LITE(kVSquare, gen::VSquareCreator);
REGISTER_JITKERNEL_GEN_LITE(kVIdentity, gen::VIdentityCreator);
REGISTER_JITKERNEL_GEN_LITE(kVExp, gen::VExpCreator);
REGISTER_JITKERNEL_GEN_LITE(kVSigmoid, gen::VSigmoidCreator);
REGISTER_JITKERNEL_GEN_LITE(kVTanh, gen::VTanhCreator);
......@@ -181,10 +181,10 @@ DECLARE_BLAS_CREATOR(VAddBias);
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
REGISTER_JITKERNEL_GEN_LITE(kVMul, gen::VMulCreator);
REGISTER_JITKERNEL_GEN_LITE(kVAdd, gen::VAddCreator);
REGISTER_JITKERNEL_GEN_LITE(kVSub, gen::VSubCreator);
REGISTER_JITKERNEL_GEN_LITE(kVAddRelu, gen::VAddReluCreator);
REGISTER_JITKERNEL_GEN_LITE(kVScal, gen::VScalCreator);
REGISTER_JITKERNEL_GEN_LITE(kVAddBias, gen::VAddBiasCreator);
REGISTER_JITKERNEL_GEN_LITE(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
......@@ -145,4 +145,4 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
REGISTER_JITKERNEL_GEN_LITE(kEmbSeqPool, gen::EmbSeqPoolCreator);
......@@ -111,6 +111,6 @@ DECLARE_GRU_CREATOR(GRUHtPart2);
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator);
REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator);
REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator);
REGISTER_JITKERNEL_GEN_LITE(kGRUH1, gen::GRUH1Creator);
REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart1, gen::GRUHtPart1Creator);
REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart2, gen::GRUHtPart2Creator);
......@@ -99,5 +99,5 @@ DECLARE_HOP_CREATOR(HSum);
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
REGISTER_JITKERNEL_GEN_LITE(kHMax, gen::HMaxCreator);
REGISTER_JITKERNEL_GEN_LITE(kHSum, gen::HSumCreator);
......@@ -138,5 +138,5 @@ DECLARE_LSTM_CREATOR(LSTMC1H1);
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator);
REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator);
REGISTER_JITKERNEL_GEN_LITE(kLSTMCtHt, gen::LSTMCtHtCreator);
REGISTER_JITKERNEL_GEN_LITE(kLSTMC1H1, gen::LSTMC1H1Creator);
......@@ -130,4 +130,4 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
REGISTER_JITKERNEL_GEN_LITE(kMatMul, gen::MatMulCreator);
......@@ -82,4 +82,4 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
REGISTER_JITKERNEL_GEN_LITE(kSeqPool, gen::SeqPoolCreator);
......@@ -127,4 +127,4 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);
REGISTER_JITKERNEL_GEN_LITE(kSgd, gen::SgdCreator);
......@@ -88,4 +88,4 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
namespace gen = paddle::lite::jit::gen;
REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
REGISTER_JITKERNEL_GEN_LITE(kVBroadcast, gen::VBroadcastCreator);
function(USE_JITKERNEL_MORE TARGET TYPE)
file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
function(USE_JITKERNEL_MORE_LITE TARGET TYPE)
file(APPEND ${jit_file} "USE_JITKERNEL_MORE_LITE(${TARGET} ${TYPE});\n")
endfunction()
# enable it latter
......
......@@ -5,5 +5,5 @@ cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_
set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
# use mkl kernels by name and type
USE_JITKERNEL_MORE(kCRFDecoding, intrinsic)
USE_JITKERNEL_MORE(kLayerNorm, intrinsic)
USE_JITKERNEL_MORE_LITE(kCRFDecoding, intrinsic)
USE_JITKERNEL_MORE_LITE(kLayerNorm, intrinsic)
......@@ -5,11 +5,11 @@ cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base)
set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE)
USE_JITKERNEL_MORE(kVSigmoid, mix)
USE_JITKERNEL_MORE(kVTanh, mix)
USE_JITKERNEL_MORE(kLSTMCtHt, mix)
USE_JITKERNEL_MORE(kLSTMC1H1, mix)
USE_JITKERNEL_MORE(kGRUH1, mix)
USE_JITKERNEL_MORE(kGRUHtPart1, mix)
USE_JITKERNEL_MORE(kGRUHtPart2, mix)
USE_JITKERNEL_MORE(kSoftmax, mix)
USE_JITKERNEL_MORE_LITE(kVSigmoid, mix)
USE_JITKERNEL_MORE_LITE(kVTanh, mix)
USE_JITKERNEL_MORE_LITE(kLSTMCtHt, mix)
USE_JITKERNEL_MORE_LITE(kLSTMC1H1, mix)
USE_JITKERNEL_MORE_LITE(kGRUH1, mix)
USE_JITKERNEL_MORE_LITE(kGRUHtPart1, mix)
USE_JITKERNEL_MORE_LITE(kGRUHtPart2, mix)
USE_JITKERNEL_MORE_LITE(kSoftmax, mix)
......@@ -3,18 +3,18 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)
# use mkl kernels by name and type
USE_JITKERNEL_MORE(kMatMul, mkl)
USE_JITKERNEL_MORE(kVMul, mkl)
USE_JITKERNEL_MORE(kVAdd, mkl)
USE_JITKERNEL_MORE(kVScal, mkl)
USE_JITKERNEL_MORE(kStrideScal, mkl)
USE_JITKERNEL_MORE(kVExp, mkl)
USE_JITKERNEL_MORE(kVSquare, mkl)
USE_JITKERNEL_MORE(kVCopy, mkl)
USE_JITKERNEL_MORE(kVSigmoid, mkl)
USE_JITKERNEL_MORE(kVTanh, mkl)
USE_JITKERNEL_MORE(kSeqPool, mkl)
USE_JITKERNEL_MORE(kSoftmax, mkl)
USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
USE_JITKERNEL_MORE(kSgd, mkl)
USE_JITKERNEL_MORE(kVBroadcast, mkl)
USE_JITKERNEL_MORE_LITE(kMatMul, mkl)
USE_JITKERNEL_MORE_LITE(kVMul, mkl)
USE_JITKERNEL_MORE_LITE(kVAdd, mkl)
USE_JITKERNEL_MORE_LITE(kVScal, mkl)
USE_JITKERNEL_MORE_LITE(kStrideScal, mkl)
USE_JITKERNEL_MORE_LITE(kVExp, mkl)
USE_JITKERNEL_MORE_LITE(kVSquare, mkl)
USE_JITKERNEL_MORE_LITE(kVCopy, mkl)
USE_JITKERNEL_MORE_LITE(kVSigmoid, mkl)
USE_JITKERNEL_MORE_LITE(kVTanh, mkl)
USE_JITKERNEL_MORE_LITE(kSeqPool, mkl)
USE_JITKERNEL_MORE_LITE(kSoftmax, mkl)
USE_JITKERNEL_MORE_LITE(kEmbSeqPool, mkl)
USE_JITKERNEL_MORE_LITE(kSgd, mkl)
USE_JITKERNEL_MORE_LITE(kVBroadcast, mkl)
......@@ -2,39 +2,39 @@
cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base)
set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE)
function(USE_JITKERNEL_REFER TARGET)
file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
function(USE_JITKERNEL_REFER_LITE TARGET)
file(APPEND ${jit_file} "USE_JITKERNEL_REFER_LITE(${TARGET});\n")
endfunction()
# use refer kernel by name
USE_JITKERNEL_REFER(kVMul)
USE_JITKERNEL_REFER(kVAdd)
USE_JITKERNEL_REFER(kVAddRelu)
USE_JITKERNEL_REFER(kVSub)
USE_JITKERNEL_REFER(kVScal)
USE_JITKERNEL_REFER(kStrideScal)
USE_JITKERNEL_REFER(kVAddBias)
USE_JITKERNEL_REFER(kVCopy)
USE_JITKERNEL_REFER(kVRelu)
USE_JITKERNEL_REFER(kVIdentity)
USE_JITKERNEL_REFER(kVExp)
USE_JITKERNEL_REFER(kVSigmoid)
USE_JITKERNEL_REFER(kVTanh)
USE_JITKERNEL_REFER(kLSTMCtHt)
USE_JITKERNEL_REFER(kLSTMC1H1)
USE_JITKERNEL_REFER(kGRUH1)
USE_JITKERNEL_REFER(kGRUHtPart1)
USE_JITKERNEL_REFER(kGRUHtPart2)
USE_JITKERNEL_REFER(kCRFDecoding)
USE_JITKERNEL_REFER(kLayerNorm)
USE_JITKERNEL_REFER(kNCHW16CMulNC)
USE_JITKERNEL_REFER(kSeqPool)
USE_JITKERNEL_REFER(kMatMul)
USE_JITKERNEL_REFER(kVSquare)
USE_JITKERNEL_REFER(kHSum)
USE_JITKERNEL_REFER(kHMax)
USE_JITKERNEL_REFER(kStrideASum)
USE_JITKERNEL_REFER(kSoftmax)
USE_JITKERNEL_REFER(kEmbSeqPool)
USE_JITKERNEL_REFER(kSgd)
USE_JITKERNEL_REFER(kVBroadcast)
USE_JITKERNEL_REFER_LITE(kVMul)
USE_JITKERNEL_REFER_LITE(kVAdd)
USE_JITKERNEL_REFER_LITE(kVAddRelu)
USE_JITKERNEL_REFER_LITE(kVSub)
USE_JITKERNEL_REFER_LITE(kVScal)
USE_JITKERNEL_REFER_LITE(kStrideScal)
USE_JITKERNEL_REFER_LITE(kVAddBias)
USE_JITKERNEL_REFER_LITE(kVCopy)
USE_JITKERNEL_REFER_LITE(kVRelu)
USE_JITKERNEL_REFER_LITE(kVIdentity)
USE_JITKERNEL_REFER_LITE(kVExp)
USE_JITKERNEL_REFER_LITE(kVSigmoid)
USE_JITKERNEL_REFER_LITE(kVTanh)
USE_JITKERNEL_REFER_LITE(kLSTMCtHt)
USE_JITKERNEL_REFER_LITE(kLSTMC1H1)
USE_JITKERNEL_REFER_LITE(kGRUH1)
USE_JITKERNEL_REFER_LITE(kGRUHtPart1)
USE_JITKERNEL_REFER_LITE(kGRUHtPart2)
USE_JITKERNEL_REFER_LITE(kCRFDecoding)
USE_JITKERNEL_REFER_LITE(kLayerNorm)
USE_JITKERNEL_REFER_LITE(kNCHW16CMulNC)
USE_JITKERNEL_REFER_LITE(kSeqPool)
USE_JITKERNEL_REFER_LITE(kMatMul)
USE_JITKERNEL_REFER_LITE(kVSquare)
USE_JITKERNEL_REFER_LITE(kHSum)
USE_JITKERNEL_REFER_LITE(kHMax)
USE_JITKERNEL_REFER_LITE(kStrideASum)
USE_JITKERNEL_REFER_LITE(kSoftmax)
USE_JITKERNEL_REFER_LITE(kEmbSeqPool)
USE_JITKERNEL_REFER_LITE(kSgd)
USE_JITKERNEL_REFER_LITE(kVBroadcast)
......@@ -18,7 +18,7 @@
namespace refer = paddle::lite::jit::refer;
#define REGISTER_REFER_KERNEL(func) \
REGISTER_JITKERNEL_REFER( \
REGISTER_JITKERNEL_REFER_LITE( \
k##func, refer::func##Kernel<float>, refer::func##Kernel<double>)
REGISTER_REFER_KERNEL(VMul);
......
......@@ -77,16 +77,16 @@ class JitKernelRegistrar {
void Touch() {}
};
#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \
#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg) \
struct __test_global_namespace_##uniq_name##__ {}; \
static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \
__test_global_namespace_##uniq_name##__>::value, \
msg)
// Refer always on CPUPlace
#define REGISTER_JITKERNEL_REFER(kernel_type, ...) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_jitkernel_##kernel_type##_refer_CPUPlace, \
#define REGISTER_JITKERNEL_REFER_LITE(kernel_type, ...) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \
__reg_litejitkernel_##kernel_type##_refer_CPUPlace, \
"REGISTER_KERNEL_REFER must be called in global namespace"); \
static ::paddle::lite::jit::JitKernelRegistrar< \
::paddle::lite::jit::ReferKernelPool, \
......@@ -94,84 +94,84 @@ class JitKernelRegistrar {
__VA_ARGS__> \
__jit_kernel_registrar_##kernel_type##_refer_CPUPlace_( \
::paddle::lite::jit::KernelType::kernel_type); \
int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \
int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() { \
__jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \
return 0; \
}
// kernel_type: should be in paddle::lite::jit::KernelType
// place_type: should be one of CPUPlace and GPUPlace in paddle::platform
#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_jitkernel_##kernel_type##_##impl_type##_##place_type, \
"REGISTER_KERNEL_MORE must be called in global namespace"); \
extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
#define REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, place_type, ...) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \
__reg_litejitkernel_##kernel_type##_##impl_type##_##place_type, \
"REGISTER_KERNEL_MORE_LITE must be called in global namespace"); \
extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \
UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
UNUSED = LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
static ::paddle::lite::jit::JitKernelRegistrar< \
::paddle::lite::jit::KernelPool, \
::paddle::lite::fluid::place_type, \
__VA_ARGS__> \
__jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_( \
::paddle::lite::jit::KernelType::kernel_type); \
int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \
int LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \
__jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_ \
.Touch(); \
return 0; \
}
#define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \
REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \
REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
#define REGISTER_JITKERNEL_GEN(kernel_type, ...) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_jitkernel_gen_##kernel_type##_CPUPlace_, \
"REGISTER_JITKERNEL_GEN must be called in global namespace"); \
extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \
TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
static ::paddle::lite::jit::JitKernelRegistrar< \
::paddle::lite::jit::JitCodeCreatorPool, \
::paddle::lite::fluid::CPUPlace, \
__VA_ARGS__> \
__jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \
::paddle::lite::jit::KernelType::kernel_type); \
int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \
__jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \
return 0; \
REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
#define REGISTER_GPUKERNEL_MORE_LITE(kernel_type, impl_type, ...) \
REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
#define REGISTER_JITKERNEL_GEN_LITE(kernel_type, ...) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \
__reg_litejitkernel_gen_##kernel_type##_CPUPlace_, \
"REGISTER_JITKERNEL_GEN_LITE must be called in global namespace"); \
extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
static int __assert_gen_##kernel_type##_has_refer_ UNUSED = \
LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
static ::paddle::lite::jit::JitKernelRegistrar< \
::paddle::lite::jit::JitCodeCreatorPool, \
::paddle::lite::fluid::CPUPlace, \
__VA_ARGS__> \
__jit_kernel_registrar_gen_##kernel_type##_CPUPlace_( \
::paddle::lite::jit::KernelType::kernel_type); \
int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() { \
__jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch(); \
return 0; \
}
#define USE_JITKERNEL_GEN(kernel_type) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_jitkernel_gen_##kernel_type##_CPUPlace_, \
"USE_JITKERNEL_GEN must be called in global namespace"); \
extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \
static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
TouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
#define USE_JITKERNEL_REFER(kernel_type) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_jitkernel_##kernel_type##_refer_CPUPlace_, \
"USE_JITKERNEL_REFER must be called in global namespace"); \
extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
TouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
#define USE_KERNEL_MORE(kernel_type, impl_type, place_type) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_, \
"USE_JITKERNEL_MORE must be called in global namespace"); \
extern int \
TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \
UNUSED = \
TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
#define USE_JITKERNEL_MORE(kernel_type, impl_type) \
USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace)
#define USE_JITKERNEL_GEN_LITE(kernel_type) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \
__reg_litejitkernel_gen_##kernel_type##_CPUPlace_, \
"USE_JITKERNEL_GEN_LITE must be called in global namespace"); \
extern int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_(); \
static int use_litejitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
#define USE_JITKERNEL_REFER_LITE(kernel_type) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \
__reg_litejitkernel_##kernel_type##_refer_CPUPlace_, \
"USE_JITKERNEL_REFER_LITE must be called in global namespace"); \
extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_(); \
static int use_litejitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
#define USE_KERNEL_MORE_LITE(kernel_type, impl_type, place_type) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE( \
__reg_litejitkernel_##kernel_type##_##impl_type##_##place_type##_, \
"USE_JITKERNEL_MORE_LITE must be called in global namespace"); \
extern int \
LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
static int use_litejitkernel_##kernel_type##_##impl_type##_##place_type##_ \
UNUSED = \
LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
#define USE_JITKERNEL_MORE_LITE(kernel_type, impl_type) \
USE_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace)
} // namespace jit
} // namespace lite
......
......@@ -36,8 +36,11 @@ std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
}
xtcl::xNetwork network =
builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs));
auto target = xtcl::Target::Create(device_name_);
auto compiler = xtcl::network::xTensorCompiler(network, target);
auto target = xtcl::NullValue<xtcl::Target>();
if (!target_.empty()) {
target = xtcl::Target::Create(target_);
}
xtcl::network::xTensorCompiler compiler(network, target);
compiler.SetParams(*params); // Set the data of constant tensors
compiler.Build();
VLOG(3) << "[XPU] Build done";
......
......@@ -15,6 +15,7 @@
#pragma once
#include <xtcl/xtcl.h>
#include <cstdlib>
#include <memory>
#include <string>
#include <utility>
......@@ -30,7 +31,18 @@ class Device {
static Device x;
return x;
}
Device() {}
Device() {
char* name = std::getenv("XPU_DEVICE_NAME");
if (name) {
name_ = std::string(name);
}
// XPU_DEVICE_TARGET for XPU model building, which supports 'llvm' and 'xpu
// -libs=xdnn'
char* target = std::getenv("XPU_DEVICE_TARGET");
if (target) {
target_ = std::string(target);
}
}
// Build the XPU graph to the XPU runtime, return the XPU runtime which can be
// used to run inference.
......@@ -39,10 +51,12 @@ class Device {
xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
std::vector<xtcl::xExpr*>* outputs);
const std::string name() const { return name_; }
const std::string target() const { return target_; }
private:
// Keep reserved fields
int device_id_{0};
std::string device_name_{"llvm"};
std::string name_{""};
std::string target_{""};
};
} // namespace xpu
......
......@@ -6,7 +6,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
X86_DEPS target_wrapper_x86
CUDA_DEPS target_wrapper_cuda
CL_DEPS cl_target_wrapper
FPGA_DEPS fpga_target_wrapper)
FPGA_DEPS fpga_target_wrapper
BM_DEPS target_wrapper_bm)
lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)
......
......@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -107,7 +107,8 @@ class TestCase {
void SetCommonTensor(const std::string& var_name,
const DDim& ddim,
const T* data,
const LoD& lod = {}) {
const LoD& lod = {},
bool is_persistable = false) {
auto* tensor = scope_->NewTensor(var_name);
tensor->Resize(ddim);
auto* d = tensor->mutable_data<T>();
......@@ -115,6 +116,8 @@ class TestCase {
// set lod
if (!lod.empty()) *tensor->mutable_lod() = lod;
// set persistable
tensor->set_persistable(is_persistable);
}
// Prepare for the operator.
......
......@@ -55,6 +55,7 @@ using NPUContext = Context<TargetType::kNPU>;
using XPUContext = Context<TargetType::kXPU>;
using OpenCLContext = Context<TargetType::kOpenCL>;
using FPGAContext = Context<TargetType::kFPGA>;
using BMContext = Context<TargetType::kBM>;
template <>
class Context<TargetType::kHost> {
......@@ -82,6 +83,23 @@ class Context<TargetType::kNPU> {
};
#endif
#ifdef LITE_WITH_BM
template <>
class Context<TargetType::kBM> {
public:
Context() {}
explicit Context(const BMContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() { Init(0); }
void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); }
void CopySharedTo(BMContext* ctx) {}
void* GetHandle() { return TargetWrapperBM::GetHandle(); }
std::string name() const { return "BMContext"; }
};
#endif
#ifdef LITE_WITH_XPU
template <>
class Context<TargetType::kXPU> {
......@@ -374,6 +392,12 @@ class ContextScheduler {
kernel_contexts_[TargetType::kFPGA].As<FPGAContext>().CopySharedTo(
&ctx->As<FPGAContext>());
break;
#endif
#ifdef LITE_WITH_BM
case TARGET(kBM):
kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
&ctx->As<BMContext>());
break;
#endif
default:
#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
......@@ -412,6 +436,9 @@ class ContextScheduler {
#endif
#ifdef LITE_WITH_XPU
InitContext<TargetType::kXPU, XPUContext>();
#endif
#ifdef LITE_WITH_BM
InitContext<TargetType::kBM, BMContext>();
#endif
}
......
......@@ -79,7 +79,7 @@ const int DEFAULT_L3_CACHE_SIZE = 0;
int get_cpu_num() {
#ifdef LITE_WITH_LINUX
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
int max_cpu_num = 20;
int max_cpu_num = 128;
int cpu_num = 0;
for (int i = 0; i < max_cpu_num; ++i) {
char path[256];
......@@ -227,19 +227,24 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
#ifdef LITE_WITH_LINUX
std::string get_cpu_name() {
std::string cpu_name;
std::string cpu_name = "";
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
return "";
}
char line[1024];
bool first_model_name = true;
while (!feof(fp)) {
char* s = fgets(line, 1024, fp);
if (!s) {
break;
}
if (strstr(line, "Hardware") != NULL) {
cpu_name = std::string(line);
cpu_name += std::string(line);
}
if (strstr(line, "model name") != NULL && first_model_name) {
cpu_name += std::string(line);
first_model_name = false;
}
}
#ifdef LITE_WITH_ANDROID
......@@ -816,6 +821,21 @@ bool DeviceInfo::SetCPUInfoByName() {
SetFP16Info(1, 1);
SetDotInfo(1, 1);
return true;
} else if (dev_name_.find("FT2000PLUS") != std::string::npos) {
core_num_ = 64;
core_ids_.resize(core_num_);
big_core_ids_.resize(core_num_);
cluster_ids_.resize(core_num_);
for (int i = 0; i < core_num_; ++i) {
core_ids_[i] = i;
big_core_ids_[i] = i;
cluster_ids_[i] = 0;
}
little_core_ids_ = {};
SetCacheInfo(0, 1, 64 * 1024);
SetCacheInfo(1, 1, 32 * 1024 * 1024);
SetCacheInfo(2, 1, 128 * 1024 * 1024);
return true;
}
return false;
}
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package paddle.framework.proto;
// Any incompatible changes to ProgramDesc and its dependencies should
......
{
global:
*paddle*;
*touch_*;
*mir_pass_*;
local:
*;
};
......@@ -40,6 +40,11 @@ void* TargetMalloc(TargetType target, size_t size) {
data = TargetWrapper<TARGET(kFPGA)>::Malloc(size);
break;
#endif // LITE_WITH_OPENCL
#ifdef LITE_WITH_BM
case TargetType::kBM:
data = TargetWrapper<TARGET(kBM)>::Malloc(size);
break;
#endif
default:
LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
}
......@@ -69,6 +74,11 @@ void TargetFree(TargetType target, void* data) {
TargetWrapper<TARGET(kFPGA)>::Free(data);
break;
#endif // LITE_WITH_CUDA
#ifdef LITE_WITH_BM
case TargetType::kBM:
TargetWrapper<TARGET(kBM)>::Free(data);
break;
#endif
default:
LOG(FATAL) << "Unknown type";
}
......@@ -95,6 +105,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
dst, src, size, IoDirection::DtoD);
break;
#endif
#ifdef LITE_WITH_BM
case TargetType::kBM:
TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
break;
#endif
#ifdef LITE_WITH_OPENCL
case TargetType::kOpenCL:
TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
......
......@@ -25,6 +25,10 @@
#include "lite/backends/cuda/target_wrapper.h"
#endif // LITE_WITH_CUDA
#ifdef LITE_WITH_BM
#include "lite/backends/bm/target_wrapper.h"
#endif // LITE_WITH_BM
namespace paddle {
namespace lite {
......@@ -71,6 +75,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
case TARGET(kFPGA):
TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
break;
#endif
#ifdef LITE_WITH_BM
case TARGET(kBM):
TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
break;
#endif
}
}
......
......@@ -79,6 +79,9 @@ cpp::OpDesc ConvActivationFuser::GenOpDesc(const key2nodes_t& matched) {
op_desc.SetAttr("act_type", act_type_);
if (act_type_ == "relu") {
op_desc.SetAttr("fuse_relu", true);
} else if (act_type_ == "relu6") {
float alpha = act_op_desc.GetAttr<float>("threshold");
op_desc.SetAttr("fuse_brelu_threshold", alpha);
} else if (act_type_ == "leaky_relu") {
float alpha = act_op_desc.GetAttr<float>("alpha");
op_desc.SetAttr("leaky_relu_alpha", alpha);
......
......@@ -26,9 +26,7 @@ namespace mir {
void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// initialze fuser params
std::vector<bool> conv_has_bias_cases{true, false};
std::vector<std::string> conv_type_cases{
"conv2d", "depthwise_conv2d", "conv2d_transpose"};
std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
// start fuse using params
for (auto conv_has_bias : conv_has_bias_cases) {
for (auto conv_type : conv_type_cases) {
......@@ -46,4 +44,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kX86), TARGET(kXPU)});
.ExcludeTargets({TARGET(kX86), TARGET(kXPU), TARGET(kBM)});
......@@ -47,4 +47,4 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass,
paddle::lite::mir::ConvElementwiseFusePass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kXPU)});
.ExcludeTargets({TARGET(kXPU), TARGET(kBM)});
......@@ -36,4 +36,6 @@ REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
paddle::lite::mir::ElementwiseAddActivationFusePass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kXPU)})
.ExcludeTargets({TARGET(kBM)})
.ExcludeTargets({TARGET(kX86)})
.BindKernel("fusion_elementwise_add_activation");
......@@ -23,8 +23,13 @@ namespace lite {
namespace mir {
void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fusion::FcFuser fuser;
#ifdef LITE_WITH_X86
fusion::FcFuser fuser(true);
fuser(graph.get());
#endif
fusion::FcFuser fuser2(false);
fuser2(graph.get());
}
} // namespace mir
......@@ -34,4 +39,6 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
.BindTargets({TARGET(kAny)})
.ExcludeTargets({TARGET(kXPU)})
.ExcludeTargets({TARGET(kBM)})
.ExcludeTargets({TARGET(kCUDA)})
.BindKernel("fc");
......@@ -88,6 +88,7 @@ USE_LITE_OP(mul);
USE_LITE_OP(elementwise_add);
USE_LITE_OP(elementwise_sub);
USE_LITE_OP(fc);
USE_LITE_OP(relu);
USE_LITE_OP(feed);
USE_LITE_OP(fetch);
USE_LITE_OP(io_copy);
......
......@@ -35,12 +35,23 @@ void FcFuser::BuildPattern() {
std::vector<PMNode*> mul_inputs{W, x};
std::vector<PMNode*> add_inputs{mul_out, b};
mul_inputs >> *mul >> *mul_out;
add_inputs >> *add >> *Out;
// Some op specialities.
mul_out->AsIntermediate();
mul->AsIntermediate();
add->AsIntermediate();
if (with_relu_) {
auto* add_out = VarNode("add_out");
auto* relu = OpNode("relu", "relu");
std::vector<PMNode*> relu_inputs{add_out};
add_inputs >> *add >> *add_out;
relu_inputs >> *relu >> *Out;
add_out->AsIntermediate();
relu->AsIntermediate();
} else {
add_inputs >> *add >> *Out;
}
}
void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
......@@ -71,6 +82,9 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
op_desc.SetAttr(
"in_num_col_dims",
matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
if (with_relu_) {
op_desc.SetAttr("activation_type", std::string{"relu"});
}
return op_desc;
}
......
......@@ -25,11 +25,13 @@ namespace fusion {
class FcFuser : public FuseBase {
public:
explicit FcFuser(bool with_relu) : with_relu_(with_relu) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
bool with_relu_;
};
} // namespace fusion
......
......@@ -256,4 +256,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
.BindTargets({TARGET(kARM)})
.ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU)});
.ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
......@@ -4,7 +4,7 @@ lite_cc_library(subgraph_detector
lite_cc_library(subgraph_pass
SRCS subgraph_pass.cc
DEPS mir_pass types context ${mir_fusers} subgraph_detector)
if (WITH_TESTING)
if (WITH_TESTING AND NOT LITE_WITH_CUDA)
lite_cc_test(test_subgraph_detector
SRCS subgraph_detector_test.cc
DEPS subgraph_detector mir_passes gflags model_parser cxx_api
......
......@@ -27,7 +27,7 @@ namespace mir {
void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
......@@ -41,7 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
......@@ -53,6 +53,20 @@ void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser();
}
void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/bm/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -61,3 +75,5 @@ REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
.BindTargets({TARGET(kNPU)});
REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.BindTargets({TARGET(kXPU)});
REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
.BindTargets({TARGET(kBM)});
......@@ -32,6 +32,11 @@ class XPUSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class BMSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -92,7 +92,7 @@ void FillInputTensors(
#define FILL_TENSOR_WITH_TYPE(type) \
auto input_tensor_data = input_tensor->mutable_data<type>(); \
for (int j = 0; j < input_tensor_size; j++) { \
input_tensor_data[i] = static_cast<type>(value); \
input_tensor_data[j] = static_cast<type>(value); \
}
for (int i = 0; i < input_tensor_shape.size(); i++) {
auto input_tensor = predictor->GetInput(i);
......
......@@ -100,6 +100,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
case TARGET(kFPGA): {
CREATE_KERNEL(kFPGA);
} break;
case TARGET(kBM): {
CREATE_KERNEL(kBM);
} break;
default:
CHECK(false) << "not supported kernel target " << TargetToStr(target);
}
......@@ -186,6 +189,11 @@ KernelRegistry::KernelRegistry()
INIT_FOR(kFPGA, kFloat, kNHWC);
INIT_FOR(kFPGA, kAny, kNHWC);
INIT_FOR(kFPGA, kAny, kAny);
INIT_FOR(kBM, kFloat, kNCHW);
INIT_FOR(kBM, kInt8, kNCHW);
INIT_FOR(kBM, kAny, kNCHW);
INIT_FOR(kBM, kAny, kAny);
#undef INIT_FOR
}
......
......@@ -230,6 +230,16 @@ class KernelRegistry final {
PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kBM),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kBM),
PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kBM),
PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kFPGA),
PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
......
......@@ -137,8 +137,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
void RuntimeProgram::Run() {
for (auto& inst : instructions_) {
std::string op_type = inst.op()->op_info()->Type();
if (op_type == "feed" || op_type == "fetch") continue;
if (inst.is_feed_fetch_op()) continue;
inst.Run();
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
......
......@@ -90,7 +90,12 @@ struct Program {
struct Instruction {
Instruction(const std::shared_ptr<OpLite>& op,
std::unique_ptr<KernelBase>&& kernel)
: op_(op), kernel_(std::move(kernel)) {}
: op_(op), kernel_(std::move(kernel)) {
std::string op_type = op->Type();
if (op_type == "feed" || op_type == "fetch") {
is_feed_fetch_op_ = true;
}
}
// Run the instruction.
void Run();
......@@ -101,6 +106,8 @@ struct Instruction {
const KernelBase* kernel() const { return kernel_.get(); }
KernelBase* mutable_kernel() { return kernel_.get(); }
bool is_feed_fetch_op() const { return is_feed_fetch_op_; }
#ifdef LITE_WITH_PROFILE
void set_profiler(profile::Profiler* profiler) {
profiler_ = profiler;
......@@ -118,6 +125,7 @@ struct Instruction {
private:
std::shared_ptr<OpLite> op_;
std::unique_ptr<KernelBase> kernel_;
bool is_feed_fetch_op_{false};
bool first_epoch_{true};
bool has_run_{false};
......
......@@ -198,6 +198,22 @@ class TensorLite {
// For other devices, T and R may be the same type.
template <typename T, typename R = T>
R *mutable_data() {
auto type_id = typeid(T).hash_code();
if (type_id == typeid(bool).hash_code()) { // NOLINT
precision_ = PrecisionType::kBool;
} else if (type_id == typeid(float).hash_code()) { // NOLINT
precision_ = PrecisionType::kFloat;
} else if (type_id == typeid(int8_t).hash_code()) {
precision_ = PrecisionType::kInt8;
} else if (type_id == typeid(int16_t).hash_code()) {
precision_ = PrecisionType::kInt16;
} else if (type_id == typeid(int32_t).hash_code()) {
precision_ = PrecisionType::kInt32;
} else if (type_id == typeid(int64_t).hash_code()) {
precision_ = PrecisionType::kInt64;
} else {
precision_ = PrecisionType::kUnk;
}
memory_size_ = dims_.production() * sizeof(T);
buffer_->ResetLazy(target_, memory_size_);
return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) +
......@@ -222,10 +238,7 @@ class TensorLite {
template <typename T, typename R = T>
R *mutable_data(TargetType target) {
target_ = target;
memory_size_ = dims_.production() * sizeof(T);
buffer_->ResetLazy(target, memory_size());
return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) +
offset_);
return mutable_data<T, R>();
}
void *mutable_data(size_t memory_size);
void *mutable_data(TargetType target, size_t memory_size);
......
# C++ Demo
1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像
2. 运行并进入docker镜像环境,执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz `
4. 执行以下命令准备模拟器环境
```shell
# armv8
adb kill-server
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 &
sleep 1m
```
```shell
# armv7
adb kill-server
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 &
sleep 1m
```
5. 准备模型、编译并运行完整api的demo
1. 环境准备
- 保证Android NDK在/opt目录下
- 一台armv7或armv8架构的安卓手机
2. 编译并运行全量api的demo(注:当编译模式为tiny_pubish时将不存在该demo)
```shell
cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
tar zxvf mobilenet_v1.tar.gz
make
adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
adb push mobilenet_v1 /data/local/tmp/
adb push mobilenetv1_full_api /data/local/tmp/
adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb shell chmod +x /data/local/tmp/mobilenetv1_full_api
adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
```
运行成功将在控制台输出预测结果的前10个类别的预测概率
6. 编译并运行轻量级api的demo
3. 编译并运行轻量级api的demo
```shell
cd ../mobile_light
make
adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
adb push mobilenetv1_light_api /data/local/tmp/
adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt"
```
运行成功将在控制台输出预测结果的前10个类别的预测概率
7. 编译并运行目标检测的demo
4. 编译并运行ssd目标检测的demo
```shell
cd ../mobile_detection
cd ../ssd_detection
wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
tar zxvf mobilenetv1-ssd.tar.gz
make
adb -s emulator-5554 push mobile_detection /data/local/tmp/
adb -s emulator-5554 push test.jpg /data/local/tmp/
adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./
adb push ssd_detection /data/local/tmp/
adb push test.jpg /data/local/tmp/
adb push mobilenetv1-ssd /data/local/tmp
adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb shell chmod +x /data/local/tmp/ssd_detection
adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/ssd_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
adb pull /data/local/tmp/test_ssd_detection_result.jpg ./
```
运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg
运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg
8. 编译并运行物体分类的demo
5. 编译并运行yolov3目标检测的demo
```shell
cd ../yolov3_detection
wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz
tar zxvf mobilenetv1-yolov3.tar.gz
make
adb push yolov3_detection /data/local/tmp/
adb push test.jpg /data/local/tmp/
adb push mobilenetv1-yolov3 /data/local/tmp
adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb shell chmod +x /data/local/tmp/yolov3_detection
adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/yolov3_detection /data/local/tmp/mobilenetv1-yolov3 /data/local/tmp/test.jpg"
adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./
```
运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg
6. 编译并运行物体分类的demo
```shell
cd ../mobile_classify
wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
tar zxvf mobilenet_v1.tar.gz
./model_optimize_tool optimize model
make
adb -s emulator-5554 push mobile_classify /data/local/tmp/
adb -s emulator-5554 push test.jpg /data/local/tmp/
adb -s emulator-5554 push labels.txt /data/local/tmp/
adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
```
运行成功将在控制台输出预测结果的前5个类别的预测概率
- 如若想看前10个类别的预测概率,在运行命令输入topk的值即可
eg:
```shell
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
```
- 如若想看其他模型的分类结果, 在运行命令输入model_dir 及其model的输入大小即可
eg:
```shell
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
```
9. 编译含CV预处理库模型单测demo
```shell
cd ../test_cv
wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
tar zxvf mobilenet_v1.tar.gz
./model_optimize_tool optimize model
make
adb -s emulator-5554 push test_model_cv /data/local/tmp/
adb -s emulator-5554 push test.jpg /data/local/tmp/
adb -s emulator-5554 push labels.txt /data/local/tmp/
adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
```
运行成功将在控制台输出预测结果的前10个类别的预测概率
......@@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
mobile_detection: fetch_opencv mobile_detection.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS)
ssd_detection: fetch_opencv ssd_detection.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection $(CXX_LIBS) $(LDFLAGS)
mobile_detection.o: mobile_detection.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
ssd_detection.o: ssd_detection.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
......@@ -57,5 +57,5 @@ fetch_opencv:
.PHONY: clean
clean:
rm -f mobile_detection.o
rm -f mobile_detection
rm -f ssd_detection.o
rm -f ssd_detection
......@@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
mobile_detection: fetch_opencv mobile_detection.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS)
ssd_detection: fetch_opencv ssd_detection.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection $(CXX_LIBS) $(LDFLAGS)
mobile_detection.o: mobile_detection.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
ssd_detection.o: ssd_detection.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
......@@ -57,5 +57,5 @@ fetch_opencv:
.PHONY: clean
clean:
rm -f mobile_detection.o
rm -f mobile_detection
rm -f ssd_detection.o
rm -f ssd_detection
ARM_ABI = arm7
LITE_WITH_CV = ON
export ARM_ABI
export LITE_WITH_CV
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
# `libpaddle_api_light_bundled.a` #
###############################################################
# Note: default use lite's shared library. #
###############################################################
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
test_model_cv: fetch_opencv test_model_cv.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS)
test_model_cv.o: test_model_cv.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
test_img_prepross: fetch_opencv test_img_prepross.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross $(CXX_LIBS) $(LDFLAGS)
test_img_prepross.o: test_img_prepross.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
.PHONY: clean
clean:
rm -f test_model_cv.o
rm -f test_model_cv
rm -f test_img_prepross.o
rm -f test_img_prepross
ARM_ABI = arm8
LITE_WITH_CV = ON
export ARM_ABI
export LITE_WITH_CV
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
# `libpaddle_api_light_bundled.a` #
###############################################################
# Note: default use lite's shared library. #
###############################################################
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
test_model_cv: fetch_opencv test_model_cv.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS)
test_model_cv.o: test_model_cv.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
test_img_prepross: fetch_opencv test_img_prepross.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross $(CXX_LIBS) $(LDFLAGS)
test_img_prepross.o: test_img_prepross.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
.PHONY: clean
clean:
rm -f test_model_cv.o
rm -f test_model_cv
rm -f test_img_prepross.o
rm -f test_img_prepross
ARM_ABI = arm7
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
# `libpaddle_api_light_bundled.a` #
###############################################################
# Note: default use lite's shared library. #
###############################################################
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
yolov3_detection: fetch_opencv yolov3_detection.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection $(CXX_LIBS) $(LDFLAGS)
yolov3_detection.o: yolov3_detection.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
.PHONY: clean
clean:
rm -f yolov3_detection.o
rm -f yolov3_detection
ARM_ABI = arm8
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
# `libpaddle_api_light_bundled.a` #
###############################################################
# Note: default use lite's shared library. #
###############################################################
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
yolov3_detection: fetch_opencv yolov3_detection.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection $(CXX_LIBS) $(LDFLAGS)
yolov3_detection.o: yolov3_detection.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
.PHONY: clean
clean:
rm -f yolov3_detection.o
rm -f yolov3_detection
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册