提交 c28971a9 编写于 作者: B baolei.an

merge to newest version

上级 662e4d7c
...@@ -174,15 +174,26 @@ if(NOT WITH_DSO) ...@@ -174,15 +174,26 @@ if(NOT WITH_DSO)
endif(WIN32) endif(WIN32)
endif(NOT WITH_DSO) endif(NOT WITH_DSO)
get_filename_component(CUDA_LIB_PATH ${CUDA_curand_LIBRARY} DIRECTORY) function(add_cuda_static_lib alias cuda_lib_paths file_name)
function(import_static_library alias path) unset(ABS_PATH CACHE)
find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH)
add_library(${alias} STATIC IMPORTED GLOBAL) add_library(${alias} STATIC IMPORTED GLOBAL)
set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path}) set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE)
if (NOT ABS_PATH)
message(FATAL_ERROR "Can not find CUDA static library: ${file_name}")
endif()
endfunction() endfunction()
import_static_library(cudart_static ${CUDA_LIB_PATH}/libcudart_static.a)
import_static_library(cublas_static ${CUDA_LIB_PATH}/libcublas_static.a) add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a)
import_static_library(curand_static ${CUDA_LIB_PATH}/libcurand_static.a) add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a)
import_static_library(culibos_static ${CUDA_LIB_PATH}/libculibos.a) add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a)
add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a)
if(NOT ${CUDA_VERSION} LESS 10.1)
add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a)
endif()
set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES})
# setting nvcc arch flags # setting nvcc arch flags
select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
......
...@@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ...@@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
/usr/lib/${TARGET_ARCH}-linux-gnu/
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib $ENV{CUDNN_ROOT}/lib
/usr/lib /usr/lib
${CUDA_TOOLKIT_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
${CUDA_TOOLKIT_ROOT_DIR}/lib64
) )
if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0)) if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
......
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(EIGEN_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/eigen3)
SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
...@@ -16,9 +17,12 @@ if(WITH_AMD_GPU) ...@@ -16,9 +17,12 @@ if(WITH_AMD_GPU)
ExternalProject_Add( ExternalProject_Add(
extern_eigen3 extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" GIT_TAG
GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fhipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
DOWNLOAD_NAME "hipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip"
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
...@@ -29,12 +33,14 @@ else() ...@@ -29,12 +33,14 @@ else()
ExternalProject_Add( ExternalProject_Add(
extern_eigen3 extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror"
# eigen on cuda9.1 missing header of math_funtions.hpp # eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c GIT_TAG
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
DOWNLOAD_NAME "eigen" DOWNLOAD_NAME "eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip"
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
......
...@@ -20,6 +20,7 @@ endif() ...@@ -20,6 +20,7 @@ endif()
include(ExternalProject) include(ExternalProject)
SET(XBYAK_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xbyak)
set(XBYAK_PROJECT extern_xbyak) set(XBYAK_PROJECT extern_xbyak)
set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak) set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak)
set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak) set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
...@@ -38,8 +39,11 @@ ExternalProject_Add( ...@@ -38,8 +39,11 @@ ExternalProject_Add(
${XBYAK_PROJECT} ${XBYAK_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS "" DEPENDS ""
GIT_REPOSITORY "https://github.com/herumi/xbyak.git"
GIT_TAG "v5.661" # Jul 26th GIT_TAG "v5.661" # Jul 26th
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fxbyak-5.66.zip
DOWNLOAD_DIR ${XBYAK_SOURCECODE_DIR}
DOWNLOAD_NAME "xbyak-5.66.zip"
DOWNLOAD_NO_PROGRESS 1
PREFIX ${XBYAK_PREFIX_DIR} PREFIX ${XBYAK_PREFIX_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
......
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(XXHASH_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xxhash)
set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash) set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash) set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
...@@ -18,10 +19,12 @@ if(WIN32) ...@@ -18,10 +19,12 @@ if(WIN32)
ExternalProject_Add( ExternalProject_Add(
extern_xxhash extern_xxhash
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
GIT_TAG "v0.6.5" GIT_TAG "v0.6.5"
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
DOWNLOAD_DIR ${XXHASH_SOURCECODE_DIR}
DOWNLOAD_NAME "xxHash-0.6.5.zip"
DOWNLOAD_NO_PROGRESS 1
PREFIX ${XXHASH_SOURCE_DIR} PREFIX ${XXHASH_SOURCE_DIR}
DOWNLOAD_NAME "xxhash"
UPDATE_COMMAND "" UPDATE_COMMAND ""
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
PATCH_COMMAND PATCH_COMMAND
...@@ -41,10 +44,12 @@ else() ...@@ -41,10 +44,12 @@ else()
ExternalProject_Add( ExternalProject_Add(
extern_xxhash extern_xxhash
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
GIT_TAG "v0.6.5" GIT_TAG "v0.6.5"
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
DOWNLOAD_DIR ${XXHASH_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1
PREFIX ${XXHASH_SOURCE_DIR} PREFIX ${XXHASH_SOURCE_DIR}
DOWNLOAD_NAME "xxhash" DOWNLOAD_NAME "xxHash-0.6.5.zip"
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
......
...@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME) ...@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES})
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
if(nv_binary_DEPS) if(nv_binary_DEPS)
target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
...@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME) ...@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME)
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} ) gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
......
...@@ -164,7 +164,9 @@ function(lite_cc_library TARGET) ...@@ -164,7 +164,9 @@ function(lite_cc_library TARGET)
endfunction() endfunction()
function(lite_cc_binary TARGET) function(lite_cc_binary TARGET)
set(options "") if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
set(options " -g ")
endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
...@@ -255,6 +257,7 @@ endfunction() ...@@ -255,6 +257,7 @@ endfunction()
set(arm_kernels CACHE INTERNAL "arm kernels") set(arm_kernels CACHE INTERNAL "arm kernels")
set(x86_kernels CACHE INTERNAL "x86 kernels") set(x86_kernels CACHE INTERNAL "x86 kernels")
set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels") set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels")
......
...@@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}") ...@@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}") message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
...@@ -121,6 +122,9 @@ if (LITE_WITH_X86) ...@@ -121,6 +122,9 @@ if (LITE_WITH_X86)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
endif() endif()
if(LITE_WITH_CUDA)
add_dependencies(publish_inference paddle_full_api_shared)
endif(LITE_WITH_CUDA)
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
# add cxx lib # add cxx lib
...@@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
) )
add_dependencies(tiny_publish_lib bundle_light_api) add_dependencies(tiny_publish_lib bundle_light_api)
add_dependencies(publish_inference tiny_publish_lib) add_dependencies(publish_inference tiny_publish_lib)
...@@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
) )
add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared) add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference tiny_publish_cxx_lib) add_dependencies(publish_inference tiny_publish_cxx_lib)
add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
endif() endif()
endif() endif()
endif() endif()
...@@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
endif() endif()
endif() endif()
if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND if ((ARM_TARGET_OS STREQUAL "android") AND
((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8))) ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
# copy # copy
...@@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
) )
add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
...@@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
) )
add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
endif() endif()
......
...@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH) ...@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH)
set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
endif() endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer) set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library #full api dynamic library
add_library(paddle_full_api_shared SHARED "") add_library(paddle_full_api_shared SHARED "")
target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc) target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
...@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and ...@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
add_dependencies(paddle_full_api_shared xxhash) add_dependencies(paddle_full_api_shared xxhash)
target_link_libraries(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash)
endif() endif()
if(LITE_WITH_CUDA)
target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
endif(LITE_WITH_CUDA)
#light api dynamic library #light api dynamic library
lite_cc_library(paddle_light_api_shared MODULE lite_cc_library(paddle_light_api_shared MODULE
SRCS light_api_shared.cc SRCS light_api_shared.cc
...@@ -65,6 +67,7 @@ endif() ...@@ -65,6 +67,7 @@ endif()
message(STATUS "get ops ${ops}") message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get CUDA kernels ${cuda_kernels}")
message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get NPU kernels ${npu_kernels}")
...@@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kenrels} FPGA_DEPS ${fpga_kernels})
BM_DEPS ${bm_kenrels}) BM_DEPS ${bm_kernels})
endif() endif()
# for light api # for light api
set(light_api_deps set(light_api_deps
scope target_wrapper_host model_parser program) scope target_wrapper_host model_parser program)
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
set(light_api_deps ${light_api_deps} target_wrapper_cuda) set(light_api_deps ${light_api_deps} target_wrapper_cuda)
set(cuda_static_deps cudart_static cublas_static curand_static
cudnn_static culibos_static)
endif() endif()
lite_cc_library(light_api SRCS light_api.cc lite_cc_library(light_api SRCS light_api.cc
DEPS scope target_wrapper_host model_parser DEPS scope target_wrapper_host model_parser
...@@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kenrels} FPGA_DEPS ${fpga_kernels})
BM_DEPS ${bm_kenrels}) BM_DEPS ${bm_kernels})
include(ExternalProject) include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
...@@ -305,9 +307,10 @@ if(NOT IOS) ...@@ -305,9 +307,10 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
X86_DEPS ${x86_kernels}) FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
...@@ -316,7 +319,9 @@ if(NOT IOS) ...@@ -316,7 +319,9 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}) X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
endif() endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
...@@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) { const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(load_model_dir); config.set_model_dir(load_model_dir);
std::vector<Place> vaild_places = {Place{TARGET(kARM), PRECISION(kFloat)}, std::vector<Place> vaild_places = {
Place{TARGET(kX86), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)},
Place{TARGET(kOpenCL), PRECISION(kFloat)}}; Place{TARGET(kX86), PRECISION(kFloat)},
};
if (FLAGS_is_quantized_model) { if (FLAGS_is_quantized_model) {
vaild_places.insert(vaild_places.begin(), vaild_places.insert(vaild_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)}); Place{TARGET(kARM), PRECISION(kInt8)});
......
...@@ -24,13 +24,6 @@ ...@@ -24,13 +24,6 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
".tailored_ops_source_list";
static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
".tailored_kernels_source_list";
static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
void Predictor::SaveModel(const std::string &dir, void Predictor::SaveModel(const std::string &dir,
lite_api::LiteModelType model_type, lite_api::LiteModelType model_type,
bool record_info) { bool record_info) {
...@@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) { ...@@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
// get inputs names // get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; } std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
// get outputnames // get outputnames
std::vector<std::string> Predictor::GetOutputNames() { return output_names_; } std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// append the names of inputs and outputs into input_names_ and output_names_ // append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() { void Predictor::PrepareFeedFetch() {
std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
// The shape of input tensors must be determined before generating NPU and XPU
// program.
auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0); auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
std::vector<cpp::OpDesc *> feeds;
std::vector<cpp::OpDesc *> fetchs;
for (size_t i = 0; i < current_block->OpsSize(); i++) { for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i); auto op = current_block->GetOp<cpp::OpDesc>(i);
#else
if (!program_) {
GenRuntimeProgram();
}
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
#endif
if (op->Type() == "feed") { if (op->Type() == "feed") {
feeds.push_back(op); feeds.push_back(op);
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
fetchs.push_back(op); fetchs.push_back(op);
} }
} }
input_names_.resize(feeds.size()); input_names_.resize(feeds.size());
output_names_.resize(fetchs.size()); output_names_.resize(fetchs.size());
for (size_t i = 0; i < feeds.size(); i++) { for (size_t i = 0; i < feeds.size(); i++) {
...@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const { ...@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
const cpp::ProgramDesc &Predictor::program_desc() const { const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_; return program_desc_;
} }
const RuntimeProgram &Predictor::runtime_program() const { return *program_; } const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
void Predictor::Build(const lite_api::CxxConfig &config, void Predictor::Build(const lite_api::CxxConfig &config,
...@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
const std::vector<Place> &valid_places, const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) { const std::vector<std::string> &passes) {
program_desc_ = desc; program_desc_ = desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places; std::vector<Place> inner_places = valid_places;
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back( inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
Program program(desc, scope_, inner_places); Program program(desc, scope_, inner_places);
/// The first place in valid_places is
core::KernelPickFactor factor; core::KernelPickFactor factor;
factor.ConsiderTarget(); factor.ConsiderTarget();
factor.ConsiderPrecision(); factor.ConsiderPrecision();
factor.ConsiderDataLayout(); factor.ConsiderDataLayout();
optimizer_.Run(std::move(program), inner_places, factor, passes); optimizer_.Run(std::move(program), inner_places, factor, passes);
exec_scope_ = optimizer_.exec_scope(); exec_scope_ = optimizer_.exec_scope();
PrepareFeedFetch(); PrepareFeedFetch();
...@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const { ...@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name); auto *var = exec_scope_->FindVar(name);
return &var->Get<lite::Tensor>(); return &var->Get<lite::Tensor>();
} }
// get input by name // get input by name
lite::Tensor *Predictor::GetInputByName(const std::string &name) { lite::Tensor *Predictor::GetInputByName(const std::string &name) {
auto element = std::find(input_names_.begin(), input_names_.end(), name); auto element = std::find(input_names_.begin(), input_names_.end(), name);
......
...@@ -29,6 +29,13 @@ ...@@ -29,6 +29,13 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
".tailored_ops_source_list";
static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
".tailored_kernels_source_list";
static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
/* /*
* Predictor for inference, input a model, it will optimize and execute it. * Predictor for inference, input a model, it will optimize and execute it.
*/ */
......
...@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) { ...@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
TEST(MobileNetV1, test_opencl) { TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({ std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
TARGET(kARM), // enable kARM CPU kernel when no opencl kernel
}); });
TestModel(valid_places); TestModel(valid_places);
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
// model_optimize_tool's compiling period // model_optimize_tool's compiling period
#include "all_kernel_faked.cc" // NOLINT #include "all_kernel_faked.cc" // NOLINT
#include "kernel_src_map.h" // NOLINT #include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
...@@ -31,6 +32,18 @@ DEFINE_string(model_dir, ...@@ -31,6 +32,18 @@ DEFINE_string(model_dir,
"", "",
"path of the model. This option will be ignored if model_file " "path of the model. This option will be ignored if model_file "
"and param_file are exist"); "and param_file are exist");
DEFINE_string(model_filename,
"",
"model topo filename of the model in models set. This option"
" will be used to specific tailoring");
DEFINE_string(param_filename,
"",
"model param filename of the model in models set. This option"
" will be used to specific tailoring");
DEFINE_string(model_set_dir,
"",
"path of the models set. This option will be used to specific"
" tailoring");
DEFINE_string(model_file, "", "model file path of the combined-param model"); DEFINE_string(model_file, "", "model file path of the combined-param model");
DEFINE_string(param_file, "", "param file path of the combined-param model"); DEFINE_string(param_file, "", "param file path of the combined-param model");
DEFINE_string( DEFINE_string(
...@@ -58,29 +71,23 @@ void DisplayKernels() { ...@@ -58,29 +71,23 @@ void DisplayKernels() {
LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString(); LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
} }
void Main() { std::vector<Place> ParserValidPlaces() {
if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
LOG(WARNING)
<< "Load combined-param model. Option model_dir will be ignored";
}
if (FLAGS_display_kernels) {
DisplayKernels();
exit(0);
}
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_model_file(FLAGS_model_file);
config.set_param_file(FLAGS_param_file);
std::vector<Place> valid_places; std::vector<Place> valid_places;
auto target_reprs = lite::Split(FLAGS_valid_targets, " "); auto target_reprs = lite::Split(FLAGS_valid_targets, ",");
for (auto& target_repr : target_reprs) { for (auto& target_repr : target_reprs) {
if (target_repr == "arm") { if (target_repr == "arm") {
valid_places.emplace_back(TARGET(kARM)); valid_places.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") { } else if (target_repr == "opencl") {
valid_places.emplace_back(TARGET(kOpenCL)); valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") { } else if (target_repr == "x86") {
valid_places.emplace_back(TARGET(kX86)); valid_places.emplace_back(TARGET(kX86));
} else { } else {
...@@ -100,26 +107,130 @@ void Main() { ...@@ -100,26 +107,130 @@ void Main() {
valid_places.insert(valid_places.begin(), valid_places.insert(valid_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)}); Place{TARGET(kARM), PRECISION(kInt8)});
} }
return valid_places;
}
void RunOptimize(const std::string& model_dir,
const std::string& model_file,
const std::string& param_file,
const std::string& optimize_out,
const std::string& optimize_out_type,
const std::vector<Place>& valid_places,
bool record_tailoring_info) {
if (!model_file.empty() && !param_file.empty()) {
LOG(WARNING)
<< "Load combined-param model. Option model_dir will be ignored";
}
lite_api::CxxConfig config;
config.set_model_dir(model_dir);
config.set_model_file(model_file);
config.set_param_file(param_file);
config.set_valid_places(valid_places); config.set_valid_places(valid_places);
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
LiteModelType model_type; LiteModelType model_type;
if (FLAGS_optimize_out_type == "protobuf") { if (optimize_out_type == "protobuf") {
model_type = LiteModelType::kProtobuf; model_type = LiteModelType::kProtobuf;
} else if (FLAGS_optimize_out_type == "naive_buffer") { } else if (optimize_out_type == "naive_buffer") {
model_type = LiteModelType::kNaiveBuffer; model_type = LiteModelType::kNaiveBuffer;
} else { } else {
LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type; LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
} }
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
predictor->SaveOptimizedModel( predictor->SaveOptimizedModel(
FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info); optimize_out, model_type, record_tailoring_info);
if (FLAGS_record_tailoring_info) { if (record_tailoring_info) {
LOG(INFO) << "Record the information of tailored model into :" LOG(INFO) << "Record the information of tailored model into :"
<< FLAGS_optimize_out; << optimize_out;
}
}
void CollectModelMetaInfo(const std::string& output_dir,
const std::vector<std::string>& models,
const std::string& filename) {
std::set<std::string> total;
for (const auto& name : models) {
std::string model_path =
lite::Join<std::string>({output_dir, name, filename}, "/");
auto lines = lite::ReadLines(model_path);
total.insert(lines.begin(), lines.end());
}
std::string output_path =
lite::Join<std::string>({output_dir, filename}, "/");
lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
output_path);
}
void Main() {
if (FLAGS_display_kernels) {
DisplayKernels();
exit(0);
} }
auto valid_places = ParserValidPlaces();
if (FLAGS_model_set_dir == "") {
RunOptimize(FLAGS_model_dir,
FLAGS_model_file,
FLAGS_param_file,
FLAGS_optimize_out,
FLAGS_optimize_out_type,
valid_places,
FLAGS_record_tailoring_info);
return;
}
if (!FLAGS_record_tailoring_info) {
LOG(WARNING) << "--model_set_dir option only be used with "
"--record_tailoring_info=true together";
return;
}
auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
if (model_dirs.size() == 0) {
LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
}
// Optimize models in FLAGS_model_set_dir
for (const auto& name : model_dirs) {
std::string input_model_dir =
lite::Join<std::string>({FLAGS_model_set_dir, name}, "/");
std::string output_model_dir =
lite::Join<std::string>({FLAGS_optimize_out, name}, "/");
std::string model_file = "";
std::string param_file = "";
if (FLAGS_model_filename != "" && FLAGS_param_filename != "") {
model_file =
lite::Join<std::string>({input_model_dir, FLAGS_model_filename}, "/");
param_file =
lite::Join<std::string>({input_model_dir, FLAGS_param_filename}, "/");
}
LOG(INFO) << "Start optimize model: " << input_model_dir;
RunOptimize(input_model_dir,
model_file,
param_file,
output_model_dir,
FLAGS_optimize_out_type,
valid_places,
FLAGS_record_tailoring_info);
LOG(INFO) << "Optimize done. ";
}
// Collect all models information
CollectModelMetaInfo(
FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo(FLAGS_optimize_out,
model_dirs,
lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
FLAGS_optimize_out, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
} }
} // namespace lite_api } // namespace lite_api
......
...@@ -21,14 +21,14 @@ ...@@ -21,14 +21,14 @@
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h" #include "lite/api/test_helper.h"
#include "lite/core/device_info.h" #include "lite/core/device_info.h"
#include "lite/tests/utils/timer.h" #include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h" #include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DEFINE_string(input_shape, DEFINE_string(input_shape,
"1,3,224,224", "1,3,224,224",
...@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
Timer ti; Timer ti;
for (int j = 0; j < repeat; ++j) { for (int j = 0; j < repeat; ++j) {
ti.start(); ti.Start();
predictor->Run(); predictor->Run();
ti.end(); float t = ti.Stop();
LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms"; LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
} }
LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir LOG(INFO) << "Model: " << model_dir
<< ", power_mode: " << static_cast<int>(power_mode) << ", power_mode: " << static_cast<int>(power_mode)
<< ", threads num " << thread_num << ", warmup: " << warmup_times << ", threads num " << thread_num << ", warmup: " << warmup_times
<< ", repeats: " << repeat << ", avg time: " << ti.get_average_ms() << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
<< " ms" << " ms"
<< ", min time: " << ti.get_min_time() << " ms" << ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.get_max_time() << " ms."; << ", max time: " << ti.LapTimes().Max() << " ms.";
auto output = predictor->GetOutput(0); auto output = predictor->GetOutput(0);
auto out = output->data<float>(); auto out = output->data<float>();
......
...@@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) { ...@@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) {
} }
} }
template <typename T> template <typename T>
void Tensor::CopyToCpu(T *data) { void Tensor::CopyToCpu(T *data) const {
const T *src_data = tensor(raw_tensor_)->data<T>(); const T *src_data = tensor(raw_tensor_)->data<T>();
int64_t num = tensor(raw_tensor_)->numel(); int64_t num = tensor(raw_tensor_)->numel();
CHECK(num > 0) << "You should call Resize interface first"; CHECK(num > 0) << "You should call Resize interface first";
...@@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *); ...@@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *); template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *); template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *); template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *); template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *); template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
template void Tensor::CopyToCpu(int8_t *); template void Tensor::CopyToCpu(int8_t *) const;
template void Tensor::CopyToCpu(float *); template void Tensor::CopyToCpu(float *) const;
template void Tensor::CopyToCpu(int *); template void Tensor::CopyToCpu(int *) const;
shape_t Tensor::shape() const { shape_t Tensor::shape() const {
return ctensor(raw_tensor_)->dims().Vectorize(); return ctensor(raw_tensor_)->dims().Vectorize();
......
...@@ -49,7 +49,7 @@ struct LITE_API Tensor { ...@@ -49,7 +49,7 @@ struct LITE_API Tensor {
void CopyFromCpu(const T* data); void CopyFromCpu(const T* data);
template <typename T> template <typename T>
void CopyToCpu(T* data); void CopyToCpu(T* data) const;
/// Shape of the tensor. /// Shape of the tensor.
shape_t shape() const; shape_t shape() const;
TargetType target() const; TargetType target() const;
......
...@@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) { ...@@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) {
"any", "any",
"fpga", "fpga",
"npu", "npu",
"xpu", "xpu"};
"bm"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) { ...@@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) {
"kAny", "kAny",
"kFPGA", "kFPGA",
"kNPU", "kNPU",
"kXPU", "kXPU"};
"kBM"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -131,8 +129,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) { ...@@ -131,8 +129,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kOpenCL), TARGET(kOpenCL),
TARGET(kNPU), TARGET(kNPU),
TARGET(kXPU), TARGET(kXPU),
TARGET(kFPGA), TARGET(kFPGA)});
TARGET(kBM)});
if (target == TARGET(kAny)) { if (target == TARGET(kAny)) {
return valid_set; return valid_set;
} }
......
...@@ -52,9 +52,8 @@ enum class TargetType : int { ...@@ -52,9 +52,8 @@ enum class TargetType : int {
kFPGA = 7, kFPGA = 7,
kNPU = 8, kNPU = 8,
kXPU = 9, kXPU = 9,
kBM = 10,
kAny = 6, // any target kAny = 6, // any target
NUM = 11, // number of fields. NUM = 10, // number of fields.
}; };
enum class PrecisionType : int { enum class PrecisionType : int {
kUnk = 0, kUnk = 0,
......
...@@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass); ...@@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass);
USE_MIR_PASS(variable_place_inference_pass); USE_MIR_PASS(variable_place_inference_pass);
USE_MIR_PASS(type_target_cast_pass); USE_MIR_PASS(type_target_cast_pass);
USE_MIR_PASS(generate_program_pass); USE_MIR_PASS(generate_program_pass);
USE_MIR_PASS(subgraph_program_pass); #ifdef LITE_WITH_NPU
USE_MIR_PASS(generate_npu_program_pass);
#endif
#ifdef LITE_WITH_XPU
USE_MIR_PASS(generate_xpu_program_pass);
#endif
USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(io_copy_kernel_pick_pass);
USE_MIR_PASS(argument_type_display_pass); USE_MIR_PASS(argument_type_display_pass);
......
...@@ -12,20 +12,6 @@ ...@@ -12,20 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
......
...@@ -57,9 +57,10 @@ endif() ...@@ -57,9 +57,10 @@ endif()
if (NOT HAS_ARM_MATH_LIB_DIR) if (NOT HAS_ARM_MATH_LIB_DIR)
# TODO(xxx): seperate them and do not deps proto, eigen3 # TODO(xxx): seperate them and do not deps proto, eigen3
cc_library(math_arm SRCS cc_library(math_arm SRCS
funcs.cc funcs.cc
packed_sgemm.cc packed_sgemm.cc
packed_sgemm_c4.cc
sgemm.cc sgemm.cc
gemm_prepacked_int8.cc gemm_prepacked_int8.cc
gemm_s8.cc gemm_s8.cc
...@@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
gemv_arm_int8.cc gemv_arm_int8.cc
conv3x3s1_direct_fp32.cc conv3x3s1_direct_fp32.cc
conv3x3s2_direct_fp32.cc conv3x3s2_direct_fp32.cc
conv3x3s1_depthwise_fp32.cc conv3x3s1p01_depthwise_fp32.cc
conv3x3s2_depthwise_fp32.cc conv3x3s2p01_depthwise_fp32.cc
conv3x3s1px_depthwise_fp32.cc
conv3x3s2px_depthwise_fp32.cc
conv3x3s1_direct_int8.cc conv3x3s1_direct_int8.cc
conv3x3s2_direct_int8.cc conv3x3s2_direct_int8.cc
conv3x3s1_depthwise_int8.cc conv3x3s1_depthwise_int8.cc
...@@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
conv5x5s1_depthwise_int8.cc conv5x5s1_depthwise_int8.cc
conv5x5s1_depthwise_fp32.cc conv5x5s1_depthwise_fp32.cc
conv5x5s2_depthwise_fp32.cc conv5x5s2_depthwise_fp32.cc
conv_depthwise_3x3p0.cc conv3x3_winograd_fp32_c4.cc
conv_depthwise_3x3p1.cc
conv_depthwise_3x3s1.cc
conv_depthwise_3x3s2.cc
conv_winograd_3x3.cc conv_winograd_3x3.cc
conv_impl.cc conv_impl.cc
softmax.cc softmax.cc
scale.cc scale.cc
pooling.cc pooling.cc
elementwise.cc elementwise.cc
layout.cc
lrn.cc lrn.cc
decode_bboxes.cc decode_bboxes.cc
concat.cc concat.cc
...@@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
anchor_generator.cc anchor_generator.cc
DEPS ${lite_kernel_deps} context tensor) DEPS ${lite_kernel_deps} context tensor)
endif() endif()
...@@ -32,8 +32,10 @@ void col2im<float>(const float* data_col, ...@@ -32,8 +32,10 @@ void col2im<float>(const float* data_col,
const int width, const int width,
const int kernel_h, const int kernel_h,
const int kernel_w, const int kernel_w,
const int pad_h, const int pad_h0,
const int pad_w, const int pad_h1,
const int pad_w0,
const int pad_w1,
const int stride_h, const int stride_h,
const int stride_w, const int stride_w,
const int dilation_h, const int dilation_h,
...@@ -41,19 +43,22 @@ void col2im<float>(const float* data_col, ...@@ -41,19 +43,22 @@ void col2im<float>(const float* data_col,
float* data_im) { float* data_im) {
memset(data_im, 0, height * width * channels * sizeof(float)); memset(data_im, 0, height * width * channels * sizeof(float));
const int output_h = const int output_h =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
stride_h +
1;
const int output_w = const int output_w =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1;
const int channel_size = height * width; const int channel_size = height * width;
for (int channel = channels; channel--; data_im += channel_size) { for (int channel = channels; channel--; data_im += channel_size) {
for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_row = -pad_h + kernel_row * dilation_h; int input_row = -pad_h0 + kernel_row * dilation_h;
for (int output_rows = output_h; output_rows; output_rows--) { for (int output_rows = output_h; output_rows; output_rows--) {
if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
data_col += output_w; data_col += output_w;
} else { } else {
int input_col = -pad_w + kernel_col * dilation_w; int input_col = -pad_w0 + kernel_col * dilation_w;
for (int output_col = output_w; output_col; output_col--) { for (int output_col = output_w; output_col; output_col--) {
if (is_a_ge_zero_and_a_lt_b(input_col, width)) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
data_im[input_row * width + input_col] += *data_col; data_im[input_row * width + input_col] += *data_col;
......
...@@ -26,8 +26,10 @@ void col2im(const Dtype* data_col, ...@@ -26,8 +26,10 @@ void col2im(const Dtype* data_col,
const int width, const int width,
const int kernel_h, const int kernel_h,
const int kernel_w, const int kernel_w,
const int pad_h, const int pad_h0,
const int pad_w, const int pad_h1,
const int pad_w0,
const int pad_w1,
const int stride_h, const int stride_h,
const int stride_w, const int stride_w,
const int dilation_h, const int dilation_h,
......
此差异已折叠。
...@@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param, ...@@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param,
auto dim_in = param.x->dims(); auto dim_in = param.x->dims();
auto dim_out = param.output->dims(); auto dim_out = param.output->dims();
const int threads = ctx->threads(); const int threads = ctx->threads();
auto paddings = *param.paddings;
int llc_size = ctx->llc_size() / sizeof(float); int llc_size = ctx->llc_size() / sizeof(float);
const int pad_w = param.paddings[1]; const int pad_w = paddings[2];
const int pad_h = param.paddings[0]; const int pad_h = paddings[0];
int ow = dim_out[3]; int ow = dim_out[3];
int oh = dim_out[2]; int oh = dim_out[2];
int ic = dim_in[1]; int ic = dim_in[1];
...@@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data, ...@@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data,
ARMContext* ctx) { ARMContext* ctx) {
const int threads = ctx->threads(); const int threads = ctx->threads();
int l2_size = ctx->llc_size() / sizeof(float); int l2_size = ctx->llc_size() / sizeof(float);
auto paddings = *param.paddings;
const int pad_h = param.paddings[0]; const int pad_h = paddings[0];
const int pad_w = param.paddings[1]; const int pad_w = paddings[2];
const int wout_round = ROUNDUP(ow, OUT_W_BLOCK); const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
const int win_round = wout_round + 2; const int win_round = wout_round + 2;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
......
...@@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din, ...@@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din,
const operators::ConvParam& param, const operators::ConvParam& param,
Context<TARGET(kARM)>* ctx, Context<TARGET(kARM)>* ctx,
const float* scale) { const float* scale) {
auto paddings = *param.paddings;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias; bool flag_bias = param.bias;
int pad_h = param.paddings[0]; int pad_h = paddings[0];
int pad_w = param.paddings[1]; int pad_w = paddings[2];
const int threads = ctx->threads(); const int threads = ctx->threads();
int llc_size = ctx->llc_size() / 4; int llc_size = ctx->llc_size() / 4;
......
此差异已折叠。
此差异已折叠。
...@@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param, ...@@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto dim_in = param.x->dims(); auto dim_in = param.x->dims();
auto dim_out = param.output->dims(); auto dim_out = param.output->dims();
auto paddings = *param.paddings;
const int threads = ctx->threads(); const int threads = ctx->threads();
int llc_size = ctx->llc_size() / sizeof(float); int llc_size = ctx->llc_size() / sizeof(float);
const int pad_w = param.paddings[1]; const int pad_w = paddings[2];
const int pad_h = param.paddings[0]; const int pad_h = paddings[0];
int ow = dim_out[3]; int ow = dim_out[3];
int oh = dim_out[2]; int oh = dim_out[2];
int ic = dim_in[1]; int ic = dim_in[1];
...@@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data, ...@@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data,
//! 3x3s2 convolution, implemented by direct algorithm //! 3x3s2 convolution, implemented by direct algorithm
//! prepack input to tmp buffer //! prepack input to tmp buffer
//! write output to tmp buffer //! write output to tmp buffer
auto paddings = *param.paddings;
const int threads = ctx->threads(); const int threads = ctx->threads();
int l2_size = ctx->llc_size() / sizeof(float); int l2_size = ctx->llc_size() / sizeof(float);
const int pad_w = param.paddings[1]; const int pad_w = paddings[2];
const int pad_h = param.paddings[0]; const int pad_h = paddings[0];
const int wout_round = ROUNDUP(ow, OUT_W_BLOCK); const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
const int win_round = wout_round * 2 /*stride_w*/ + 1; const int win_round = wout_round * 2 /*stride_w*/ + 1;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
......
...@@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din, ...@@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
//! 3x3s2 int8 convolution, implemented by direct algorithm //! 3x3s2 int8 convolution, implemented by direct algorithm
//! prepack input to tmp buffer //! prepack input to tmp buffer
//! write output to tmp buffer //! write output to tmp buffer
auto paddings = *param.paddings;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias; bool flag_bias = param.bias;
int pad_h = param.paddings[0]; int pad_h = paddings[0];
int pad_w = param.paddings[1]; int pad_w = paddings[1];
const int threads = ctx->threads(); const int threads = ctx->threads();
int llc_size = ctx->llc_size() / 4; int llc_size = ctx->llc_size() / 4;
...@@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din, ...@@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
//! 3x3s2 int8 convolution, implemented by direct algorithm //! 3x3s2 int8 convolution, implemented by direct algorithm
//! prepack input to tmp buffer //! prepack input to tmp buffer
//! write output to tmp buffer //! write output to tmp buffer
auto paddings = *param.paddings;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias; bool flag_bias = param.bias;
int pad_h = param.paddings[0]; int pad_h = paddings[0];
int pad_w = param.paddings[1]; int pad_w = paddings[1];
const int threads = ctx->threads(); const int threads = ctx->threads();
//! set 1/4 l2 cache //! set 1/4 l2 cache
int llc_size = ctx->llc_size() / 4; int llc_size = ctx->llc_size() / 4;
......
此差异已折叠。
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <arm_neon.h>
#include "lite/backends/arm/math/conv_block_utils.h"
#include "lite/backends/arm/math/conv_impl.h"
#include "lite/core/context.h"
#include "lite/operators/op_params.h"
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void conv_3x3s2_depthwise_fp32(const float* i_data,
float* o_data,
int bs,
int oc,
int oh,
int ow,
int ic,
int ih,
int win,
const float* weights,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx) {
auto paddings = *param.paddings;
int threads = ctx->threads();
const int pad_h = paddings[0];
const int pad_w = paddings[2];
const int out_c_block = 4;
const int out_h_kernel = 1;
const int out_w_kernel = 4;
const int win_ext = ow * 2 + 1;
const int ow_round = ROUNDUP(ow, 4);
const int win_round = ROUNDUP(win_ext, 4);
const int hin_round = oh * 2 + 1;
const int prein_size = win_round * hin_round * out_c_block;
auto workspace_size =
threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
ctx->ExtendWorkspace(sizeof(float) * workspace_size);
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
/// get workspace
auto ptr_zero = ctx->workspace_data<float>();
memset(ptr_zero, 0, sizeof(float) * win_round);
float* ptr_write = ptr_zero + win_round;
int size_in_channel = win * ih;
int size_out_channel = ow * oh;
int ws = -pad_w;
int we = ws + win_round;
int hs = -pad_h;
int he = hs + hin_round;
int w_loop = ow_round / 4;
auto remain = w_loop * 4 - ow;
bool flag_remain = remain > 0;
remain = 4 - remain;
remain = remain > 0 ? remain : 0;
int row_len = win_round * out_c_block;
for (int n = 0; n < bs; ++n) {
const float* din_batch = i_data + n * ic * size_in_channel;
float* dout_batch = o_data + n * oc * size_out_channel;
#pragma omp parallel for num_threads(threads)
for (int c = 0; c < oc; c += out_c_block) {
#ifdef ARM_WITH_OMP
float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
#else
float* pre_din = ptr_write + ow_round;
#endif
/// const array size
prepack_input_nxwc4_dw(
din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
const float* weight_c = weights + c * 9; // kernel_w * kernel_h
float* dout_c00 = dout_batch + c * size_out_channel;
float bias_local[4] = {0, 0, 0, 0};
if (flag_bias) {
bias_local[0] = bias[c];
bias_local[1] = bias[c + 1];
bias_local[2] = bias[c + 2];
bias_local[3] = bias[c + 3];
}
#ifdef __aarch64__
float32x4_t w0 = vld1q_f32(weight_c); // w0, v23
float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24
float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25
float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26
float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27
float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28
float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29
float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30
float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31
#endif
for (int h = 0; h < oh; h += out_h_kernel) {
float* outc0 = dout_c00 + h * ow;
float* outc1 = outc0 + size_out_channel;
float* outc2 = outc1 + size_out_channel;
float* outc3 = outc2 + size_out_channel;
const float* inr0 = pre_din + h * 2 * row_len;
const float* inr1 = inr0 + row_len;
const float* inr2 = inr1 + row_len;
if (c + out_c_block > oc) {
switch (c + out_c_block - oc) {
case 3:
outc1 = ptr_write;
case 2:
outc2 = ptr_write;
case 1:
outc3 = ptr_write;
default:
break;
}
}
auto c0 = outc0;
auto c1 = outc1;
auto c2 = outc2;
auto c3 = outc3;
float pre_out[16];
for (int w = 0; w < w_loop; ++w) {
bool flag_mask = (w == w_loop - 1) && flag_remain;
if (flag_mask) {
c0 = outc0;
c1 = outc1;
c2 = outc2;
c3 = outc3;
outc0 = pre_out;
outc1 = pre_out + 4;
outc2 = pre_out + 8;
outc3 = pre_out + 12;
}
// clang-format off
#ifdef __aarch64__
asm volatile(
"ldr q8, [%[bias]]\n" /* load bias */
"ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/
"and v19.16b, v8.16b, v8.16b\n"
"ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/
"and v20.16b, v8.16b, v8.16b\n"
"ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/
"and v21.16b, v8.16b, v8.16b\n"
"ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/
"and v22.16b, v8.16b, v8.16b\n"
"ldr q8, [%[inr0]]\n" /* load input r0*/
/* r0 mul w0-w2, get out */
"fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/
"fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/
"fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/
"fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/
"fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/
"ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/
"fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/
"fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/
"fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/
"fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/
"ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/
"fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/
"ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/
"fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/
"ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/
"fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/
"ldr q8, [%[inr1]]\n" /* load input r1*/
/* r1, mul w3-w5, get out */
"fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/
"fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/
"fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/
"fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/
"fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/
"ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/
"fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/
"fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/
"fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/
"fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/
"ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/
"fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/
"ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/
"fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/
"ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/
"fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/
"ldr q8, [%[inr2]]\n" /* load input r2*/
/* r2, mul w6-w8, get out r0, r1 */
"fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/
"fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/
"fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/
"fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/
"fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/
"fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/
"fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/
"fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/
"fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/
"fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/
"fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/
"fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/
/* transpose */
"trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
"trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
"trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
"trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
"trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/
"trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/
"trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/
"trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/
/* relu */
"cbz %w[flag_relu], 0f\n" /* skip relu*/
"movi v0.4s, #0\n" /* for relu */
"fmax v19.4s, v19.4s, v0.4s\n"
"fmax v20.4s, v20.4s, v0.4s\n"
"fmax v21.4s, v21.4s, v0.4s\n"
"fmax v22.4s, v22.4s, v0.4s\n"
/* save result */
"0:\n"
"str q19, [%[outc0]], #16\n"
"str q20, [%[outc1]], #16\n"
"str q21, [%[outc2]], #16\n"
"str q22, [%[outc3]], #16\n"
:[inr0] "+r"(inr0), [inr1] "+r"(inr1),
[inr2] "+r"(inr2),
[outc0]"+r"(outc0), [outc1]"+r"(outc1),
[outc2]"+r"(outc2), [outc3]"+r"(outc3)
:[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
[w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
[w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
[bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
: "cc", "memory",
"v0","v1","v2","v3","v4","v5","v6","v7",
"v8", "v19","v20","v21","v22"
);
#else
asm volatile(
/* fill with bias */
"vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */
/* load weights */
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
"vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/
"vand.i32 q12, q8, q8\n"
"vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/
"vand.i32 q13, q8, q8\n"
"vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/
"vand.i32 q14, q8, q8\n"
"vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/
"vand.i32 q15, q8, q8\n"
"vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/
/* mul r0 with w0, w1, w2 */
"vmla.f32 q12, q9, q0 @ w0 * inr0\n"
"vmla.f32 q13, q9, q2 @ w0 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */
"vmla.f32 q14, q9, q4 @ w0 * inr4\n"
"vmla.f32 q15, q9, q6 @ w0 * inr6\n"
"vmla.f32 q12, q10, q1 @ w1 * inr1\n"
"vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n"
"vmla.f32 q13, q10, q3 @ w1 * inr3\n"
"vmla.f32 q14, q10, q5 @ w1 * inr5\n"
"vmla.f32 q15, q10, q7 @ w1 * inr7\n"
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */
"vmla.f32 q12, q11, q2 @ w2 * inr2\n"
"vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n"
"vmla.f32 q13, q11, q4 @ w2 * inr4\n"
"vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n"
"vmla.f32 q14, q11, q6 @ w2 * inr6\n"
"vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n"
"vmla.f32 q15, q11, q8 @ w2 * inr8\n"
/* mul r1 with w3, w4, w5 */
"vmla.f32 q12, q9, q0 @ w3 * inr0\n"
"vmla.f32 q13, q9, q2 @ w3 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */
"vmla.f32 q14, q9, q4 @ w3 * inr4\n"
"vmla.f32 q15, q9, q6 @ w3 * inr6\n"
"vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/
"vmla.f32 q12, q10, q1 @ w4 * inr1\n"
"vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n"
"vmla.f32 q13, q10, q3 @ w4 * inr3\n"
"vmla.f32 q14, q10, q5 @ w4 * inr5\n"
"vmla.f32 q15, q10, q7 @ w4 * inr7\n"
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */
"vmla.f32 q12, q11, q2 @ w5 * inr2\n"
"vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n"
"vmla.f32 q13, q11, q4 @ w5 * inr4\n"
"vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n"
"vmla.f32 q14, q11, q6 @ w5 * inr6\n"
"vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n"
"vmla.f32 q15, q11, q8 @ w5 * inr8\n"
/* mul r2 with w6, w7, w8 */
"vmla.f32 q12, q9, q0 @ w6 * inr0\n"
"vmla.f32 q13, q9, q2 @ w6 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */
"vmla.f32 q14, q9, q4 @ w6 * inr4\n"
"vmla.f32 q15, q9, q6 @ w6 * inr6\n"
"vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/
"vmla.f32 q12, q10, q1 @ w7 * inr1\n"
"vmla.f32 q13, q10, q3 @ w7 * inr3\n"
"vmla.f32 q14, q10, q5 @ w7 * inr5\n"
"vmla.f32 q15, q10, q7 @ w7 * inr7\n"
"sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n"
"vmla.f32 q12, q11, q2 @ w8 * inr2\n"
"vmla.f32 q13, q11, q4 @ w8 * inr4\n"
"vmla.f32 q14, q11, q6 @ w8 * inr6\n"
"vmla.f32 q15, q11, q8 @ w8 * inr8\n"
/* transpose */
"vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/
"vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/
"vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/
"vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/
"cmp %[flag_relu], #0\n"
"beq 0f\n" /* skip relu*/
"vmov.u32 q0, #0\n"
"vmax.f32 q12, q12, q0\n"
"vmax.f32 q13, q13, q0\n"
"vmax.f32 q14, q14, q0\n"
"vmax.f32 q15, q15, q0\n"
"0:\n"
"vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
"vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
"vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
"vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
:[r0] "+r"(inr0), [r1] "+r"(inr1),
[r2] "+r"(inr2), [wc0] "+r" (weight_c),
[outc0]"+r"(outc0), [outc1]"+r"(outc1),
[outc2]"+r"(outc2), [outc3]"+r"(outc3)
:[bias] "r" (bias_local),
[flag_relu]"r"(flag_relu)
:"cc", "memory",
"q0","q1","q2","q3","q4","q5","q6","q7",
"q8", "q9","q10","q11","q12","q13","q14","q15"
);
#endif // __arch64__
// clang-format off
if (flag_mask) {
for (int i = 0; i < remain; ++i) {
c0[i] = pre_out[i];
c1[i] = pre_out[i + 4];
c2[i] = pre_out[i + 8];
c3[i] = pre_out[i + 12];
}
}
}
}
}
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
此差异已折叠。
...@@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor, ...@@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor,
const int* bias, const int* bias,
int channel, int channel,
int channel_size); int channel_size);
// new winograd
void weight_trans_c4(
float* dest, const float* src, int ic, int oc, void* workspace);
void conv_compute_6x6_3x3(const float* input,
float* output,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weight,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册