提交 c28971a9 编写于 作者: B baolei.an

merge to newest version

上级 662e4d7c
...@@ -174,15 +174,26 @@ if(NOT WITH_DSO) ...@@ -174,15 +174,26 @@ if(NOT WITH_DSO)
endif(WIN32) endif(WIN32)
endif(NOT WITH_DSO) endif(NOT WITH_DSO)
get_filename_component(CUDA_LIB_PATH ${CUDA_curand_LIBRARY} DIRECTORY) function(add_cuda_static_lib alias cuda_lib_paths file_name)
function(import_static_library alias path) unset(ABS_PATH CACHE)
find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH)
add_library(${alias} STATIC IMPORTED GLOBAL) add_library(${alias} STATIC IMPORTED GLOBAL)
set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path}) set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE)
if (NOT ABS_PATH)
message(FATAL_ERROR "Can not find CUDA static library: ${file_name}")
endif()
endfunction() endfunction()
import_static_library(cudart_static ${CUDA_LIB_PATH}/libcudart_static.a)
import_static_library(cublas_static ${CUDA_LIB_PATH}/libcublas_static.a) add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a)
import_static_library(curand_static ${CUDA_LIB_PATH}/libcurand_static.a) add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a)
import_static_library(culibos_static ${CUDA_LIB_PATH}/libculibos.a) add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a)
add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a)
if(NOT ${CUDA_VERSION} LESS 10.1)
add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a)
endif()
set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES})
# setting nvcc arch flags # setting nvcc arch flags
select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
......
...@@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ...@@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
/usr/lib/${TARGET_ARCH}-linux-gnu/
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib $ENV{CUDNN_ROOT}/lib
/usr/lib /usr/lib
${CUDA_TOOLKIT_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
${CUDA_TOOLKIT_ROOT_DIR}/lib64
) )
if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0)) if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
......
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(EIGEN_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/eigen3)
SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
...@@ -16,9 +17,12 @@ if(WITH_AMD_GPU) ...@@ -16,9 +17,12 @@ if(WITH_AMD_GPU)
ExternalProject_Add( ExternalProject_Add(
extern_eigen3 extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" GIT_TAG
GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fhipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
DOWNLOAD_NAME "hipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip"
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
...@@ -29,12 +33,14 @@ else() ...@@ -29,12 +33,14 @@ else()
ExternalProject_Add( ExternalProject_Add(
extern_eigen3 extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror"
# eigen on cuda9.1 missing header of math_funtions.hpp # eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c GIT_TAG
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
DOWNLOAD_NAME "eigen" DOWNLOAD_NAME "eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip"
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
......
...@@ -20,6 +20,7 @@ endif() ...@@ -20,6 +20,7 @@ endif()
include(ExternalProject) include(ExternalProject)
SET(XBYAK_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xbyak)
set(XBYAK_PROJECT extern_xbyak) set(XBYAK_PROJECT extern_xbyak)
set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak) set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak)
set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak) set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak)
...@@ -38,8 +39,11 @@ ExternalProject_Add( ...@@ -38,8 +39,11 @@ ExternalProject_Add(
${XBYAK_PROJECT} ${XBYAK_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS "" DEPENDS ""
GIT_REPOSITORY "https://github.com/herumi/xbyak.git"
GIT_TAG "v5.661" # Jul 26th GIT_TAG "v5.661" # Jul 26th
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fxbyak-5.66.zip
DOWNLOAD_DIR ${XBYAK_SOURCECODE_DIR}
DOWNLOAD_NAME "xbyak-5.66.zip"
DOWNLOAD_NO_PROGRESS 1
PREFIX ${XBYAK_PREFIX_DIR} PREFIX ${XBYAK_PREFIX_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
......
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(XXHASH_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xxhash)
set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash) set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash) set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
...@@ -18,10 +19,12 @@ if(WIN32) ...@@ -18,10 +19,12 @@ if(WIN32)
ExternalProject_Add( ExternalProject_Add(
extern_xxhash extern_xxhash
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
GIT_TAG "v0.6.5" GIT_TAG "v0.6.5"
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
DOWNLOAD_DIR ${XXHASH_SOURCECODE_DIR}
DOWNLOAD_NAME "xxHash-0.6.5.zip"
DOWNLOAD_NO_PROGRESS 1
PREFIX ${XXHASH_SOURCE_DIR} PREFIX ${XXHASH_SOURCE_DIR}
DOWNLOAD_NAME "xxhash"
UPDATE_COMMAND "" UPDATE_COMMAND ""
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
PATCH_COMMAND PATCH_COMMAND
...@@ -41,10 +44,12 @@ else() ...@@ -41,10 +44,12 @@ else()
ExternalProject_Add( ExternalProject_Add(
extern_xxhash extern_xxhash
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Cyan4973/xxHash"
GIT_TAG "v0.6.5" GIT_TAG "v0.6.5"
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
DOWNLOAD_DIR ${XXHASH_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1
PREFIX ${XXHASH_SOURCE_DIR} PREFIX ${XXHASH_SOURCE_DIR}
DOWNLOAD_NAME "xxhash" DOWNLOAD_NAME "xxHash-0.6.5.zip"
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
......
...@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME) ...@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES})
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
if(nv_binary_DEPS) if(nv_binary_DEPS)
target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
...@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME) ...@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME)
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} ) gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
......
...@@ -164,7 +164,9 @@ function(lite_cc_library TARGET) ...@@ -164,7 +164,9 @@ function(lite_cc_library TARGET)
endfunction() endfunction()
function(lite_cc_binary TARGET) function(lite_cc_binary TARGET)
set(options "") if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
set(options " -g ")
endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
...@@ -255,6 +257,7 @@ endfunction() ...@@ -255,6 +257,7 @@ endfunction()
set(arm_kernels CACHE INTERNAL "arm kernels") set(arm_kernels CACHE INTERNAL "arm kernels")
set(x86_kernels CACHE INTERNAL "x86 kernels") set(x86_kernels CACHE INTERNAL "x86 kernels")
set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels") set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels")
......
...@@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}") ...@@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}") message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
...@@ -121,6 +122,9 @@ if (LITE_WITH_X86) ...@@ -121,6 +122,9 @@ if (LITE_WITH_X86)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
endif() endif()
if(LITE_WITH_CUDA)
add_dependencies(publish_inference paddle_full_api_shared)
endif(LITE_WITH_CUDA)
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
# add cxx lib # add cxx lib
...@@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
) )
add_dependencies(tiny_publish_lib bundle_light_api) add_dependencies(tiny_publish_lib bundle_light_api)
add_dependencies(publish_inference tiny_publish_lib) add_dependencies(publish_inference tiny_publish_lib)
...@@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
) )
add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared) add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference tiny_publish_cxx_lib) add_dependencies(publish_inference tiny_publish_cxx_lib)
add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
endif() endif()
endif() endif()
endif() endif()
...@@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
endif() endif()
endif() endif()
if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND if ((ARM_TARGET_OS STREQUAL "android") AND
((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8))) ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
# copy # copy
...@@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
) )
add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
...@@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
) )
add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
endif() endif()
......
...@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH) ...@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH)
set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
endif() endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer) set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library #full api dynamic library
add_library(paddle_full_api_shared SHARED "") add_library(paddle_full_api_shared SHARED "")
target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc) target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
...@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and ...@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
add_dependencies(paddle_full_api_shared xxhash) add_dependencies(paddle_full_api_shared xxhash)
target_link_libraries(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash)
endif() endif()
if(LITE_WITH_CUDA)
target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
endif(LITE_WITH_CUDA)
#light api dynamic library #light api dynamic library
lite_cc_library(paddle_light_api_shared MODULE lite_cc_library(paddle_light_api_shared MODULE
SRCS light_api_shared.cc SRCS light_api_shared.cc
...@@ -65,6 +67,7 @@ endif() ...@@ -65,6 +67,7 @@ endif()
message(STATUS "get ops ${ops}") message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get CUDA kernels ${cuda_kernels}")
message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get NPU kernels ${npu_kernels}")
...@@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kenrels} FPGA_DEPS ${fpga_kernels})
BM_DEPS ${bm_kenrels}) BM_DEPS ${bm_kernels})
endif() endif()
# for light api # for light api
set(light_api_deps set(light_api_deps
scope target_wrapper_host model_parser program) scope target_wrapper_host model_parser program)
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
set(light_api_deps ${light_api_deps} target_wrapper_cuda) set(light_api_deps ${light_api_deps} target_wrapper_cuda)
set(cuda_static_deps cudart_static cublas_static curand_static
cudnn_static culibos_static)
endif() endif()
lite_cc_library(light_api SRCS light_api.cc lite_cc_library(light_api SRCS light_api.cc
DEPS scope target_wrapper_host model_parser DEPS scope target_wrapper_host model_parser
...@@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kenrels} FPGA_DEPS ${fpga_kernels})
BM_DEPS ${bm_kenrels}) BM_DEPS ${bm_kernels})
include(ExternalProject) include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
...@@ -305,9 +307,10 @@ if(NOT IOS) ...@@ -305,9 +307,10 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
X86_DEPS ${x86_kernels}) FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
...@@ -316,7 +319,9 @@ if(NOT IOS) ...@@ -316,7 +319,9 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}) X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
endif() endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
...@@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) { const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(load_model_dir); config.set_model_dir(load_model_dir);
std::vector<Place> vaild_places = {Place{TARGET(kARM), PRECISION(kFloat)}, std::vector<Place> vaild_places = {
Place{TARGET(kARM), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}, Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kOpenCL), PRECISION(kFloat)}}; };
if (FLAGS_is_quantized_model) { if (FLAGS_is_quantized_model) {
vaild_places.insert(vaild_places.begin(), vaild_places.insert(vaild_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)}); Place{TARGET(kARM), PRECISION(kInt8)});
......
...@@ -24,13 +24,6 @@ ...@@ -24,13 +24,6 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
".tailored_ops_source_list";
static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
".tailored_kernels_source_list";
static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
void Predictor::SaveModel(const std::string &dir, void Predictor::SaveModel(const std::string &dir,
lite_api::LiteModelType model_type, lite_api::LiteModelType model_type,
bool record_info) { bool record_info) {
...@@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) { ...@@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
// get inputs names // get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; } std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
// get outputnames // get outputnames
std::vector<std::string> Predictor::GetOutputNames() { return output_names_; } std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// append the names of inputs and outputs into input_names_ and output_names_ // append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() { void Predictor::PrepareFeedFetch() {
std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
// The shape of input tensors must be determined before generating NPU and XPU
// program.
auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0); auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
std::vector<cpp::OpDesc *> feeds;
std::vector<cpp::OpDesc *> fetchs;
for (size_t i = 0; i < current_block->OpsSize(); i++) { for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i); auto op = current_block->GetOp<cpp::OpDesc>(i);
#else
if (!program_) {
GenRuntimeProgram();
}
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
#endif
if (op->Type() == "feed") { if (op->Type() == "feed") {
feeds.push_back(op); feeds.push_back(op);
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
fetchs.push_back(op); fetchs.push_back(op);
} }
} }
input_names_.resize(feeds.size()); input_names_.resize(feeds.size());
output_names_.resize(fetchs.size()); output_names_.resize(fetchs.size());
for (size_t i = 0; i < feeds.size(); i++) { for (size_t i = 0; i < feeds.size(); i++) {
...@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const { ...@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
const cpp::ProgramDesc &Predictor::program_desc() const { const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_; return program_desc_;
} }
const RuntimeProgram &Predictor::runtime_program() const { return *program_; } const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
void Predictor::Build(const lite_api::CxxConfig &config, void Predictor::Build(const lite_api::CxxConfig &config,
...@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
const std::vector<Place> &valid_places, const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) { const std::vector<std::string> &passes) {
program_desc_ = desc; program_desc_ = desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places; std::vector<Place> inner_places = valid_places;
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back( inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
Program program(desc, scope_, inner_places); Program program(desc, scope_, inner_places);
/// The first place in valid_places is
core::KernelPickFactor factor; core::KernelPickFactor factor;
factor.ConsiderTarget(); factor.ConsiderTarget();
factor.ConsiderPrecision(); factor.ConsiderPrecision();
factor.ConsiderDataLayout(); factor.ConsiderDataLayout();
optimizer_.Run(std::move(program), inner_places, factor, passes); optimizer_.Run(std::move(program), inner_places, factor, passes);
exec_scope_ = optimizer_.exec_scope(); exec_scope_ = optimizer_.exec_scope();
PrepareFeedFetch(); PrepareFeedFetch();
...@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const { ...@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name); auto *var = exec_scope_->FindVar(name);
return &var->Get<lite::Tensor>(); return &var->Get<lite::Tensor>();
} }
// get input by name // get input by name
lite::Tensor *Predictor::GetInputByName(const std::string &name) { lite::Tensor *Predictor::GetInputByName(const std::string &name) {
auto element = std::find(input_names_.begin(), input_names_.end(), name); auto element = std::find(input_names_.begin(), input_names_.end(), name);
......
...@@ -29,6 +29,13 @@ ...@@ -29,6 +29,13 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
".tailored_ops_source_list";
static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
".tailored_kernels_source_list";
static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
/* /*
* Predictor for inference, input a model, it will optimize and execute it. * Predictor for inference, input a model, it will optimize and execute it.
*/ */
......
...@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) { ...@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
TEST(MobileNetV1, test_opencl) { TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({ std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
TARGET(kARM), // enable kARM CPU kernel when no opencl kernel
}); });
TestModel(valid_places); TestModel(valid_places);
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
// model_optimize_tool's compiling period // model_optimize_tool's compiling period
#include "all_kernel_faked.cc" // NOLINT #include "all_kernel_faked.cc" // NOLINT
#include "kernel_src_map.h" // NOLINT #include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
...@@ -31,6 +32,18 @@ DEFINE_string(model_dir, ...@@ -31,6 +32,18 @@ DEFINE_string(model_dir,
"", "",
"path of the model. This option will be ignored if model_file " "path of the model. This option will be ignored if model_file "
"and param_file are exist"); "and param_file are exist");
DEFINE_string(model_filename,
"",
"model topo filename of the model in models set. This option"
" will be used to specific tailoring");
DEFINE_string(param_filename,
"",
"model param filename of the model in models set. This option"
" will be used to specific tailoring");
DEFINE_string(model_set_dir,
"",
"path of the models set. This option will be used to specific"
" tailoring");
DEFINE_string(model_file, "", "model file path of the combined-param model"); DEFINE_string(model_file, "", "model file path of the combined-param model");
DEFINE_string(param_file, "", "param file path of the combined-param model"); DEFINE_string(param_file, "", "param file path of the combined-param model");
DEFINE_string( DEFINE_string(
...@@ -58,29 +71,23 @@ void DisplayKernels() { ...@@ -58,29 +71,23 @@ void DisplayKernels() {
LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString(); LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
} }
void Main() { std::vector<Place> ParserValidPlaces() {
if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
LOG(WARNING)
<< "Load combined-param model. Option model_dir will be ignored";
}
if (FLAGS_display_kernels) {
DisplayKernels();
exit(0);
}
lite_api::CxxConfig config;
config.set_model_dir(FLAGS_model_dir);
config.set_model_file(FLAGS_model_file);
config.set_param_file(FLAGS_param_file);
std::vector<Place> valid_places; std::vector<Place> valid_places;
auto target_reprs = lite::Split(FLAGS_valid_targets, " "); auto target_reprs = lite::Split(FLAGS_valid_targets, ",");
for (auto& target_repr : target_reprs) { for (auto& target_repr : target_reprs) {
if (target_repr == "arm") { if (target_repr == "arm") {
valid_places.emplace_back(TARGET(kARM)); valid_places.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") { } else if (target_repr == "opencl") {
valid_places.emplace_back(TARGET(kOpenCL)); valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") { } else if (target_repr == "x86") {
valid_places.emplace_back(TARGET(kX86)); valid_places.emplace_back(TARGET(kX86));
} else { } else {
...@@ -100,26 +107,130 @@ void Main() { ...@@ -100,26 +107,130 @@ void Main() {
valid_places.insert(valid_places.begin(), valid_places.insert(valid_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)}); Place{TARGET(kARM), PRECISION(kInt8)});
} }
return valid_places;
}
void RunOptimize(const std::string& model_dir,
const std::string& model_file,
const std::string& param_file,
const std::string& optimize_out,
const std::string& optimize_out_type,
const std::vector<Place>& valid_places,
bool record_tailoring_info) {
if (!model_file.empty() && !param_file.empty()) {
LOG(WARNING)
<< "Load combined-param model. Option model_dir will be ignored";
}
lite_api::CxxConfig config;
config.set_model_dir(model_dir);
config.set_model_file(model_file);
config.set_param_file(param_file);
config.set_valid_places(valid_places); config.set_valid_places(valid_places);
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
LiteModelType model_type; LiteModelType model_type;
if (FLAGS_optimize_out_type == "protobuf") { if (optimize_out_type == "protobuf") {
model_type = LiteModelType::kProtobuf; model_type = LiteModelType::kProtobuf;
} else if (FLAGS_optimize_out_type == "naive_buffer") { } else if (optimize_out_type == "naive_buffer") {
model_type = LiteModelType::kNaiveBuffer; model_type = LiteModelType::kNaiveBuffer;
} else { } else {
LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type; LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
} }
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
predictor->SaveOptimizedModel( predictor->SaveOptimizedModel(
FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info); optimize_out, model_type, record_tailoring_info);
if (FLAGS_record_tailoring_info) { if (record_tailoring_info) {
LOG(INFO) << "Record the information of tailored model into :" LOG(INFO) << "Record the information of tailored model into :"
<< FLAGS_optimize_out; << optimize_out;
}
}
void CollectModelMetaInfo(const std::string& output_dir,
const std::vector<std::string>& models,
const std::string& filename) {
std::set<std::string> total;
for (const auto& name : models) {
std::string model_path =
lite::Join<std::string>({output_dir, name, filename}, "/");
auto lines = lite::ReadLines(model_path);
total.insert(lines.begin(), lines.end());
}
std::string output_path =
lite::Join<std::string>({output_dir, filename}, "/");
lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
output_path);
}
void Main() {
if (FLAGS_display_kernels) {
DisplayKernels();
exit(0);
} }
auto valid_places = ParserValidPlaces();
if (FLAGS_model_set_dir == "") {
RunOptimize(FLAGS_model_dir,
FLAGS_model_file,
FLAGS_param_file,
FLAGS_optimize_out,
FLAGS_optimize_out_type,
valid_places,
FLAGS_record_tailoring_info);
return;
}
if (!FLAGS_record_tailoring_info) {
LOG(WARNING) << "--model_set_dir option only be used with "
"--record_tailoring_info=true together";
return;
}
auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
if (model_dirs.size() == 0) {
LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
}
// Optimize models in FLAGS_model_set_dir
for (const auto& name : model_dirs) {
std::string input_model_dir =
lite::Join<std::string>({FLAGS_model_set_dir, name}, "/");
std::string output_model_dir =
lite::Join<std::string>({FLAGS_optimize_out, name}, "/");
std::string model_file = "";
std::string param_file = "";
if (FLAGS_model_filename != "" && FLAGS_param_filename != "") {
model_file =
lite::Join<std::string>({input_model_dir, FLAGS_model_filename}, "/");
param_file =
lite::Join<std::string>({input_model_dir, FLAGS_param_filename}, "/");
}
LOG(INFO) << "Start optimize model: " << input_model_dir;
RunOptimize(input_model_dir,
model_file,
param_file,
output_model_dir,
FLAGS_optimize_out_type,
valid_places,
FLAGS_record_tailoring_info);
LOG(INFO) << "Optimize done. ";
}
// Collect all models information
CollectModelMetaInfo(
FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo(FLAGS_optimize_out,
model_dirs,
lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
FLAGS_optimize_out, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
} }
} // namespace lite_api } // namespace lite_api
......
...@@ -21,14 +21,14 @@ ...@@ -21,14 +21,14 @@
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h" #include "lite/api/test_helper.h"
#include "lite/core/device_info.h" #include "lite/core/device_info.h"
#include "lite/tests/utils/timer.h" #include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h" #include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DEFINE_string(input_shape, DEFINE_string(input_shape,
"1,3,224,224", "1,3,224,224",
...@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
Timer ti; Timer ti;
for (int j = 0; j < repeat; ++j) { for (int j = 0; j < repeat; ++j) {
ti.start(); ti.Start();
predictor->Run(); predictor->Run();
ti.end(); float t = ti.Stop();
LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms"; LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
} }
LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir LOG(INFO) << "Model: " << model_dir
<< ", power_mode: " << static_cast<int>(power_mode) << ", power_mode: " << static_cast<int>(power_mode)
<< ", threads num " << thread_num << ", warmup: " << warmup_times << ", threads num " << thread_num << ", warmup: " << warmup_times
<< ", repeats: " << repeat << ", avg time: " << ti.get_average_ms() << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
<< " ms" << " ms"
<< ", min time: " << ti.get_min_time() << " ms" << ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.get_max_time() << " ms."; << ", max time: " << ti.LapTimes().Max() << " ms.";
auto output = predictor->GetOutput(0); auto output = predictor->GetOutput(0);
auto out = output->data<float>(); auto out = output->data<float>();
......
...@@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) { ...@@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) {
} }
} }
template <typename T> template <typename T>
void Tensor::CopyToCpu(T *data) { void Tensor::CopyToCpu(T *data) const {
const T *src_data = tensor(raw_tensor_)->data<T>(); const T *src_data = tensor(raw_tensor_)->data<T>();
int64_t num = tensor(raw_tensor_)->numel(); int64_t num = tensor(raw_tensor_)->numel();
CHECK(num > 0) << "You should call Resize interface first"; CHECK(num > 0) << "You should call Resize interface first";
...@@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *); ...@@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *); template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *); template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *); template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *); template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *); template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
template void Tensor::CopyToCpu(int8_t *); template void Tensor::CopyToCpu(int8_t *) const;
template void Tensor::CopyToCpu(float *); template void Tensor::CopyToCpu(float *) const;
template void Tensor::CopyToCpu(int *); template void Tensor::CopyToCpu(int *) const;
shape_t Tensor::shape() const { shape_t Tensor::shape() const {
return ctensor(raw_tensor_)->dims().Vectorize(); return ctensor(raw_tensor_)->dims().Vectorize();
......
...@@ -49,7 +49,7 @@ struct LITE_API Tensor { ...@@ -49,7 +49,7 @@ struct LITE_API Tensor {
void CopyFromCpu(const T* data); void CopyFromCpu(const T* data);
template <typename T> template <typename T>
void CopyToCpu(T* data); void CopyToCpu(T* data) const;
/// Shape of the tensor. /// Shape of the tensor.
shape_t shape() const; shape_t shape() const;
TargetType target() const; TargetType target() const;
......
...@@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) { ...@@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) {
"any", "any",
"fpga", "fpga",
"npu", "npu",
"xpu", "xpu"};
"bm"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) { ...@@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) {
"kAny", "kAny",
"kFPGA", "kFPGA",
"kNPU", "kNPU",
"kXPU", "kXPU"};
"kBM"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -131,8 +129,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) { ...@@ -131,8 +129,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kOpenCL), TARGET(kOpenCL),
TARGET(kNPU), TARGET(kNPU),
TARGET(kXPU), TARGET(kXPU),
TARGET(kFPGA), TARGET(kFPGA)});
TARGET(kBM)});
if (target == TARGET(kAny)) { if (target == TARGET(kAny)) {
return valid_set; return valid_set;
} }
......
...@@ -52,9 +52,8 @@ enum class TargetType : int { ...@@ -52,9 +52,8 @@ enum class TargetType : int {
kFPGA = 7, kFPGA = 7,
kNPU = 8, kNPU = 8,
kXPU = 9, kXPU = 9,
kBM = 10,
kAny = 6, // any target kAny = 6, // any target
NUM = 11, // number of fields. NUM = 10, // number of fields.
}; };
enum class PrecisionType : int { enum class PrecisionType : int {
kUnk = 0, kUnk = 0,
......
...@@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass); ...@@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass);
USE_MIR_PASS(variable_place_inference_pass); USE_MIR_PASS(variable_place_inference_pass);
USE_MIR_PASS(type_target_cast_pass); USE_MIR_PASS(type_target_cast_pass);
USE_MIR_PASS(generate_program_pass); USE_MIR_PASS(generate_program_pass);
USE_MIR_PASS(subgraph_program_pass); #ifdef LITE_WITH_NPU
USE_MIR_PASS(generate_npu_program_pass);
#endif
#ifdef LITE_WITH_XPU
USE_MIR_PASS(generate_xpu_program_pass);
#endif
USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(io_copy_kernel_pick_pass);
USE_MIR_PASS(argument_type_display_pass); USE_MIR_PASS(argument_type_display_pass);
......
...@@ -12,20 +12,6 @@ ...@@ -12,20 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
......
...@@ -60,6 +60,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -60,6 +60,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
cc_library(math_arm SRCS cc_library(math_arm SRCS
funcs.cc funcs.cc
packed_sgemm.cc packed_sgemm.cc
packed_sgemm_c4.cc
sgemm.cc sgemm.cc
gemm_prepacked_int8.cc gemm_prepacked_int8.cc
gemm_s8.cc gemm_s8.cc
...@@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
gemv_arm_int8.cc gemv_arm_int8.cc
conv3x3s1_direct_fp32.cc conv3x3s1_direct_fp32.cc
conv3x3s2_direct_fp32.cc conv3x3s2_direct_fp32.cc
conv3x3s1_depthwise_fp32.cc conv3x3s1p01_depthwise_fp32.cc
conv3x3s2_depthwise_fp32.cc conv3x3s2p01_depthwise_fp32.cc
conv3x3s1px_depthwise_fp32.cc
conv3x3s2px_depthwise_fp32.cc
conv3x3s1_direct_int8.cc conv3x3s1_direct_int8.cc
conv3x3s2_direct_int8.cc conv3x3s2_direct_int8.cc
conv3x3s1_depthwise_int8.cc conv3x3s1_depthwise_int8.cc
...@@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
conv5x5s1_depthwise_int8.cc conv5x5s1_depthwise_int8.cc
conv5x5s1_depthwise_fp32.cc conv5x5s1_depthwise_fp32.cc
conv5x5s2_depthwise_fp32.cc conv5x5s2_depthwise_fp32.cc
conv_depthwise_3x3p0.cc conv3x3_winograd_fp32_c4.cc
conv_depthwise_3x3p1.cc
conv_depthwise_3x3s1.cc
conv_depthwise_3x3s2.cc
conv_winograd_3x3.cc conv_winograd_3x3.cc
conv_impl.cc conv_impl.cc
softmax.cc softmax.cc
scale.cc scale.cc
pooling.cc pooling.cc
elementwise.cc elementwise.cc
layout.cc
lrn.cc lrn.cc
decode_bboxes.cc decode_bboxes.cc
concat.cc concat.cc
...@@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
anchor_generator.cc anchor_generator.cc
DEPS ${lite_kernel_deps} context tensor) DEPS ${lite_kernel_deps} context tensor)
endif() endif()
...@@ -32,8 +32,10 @@ void col2im<float>(const float* data_col, ...@@ -32,8 +32,10 @@ void col2im<float>(const float* data_col,
const int width, const int width,
const int kernel_h, const int kernel_h,
const int kernel_w, const int kernel_w,
const int pad_h, const int pad_h0,
const int pad_w, const int pad_h1,
const int pad_w0,
const int pad_w1,
const int stride_h, const int stride_h,
const int stride_w, const int stride_w,
const int dilation_h, const int dilation_h,
...@@ -41,19 +43,22 @@ void col2im<float>(const float* data_col, ...@@ -41,19 +43,22 @@ void col2im<float>(const float* data_col,
float* data_im) { float* data_im) {
memset(data_im, 0, height * width * channels * sizeof(float)); memset(data_im, 0, height * width * channels * sizeof(float));
const int output_h = const int output_h =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
stride_h +
1;
const int output_w = const int output_w =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1;
const int channel_size = height * width; const int channel_size = height * width;
for (int channel = channels; channel--; data_im += channel_size) { for (int channel = channels; channel--; data_im += channel_size) {
for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
int input_row = -pad_h + kernel_row * dilation_h; int input_row = -pad_h0 + kernel_row * dilation_h;
for (int output_rows = output_h; output_rows; output_rows--) { for (int output_rows = output_h; output_rows; output_rows--) {
if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
data_col += output_w; data_col += output_w;
} else { } else {
int input_col = -pad_w + kernel_col * dilation_w; int input_col = -pad_w0 + kernel_col * dilation_w;
for (int output_col = output_w; output_col; output_col--) { for (int output_col = output_w; output_col; output_col--) {
if (is_a_ge_zero_and_a_lt_b(input_col, width)) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
data_im[input_row * width + input_col] += *data_col; data_im[input_row * width + input_col] += *data_col;
......
...@@ -26,8 +26,10 @@ void col2im(const Dtype* data_col, ...@@ -26,8 +26,10 @@ void col2im(const Dtype* data_col,
const int width, const int width,
const int kernel_h, const int kernel_h,
const int kernel_w, const int kernel_w,
const int pad_h, const int pad_h0,
const int pad_w, const int pad_h1,
const int pad_w0,
const int pad_w1,
const int stride_h, const int stride_h,
const int stride_w, const int stride_w,
const int dilation_h, const int dilation_h,
......
此差异已折叠。
...@@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param, ...@@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param,
auto dim_in = param.x->dims(); auto dim_in = param.x->dims();
auto dim_out = param.output->dims(); auto dim_out = param.output->dims();
const int threads = ctx->threads(); const int threads = ctx->threads();
auto paddings = *param.paddings;
int llc_size = ctx->llc_size() / sizeof(float); int llc_size = ctx->llc_size() / sizeof(float);
const int pad_w = param.paddings[1]; const int pad_w = paddings[2];
const int pad_h = param.paddings[0]; const int pad_h = paddings[0];
int ow = dim_out[3]; int ow = dim_out[3];
int oh = dim_out[2]; int oh = dim_out[2];
int ic = dim_in[1]; int ic = dim_in[1];
...@@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data, ...@@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data,
ARMContext* ctx) { ARMContext* ctx) {
const int threads = ctx->threads(); const int threads = ctx->threads();
int l2_size = ctx->llc_size() / sizeof(float); int l2_size = ctx->llc_size() / sizeof(float);
auto paddings = *param.paddings;
const int pad_h = param.paddings[0]; const int pad_h = paddings[0];
const int pad_w = param.paddings[1]; const int pad_w = paddings[2];
const int wout_round = ROUNDUP(ow, OUT_W_BLOCK); const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
const int win_round = wout_round + 2; const int win_round = wout_round + 2;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
......
...@@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din, ...@@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din,
const operators::ConvParam& param, const operators::ConvParam& param,
Context<TARGET(kARM)>* ctx, Context<TARGET(kARM)>* ctx,
const float* scale) { const float* scale) {
auto paddings = *param.paddings;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias; bool flag_bias = param.bias;
int pad_h = param.paddings[0]; int pad_h = paddings[0];
int pad_w = param.paddings[1]; int pad_w = paddings[2];
const int threads = ctx->threads(); const int threads = ctx->threads();
int llc_size = ctx->llc_size() / 4; int llc_size = ctx->llc_size() / 4;
......
此差异已折叠。
此差异已折叠。
...@@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param, ...@@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto dim_in = param.x->dims(); auto dim_in = param.x->dims();
auto dim_out = param.output->dims(); auto dim_out = param.output->dims();
auto paddings = *param.paddings;
const int threads = ctx->threads(); const int threads = ctx->threads();
int llc_size = ctx->llc_size() / sizeof(float); int llc_size = ctx->llc_size() / sizeof(float);
const int pad_w = param.paddings[1]; const int pad_w = paddings[2];
const int pad_h = param.paddings[0]; const int pad_h = paddings[0];
int ow = dim_out[3]; int ow = dim_out[3];
int oh = dim_out[2]; int oh = dim_out[2];
int ic = dim_in[1]; int ic = dim_in[1];
...@@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data, ...@@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data,
//! 3x3s2 convolution, implemented by direct algorithm //! 3x3s2 convolution, implemented by direct algorithm
//! prepack input to tmp buffer //! prepack input to tmp buffer
//! write output to tmp buffer //! write output to tmp buffer
auto paddings = *param.paddings;
const int threads = ctx->threads(); const int threads = ctx->threads();
int l2_size = ctx->llc_size() / sizeof(float); int l2_size = ctx->llc_size() / sizeof(float);
const int pad_w = param.paddings[1]; const int pad_w = paddings[2];
const int pad_h = param.paddings[0]; const int pad_h = paddings[0];
const int wout_round = ROUNDUP(ow, OUT_W_BLOCK); const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
const int win_round = wout_round * 2 /*stride_w*/ + 1; const int win_round = wout_round * 2 /*stride_w*/ + 1;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
......
...@@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din, ...@@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
//! 3x3s2 int8 convolution, implemented by direct algorithm //! 3x3s2 int8 convolution, implemented by direct algorithm
//! prepack input to tmp buffer //! prepack input to tmp buffer
//! write output to tmp buffer //! write output to tmp buffer
auto paddings = *param.paddings;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias; bool flag_bias = param.bias;
int pad_h = param.paddings[0]; int pad_h = paddings[0];
int pad_w = param.paddings[1]; int pad_w = paddings[1];
const int threads = ctx->threads(); const int threads = ctx->threads();
int llc_size = ctx->llc_size() / 4; int llc_size = ctx->llc_size() / 4;
...@@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din, ...@@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
//! 3x3s2 int8 convolution, implemented by direct algorithm //! 3x3s2 int8 convolution, implemented by direct algorithm
//! prepack input to tmp buffer //! prepack input to tmp buffer
//! write output to tmp buffer //! write output to tmp buffer
auto paddings = *param.paddings;
bool flag_relu = param.fuse_relu; bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias; bool flag_bias = param.bias;
int pad_h = param.paddings[0]; int pad_h = paddings[0];
int pad_w = param.paddings[1]; int pad_w = paddings[1];
const int threads = ctx->threads(); const int threads = ctx->threads();
//! set 1/4 l2 cache //! set 1/4 l2 cache
int llc_size = ctx->llc_size() / 4; int llc_size = ctx->llc_size() / 4;
......
此差异已折叠。
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <arm_neon.h>
#include "lite/backends/arm/math/conv_block_utils.h"
#include "lite/backends/arm/math/conv_impl.h"
#include "lite/core/context.h"
#include "lite/operators/op_params.h"
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void conv_3x3s2_depthwise_fp32(const float* i_data,
float* o_data,
int bs,
int oc,
int oh,
int ow,
int ic,
int ih,
int win,
const float* weights,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx) {
auto paddings = *param.paddings;
int threads = ctx->threads();
const int pad_h = paddings[0];
const int pad_w = paddings[2];
const int out_c_block = 4;
const int out_h_kernel = 1;
const int out_w_kernel = 4;
const int win_ext = ow * 2 + 1;
const int ow_round = ROUNDUP(ow, 4);
const int win_round = ROUNDUP(win_ext, 4);
const int hin_round = oh * 2 + 1;
const int prein_size = win_round * hin_round * out_c_block;
auto workspace_size =
threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
ctx->ExtendWorkspace(sizeof(float) * workspace_size);
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
/// get workspace
auto ptr_zero = ctx->workspace_data<float>();
memset(ptr_zero, 0, sizeof(float) * win_round);
float* ptr_write = ptr_zero + win_round;
int size_in_channel = win * ih;
int size_out_channel = ow * oh;
int ws = -pad_w;
int we = ws + win_round;
int hs = -pad_h;
int he = hs + hin_round;
int w_loop = ow_round / 4;
auto remain = w_loop * 4 - ow;
bool flag_remain = remain > 0;
remain = 4 - remain;
remain = remain > 0 ? remain : 0;
int row_len = win_round * out_c_block;
for (int n = 0; n < bs; ++n) {
const float* din_batch = i_data + n * ic * size_in_channel;
float* dout_batch = o_data + n * oc * size_out_channel;
#pragma omp parallel for num_threads(threads)
for (int c = 0; c < oc; c += out_c_block) {
#ifdef ARM_WITH_OMP
float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
#else
float* pre_din = ptr_write + ow_round;
#endif
/// const array size
prepack_input_nxwc4_dw(
din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
const float* weight_c = weights + c * 9; // kernel_w * kernel_h
float* dout_c00 = dout_batch + c * size_out_channel;
float bias_local[4] = {0, 0, 0, 0};
if (flag_bias) {
bias_local[0] = bias[c];
bias_local[1] = bias[c + 1];
bias_local[2] = bias[c + 2];
bias_local[3] = bias[c + 3];
}
#ifdef __aarch64__
float32x4_t w0 = vld1q_f32(weight_c); // w0, v23
float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24
float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25
float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26
float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27
float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28
float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29
float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30
float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31
#endif
for (int h = 0; h < oh; h += out_h_kernel) {
float* outc0 = dout_c00 + h * ow;
float* outc1 = outc0 + size_out_channel;
float* outc2 = outc1 + size_out_channel;
float* outc3 = outc2 + size_out_channel;
const float* inr0 = pre_din + h * 2 * row_len;
const float* inr1 = inr0 + row_len;
const float* inr2 = inr1 + row_len;
if (c + out_c_block > oc) {
switch (c + out_c_block - oc) {
case 3:
outc1 = ptr_write;
case 2:
outc2 = ptr_write;
case 1:
outc3 = ptr_write;
default:
break;
}
}
auto c0 = outc0;
auto c1 = outc1;
auto c2 = outc2;
auto c3 = outc3;
float pre_out[16];
for (int w = 0; w < w_loop; ++w) {
bool flag_mask = (w == w_loop - 1) && flag_remain;
if (flag_mask) {
c0 = outc0;
c1 = outc1;
c2 = outc2;
c3 = outc3;
outc0 = pre_out;
outc1 = pre_out + 4;
outc2 = pre_out + 8;
outc3 = pre_out + 12;
}
// clang-format off
#ifdef __aarch64__
asm volatile(
"ldr q8, [%[bias]]\n" /* load bias */
"ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/
"and v19.16b, v8.16b, v8.16b\n"
"ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/
"and v20.16b, v8.16b, v8.16b\n"
"ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/
"and v21.16b, v8.16b, v8.16b\n"
"ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/
"and v22.16b, v8.16b, v8.16b\n"
"ldr q8, [%[inr0]]\n" /* load input r0*/
/* r0 mul w0-w2, get out */
"fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/
"fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/
"fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/
"fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/
"fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/
"ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/
"fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/
"fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/
"fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/
"fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/
"ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/
"fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/
"ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/
"fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/
"ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/
"fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/
"ldr q8, [%[inr1]]\n" /* load input r1*/
/* r1, mul w3-w5, get out */
"fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/
"fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/
"fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/
"fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/
"fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/
"ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/
"fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/
"fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/
"fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/
"fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/
"ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/
"fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/
"ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/
"fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/
"ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/
"fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/
"ldr q8, [%[inr2]]\n" /* load input r2*/
/* r2, mul w6-w8, get out r0, r1 */
"fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/
"fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/
"fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/
"fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/
"fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/
"fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/
"fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/
"fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/
"fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/
"fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/
"fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/
"fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/
/* transpose */
"trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
"trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
"trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
"trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
"trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/
"trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/
"trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/
"trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/
/* relu */
"cbz %w[flag_relu], 0f\n" /* skip relu*/
"movi v0.4s, #0\n" /* for relu */
"fmax v19.4s, v19.4s, v0.4s\n"
"fmax v20.4s, v20.4s, v0.4s\n"
"fmax v21.4s, v21.4s, v0.4s\n"
"fmax v22.4s, v22.4s, v0.4s\n"
/* save result */
"0:\n"
"str q19, [%[outc0]], #16\n"
"str q20, [%[outc1]], #16\n"
"str q21, [%[outc2]], #16\n"
"str q22, [%[outc3]], #16\n"
:[inr0] "+r"(inr0), [inr1] "+r"(inr1),
[inr2] "+r"(inr2),
[outc0]"+r"(outc0), [outc1]"+r"(outc1),
[outc2]"+r"(outc2), [outc3]"+r"(outc3)
:[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
[w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
[w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
[bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
: "cc", "memory",
"v0","v1","v2","v3","v4","v5","v6","v7",
"v8", "v19","v20","v21","v22"
);
#else
asm volatile(
/* fill with bias */
"vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */
/* load weights */
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
"vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/
"vand.i32 q12, q8, q8\n"
"vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/
"vand.i32 q13, q8, q8\n"
"vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/
"vand.i32 q14, q8, q8\n"
"vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/
"vand.i32 q15, q8, q8\n"
"vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/
/* mul r0 with w0, w1, w2 */
"vmla.f32 q12, q9, q0 @ w0 * inr0\n"
"vmla.f32 q13, q9, q2 @ w0 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */
"vmla.f32 q14, q9, q4 @ w0 * inr4\n"
"vmla.f32 q15, q9, q6 @ w0 * inr6\n"
"vmla.f32 q12, q10, q1 @ w1 * inr1\n"
"vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n"
"vmla.f32 q13, q10, q3 @ w1 * inr3\n"
"vmla.f32 q14, q10, q5 @ w1 * inr5\n"
"vmla.f32 q15, q10, q7 @ w1 * inr7\n"
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */
"vmla.f32 q12, q11, q2 @ w2 * inr2\n"
"vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n"
"vmla.f32 q13, q11, q4 @ w2 * inr4\n"
"vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n"
"vmla.f32 q14, q11, q6 @ w2 * inr6\n"
"vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n"
"vmla.f32 q15, q11, q8 @ w2 * inr8\n"
/* mul r1 with w3, w4, w5 */
"vmla.f32 q12, q9, q0 @ w3 * inr0\n"
"vmla.f32 q13, q9, q2 @ w3 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */
"vmla.f32 q14, q9, q4 @ w3 * inr4\n"
"vmla.f32 q15, q9, q6 @ w3 * inr6\n"
"vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/
"vmla.f32 q12, q10, q1 @ w4 * inr1\n"
"vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n"
"vmla.f32 q13, q10, q3 @ w4 * inr3\n"
"vmla.f32 q14, q10, q5 @ w4 * inr5\n"
"vmla.f32 q15, q10, q7 @ w4 * inr7\n"
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */
"vmla.f32 q12, q11, q2 @ w5 * inr2\n"
"vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n"
"vmla.f32 q13, q11, q4 @ w5 * inr4\n"
"vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n"
"vmla.f32 q14, q11, q6 @ w5 * inr6\n"
"vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n"
"vmla.f32 q15, q11, q8 @ w5 * inr8\n"
/* mul r2 with w6, w7, w8 */
"vmla.f32 q12, q9, q0 @ w6 * inr0\n"
"vmla.f32 q13, q9, q2 @ w6 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */
"vmla.f32 q14, q9, q4 @ w6 * inr4\n"
"vmla.f32 q15, q9, q6 @ w6 * inr6\n"
"vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/
"vmla.f32 q12, q10, q1 @ w7 * inr1\n"
"vmla.f32 q13, q10, q3 @ w7 * inr3\n"
"vmla.f32 q14, q10, q5 @ w7 * inr5\n"
"vmla.f32 q15, q10, q7 @ w7 * inr7\n"
"sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n"
"vmla.f32 q12, q11, q2 @ w8 * inr2\n"
"vmla.f32 q13, q11, q4 @ w8 * inr4\n"
"vmla.f32 q14, q11, q6 @ w8 * inr6\n"
"vmla.f32 q15, q11, q8 @ w8 * inr8\n"
/* transpose */
"vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/
"vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/
"vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/
"vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/
"cmp %[flag_relu], #0\n"
"beq 0f\n" /* skip relu*/
"vmov.u32 q0, #0\n"
"vmax.f32 q12, q12, q0\n"
"vmax.f32 q13, q13, q0\n"
"vmax.f32 q14, q14, q0\n"
"vmax.f32 q15, q15, q0\n"
"0:\n"
"vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
"vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
"vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
"vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
:[r0] "+r"(inr0), [r1] "+r"(inr1),
[r2] "+r"(inr2), [wc0] "+r" (weight_c),
[outc0]"+r"(outc0), [outc1]"+r"(outc1),
[outc2]"+r"(outc2), [outc3]"+r"(outc3)
:[bias] "r" (bias_local),
[flag_relu]"r"(flag_relu)
:"cc", "memory",
"q0","q1","q2","q3","q4","q5","q6","q7",
"q8", "q9","q10","q11","q12","q13","q14","q15"
);
#endif // __arch64__
// clang-format off
if (flag_mask) {
for (int i = 0; i < remain; ++i) {
c0[i] = pre_out[i];
c1[i] = pre_out[i + 4];
c2[i] = pre_out[i + 8];
c3[i] = pre_out[i + 12];
}
}
}
}
}
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
此差异已折叠。
...@@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor, ...@@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor,
const int* bias, const int* bias,
int channel, int channel,
int channel_size); int channel_size);
// new winograd
void weight_trans_c4(
float* dest, const float* src, int ic, int oc, void* workspace);
void conv_compute_6x6_3x3(const float* input,
float* output,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weight,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
if(NOT LITE_WITH_CUDA) if(NOT LITE_WITH_CUDA)
return() return()
endif() endif()
set(cuda_static_deps cudnn_static cublas_static curand_static get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
culibos_static cudart_static)
nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps}) nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps})
nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps}) nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps})
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册