提交 38576db5 编写于 作者: Y yongqiang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into develop

...@@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images ...@@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources metal/MobileNetDemo/MobileNetDemo/Resources
#flatbuffers
lite/model_parser/flatbuffers/framework_generated.h
build* build*
# hiai libs
ai_ddk_lib*
...@@ -10,3 +10,6 @@ ...@@ -10,3 +10,6 @@
[submodule "third-party/protobuf-host"] [submodule "third-party/protobuf-host"]
path = third-party/protobuf-host path = third-party/protobuf-host
url = https://github.com/protocolbuffers/protobuf.git url = https://github.com/protocolbuffers/protobuf.git
[submodule "third-party/flatbuffers"]
path = third-party/flatbuffers
url = https://github.com/google/flatbuffers.git
...@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) ...@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF) lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_HUAWEI_ASCEND_NPU "Enable HUAWEI_ASCEND_NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
...@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OF ...@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OF
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF) lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_WITH_LOG "Enable log printing or not." ON) lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF) lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF) lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
...@@ -106,7 +108,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne ...@@ -106,7 +108,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne
lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
# cv build options # cv build options
lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF) lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF)
lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON) lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." OFF)
lite_option(CUDA_WITH_FP16 "Compile with cuda half support" OFF)
lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF) lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
...@@ -168,6 +171,7 @@ if(LITE_WITH_RKNPU) ...@@ -168,6 +171,7 @@ if(LITE_WITH_RKNPU)
include(device/rknpu) include(device/rknpu)
endif() endif()
include(external/flatbuffers)
# for mobile # for mobile
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
...@@ -222,6 +226,11 @@ endif() ...@@ -222,6 +226,11 @@ endif()
if(LITE_WITH_MLU) if(LITE_WITH_MLU)
include(mlu) include(mlu)
endif() endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
include(device/huawei_ascend_npu)
endif()
include(coveralls) include(coveralls)
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
......
...@@ -174,6 +174,10 @@ if (LITE_WITH_MLU) ...@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
add_definitions("-DLITE_WITH_MLU") add_definitions("-DLITE_WITH_MLU")
endif() endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
endif()
if (LITE_WITH_PROFILE) if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE") add_definitions("-DLITE_WITH_PROFILE")
endif() endif()
...@@ -190,6 +194,10 @@ if (LITE_WITH_LOG) ...@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
add_definitions("-DLITE_WITH_LOG") add_definitions("-DLITE_WITH_LOG")
endif() endif()
if (LITE_WITH_EXCEPTION)
add_definitions("-DLITE_WITH_EXCEPTION")
endif()
if (LITE_ON_TINY_PUBLISH) if (LITE_ON_TINY_PUBLISH)
add_definitions("-DLITE_ON_TINY_PUBLISH") add_definitions("-DLITE_ON_TINY_PUBLISH")
endif() endif()
......
...@@ -35,7 +35,11 @@ endif() ...@@ -35,7 +35,11 @@ endif()
if(NOT DEFINED ANDROID_API_LEVEL) if(NOT DEFINED ANDROID_API_LEVEL)
set(ANDROID_API_LEVEL "23") set(ANDROID_API_LEVEL "23")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(ANDROID_API_LEVEL "22") if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH)
set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24
else()
set(ANDROID_API_LEVEL "22")
endif()
endif() endif()
endif() endif()
...@@ -76,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang") ...@@ -76,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7") elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(triple arm-v7a-linux-android) set(triple arm-v7a-linux-android)
set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE) set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
# Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
if (ANDROID_API_LEVEL LESS 21)
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
endif()
if(ANDROID_STL_TYPE STREQUAL "c++_shared")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
else()
message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
endif()
endif()
else() else()
message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7") message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
endif() endif()
......
...@@ -23,6 +23,21 @@ if(ANDROID) ...@@ -23,6 +23,21 @@ if(ANDROID)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
# Don't re-export libgcc symbols
set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
# Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
# Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
endif()
endif() endif()
if(ARMLINUX) if(ARMLINUX)
...@@ -59,14 +74,13 @@ function(check_linker_flag) ...@@ -59,14 +74,13 @@ function(check_linker_flag)
endfunction() endfunction()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
else ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
endif()
if (LITE_ON_TINY_PUBLISH) if (LITE_ON_TINY_PUBLISH)
if((NOT LITE_WITH_PYTHON)) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
endif()
if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
check_linker_flag(-Wl,--gc-sections) check_linker_flag(-Wl,--gc-sections)
endif() endif()
......
...@@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA) ...@@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA)
return() return()
endif() endif()
if(WITH_CUDA_FP16)
add_definitions("-DCUDA_WITH_FP16")
endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70") set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62") set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62")
...@@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x ...@@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
add_definitions("-DPADDLE_CUDA_BINVER=\"100\"") add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
endif() endif()
if (CUDA_WITH_FP16)
STRING(REGEX REPLACE "30|35|50|52" "" paddle_known_gpu_archs ${paddle_known_gpu_archs})
endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
if(NOT WITH_DSO) if(NOT WITH_DSO)
if(WIN32) if(WIN32)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
# 1. path to Huawei Ascend Install Path
if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
endif()
endif()
message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
# 2. Huawei Ascend include directory
set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
include_directories(${ACL_INCLUDE_DIR})
include_directories(${ATC_INCLUDE_DIR})
include_directories(${OPP_INCLUDE_DIR})
# 3 find ACL Libs (ACL libs should before ATC libs)
find_library(ACL_ASCENDCL_FILE NAMES ascendcl
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_ASCENDCL_FILE)
message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
endif()
# 3.1 ascendcl dependency - libruntime.so
find_library(ACL_RUNTIME_FILE NAMES runtime
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_RUNTIME_FILE)
message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
add_library(acl_runtime SHARED IMPORTED GLOBAL)
set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
endif()
# 4.1 find ATC libs - libregister.so
find_library(ATC_REGISTER_FILE NAMES register
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
add_library(atc_register SHARED IMPORTED GLOBAL)
set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
endif()
# 4.1.1 dependency of register - libprotobuf.so.19,
find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
add_library(atc_protobuf SHARED IMPORTED GLOBAL)
set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
endif()
# 4.1.2 dependency of register - libgraph.so
find_library(ATC_GRAPH_FILE NAMES graph
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GRAPH_FILE)
message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
add_library(atc_graph SHARED IMPORTED GLOBAL)
set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
endif()
# 4.2 find ATC libs - libge_compiler.so
find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMPILER_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
endif()
# 4.2.1 dependencies of libge_compiler.so - libge_common.so
find_library(ATC_GE_COMMON_FILE NAMES ge_common
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMMON_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
add_library(atc_ge_common SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
endif()
# 4.2.3 dependencies of libge_compiler.so - libresource.so
find_library(ATC_RESOURCE_FILE NAMES resource
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_RESOURCE_FILE)
message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
add_library(atc_resource SHARED IMPORTED GLOBAL)
set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
endif()
# 4.3 find OPP libs - libopsproto.so
find_library(OPP_OPS_PROTO_FILE NAMES opsproto
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
NO_DEFAULT_PATH)
if(NOT OPP_OPS_PROTO_FILE)
message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
else()
message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
endif()
# 4.3.1 dependency of opp_ops_proto - liberror_manager.so
find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_ERROR_MANAGER_FILE)
message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
add_library(atc_error_manager SHARED IMPORTED GLOBAL)
set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
endif()
# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager
atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
...@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build ...@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
# Added in HiAI DDK 320 or later version
find_library(NPU_DDK_HCL_FILE NAMES hcl
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
if(NOT NPU_DDK_HIAI_FILE) if(NOT NPU_DDK_HIAI_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
else() else()
...@@ -78,5 +83,13 @@ else() ...@@ -78,5 +83,13 @@ else()
set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE}) set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
endif() endif()
set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs") if(NOT NPU_DDK_HCL_FILE)
# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
else()
message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
endif()
set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs") set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
...@@ -39,7 +39,7 @@ else() ...@@ -39,7 +39,7 @@ else()
endif() endif()
find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
PATHS ${XPU_SDK_ROOT}/XTDK/shlib PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib ${XPU_SDK_ROOT}/XTDK/shlib # libxpurt.so may have been moved to XTDK/runtime/shlib
NO_DEFAULT_PATH) NO_DEFAULT_PATH)
if(NOT XPU_SDK_XPU_RT_FILE) if(NOT XPU_SDK_XPU_RT_FILE)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
# Introduce variables:
# * CMAKE_INSTALL_LIBDIR
INCLUDE(GNUInstallDirs)
SET(LIBDIR "lib")
if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
SET(LIBDIR "lib64")
endif()
SET(FLATBUFFERS_PREFIX_DIR ${THIRD_PARTY_PATH}/flatbuffers)
SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
IF(WIN32)
set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
ELSE(WIN32)
set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${FLATBUFFERS_INCLUDE_DIR})
if(NOT HOST_CXX_COMPILER)
set(HOST_CXX_COMPILER ${CMAKE_CXX_COMPILER})
set(HOST_C_COMPILER ${CMAKE_C_COMPILER})
endif()
SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}"
"-DCMAKE_C_COMPILER=${HOST_C_COMPILER}")
ExternalProject_Add(
extern_flatbuffers
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/google/flatbuffers.git"
GIT_TAG "v1.12.0"
SOURCE_DIR ${FLATBUFFERS_SOURCES_DIR}
PREFIX ${FLATBUFFERS_PREFIX_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DBUILD_STATIC_LIBS=ON
-DCMAKE_INSTALL_PREFIX=${FLATBUFFERS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
-DFLATBUFFERS_BUILD_TESTS=OFF
${CROSS_COMPILE_CMAKE_ARGS}
${OPTIONAL_ARGS}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
add_custom_command(TARGET extern_flatbuffers POST_BUILD
COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
)
ENDIF()
ENDIF(WIN32)
ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)
SET(FLATBUFFERS_FLATC_EXECUTABLE ${FLATBUFFERS_INSTALL_DIR}/bin/flatc)
function(register_generated_output file_name)
get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
list(APPEND tmp ${file_name})
set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp})
endfunction(register_generated_output)
function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
if(FLATBUFFERS_BUILD_LEGACY)
set(OPT ${OPT};--cpp-std c++0x)
else()
# --cpp-std is defined by flatc default settings.
endif()
message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'")
get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/${GEN_HEADER}"
COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
--cpp --gen-mutable --gen-object-api --reflect-names
${OPT}
-o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
"${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
DEPENDS flatbuffers
COMMENT "Run generation: '${GEN_HEADER}'")
register_generated_output(${GEN_HEADER})
add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER})
endfunction()
set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers")
set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs")
compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
include_directories(${FLATBUFFERS_INCLUDE_DIR})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR})
...@@ -22,7 +22,7 @@ endfunction() ...@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET) function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS) set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -118,6 +118,12 @@ function (lite_deps TARGET) ...@@ -118,6 +118,12 @@ function (lite_deps TARGET)
endforeach(var) endforeach(var)
endif() endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE) set(${TARGET} ${deps} PARENT_SCOPE)
endfunction() endfunction()
...@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean ...@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET) function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -165,6 +171,7 @@ function(lite_cc_library TARGET) ...@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
) )
if (args_SHARED OR ARGS_shared) if (args_SHARED OR ARGS_shared)
...@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET) ...@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
set(options " -g ") set(options " -g ")
endif() endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET) ...@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS} CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
) )
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
if(NOT WIN32) if(NOT WIN32)
...@@ -246,7 +254,7 @@ function(lite_cc_test TARGET) ...@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
endif() endif()
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS ARGS
COMPILE_LEVEL # (basic|extra) COMPILE_LEVEL # (basic|extra)
...@@ -276,6 +284,7 @@ function(lite_cc_test TARGET) ...@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS} CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
) )
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size # strip binary target to reduce size
...@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels") ...@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
set(apu_kernels CACHE INTERNAL "apu kernels") set(apu_kernels CACHE INTERNAL "apu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels") set(bm_kernels CACHE INTERNAL "bm kernels")
set(rknpu_kernels CACHE INTERNAL "rknpu kernels") set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels")
...@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR) ...@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif() endif()
# add a kernel for some specific device # add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU) # device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra) # level: one of (basic, extra)
function(add_kernel TARGET device level) function(add_kernel TARGET device level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -438,6 +448,15 @@ function(add_kernel TARGET device level) ...@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
endif() endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL") if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL) if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
...@@ -481,6 +500,7 @@ function(add_kernel TARGET device level) ...@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
RKNPU_DEPS ${args_RKNPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -499,7 +519,7 @@ endif() ...@@ -499,7 +519,7 @@ endif()
function(add_operator TARGET level) function(add_operator TARGET level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -537,6 +557,7 @@ function(add_operator TARGET level) ...@@ -537,6 +557,7 @@ function(add_operator TARGET level)
RKNPU_DEPS ${args_RKNPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
......
# C++ Train Demo
# Introduction ## Introduction
我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。
我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。
你可以通过book库中的 你可以通过book库中的
[文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html) [文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
[源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line) [源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
...@@ -10,18 +12,16 @@ ...@@ -10,18 +12,16 @@
其使用线性回归(Linear Regression) 其使用线性回归(Linear Regression)
模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。 模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。 ## Requirements
# Requirements
- 一部安卓手机,用于运行训练程序 - 一部安卓手机,用于运行训练程序
- 装了Paddle (version: 1.7.0) 的python - 装了Paddle (version >= 1.7.0) 的python
# Quick start ## Quick start
## Step1 build paddle-lite ### Step1 build paddle-lite
请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为: 请按照paddle-lite官方文档的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为:
```shell ```shell
## 配置环境 ## 配置环境
...@@ -51,7 +51,7 @@ cd Paddle-Lite ...@@ -51,7 +51,7 @@ cd Paddle-Lite
Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
``` ```
## Step2 编译lr_trainer ### Step2 编译lr_trainer
```shell ```shell
cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/ cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
...@@ -64,7 +64,7 @@ bin/ ...@@ -64,7 +64,7 @@ bin/
`-- demo_trainer `-- demo_trainer
``` ```
## Step3 download model and run it! ### Step3 download model and run it!
在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行: 在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行:
...@@ -102,7 +102,7 @@ sample 8: Loss: 248.445 ...@@ -102,7 +102,7 @@ sample 8: Loss: 248.445
sample 9: Loss: 325.135 sample 9: Loss: 325.135
``` ```
# 更多细节 ## 更多细节
上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令: 上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令:
```shell ```shell
...@@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d ...@@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。 如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。
# 与Paddle训练结果做校对 ## 与Paddle训练结果做校对
## 前10个Loss值 ### 前10个Loss值
为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。 为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。
...@@ -171,11 +171,11 @@ sample 8: Loss: 248.445 ...@@ -171,11 +171,11 @@ sample 8: Loss: 248.445
sample 9: Loss: 325.135 sample 9: Loss: 325.135
``` ```
## Loss 曲线 ### Loss 曲线
控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。 控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。
![lr_loss](image/lr_loss.png) ![lr_loss](../images/lr_loss.png)
如果想复现上述效果,paddle+python的运行命令为: 如果想复现上述效果,paddle+python的运行命令为:
......
...@@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h ...@@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h
rm ./lite/api/paddle_use_ops.h rm ./lite/api/paddle_use_ops.h
# 设置编译参数并开始编译 # 设置编译参数并开始编译
# android-armv7:cpu+gpu+cv+extra
./lite/tools/build_android.sh \ ./lite/tools/build_android.sh \
--arch=armv7 \ --arch=armv7 \
--toolchain=clang \ --toolchain=clang \
--with_cv=OFF \
--with_log=OFF \ --with_log=OFF \
--with_extra=OFF \ --with_extra=ON \
--with_cv=ON \
--with_opencl=ON --with_opencl=ON
# android-armv8:cpu+gpu+cv+extra
./lite/tools/build_android.sh \
--arch=armv8 \
--toolchain=clang \
--with_log=OFF \
--with_extra=ON \
--with_cv=ON \
--with_opencl=ON
# 注:编译帮助请执行: ./lite/tools/build_android.sh help # 注:编译帮助请执行: ./lite/tools/build_android.sh help
``` ```
...@@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \ ...@@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \
## 3. 如何在Code中使用 ## 3. 如何在Code中使用
即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc); 即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc),其中也包括判断当前设备是否支持OpenCL的方法;
注:这里给出的链接会跳转到线上最新develop分支的代码,很可能与您本地的代码存在差异,建议参考自己本地位于`lite/demo/cxx/`目录的代码,查看如何使用。 注:这里给出的链接会跳转到线上最新develop分支的代码,很可能与您本地的代码存在差异,建议参考自己本地位于`lite/demo/cxx/`目录的代码,查看如何使用。
......
...@@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb) ...@@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb)
predictor = create_paddle_predictor(config) predictor = create_paddle_predictor(config)
``` ```
(3) 设置输入数据 (3) 从图片读入数据
```python
image = Image.open('./example.jpg')
resized_image = image.resize((224, 224), Image.BILINEAR)
image_data = np.array(resized_image).flatten().tolist()
```
(4) 设置输入数据
```python ```python
input_tensor = predictor.get_input(0) input_tensor = predictor.get_input(0)
input_tensor.resize([1, 3, 224, 224]) input_tensor.resize([1, 3, 224, 224])
input_tensor.set_float_data([1.] * 3 * 224 * 224) input_tensor.set_float_data(image_data)
``` ```
(4) 执行预测 (5) 执行预测
```python ```python
predictor.run() predictor.run()
``` ```
(5) 得到输出数据 (6) 得到输出数据
```python ```python
output_tensor = predictor.get_output(0) output_tensor = predictor.get_output(0)
print(output_tensor.shape()) print(output_tensor.shape())
......
...@@ -59,7 +59,14 @@ Welcome to Paddle-Lite's documentation! ...@@ -59,7 +59,14 @@ Welcome to Paddle-Lite's documentation!
demo_guides/baidu_xpu demo_guides/baidu_xpu
demo_guides/rockchip_npu demo_guides/rockchip_npu
demo_guides/mediatek_apu demo_guides/mediatek_apu
.. toctree::
:maxdepth: 1
:caption: 训练示例(预览)
:name: sec-train_demo_guides
demo_guides/cpp_train_demo
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
:caption: API文档 :caption: API文档
......
...@@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8 iOS预测库和头文件 ...@@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8 iOS预测库和头文件
- 裁剪预测库方法(只编译模型中的kernel&OP,降低预测库体积): - 裁剪预测库方法(只编译模型中的kernel&OP,降低预测库体积):
```shell ```shell
./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir ./lite/tools/build_ios.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
``` ```
```shell ```shell
--with_strip: (OFF|ON); 是否根据输入模型裁剪预测库,默认为OFF --with_strip: (OFF|ON); 是否根据输入模型裁剪预测库,默认为OFF
......
...@@ -21,11 +21,11 @@ pip install paddlelite ...@@ -21,11 +21,11 @@ pip install paddlelite
- 方法二: 下载opt可执行文件 - 方法二: 下载opt可执行文件
[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择当前预测库对应版本的`opt`转化工具 [release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择当前预测库对应版本的`opt`转化工具
本文提供`release/v2.6``release/v2.2.0`版本的优化工具下载 本文提供`release/v2.6.1``release/v2.2.0`版本的优化工具下载
|版本 | Linux | MacOS| |版本 | Linux | MacOS|
|---|---|---| |---|---|---|
| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) | | `release/v2.6.1` | [opt](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) |
|`release/v2.2.0` | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) | |`release/v2.2.0` | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |
- 方法三: 源码编译opt - 方法三: 源码编译opt
......
...@@ -49,4 +49,4 @@ $ ./opt \ ...@@ -49,4 +49,4 @@ $ ./opt \
## 五. 测试工具 ## 五. 测试工具
为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug)[Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。 为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug)[Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
...@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}") ...@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
...@@ -45,14 +46,17 @@ if (WITH_TESTING) ...@@ -45,14 +46,17 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
endif() endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
endif() endif()
endif() endif()
...@@ -242,7 +246,6 @@ if (LITE_WITH_X86) ...@@ -242,7 +246,6 @@ if (LITE_WITH_X86)
add_dependencies(publish_inference_x86_cxx_lib test_model_bin) add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET} add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"
......
...@@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG)) ...@@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
lite_cc_library(place SRCS paddle_place.cc DEPS logging) lite_cc_library(place SRCS paddle_place.cc DEPS logging)
else() else()
lite_cc_library(place SRCS paddle_place.cc DEPS glog) lite_cc_library(place SRCS paddle_place.cc DEPS glog)
endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) endif()
if (LITE_ON_TINY_PUBLISH) if (LITE_ON_TINY_PUBLISH)
set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG") set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
...@@ -11,12 +11,13 @@ endif() ...@@ -11,12 +11,13 @@ endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light) set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library #full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full) DEPS paddle_api paddle_api_light paddle_api_full)
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto) target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
target_link_libraries(paddle_full_api_shared framework_proto) add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header)
target_link_libraries(paddle_full_api_shared framework_proto op_registry)
if(LITE_WITH_X86) if(LITE_WITH_X86)
add_dependencies(paddle_full_api_shared xxhash) add_dependencies(paddle_full_api_shared xxhash)
target_link_libraries(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash)
...@@ -39,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ...@@ -39,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels} APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
) )
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if(WIN32) if(WIN32)
target_link_libraries(paddle_light_api_shared shlwapi.lib) target_link_libraries(paddle_light_api_shared shlwapi.lib)
endif() endif()
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
if(APPLE) if(APPLE)
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds") set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}") set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
...@@ -70,7 +72,7 @@ else() ...@@ -70,7 +72,7 @@ else()
set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto") set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
endif() endif()
set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}") set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header)
if (LITE_WITH_NPU) if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency # Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
...@@ -93,6 +95,7 @@ if (WITH_TESTING) ...@@ -93,6 +95,7 @@ if (WITH_TESTING)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
APU_DEPS ${apu_kernels}) APU_DEPS ${apu_kernels})
endif() endif()
...@@ -111,6 +114,10 @@ if(LITE_WITH_RKNPU) ...@@ -111,6 +114,10 @@ if(LITE_WITH_RKNPU)
set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps}) set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
endif() endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
endif()
message(STATUS "get ops ${ops}") message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get X86 kernels ${x86_kernels}")
...@@ -125,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}") ...@@ -125,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}") message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}") message(STATUS "get MLU kernels ${mlu_kernels}")
message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")
# for full api # for full api
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
...@@ -143,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -143,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}) FPGA_DEPS ${fpga_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif() endif()
# for light api # for light api
...@@ -167,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -167,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}) MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
include(ExternalProject) include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
...@@ -190,6 +200,7 @@ if(WITH_TESTING) ...@@ -190,6 +200,7 @@ if(WITH_TESTING)
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
EXCLUDE_COMPILE_DEPS "ON" EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
...@@ -321,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -321,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
APU_DEPS ${apu_kernels} APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}) BM_DEPS ${bm_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
# The final inference library for just MobileConfig. # The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps}) target_link_libraries(paddle_api_full ${cuda_deps})
...@@ -361,6 +373,9 @@ endif() ...@@ -361,6 +373,9 @@ endif()
if (LITE_WITH_PYTHON) if (LITE_WITH_PYTHON)
add_subdirectory(python) add_subdirectory(python)
# add library for opt_base
lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
endif() endif()
if (LITE_ON_TINY_PUBLISH) if (LITE_ON_TINY_PUBLISH)
...@@ -368,9 +383,6 @@ if (LITE_ON_TINY_PUBLISH) ...@@ -368,9 +383,6 @@ if (LITE_ON_TINY_PUBLISH)
endif() endif()
# add library for opt_base
lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
if (LITE_ON_MODEL_OPTIMIZE_TOOL) if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling opt") message(STATUS "Compiling opt")
...@@ -393,6 +405,7 @@ if(NOT WITH_COVERAGE) ...@@ -393,6 +405,7 @@ if(NOT WITH_COVERAGE)
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING) if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
...@@ -414,7 +427,8 @@ if(NOT IOS) ...@@ -414,7 +427,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -429,7 +443,8 @@ if(NOT IOS) ...@@ -429,7 +443,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -444,7 +459,8 @@ if(NOT IOS) ...@@ -444,7 +459,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -458,7 +474,8 @@ if(NOT IOS) ...@@ -458,7 +474,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -469,8 +486,9 @@ if(NOT IOS) ...@@ -469,8 +486,9 @@ if(NOT IOS)
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels})
...@@ -486,7 +504,8 @@ if(NOT IOS) ...@@ -486,7 +504,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif() endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
...@@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
# Unlike static library, module library has to link target to be able to work # Unlike static library, module library has to link target to be able to work
# as a single .so lib. # as a single .so lib.
target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels}) target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
add_dependencies(paddle_lite_jni framework_fbs_header)
if (LITE_WITH_NPU) if (LITE_WITH_NPU)
# Strips the symbols of our protobuf functions to fix the conflicts during # Strips the symbols of our protobuf functions to fix the conflicts during
# loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so) # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
...@@ -31,7 +32,7 @@ else() ...@@ -31,7 +32,7 @@ else()
endif() endif()
set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS}) set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc) target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
add_dependencies(paddle_lite_jni op_list_h kernel_list_h) add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header)
if (LITE_WITH_NPU) if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency # Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs}) target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
......
...@@ -13,26 +13,31 @@ ...@@ -13,26 +13,31 @@
// limitations under the License. // limitations under the License.
#include "lite/api/cxx_api.h" #include "lite/api/cxx_api.h"
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <set> #include <set>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/utils/io.h" #include "lite/utils/io.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
std::vector<std::string> GetAllOps() {
return OpLiteFactory::Global().GetAllOps();
}
void Predictor::SaveModel(const std::string &dir, void Predictor::SaveModel(const std::string &dir,
lite_api::LiteModelType model_type, lite_api::LiteModelType model_type,
bool record_info) { bool record_info) {
if (!program_) { if (!program_) {
GenRuntimeProgram(); GenRuntimeProgram();
} }
program_->SaveOpInfosToProgram(program_desc_.get()); program_->SaveToProgram(program_desc_);
program_->UpdateVarsOfProgram(program_desc_.get());
switch (model_type) { switch (model_type) {
case lite_api::LiteModelType::kProtobuf: case lite_api::LiteModelType::kProtobuf:
SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true); SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
...@@ -52,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir, ...@@ -52,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir,
void Predictor::SaveOpKernelInfo(const std::string &model_dir) { void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
std::set<std::string> ops_info; std::set<std::string> ops_info;
std::set<std::string> kernels_info; std::set<std::string> kernels_info;
const auto &instructions_ = program_->instructions(); auto block_size = program_->block_size();
for (auto &node : instructions_) { for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
// parse op type infomation const auto &insts = program_->instructions(block_idx);
auto op = node.op()->op_info(); for (auto &inst : insts) {
ops_info.insert(op->Type()); // parse op type infomation
// parse kernel type information auto op = inst.op()->op_info();
std::string kernel_type_str = ops_info.insert(op->Type());
node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) + // parse kernel type information
"," + PrecisionRepr(node.kernel()->precision()) + "," + std::string kernel_type_str =
DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias(); inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
kernels_info.insert(kernel_type_str); "," + PrecisionRepr(inst.kernel()->precision()) + "," +
DataLayoutRepr(inst.kernel()->layout()) + "," +
inst.kernel()->alias();
kernels_info.insert(kernel_type_str);
}
} }
// get souce_file name from op type and kernel type // get souce_file name from op type and kernel type
...@@ -164,9 +173,9 @@ void Predictor::PrepareFeedFetch() { ...@@ -164,9 +173,9 @@ void Predictor::PrepareFeedFetch() {
std::vector<const cpp::OpDesc *> feeds; std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs; std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions(); const auto &insts = program_->instructions(kRootBlockIdx);
for (size_t i = 0; i < program_->num_instructions(); i++) { for (auto &inst : insts) {
const auto &op = insts[i].op()->op_info(); const auto &op = inst.op()->op_info();
if (op->Type() == "feed") { if (op->Type() == "feed") {
feeds.push_back(op); feeds.push_back(op);
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
...@@ -249,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config, ...@@ -249,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
} else { } else {
LOG(INFO) << "Load model from file."; LOG(INFO) << "Load model from file.";
} }
Build(model_path, Build(model_path,
model_file, model_file,
param_file, param_file,
...@@ -290,10 +298,10 @@ void Predictor::Build(const std::string &model_path, ...@@ -290,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
Build(program_desc_, valid_places, passes); Build(program_desc_, valid_places, passes);
} }
void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc, void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
const std::vector<Place> &valid_places, const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) { const std::vector<std::string> &passes) {
program_desc_ = desc; program_desc_ = program_desc;
// `inner_places` is used to optimize passes // `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places; std::vector<Place> inner_places = valid_places;
for (auto &valid_place : valid_places) { for (auto &valid_place : valid_places) {
...@@ -326,13 +334,11 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc, ...@@ -326,13 +334,11 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
} }
} }
if (is_quantized_model) { if (is_quantized_model) {
#ifdef LITE_WITH_ARM
inner_places.insert(inner_places.begin(), inner_places.insert(inner_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)}); Place{TARGET(kARM), PRECISION(kInt8)});
#endif
} }
Program program(*desc.get(), scope_, inner_places); Program program(program_desc_, scope_, inner_places);
valid_places_ = inner_places; valid_places_ = inner_places;
core::KernelPickFactor factor; core::KernelPickFactor factor;
......
...@@ -36,6 +36,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] = ...@@ -36,6 +36,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
".tailored_kernels_source_list"; ".tailored_kernels_source_list";
static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list"; static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
std::vector<std::string> GetAllOps();
/* /*
* Predictor for inference, input a model, it will optimize and execute it. * Predictor for inference, input a model, it will optimize and execute it.
*/ */
...@@ -47,18 +49,33 @@ class LITE_API Predictor { ...@@ -47,18 +49,33 @@ class LITE_API Predictor {
program_desc_ = std::make_shared<cpp::ProgramDesc>(); program_desc_ = std::make_shared<cpp::ProgramDesc>();
} }
// Create a predictor with the weight variable scope set. ///////////////////////////////////////////////////////////////////
// Function: Predictor
// Usage: Constructor of Predictor. Create a predictor with the
// weight variable scope set given.
///////////////////////////////////////////////////////////////////
explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope) explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
: scope_(root_scope) {} : scope_(root_scope) {}
Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc, ///////////////////////////////////////////////////////////////////
// Function: Predictor
// Usage: Constructor of Predictor. This constructor function can
// only be called in Predictor->Clone. This Function will create
// a predictor from existed ProgramDesc, Scope and RuntimeProgram.
///////////////////////////////////////////////////////////////////
Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::shared_ptr<Scope>& root, const std::shared_ptr<Scope>& root,
const std::vector<Place>& valid_places, const std::vector<Place>& valid_places,
const std::vector<std::string>& var_names = {}) const std::vector<std::string>& var_names = {})
: program_desc_(desc), scope_(root) { : program_desc_(program_desc), scope_(root) {
Program program(*desc.get(), scope_, valid_places, var_names); // step1. Create a Program to construct the exec_scope and ops
optimizer_ = Optimizer(std::move(program), valid_places); Program program(program_desc_, scope_, valid_places, var_names);
exec_scope_ = optimizer_.exec_scope(); exec_scope_ = program.exec_scope();
valid_places_ = valid_places; valid_places_ = valid_places;
// step3. Create the RuntimeProgram.
program_.reset(
new RuntimeProgram(program_desc_, exec_scope_, kRootBlockIdx));
program_generated_ = true;
} }
// Build from a model, with places set for hardware config. // Build from a model, with places set for hardware config.
...@@ -77,32 +94,62 @@ class LITE_API Predictor { ...@@ -77,32 +94,62 @@ class LITE_API Predictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf, lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool memory_from_memory = false); bool memory_from_memory = false);
void Build(const std::shared_ptr<cpp::ProgramDesc>& desc, void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::vector<Place>& valid_places, const std::vector<Place>& valid_places,
const std::vector<std::string>& passes = {}); const std::vector<std::string>& passes = {});
std::shared_ptr<Predictor> Clone() const { //////////////////////////////////////////////////////////
// Function: Clone
// Usage: Create a Predictor from an existed one,
// the cloned predictor will share persistable variables
// in scope_ with the original predictor.
//////////////////////////////////////////////////////////
std::shared_ptr<Predictor> Clone() {
// step 1. Generate runtime_program, update op_info and var_info in
// program_desc_
if (!program_generated_) {
GenRuntimeProgram();
}
program_->SaveToProgram(program_desc_);
// step 2. Create a predictor friom current program_desc_ and
// runtime_program.
auto predictor = auto predictor =
std::make_shared<Predictor>(program_desc_, scope_, valid_places_); std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
// step3. Return the result
return predictor; return predictor;
} }
//////////////////////////////////////////////////////////
std::shared_ptr<Predictor> Clone( // Function: Clone(var_names)
const std::vector<std::string>& var_names) const { // Usage: Create a Predictor from an existed one,
// the cloned predictor will share persistable variables
// but persistable variables of name var_names will not
// be shared.
//////////////////////////////////////////////////////////
std::shared_ptr<Predictor> Clone(const std::vector<std::string>& var_names) {
CHECK(program_desc_) << "Both program and scope of current predicotr " CHECK(program_desc_) << "Both program and scope of current predicotr "
"should be not be nullptr in Clone mode."; "should be not be nullptr in Clone mode.";
CHECK(scope_) << "Both program and scope of current predicotr should be " CHECK(scope_) << "Both program and scope of current predicotr should be "
"not be nullptr in Clone mode."; "not be nullptr in Clone mode.";
// step 1. Generate runtime_program, update op_info and var_info in
// program_desc_
if (!program_generated_) {
GenRuntimeProgram();
}
program_->SaveToProgram(program_desc_);
// step 2. Create a predictor friom current program_desc_ and
// runtime_program.
auto predictor = std::make_shared<Predictor>( auto predictor = std::make_shared<Predictor>(
program_desc_, scope_, valid_places_, var_names); program_desc_, scope_, valid_places_, var_names);
// step3. Copy some persistable variables into private scope.
for (auto i : var_names) { for (auto var_name : var_names) {
predictor->exec_scope_->LocalVar(i); predictor->exec_scope_->LocalVar(var_name);
auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>(); auto* tensor =
predictor->scope_->Var(var_name)->GetMutable<lite::Tensor>();
auto* sub_tensor = auto* sub_tensor =
predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>(); predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
sub_tensor->CopyDataFrom(*tensor); sub_tensor->CopyDataFrom(*tensor);
} }
// step4. Return the result
return predictor; return predictor;
} }
...@@ -138,6 +185,7 @@ class LITE_API Predictor { ...@@ -138,6 +185,7 @@ class LITE_API Predictor {
// get a const tensor according to its name // get a const tensor according to its name
const lite::Tensor* GetTensor(const std::string& name) const; const lite::Tensor* GetTensor(const std::string& name) const;
const RuntimeProgram& runtime_program() const; const RuntimeProgram& runtime_program() const;
Scope* scope() { return scope_.get(); }
// This method is disabled in mobile, for unnecessary dependencies required. // This method is disabled in mobile, for unnecessary dependencies required.
void SaveModel( void SaveModel(
...@@ -160,7 +208,7 @@ class LITE_API Predictor { ...@@ -160,7 +208,7 @@ class LITE_API Predictor {
std::shared_ptr<cpp::ProgramDesc> program_desc_; std::shared_ptr<cpp::ProgramDesc> program_desc_;
std::shared_ptr<Scope> scope_; std::shared_ptr<Scope> scope_;
Scope* exec_scope_; Scope* exec_scope_;
std::unique_ptr<RuntimeProgram> program_; std::shared_ptr<RuntimeProgram> program_;
bool program_generated_{false}; bool program_generated_{false};
std::vector<std::string> input_names_; std::vector<std::string> input_names_;
std::vector<std::string> output_names_; std::vector<std::string> output_names_;
......
...@@ -53,12 +53,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -53,12 +53,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
#endif #endif
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
Env<TARGET(kMLU)>::Init(); Env<TARGET(kMLU)>::Init();
lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(), lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
config.mlu_core_number(), config.mlu_core_number(),
config.mlu_use_first_conv(), config.mlu_input_layout(),
config.mlu_first_conv_mean(), config.mlu_firstconv_param());
config.mlu_first_conv_std(),
config.mlu_input_layout());
#endif // LITE_WITH_MLU #endif // LITE_WITH_MLU
auto use_layout_preprocess_pass = auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS"); config.model_dir().find("OPENCL_PRE_PRECESS");
...@@ -75,6 +73,18 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -75,6 +73,18 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
} }
mode_ = config.power_mode(); mode_ = config.power_mode();
threads_ = config.threads(); threads_ = config.threads();
#ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
int num_threads = config.x86_math_library_num_threads(); int num_threads = config.x86_math_library_num_threads();
......
...@@ -15,8 +15,6 @@ ...@@ -15,8 +15,6 @@
#include "lite/api/light_api.h" #include "lite/api/light_api.h"
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -24,17 +22,18 @@ namespace lite { ...@@ -24,17 +22,18 @@ namespace lite {
void LightPredictor::Build(const std::string& lite_model_file, void LightPredictor::Build(const std::string& lite_model_file,
bool model_from_memory) { bool model_from_memory) {
if (model_from_memory) { if (model_from_memory) {
LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_); LoadModelNaiveFromMemory(
lite_model_file, scope_.get(), program_desc_.get());
} else { } else {
LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_); LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
} }
// For weight quantization of post training, load the int8/16 weights // For weight quantization of post training, load the int8/16 weights
// for optimized model, and dequant it to fp32. // for optimized model, and dequant it to fp32.
DequantizeWeight(); DequantizeWeight();
BuildRuntimeProgram(program_desc_);
BuildRuntimeProgram(cpp_program_desc_);
PrepareFeedFetch(); PrepareFeedFetch();
program_desc_.reset();
} }
void LightPredictor::Build(const std::string& model_dir, void LightPredictor::Build(const std::string& model_dir,
...@@ -45,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir, ...@@ -45,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir,
switch (model_type) { switch (model_type) {
#ifndef LITE_ON_TINY_PUBLISH #ifndef LITE_ON_TINY_PUBLISH
case lite_api::LiteModelType::kProtobuf: case lite_api::LiteModelType::kProtobuf:
LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_); LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
break; break;
#endif #endif
case lite_api::LiteModelType::kNaiveBuffer: { case lite_api::LiteModelType::kNaiveBuffer: {
if (model_from_memory) { if (model_from_memory) {
LoadModelNaiveFromMemory( LoadModelNaiveFromMemory(
model_buffer, param_buffer, scope_.get(), &cpp_program_desc_); model_buffer, param_buffer, scope_.get(), program_desc_.get());
} else { } else {
LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_); LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
} }
break; break;
} }
...@@ -62,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir, ...@@ -62,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir,
} }
DequantizeWeight(); DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_); BuildRuntimeProgram(program_desc_);
PrepareFeedFetch(); PrepareFeedFetch();
} }
...@@ -111,15 +110,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() { ...@@ -111,15 +110,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
} }
// append the names of inputs and outputs into input_names_ and output_names_ // append the names of inputs and outputs into input_names_ and output_names_
void LightPredictor::PrepareFeedFetch() { void LightPredictor::PrepareFeedFetch() {
auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0); std::vector<const cpp::OpDesc*> feeds;
std::vector<cpp::OpDesc*> feeds; std::vector<const cpp::OpDesc*> fetchs;
std::vector<cpp::OpDesc*> fetchs; std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
for (size_t i = 0; i < current_block->OpsSize(); i++) { auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
auto op = current_block->GetOp<cpp::OpDesc>(i); auto op_size = main_block->OpsSize();
if (op->Type() == "feed") { for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
feeds.push_back(op); auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
} else if (op->Type() == "fetch") { if (op_desc->Type() == "feed") {
fetchs.push_back(op); feeds.push_back(op_desc);
} else if (op_desc->Type() == "fetch") {
fetchs.push_back(op_desc);
} }
} }
input_names_.resize(feeds.size()); input_names_.resize(feeds.size());
...@@ -134,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() { ...@@ -134,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() {
} }
} }
void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { void LightPredictor::BuildRuntimeProgram(
std::vector<Instruction> insts; const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
// 1. Create op first auto* exe_scope = &scope_->NewScope();
Program program(prog, scope_, {}); // Prepare workspace
scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
// 2. Create Instructs scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
#ifdef LITE_WITH_OPENCL CHECK(program_desc);
using OpenCLContext = Context<TargetType::kOpenCL>; auto block_size = program_desc->BlocksSize();
std::unique_ptr<KernelContext> local_ctx(new KernelContext()); CHECK(block_size);
local_ctx->As<OpenCLContext>().InitOnce(); for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
#endif auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
auto var_size = block_desc->VarsSize();
// Create the kernels of the target places, and filter out the specific for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
// kernel with the target alias. auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
for (auto& op : program.ops()) { if (!var_desc->Persistable()) {
auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr); exe_scope->Var(var_desc->Name());
std::string op_type, alias; } else {
Place place; if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); scope_->Var(var_desc->Name());
auto kernels = op->CreateKernels({place}); }
// filter out a kernel
auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
return it->alias() == alias;
});
CHECK(it != kernels.end());
#ifdef LITE_WITH_OPENCL
if ((*it)->target() == TARGET(kOpenCL)) {
std::unique_ptr<KernelContext> ctx(new KernelContext());
(*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
(*it)->SetContext(std::move(ctx));
} else {
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
} }
#else
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
#endif
insts.emplace_back(op, std::move(*it));
} }
program_.reset(new RuntimeProgram(std::move(insts))); // Only extracting the ops and generate the runtime program from the main
// block desc
CHECK(program.exec_scope()); program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
program_->set_exec_scope(program.exec_scope());
} }
void LightPredictor::DequantizeWeight() { void LightPredictor::DequantizeWeight() {
std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
#define PROCESS_CONV2D_DATA() \ #define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < ch; ++i) { \ for (int64_t i = 0; i < ch; ++i) { \
for (int64_t j = 0; j < offset; ++j) { \ for (int64_t j = 0; j < offset; ++j) { \
...@@ -207,10 +189,9 @@ void LightPredictor::DequantizeWeight() { ...@@ -207,10 +189,9 @@ void LightPredictor::DequantizeWeight() {
} }
return result; return result;
}; };
Tensor tmp_tensor; Tensor tmp_tensor;
for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) { for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i); auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
for (size_t k = 0; k < block->OpsSize(); ++k) { for (size_t k = 0; k < block->OpsSize(); ++k) {
auto* op_desc = block->GetOp<cpp::OpDesc>(k); auto* op_desc = block->GetOp<cpp::OpDesc>(k);
if (is_weight_quantized_op(op_desc)) { if (is_weight_quantized_op(op_desc)) {
......
...@@ -46,6 +46,7 @@ class LITE_API LightPredictor { ...@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
LightPredictor(const std::string& lite_model_file, LightPredictor(const std::string& lite_model_file,
bool model_from_memory = false) { bool model_from_memory = false) {
scope_ = std::make_shared<Scope>(); scope_ = std::make_shared<Scope>();
program_desc_ = std::make_shared<cpp::ProgramDesc>();
Build(lite_model_file, model_from_memory); Build(lite_model_file, model_from_memory);
} }
...@@ -57,6 +58,7 @@ class LITE_API LightPredictor { ...@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType model_type =
lite_api::LiteModelType::kNaiveBuffer) { lite_api::LiteModelType::kNaiveBuffer) {
scope_ = std::make_shared<Scope>(); scope_ = std::make_shared<Scope>();
program_desc_ = std::make_shared<cpp::ProgramDesc>();
Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory); Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
} }
...@@ -78,6 +80,7 @@ class LITE_API LightPredictor { ...@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
std::vector<std::string> GetInputNames(); std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames(); std::vector<std::string> GetOutputNames();
void PrepareFeedFetch(); void PrepareFeedFetch();
Scope* scope() { return scope_.get(); }
private: private:
void Build(const std::string& lite_model_file, void Build(const std::string& lite_model_file,
...@@ -91,14 +94,15 @@ class LITE_API LightPredictor { ...@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf, lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool model_from_memory = false); bool model_from_memory = false);
void BuildRuntimeProgram(const cpp::ProgramDesc& prog); void BuildRuntimeProgram(
const std::shared_ptr<const cpp::ProgramDesc>& program_desc);
void DequantizeWeight(); void DequantizeWeight();
private: private:
std::shared_ptr<Scope> scope_; std::shared_ptr<Scope> scope_;
std::unique_ptr<RuntimeProgram> program_; std::unique_ptr<RuntimeProgram> program_;
cpp::ProgramDesc cpp_program_desc_; std::shared_ptr<cpp::ProgramDesc> program_desc_;
std::vector<std::string> input_names_; std::vector<std::string> input_names_;
std::vector<std::string> output_names_; std::vector<std::string> output_names_;
}; };
......
...@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) { ...@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
threads_ = config.threads(); threads_ = config.threads();
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir( Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir()); config.subgraph_model_cache_dir());
#endif #endif
} }
......
...@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000); ASSERT_EQ(out->dims().production(), 1000);
double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1; double eps = first_target == TARGET(kOpenCL) ? 0.25 : 0.1;
for (int i = 0; i < ref.size(); ++i) { for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) { for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)]; auto result = pdata[j * step + (out->dims()[1] * i)];
......
...@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() { ...@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)}); valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
} else if (target_repr == "npu") { } else if (target_repr == "npu") {
valid_places.emplace_back(TARGET(kNPU)); valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU)); valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") { } else if (target_repr == "mlu") {
...@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kXPU", "kXPU",
"kRKNPU", "kRKNPU",
"kAPU", "kAPU",
"kHuaweiAscendNPU",
"kAny", "kAny",
"kUnk"}; "kUnk"};
int maximum_optype_length = 0; int maximum_optype_length = 0;
...@@ -265,16 +268,17 @@ void PrintHelpInfo() { ...@@ -265,16 +268,17 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n" " `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n" " `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " "
"`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n" " Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of " " `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n" "Paddle-Lite\n"
" `--print_supported_ops=true " " `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display valid operators of input targets\n" " Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> " " `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display operators in the input model\n"; " Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl; << help_info << std::endl;
......
...@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { ...@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.emplace_back(TARGET(kX86)); valid_places_.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") { } else if (target_repr == "npu") {
valid_places_.emplace_back(TARGET(kNPU)); valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU)); valid_places_.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") { } else if (target_repr == "rknpu") {
...@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() { ...@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
" `set_model_type(protobuf|naive_buffer)`: naive_buffer by " " `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
"default\n" "default\n"
" `set_lite_out(output_optimize_model_dir)`\n" " `set_lite_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " "
"`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `record_model_info(false|true)`: refer to whether to record ops " " `record_model_info(false|true)`: refer to whether to record ops "
"info for striping lib, false by default`\n" "info for striping lib, false by default`\n"
" `run() : start model transformation`\n" " `run() : start model transformation`\n"
...@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() { ...@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
" `--param_file=<param_path>`\n" " `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n" " `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" " `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n" " Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of " " `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n" "Paddle-Lite\n"
" `--print_supported_ops=true " " `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display valid operators of input targets\n" " Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> " " `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display operators in the input model\n"; " Display operators in the input model\n";
std::cout << "paddlelite opt version:" << opt_version << std::endl std::cout << "paddlelite opt version:" << opt_version << std::endl
<< help_info << std::endl; << help_info << std::endl;
...@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) { ...@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
"kXPU", "kXPU",
"kRKNPU", "kRKNPU",
"kAPU", "kAPU",
"kHuaweiAscendNPU",
"kAny", "kAny",
"kUnk"}; "kUnk"};
// Get the lengh of the first column: maximum length of the op_type // Get the lengh of the first column: maximum length of the op_type
......
...@@ -13,6 +13,9 @@ ...@@ -13,6 +13,9 @@
// limitations under the License. // limitations under the License.
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include <utility>
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/device_info.h" #include "lite/core/device_info.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
...@@ -21,10 +24,30 @@ ...@@ -21,10 +24,30 @@
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/target_wrapper.h" #include "lite/backends/cuda/target_wrapper.h"
#endif #endif
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/target_wrapper.h"
#endif
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/target_wrapper.h"
#endif
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_runtime.h"
#endif
namespace paddle { namespace paddle {
namespace lite_api { namespace lite_api {
bool IsOpenCLBackendValid() {
bool opencl_valid = false;
#ifdef LITE_WITH_OPENCL
opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice();
#endif
LOG(INFO) << "opencl_valid:" << opencl_valid;
return opencl_valid;
}
Tensor::Tensor(void *raw) : raw_tensor_(raw) {} Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
// TODO(Superjomn) refine this by using another `const void* const_raw`; // TODO(Superjomn) refine this by using another `const void* const_raw`;
...@@ -97,6 +120,13 @@ void Tensor::CopyFromCpu(const T *src_data) { ...@@ -97,6 +120,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
data, src_data, num * sizeof(T), lite::IoDirection::HtoD); data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
#else #else
LOG(FATAL) << "Please compile the lib with CUDA."; LOG(FATAL) << "Please compile the lib with CUDA.";
#endif
} else if (type == TargetType::kMLU) {
#ifdef LITE_WITH_MLU
lite::TargetWrapperMlu::MemcpySync(
data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
#else
LOG(FATAL) << "Please compile the lib with MLU.";
#endif #endif
} else { } else {
LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA"; LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
...@@ -117,6 +147,13 @@ void Tensor::CopyToCpu(T *data) const { ...@@ -117,6 +147,13 @@ void Tensor::CopyToCpu(T *data) const {
data, src_data, num * sizeof(T), lite::IoDirection::DtoH); data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
#else #else
LOG(FATAL) << "Please compile the lib with CUDA."; LOG(FATAL) << "Please compile the lib with CUDA.";
#endif
} else if (type == TargetType::kMLU) {
#ifdef LITE_WITH_MLU
lite::TargetWrapperMlu::MemcpySync(
data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
#else
LOG(FATAL) << "Please compile the lib with MLU.";
#endif #endif
} else { } else {
LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA"; LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
...@@ -138,6 +175,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *); ...@@ -138,6 +175,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *); template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *); template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
template void Tensor::CopyFromCpu<int, TargetType::kMLU>(const int *);
template void Tensor::CopyFromCpu<int64_t, TargetType::kMLU>(const int64_t *);
template void Tensor::CopyFromCpu<float, TargetType::kMLU>(const float *);
template void Tensor::CopyFromCpu<int8_t, TargetType::kMLU>(const int8_t *);
template void Tensor::CopyToCpu(float *) const; template void Tensor::CopyToCpu(float *) const;
template void Tensor::CopyToCpu(int *) const; template void Tensor::CopyToCpu(int *) const;
template void Tensor::CopyToCpu(int8_t *) const; template void Tensor::CopyToCpu(int8_t *) const;
...@@ -228,13 +270,9 @@ void CxxConfig::set_mlu_core_number(int core_number) { ...@@ -228,13 +270,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
void CxxConfig::set_mlu_input_layout(DataLayoutType layout) { void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
mlu_input_layout_ = layout; mlu_input_layout_ = layout;
} }
void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) { void CxxConfig::set_mlu_firstconv_param(const std::vector<float> &mean,
mlu_use_first_conv_ = use_first_conv; const std::vector<float> &std) {
}
void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
mlu_first_conv_mean_ = mean; mlu_first_conv_mean_ = mean;
}
void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
mlu_first_conv_std_ = std; mlu_first_conv_std_ = std;
} }
lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
...@@ -242,18 +280,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { ...@@ -242,18 +280,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
} }
int CxxConfig::mlu_core_number() const { return mlu_core_number_; } int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; } DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; } std::pair<std::vector<float>, std::vector<float>>
const std::vector<float> &CxxConfig::mlu_first_conv_mean() const { CxxConfig::mlu_firstconv_param() const {
return mlu_first_conv_mean_; return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_);
}
const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
return mlu_first_conv_std_;
} }
#endif #endif
void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size); lite::TargetWrapperXPU::workspace_l3_size_per_thread = l3_size;
#else #else
LOG(WARNING) << "The invoking of the function " LOG(WARNING) << "The invoking of the function "
"'set_xpu_workspace_l3_size_per_thread' is ignored, please " "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
...@@ -263,7 +298,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { ...@@ -263,7 +298,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
void CxxConfig::set_xpu_dev_per_thread(int dev_no) { void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetDev(dev_no); lite::TargetWrapperXPU::SetDev(dev_no);
#else #else
LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is " LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
"ignored, please rebuild it with LITE_WITH_XPU=ON."; "ignored, please rebuild it with LITE_WITH_XPU=ON.";
...@@ -272,7 +307,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) { ...@@ -272,7 +307,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) { void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) {
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::_multi_encoder_precision = precision; lite::TargetWrapperXPU::multi_encoder_precision = precision;
#else #else
LOG(WARNING) << "The invoking of the function " LOG(WARNING) << "The invoking of the function "
"'set_xpu_multi_encoder_precision' is " "'set_xpu_multi_encoder_precision' is "
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#define PADDLE_LITE_API_H_ #define PADDLE_LITE_API_H_
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle_place.h" // NOLINT #include "paddle_place.h" // NOLINT
...@@ -32,6 +33,9 @@ using lod_t = std::vector<std::vector<uint64_t>>; ...@@ -32,6 +33,9 @@ using lod_t = std::vector<std::vector<uint64_t>>;
enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK }; enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
// return true if current device supports OpenCL model
LITE_API bool IsOpenCLBackendValid();
struct LITE_API Tensor { struct LITE_API Tensor {
explicit Tensor(void* raw); explicit Tensor(void* raw);
explicit Tensor(const void* raw); explicit Tensor(const void* raw);
...@@ -122,6 +126,7 @@ class LITE_API ConfigBase { ...@@ -122,6 +126,7 @@ class LITE_API ConfigBase {
PowerMode mode_{LITE_POWER_NO_BIND}; PowerMode mode_{LITE_POWER_NO_BIND};
// to save subgraph model for npu/xpu/... // to save subgraph model for npu/xpu/...
std::string subgraph_model_cache_dir_{""}; std::string subgraph_model_cache_dir_{""};
int device_id_{0};
public: public:
explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1); explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
...@@ -141,6 +146,9 @@ class LITE_API ConfigBase { ...@@ -141,6 +146,9 @@ class LITE_API ConfigBase {
const std::string& subgraph_model_cache_dir() const { const std::string& subgraph_model_cache_dir() const {
return subgraph_model_cache_dir_; return subgraph_model_cache_dir_;
} }
// set Device ID
void set_device_id(int device_id) { device_id_ = device_id; }
const int get_device_id() const { return device_id_; }
}; };
/// CxxConfig is the config for the Full feature predictor. /// CxxConfig is the config for the Full feature predictor.
...@@ -160,9 +168,8 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -160,9 +168,8 @@ class LITE_API CxxConfig : public ConfigBase {
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
int mlu_core_number_{1}; int mlu_core_number_{1};
DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
bool mlu_use_first_conv_{false}; std::vector<float> mlu_first_conv_mean_{};
std::vector<float> mlu_first_conv_mean_; std::vector<float> mlu_first_conv_std_{};
std::vector<float> mlu_first_conv_std_;
#endif #endif
public: public:
...@@ -210,24 +217,22 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -210,24 +217,22 @@ class LITE_API CxxConfig : public ConfigBase {
void set_mlu_core_version(lite_api::MLUCoreVersion core_version); void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
// set MLU core number, which is used when compiling MLU kernels // set MLU core number, which is used when compiling MLU kernels
void set_mlu_core_number(int core_number); void set_mlu_core_number(int core_number);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
// whether use MLU's first conv kernel. First conv is a special kernel // whether use MLU's first conv kernel. First conv is a special kernel
// provided by MLU, its input is uint8, and also needs two 3-dimentional // provided by MLU, its input is uint8, and also needs two 3-dimentional
// vectors which save all inputs' mean and std values // vectors which save all inputs' mean and std values
void set_mlu_use_first_conv(bool use_first_conv); // set the 3-dimentional mean vector and 3-dimentional std vector used by
// set the 3-dimentional mean vector used by MLU's first conv // MLU's first conv
void set_mlu_first_conv_mean(const std::vector<float>& mean); void set_mlu_firstconv_param(const std::vector<float>& mean,
// set the 3-dimentional std vector used by MLU's first conv const std::vector<float>& std);
void set_mlu_first_conv_std(const std::vector<float>& std); // set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
lite_api::MLUCoreVersion mlu_core_version() const; lite_api::MLUCoreVersion mlu_core_version() const;
int mlu_core_number() const; int mlu_core_number() const;
DataLayoutType mlu_input_layout() const; DataLayoutType mlu_input_layout() const;
bool mlu_use_first_conv() const; // std::pair<mean, std>
const std::vector<float>& mlu_first_conv_mean() const; std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
const std::vector<float>& mlu_first_conv_std() const;
#endif #endif
// XPU only, set the size of the workspace memory from L3 cache for the // XPU only, set the size of the workspace memory from L3 cache for the
......
...@@ -15,8 +15,11 @@ ...@@ -15,8 +15,11 @@
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#include "lite/utils/io.h" #include "lite/utils/io.h"
DEFINE_string(model_dir, "", ""); DEFINE_string(model_dir, "", "");
namespace paddle { namespace paddle {
......
...@@ -54,7 +54,8 @@ const std::string& ActivationTypeToStr(ActivationType act) { ...@@ -54,7 +54,8 @@ const std::string& ActivationTypeToStr(ActivationType act) {
"Sigmoid", "Sigmoid",
"Tanh", "Tanh",
"Swish", "Swish",
"Exp"}; "Exp",
"ThresholdedRelu"};
auto x = static_cast<int>(act); auto x = static_cast<int>(act);
CHECK_LT(x, static_cast<int>(ActivationType::NUM)); CHECK_LT(x, static_cast<int>(ActivationType::NUM));
return act2string[x]; return act2string[x];
...@@ -74,7 +75,8 @@ const std::string& TargetToStr(TargetType target) { ...@@ -74,7 +75,8 @@ const std::string& TargetToStr(TargetType target) {
"bm", "bm",
"mlu", "mlu",
"rknpu", "rknpu",
"apu"}; "apu",
"huawei_ascend_npu"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -117,7 +119,8 @@ const std::string& TargetRepr(TargetType target) { ...@@ -117,7 +119,8 @@ const std::string& TargetRepr(TargetType target) {
"kBM", "kBM",
"kMLU", "kMLU",
"kRKNPU", "kRKNPU",
"kAPU"}; "kAPU",
"kHuaweiAscendNPU"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -162,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) { ...@@ -162,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kMLU), TARGET(kMLU),
TARGET(kAPU), TARGET(kAPU),
TARGET(kRKNPU), TARGET(kRKNPU),
TARGET(kFPGA)}); TARGET(kFPGA),
TARGET(kHuaweiAscendNPU)});
if (target == TARGET(kAny)) { if (target == TARGET(kAny)) {
return valid_set; return valid_set;
} }
......
...@@ -57,7 +57,8 @@ enum class TargetType : int { ...@@ -57,7 +57,8 @@ enum class TargetType : int {
kMLU = 11, kMLU = 11,
kRKNPU = 12, kRKNPU = 12,
kAPU = 13, kAPU = 13,
NUM = 14, // number of fields. kHuaweiAscendNPU = 14,
NUM = 15, // number of fields.
}; };
enum class PrecisionType : int { enum class PrecisionType : int {
kUnk = 0, kUnk = 0,
...@@ -106,7 +107,8 @@ enum class ActivationType : int { ...@@ -106,7 +107,8 @@ enum class ActivationType : int {
kAbs = 9, kAbs = 9,
kHardSwish = 10, kHardSwish = 10,
kReciprocal = 11, kReciprocal = 11,
NUM = 12, kThresholdedRelu = 12,
NUM = 13,
}; };
static size_t PrecisionTypeLength(PrecisionType type) { static size_t PrecisionTypeLength(PrecisionType type) {
......
...@@ -26,7 +26,9 @@ USE_MIR_PASS(argument_type_display_pass); ...@@ -26,7 +26,9 @@ USE_MIR_PASS(argument_type_display_pass);
USE_MIR_PASS(runtime_context_assign_pass); USE_MIR_PASS(runtime_context_assign_pass);
USE_MIR_PASS(graph_visualize_pass); USE_MIR_PASS(graph_visualize_pass);
USE_MIR_PASS(remove_tf_redundant_ops_pass);
USE_MIR_PASS(lite_conv_bn_fuse_pass); USE_MIR_PASS(lite_conv_bn_fuse_pass);
USE_MIR_PASS(lite_conv_conv_fuse_pass);
USE_MIR_PASS(lite_fc_fuse_pass); USE_MIR_PASS(lite_fc_fuse_pass);
USE_MIR_PASS(lite_shuffle_channel_fuse_pass); USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
...@@ -46,14 +48,18 @@ USE_MIR_PASS(memory_optimize_pass); ...@@ -46,14 +48,18 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(multi_stream_analysis_pass); USE_MIR_PASS(multi_stream_analysis_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass); USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass); USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
USE_MIR_PASS(lite_scale_activation_fuse_pass); USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass); USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
USE_MIR_PASS(__xpu__fc_fuse_pass); USE_MIR_PASS(__xpu__fc_fuse_pass);
USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
...@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) { ...@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
.value("MLU", TargetType::kMLU) .value("MLU", TargetType::kMLU)
.value("RKNPU", TargetType::kRKNPU) .value("RKNPU", TargetType::kRKNPU)
.value("APU", TargetType::kAPU) .value("APU", TargetType::kAPU)
.value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
.value("Any", TargetType::kAny); .value("Any", TargetType::kAny);
// PrecisionType // PrecisionType
......
...@@ -59,9 +59,9 @@ void TestModel(const std::vector<Place>& valid_places) { ...@@ -59,9 +59,9 @@ void TestModel(const std::vector<Place>& valid_places) {
} }
auto* image_tensor = predictor.GetInput(1); auto* image_tensor = predictor.GetInput(1);
image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2}))); image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
data = image_tensor->mutable_data<float>(); auto* data_1 = image_tensor->mutable_data<int>();
data[0] = FLAGS_im_height; data_1[0] = FLAGS_im_height;
data[1] = FLAGS_im_width; data_1[1] = FLAGS_im_width;
for (int i = 0; i < FLAGS_warmup; ++i) { for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run(); predictor.Run();
......
...@@ -10,3 +10,4 @@ add_subdirectory(mlu) ...@@ -10,3 +10,4 @@ add_subdirectory(mlu)
add_subdirectory(bm) add_subdirectory(bm)
add_subdirectory(apu) add_subdirectory(apu)
add_subdirectory(rknpu) add_subdirectory(rknpu)
add_subdirectory(huawei_ascend_npu)
...@@ -83,6 +83,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -83,6 +83,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
conv5x5s2_depthwise_int8.cc conv5x5s2_depthwise_int8.cc
conv5x5s2_depthwise_fp32.cc conv5x5s2_depthwise_fp32.cc
conv3x3_winograd_fp32_c4.cc conv3x3_winograd_fp32_c4.cc
conv3x3_winograd_int8.cc
conv_winograd_3x3.cc conv_winograd_3x3.cc
conv_impl.cc conv_impl.cc
softmax.cc softmax.cc
...@@ -126,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR) ...@@ -126,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
split_merge_lod_tenosr.cc split_merge_lod_tenosr.cc
reduce_prod.cc reduce_prod.cc
lstm.cc lstm.cc
clip.cc
DEPS ${lite_kernel_deps} context tensor) DEPS ${lite_kernel_deps} context tensor)
endif() endif()
...@@ -753,23 +753,15 @@ void act_abs<float>(const float* din, float* dout, int size, int threads) { ...@@ -753,23 +753,15 @@ void act_abs<float>(const float* din, float* dout, int size, int threads) {
} }
} }
#ifdef LITE_WITH_TRAIN
template <> template <>
void act_square_grad(const float* din, void act_thresholded_relu<float>(
const float* dout_grad, const float* din, float* dout, int size, float threshold, int threads) {
float* din_grad,
int size,
int threads) {
const float* ptr_out_grad = dout_grad;
float* ptr_in_grad = din_grad;
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0]; dout[0] = (din[0] > threshold ? din[0] : 0.f);
ptr_out_grad++;
ptr_in_grad++;
din++; din++;
dout++;
} }
} }
#endif
} // namespace math } // namespace math
} // namespace arm } // namespace arm
......
...@@ -86,11 +86,9 @@ void act_reciprocal(const T* din, T* dout, int size, int threads); ...@@ -86,11 +86,9 @@ void act_reciprocal(const T* din, T* dout, int size, int threads);
template <typename T> template <typename T>
void act_abs(const T* din, T* dout, int size, int threads); void act_abs(const T* din, T* dout, int size, int threads);
#ifdef LITE_WITH_TRAIN
template <typename T> template <typename T>
void act_square_grad( void act_thresholded_relu(
const T* din, const T* dout_grad, T* din_grad, int size, int threads); const T* din, T* dout, int size, float threshold, int threads);
#endif
} // namespace math } // namespace math
} // namespace arm } // namespace arm
......
...@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids, ...@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
selected_ids->Resize(dims); selected_ids->Resize(dims);
selected_scores->Resize(dims); selected_scores->Resize(dims);
if (parent_idx) { if (parent_idx) {
parent_idx->Resize(dims); parent_idx->Resize({static_cast<int64_t>(num_instances)});
} }
auto *selected_ids_data = selected_ids->mutable_data<int64_t>(); auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
auto *selected_scores_data = selected_scores->mutable_data<float>(); auto *selected_scores_data = selected_scores->mutable_data<float>();
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/arm/math/clip.h"
#include <algorithm>
#include <limits>
#include <memory>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/arm/math/saturate.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void clip_kernel_fp32(
const float* input, int64_t num, float min, float max, float* output) {
float tmp;
for (int64_t i = 0; i < num; i++) {
tmp = *input;
tmp = tmp > min ? tmp : min;
*output = tmp < max ? tmp : max;
input++;
output++;
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void clip_kernel_fp32(
const float* input, int64_t num, float min, float max, float* output);
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
...@@ -1245,7 +1245,7 @@ void weight_trans_c4_8x8( ...@@ -1245,7 +1245,7 @@ void weight_trans_c4_8x8(
for (int i = 0; i < ch_out * ch_in * 64; ++i) { for (int i = 0; i < ch_out * ch_in * 64; ++i) {
int new_c = i % 64; int new_c = i % 64;
int new_oc = i / ch_in / 64 / 4; int new_oc = i / ch_in / 64 / 4;
int new_ic = i / 64 % (ch_in * 4) % ch_in; int new_ic = i / 64 % ch_in;
int new_inner = i / ch_in / 64 % 4; int new_inner = i / ch_in / 64 % 4;
int dest_ind = int dest_ind =
new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner; new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
...@@ -1302,7 +1302,7 @@ void weight_trans_c4_4x4( ...@@ -1302,7 +1302,7 @@ void weight_trans_c4_4x4(
for (int i = 0; i < ch_out * ch_in * 16; ++i) { for (int i = 0; i < ch_out * ch_in * 16; ++i) {
int new_c = i % 16; int new_c = i % 16;
int new_oc = i / ch_in / 16 / 4; int new_oc = i / ch_in / 16 / 4;
int new_ic = i / 16 % (ch_in * 4) % ch_in; int new_ic = i / 16 % ch_in;
int new_inner = i / ch_in / 16 % 4; int new_inner = i / ch_in / 16 % 4;
int dest_ind = int dest_ind =
new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner; new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
......
此差异已折叠。
...@@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din, ...@@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din,
} }
return true; return true;
} }
// for example: m = 4, n = 4
// din = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9 , 10 ,11], [12, 13, 14, 15]]
// dout = [[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]
/*
m = 8 n = 8: 0 1 2 3 4 5 6 7 0 8 16 24 32 40 48 56
16 17 18 19 20 21 22 23 2 10 18 26 34 42 50 58
24 25 26 27 28 29 30 31 3 11 19 27 35 43 51 59
32 33 34 35 36 37 38 39 4 12 20 28 36 44 52 60 ...
}
}
*/
template <typename Dtype>
void local_transpose(const Dtype* din, Dtype* dout, int m, int n) {
// n % 4 == 0 && m % 4 == 0
// n * m ==> n * m data trans
int offset_m = m << 2;
const Dtype* din_ptr = din;
Dtype* dout_ptr = dout;
for (int i = 0; i < n; i += 4) {
Dtype* out_ptr0 = dout_ptr;
Dtype* out_ptr1 = dout_ptr + m;
Dtype* out_ptr2 = out_ptr1 + m;
Dtype* out_ptr3 = out_ptr2 + m;
const Dtype* in_ptr0 = din_ptr;
const Dtype* in_ptr1 = din_ptr + m;
const Dtype* in_ptr2 = in_ptr1 + m;
const Dtype* in_ptr3 = in_ptr2 + m;
for (int j = 0; j < m; j += 4) {
float32x4_t vin0 = vld1q_f32(in_ptr0);
float32x4_t vin1 = vld1q_f32(in_ptr1);
float32x4_t vin2 = vld1q_f32(in_ptr2);
float32x4_t vin3 = vld1q_f32(in_ptr3);
// a00 b00 a02 b02 a01 b01 a03 b03
float32x4x2_t tmp0 = vtrnq_f32(vin0, vin1);
// c00 d00 c02 d02 c01 d01 c03 d03
float32x4x2_t tmp2 = vtrnq_f32(vin2, vin3);
in_ptr0 = in_ptr3 + m;
in_ptr1 = in_ptr3 + 2 * m;
float tmp_val1 = tmp0.val[0][2];
float tmp_val2 = tmp0.val[0][3];
tmp0.val[0][2] = tmp2.val[0][0];
tmp0.val[0][3] = tmp2.val[0][1];
float tmp_val3 = tmp0.val[1][2];
float tmp_val4 = tmp0.val[1][3];
tmp2.val[0][0] = tmp_val1;
tmp2.val[0][1] = tmp_val2;
tmp0.val[1][2] = tmp2.val[1][0];
tmp0.val[1][3] = tmp2.val[1][1];
tmp2.val[1][0] = tmp_val3;
tmp2.val[1][1] = tmp_val4;
in_ptr2 = in_ptr1 + m;
in_ptr3 = in_ptr1 + 2 * m;
vst1q_f32(out_ptr0, tmp0.val[0]);
vst1q_f32(out_ptr1, tmp0.val[1]);
out_ptr0 += 4;
out_ptr1 += 4;
vst1q_f32(out_ptr2, tmp2.val[0]);
vst1q_f32(out_ptr3, tmp2.val[1]);
out_ptr2 += 4;
out_ptr3 += 4;
}
dout_ptr += offset_m;
din_ptr += 4;
}
}
template <typename Dtype>
void transpose(const Dtype* din, Dtype* dout, int m, int n) {
// nxm == mxn
// 4x4
int cnt_n = n >> 2;
int remain_n = n & 3;
int cnt_m = m >> 2;
int remain_m = m & 3;
int nn_num = n << 2; // n * 4
int mm_num = m << 2; // m * 4
for (int x = 0; x < cnt_n; x++) {
const Dtype* din_ptr0 = din + x * mm_num;
const Dtype* din_ptr1 = din_ptr0 + m;
const Dtype* din_ptr2 = din_ptr1 + m;
const Dtype* din_ptr3 = din_ptr2 + m;
Dtype* dout_ptr0 = dout + x * 4;
for (int y = 0; y < cnt_m; y++) {
float32x4_t din0 = vld1q_f32(din_ptr0); // a00 a01 a02 a03
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
Dtype* dout_ptr1 = dout_ptr0 + n;
Dtype* dout_ptr2 = dout_ptr1 + n;
Dtype* dout_ptr3 = dout_ptr2 + n;
// a00 b00 a02 b02 a01 b01 a03 b03
float32x4x2_t tmp0 = vtrnq_f32(din0, din1);
// c00 d00 c02 d02 c01 d01 c03 d03
float32x4x2_t tmp2 = vtrnq_f32(din2, din3);
din_ptr0 += 4;
din_ptr1 += 4;
// a00 b00 c00 d00 a02 b02 c02 d02
// a01 b01 c01 d01 a03 b03 c03 d03
float tmp_val1 = tmp0.val[0][2];
float tmp_val2 = tmp0.val[0][3];
tmp0.val[0][2] = tmp2.val[0][0];
tmp0.val[0][3] = tmp2.val[0][1];
float tmp_val3 = tmp0.val[1][2];
float tmp_val4 = tmp0.val[1][3];
tmp2.val[0][0] = tmp_val1;
tmp2.val[0][1] = tmp_val2;
tmp0.val[1][2] = tmp2.val[1][0];
tmp0.val[1][3] = tmp2.val[1][1];
tmp2.val[1][0] = tmp_val3;
tmp2.val[1][1] = tmp_val4;
din_ptr2 += 4;
din_ptr3 += 4;
vst1q_f32(dout_ptr0, tmp0.val[0]);
vst1q_f32(dout_ptr1, tmp0.val[1]);
dout_ptr0 += nn_num;
vst1q_f32(dout_ptr2, tmp2.val[0]);
vst1q_f32(dout_ptr3, tmp2.val[1]);
}
for (int y = 0; y < remain_m; y++) {
*dout_ptr0++ = *din_ptr0++;
*dout_ptr0++ = *din_ptr1++;
*dout_ptr0++ = *din_ptr2++;
*dout_ptr0++ = *din_ptr3++;
}
}
const Dtype* din_ptr0 = din + cnt_n * mm_num;
dout = dout + cnt_n * 4;
for (int x = 0; x < remain_n; x++) {
Dtype* dout_ptr0 = dout + x * 4;
for (int y = 0; y < cnt_m; y++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
Dtype* dout_ptr1 = dout_ptr0 + n;
Dtype* dout_ptr2 = dout_ptr1 + n;
Dtype* dout_ptr3 = dout_ptr2 + n;
din_ptr0 += 4;
*dout_ptr0 = din0[0];
*dout_ptr1 = din0[1];
dout_ptr0 += nn_num;
*dout_ptr2 = din0[2];
*dout_ptr3 = din0[3];
}
for (int y = 0; y < remain_m; y++) {
*dout_ptr0++ = *din_ptr0++;
}
}
}
/*preprocessing inputs /*preprocessing inputs
* input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws] * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
* n = he - hs * n = he - hs
...@@ -3762,6 +3907,7 @@ inline void write_int32_nchwc8_to_nchw(const int* din, ...@@ -3762,6 +3907,7 @@ inline void write_int32_nchwc8_to_nchw(const int* din,
int w_stride = we - ws; int w_stride = we - ws;
int valid_w = (we > width ? width : we) - ws; int valid_w = (we > width ? width : we) - ws;
int cnt = valid_w / 4; int cnt = valid_w / 4;
int remain = valid_w & 3;
float32x4_t w_scale0 = vld1q_f32(scale); float32x4_t w_scale0 = vld1q_f32(scale);
float32x4_t w_scale1 = vld1q_f32(scale + 4); float32x4_t w_scale1 = vld1q_f32(scale + 4);
...@@ -3818,10 +3964,10 @@ inline void write_int32_nchwc8_to_nchw(const int* din, ...@@ -3818,10 +3964,10 @@ inline void write_int32_nchwc8_to_nchw(const int* din,
flag_act, flag_act,
alpha); alpha);
} }
if (we > width) { if (remain > 0) {
int offset = 32 * cnt; int offset = 32 * cnt;
din_hei_ptr = ptr_din + offset; din_hei_ptr = ptr_din + offset;
for (int j = ws + cnt * 4; j < width; ++j) { for (int j = 0; j < remain; ++j) {
if (flag_bias) { if (flag_bias) {
*(doutc0_ptr++) = cvt_kernel<Dtype>( *(doutc0_ptr++) = cvt_kernel<Dtype>(
din_hei_ptr[0], scale[0], bias[0], flag_act, alpha[0]); din_hei_ptr[0], scale[0], bias[0], flag_act, alpha[0]);
......
...@@ -359,6 +359,35 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -359,6 +359,35 @@ void conv_compute_2x2_3x3_small(const float* input,
const float* bias, const float* bias,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx); ARMContext* ctx);
void input_trans_c8_4x4_int8(const int8_t* src,
int src_stride,
int src_h_stride,
int16_t* dest,
int dest_stride,
int dest_h_stride);
void output_trans_c8_post_2x4_int8(const int32_t* src,
int src_stride,
int src_h_stride,
int32_t* dest,
int dest_stride,
int dest_h_stride);
void weight_trans_c8_4x4_int8(
int16_t* dest, const int8_t* src, int ic, int oc, void* workspace);
template <typename Dtype>
void conv_compute_2x2_3x3_int8(const int8_t* input,
Dtype* output,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const int16_t* weight,
const float* bias,
const float* scale,
const operators::ConvParam& param,
ARMContext* ctx);
template <typename Dtype> template <typename Dtype>
void im2col(const Dtype* data_im, void im2col(const Dtype* data_im,
......
...@@ -11,8 +11,8 @@ ...@@ -11,8 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/backends/arm/math/elementwise.h" #include "lite/backends/arm/math/elementwise.h"
#include <math.h>
#include <algorithm> #include <algorithm>
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
...@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx, ...@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
} }
} }
template <>
void elementwise_mul<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
dout[i] = dinx[i] * diny[i];
}
}
template <> template <>
void elementwise_mul_relu<float>(const float* dinx, void elementwise_mul_relu<float>(const float* dinx,
const float* diny, const float* diny,
...@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx, ...@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
} }
} }
template <>
void elementwise_mul_relu<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
int64_t tmp = dinx[i] * diny[i];
dout[i] = tmp > 0 ? tmp : 0;
}
}
template <> template <>
void elementwise_mul_broadcast<float>(const float* dinx, void elementwise_mul_broadcast<float>(const float* dinx,
const float* diny, const float* diny,
...@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx, ...@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
} }
} }
template <>
void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* dinx_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *dinx_ptr * diny_data;
dout_ptr++;
dinx_ptr++;
}
}
}
}
template <> template <>
void elementwise_mul_relu_broadcast<float>(const float* dinx, void elementwise_mul_relu_broadcast<float>(const float* dinx,
const float* diny, const float* diny,
...@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx, ...@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
} }
} }
template <>
void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* dinx_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int k = 0; k < num; ++k) {
int64_t tmp = *dinx_ptr * diny_data;
*dout_ptr = tmp > 0 ? tmp : 0;
dout_ptr++;
dinx_ptr++;
}
}
}
}
template <> template <>
void elementwise_max<float>(const float* dinx, void elementwise_max<float>(const float* dinx,
const float* diny, const float* diny,
...@@ -1254,6 +1322,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx, ...@@ -1254,6 +1322,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
} }
} }
template <>
void elementwise_div<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
*dout = *dinx / *diny;
dout++;
dinx++;
diny++;
}
}
template <> template <>
void elementwise_div<float>(const float* dinx, void elementwise_div<float>(const float* dinx,
const float* diny, const float* diny,
...@@ -1306,6 +1387,28 @@ void elementwise_div<float>(const float* dinx, ...@@ -1306,6 +1387,28 @@ void elementwise_div<float>(const float* dinx,
} }
} }
template <>
void elementwise_div_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* din_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int p = 0; p < num; p++) {
*dout_ptr = *din_ptr / diny_data;
dout_ptr++;
din_ptr++;
}
}
}
}
template <> template <>
void elementwise_div_broadcast<float>(const float* dinx, void elementwise_div_broadcast<float>(const float* dinx,
const float* diny, const float* diny,
...@@ -1541,6 +1644,87 @@ void elementwise_div_relu_broadcast<float>(const float* dinx, ...@@ -1541,6 +1644,87 @@ void elementwise_div_relu_broadcast<float>(const float* dinx,
} }
} }
template <typename T>
void elementwise_mod_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const T* din_ptr = dinx + offset;
const T diny_data = diny[j];
T* dout_ptr = dout + offset;
int cnt = num >> 2;
int remain = num % 4;
for (int k = 0; k < cnt; ++k) {
register T dinx0 = din_ptr[0];
register T dinx1 = din_ptr[1];
register T dinx2 = din_ptr[2];
register T dinx3 = din_ptr[3];
dout_ptr[0] = dinx0 % diny_data;
dout_ptr[1] = dinx1 % diny_data;
dout_ptr[2] = dinx2 % diny_data;
dout_ptr[3] = dinx3 % diny_data;
din_ptr += 4;
dout_ptr += 4;
}
if (remain > 0) {
for (int p = 0; p < remain; p++) {
*dout_ptr++ = *din_ptr++ % diny_data;
}
}
}
}
}
template <typename T>
void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) {
int cnt = num >> 2;
int remain = num % 4;
#pragma omp parallel for
for (int i = 0; i < cnt; i++) {
const T* dinx_ptr = dinx + (i << 2);
const T* diny_ptr = diny + (i << 2);
T* dout_ptr = dout + (i << 2);
register T dinx0 = dinx_ptr[0];
register T dinx1 = dinx_ptr[1];
register T dinx2 = dinx_ptr[2];
register T dinx3 = dinx_ptr[3];
register T diny0 = diny_ptr[0];
register T diny1 = diny_ptr[1];
register T diny2 = diny_ptr[2];
register T diny3 = diny_ptr[3];
dout_ptr[0] = dinx0 % diny0;
dout_ptr[1] = dinx1 % diny1;
dout_ptr[2] = dinx2 % diny2;
dout_ptr[3] = dinx3 % diny3;
}
if (remain > 0) {
const T* dinx_ptr = dinx + (cnt << 2);
const T* diny_ptr = diny + (cnt << 2);
T* dout_ptr = dout + (cnt << 2);
for (int i = 0; i < remain; i++) {
*dout_ptr++ = *dinx_ptr++ % *diny_ptr++;
}
}
}
template void elementwise_mod<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num);
template void elementwise_mod_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -253,6 +253,13 @@ template <typename T> ...@@ -253,6 +253,13 @@ template <typename T>
void elementwise_div_relu_broadcast( void elementwise_div_relu_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num); const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
template <typename T>
void elementwise_mod(const T* dinx, const T* diny, T* dout, int num);
template <typename T>
void elementwise_mod_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "lite/backends/arm/math/axpy.h" #include "lite/backends/arm/math/axpy.h"
#include "lite/backends/arm/math/beam_search.h" #include "lite/backends/arm/math/beam_search.h"
#include "lite/backends/arm/math/box_coder.h" #include "lite/backends/arm/math/box_coder.h"
#include "lite/backends/arm/math/clip.h"
#include "lite/backends/arm/math/col_im_transform.h" #include "lite/backends/arm/math/col_im_transform.h"
#include "lite/backends/arm/math/concat.h" #include "lite/backends/arm/math/concat.h"
#include "lite/backends/arm/math/conv_block_utils.h" #include "lite/backends/arm/math/conv_block_utils.h"
......
...@@ -2242,19 +2242,45 @@ void gemm_prepack_oth_int8(const int8_t* A_packed, ...@@ -2242,19 +2242,45 @@ void gemm_prepack_oth_int8(const int8_t* A_packed,
Dtype* tmp1 = nullptr; Dtype* tmp1 = nullptr;
Dtype* tmp2 = nullptr; Dtype* tmp2 = nullptr;
Dtype* tmp3 = nullptr; Dtype* tmp3 = nullptr;
float32_t scale_local[4]; float32_t scale_local[4] = {0, 0, 0, 0};
float32_t bias_local[4] = {0, 0, 0, 0}; float32_t bias_local[4] = {0, 0, 0, 0};
if (is_bias) { if (is_bias) {
bias_local[0] = bias[y]; if (y + 4 <= M) {
bias_local[1] = bias[y + 1]; bias_local[0] = bias[y];
bias_local[2] = bias[y + 2]; bias_local[1] = bias[y + 1];
bias_local[3] = bias[y + 3]; bias_local[2] = bias[y + 2];
bias_local[3] = bias[y + 3];
} else {
switch (M - y) {
case 3:
bias_local[2] = bias[y + 2];
case 2:
bias_local[1] = bias[y + 1];
case 1:
bias_local[0] = bias[y + 0];
default:
break;
}
}
} }
if (scale) { if (scale) {
scale_local[0] = scale[y]; if (y + 4 <= M) {
scale_local[1] = scale[y + 1]; scale_local[0] = scale[y];
scale_local[2] = scale[y + 2]; scale_local[1] = scale[y + 1];
scale_local[3] = scale[y + 3]; scale_local[2] = scale[y + 2];
scale_local[3] = scale[y + 3];
} else {
switch (M - y) {
case 3:
scale_local[2] = scale[y + 2];
case 2:
scale_local[1] = scale[y + 1];
case 1:
scale_local[0] = scale[y + 0];
default:
break;
}
}
} }
if (y + MBLOCK_INT8_OTH > M) { if (y + MBLOCK_INT8_OTH > M) {
switch (y + MBLOCK_INT8_OTH - M) { switch (y + MBLOCK_INT8_OTH - M) {
......
...@@ -54,6 +54,13 @@ void sgemm_prepack_c4_small(int M, ...@@ -54,6 +54,13 @@ void sgemm_prepack_c4_small(int M,
const float* B, const float* B,
float* C, float* C,
ARMContext* ctx); ARMContext* ctx);
void sgemm_prepack_c8_int16_small(int M,
int N,
int K,
const int16_t* A_packed,
const int16_t* B,
int32_t* C,
ARMContext* ctx);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din, ...@@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din,
} else { } else {
if (pad_bottom > 1) { if (pad_bottom > 1) {
coef_h = 1.f / 3; coef_h = 1.f / 3;
} else if (pad_bottom = 1) { } else if (pad_bottom == 1) {
coef_h = 0.5f; coef_h = 0.5f;
} else { } else {
coef_h = 1.f; coef_h = 1.f;
......
...@@ -21,7 +21,7 @@ namespace lite { ...@@ -21,7 +21,7 @@ namespace lite {
namespace arm { namespace arm {
namespace math { namespace math {
const int MALLOC_ALIGN = 64; const int MALLOC_ALIGN = 16;
void* fast_malloc(size_t size) { void* fast_malloc(size_t size) {
size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
......
...@@ -46,11 +46,60 @@ void seq_pool_sum<float>(const float* din, ...@@ -46,11 +46,60 @@ void seq_pool_sum<float>(const float* din,
memcpy(dout_ptr, din_ptr, width * sizeof(float)); memcpy(dout_ptr, din_ptr, width * sizeof(float));
din_ptr += width; din_ptr += width;
height = height - 1; height = height - 1;
for (int h = 0; h < height; h++) { int cnt_w = width >> 2;
for (int w = 0; w < width; ++w) { int remain_w = width & 3;
dout_ptr[w] += din_ptr[w]; int cnt_h = height >> 2;
int remain_h = height & 3;
int stride = width << 2;
for (int w = 0; w < cnt_w; w++) {
const float* din_ptr0 = din_ptr + w * 4;
float32x4_t dout_val = vld1q_f32(dout_ptr);
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
dout_val = vaddq_f32(din0, dout_val);
float32x4_t tmp = vaddq_f32(din1, din2);
din_ptr0 += stride;
din_ptr1 += stride;
dout_val = vaddq_f32(din3, dout_val);
din_ptr2 += stride;
din_ptr3 += stride;
dout_val = vaddq_f32(tmp, dout_val);
} }
din_ptr += width; for (int h = 0; h < remain_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
dout_val = vaddq_f32(din0, dout_val);
din_ptr0 += width;
}
vst1q_f32(dout_ptr, dout_val);
dout_ptr += 4;
}
const float* din_ptr00 = din_ptr + cnt_w * 4;
for (int w = 0; w < remain_w; w++) {
const float* din_ptr0 = din_ptr00 + w;
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
*dout_ptr += din_ptr0[0];
float tmp = din_ptr1[0] + din_ptr2[0];
din_ptr0 += stride;
din_ptr1 += stride;
*dout_ptr += din_ptr3[0];
din_ptr2 += stride;
din_ptr3 += stride;
*dout_ptr += tmp;
}
for (int h = 0; h < remain_h; h++) {
*dout_ptr += din_ptr0[0];
din_ptr0 += width;
}
dout_ptr++;
} }
} }
} }
...@@ -144,12 +193,62 @@ void seq_pool_max<float>(const float* din, ...@@ -144,12 +193,62 @@ void seq_pool_max<float>(const float* din,
} else { } else {
memcpy(dout_ptr, din_ptr, width * sizeof(float)); memcpy(dout_ptr, din_ptr, width * sizeof(float));
din_ptr += width; din_ptr += width;
int remain_h = height - 1; height = height - 1;
for (int h = 0; h < remain_h; h++) { int cnt_w = width >> 2;
for (int w = 0; w < width; w++) { int remain_w = width & 3;
dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]); int cnt_h = height >> 2;
int remain_h = height & 3;
int stride = width << 2;
for (int w = 0; w < cnt_w; w++) {
const float* din_ptr0 = din_ptr + w * 4;
float32x4_t dout_val = vld1q_f32(dout_ptr);
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
dout_val = vmaxq_f32(din0, dout_val);
float32x4_t tmp = vmaxq_f32(din1, din2);
din_ptr0 += stride;
din_ptr1 += stride;
dout_val = vmaxq_f32(din3, dout_val);
din_ptr2 += stride;
din_ptr3 += stride;
dout_val = vmaxq_f32(tmp, dout_val);
} }
din_ptr += width; for (int h = 0; h < remain_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
dout_val = vmaxq_f32(din0, dout_val);
din_ptr0 += width;
}
vst1q_f32(dout_ptr, dout_val);
dout_ptr += 4;
}
const float* din_ptr00 = din_ptr + cnt_w * 4;
for (int w = 0; w < remain_w; w++) {
const float* din_ptr0 = din_ptr00 + w;
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
*dout_ptr += din_ptr0[0];
*dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
float tmp = std::max(din_ptr1[0], din_ptr2[0]);
din_ptr0 += stride;
din_ptr1 += stride;
*dout_ptr = std::max(*dout_ptr, din_ptr3[0]);
din_ptr2 += stride;
din_ptr3 += stride;
*dout_ptr = std::max(*dout_ptr, tmp);
}
for (int h = 0; h < remain_h; h++) {
*dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
din_ptr0 += width;
}
dout_ptr++;
} }
} }
} }
......
...@@ -531,7 +531,7 @@ void softmax_inner1_large_axis<float>(const float* din, ...@@ -531,7 +531,7 @@ void softmax_inner1_large_axis<float>(const float* din,
} }
float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax)); float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax));
float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1)); float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1));
for (j = 4 * j; j < axis_size; ++j) { for (j = 4 * nn; j < axis_size; ++j) {
max_data = std::max(max_data, din_max_ptr[0]); max_data = std::max(max_data, din_max_ptr[0]);
din_max_ptr++; din_max_ptr++;
} }
...@@ -557,7 +557,7 @@ void softmax_inner1_large_axis<float>(const float* din, ...@@ -557,7 +557,7 @@ void softmax_inner1_large_axis<float>(const float* din,
float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum)); float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum));
float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1); float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1);
for (j = 4 * j; j < axis_size; ++j) { for (j = 4 * nn; j < axis_size; ++j) {
dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data); dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data);
sum_data += dout_sum_ptr[0]; sum_data += dout_sum_ptr[0];
din_sum_ptr++; din_sum_ptr++;
......
...@@ -41,6 +41,8 @@ ...@@ -41,6 +41,8 @@
<< "CUDA: " << cudaGetErrorString(e); \ << "CUDA: " << cudaGetErrorString(e); \
} }
#define CUDA_POST_KERNEL_CHECK CUDA_CALL(cudaPeekAtLastError())
#define CUBLAS_CALL(func) \ #define CUBLAS_CALL(func) \
{ \ { \
auto e = (func); \ auto e = (func); \
...@@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) { ...@@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) {
return "CUDNN_STATUS_RUNTIME_IN_PROGRESS"; return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW"; return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
#endif
#if CUDNN_VERSION_MIN(8, 0, 0)
case CUDNN_STATUS_VERSION_MISMATCH:
return "CUDNN_STATUS_VERSION_MISMATCH";
#endif #endif
} }
return "Unknown cudnn status"; return "Unknown cudnn status";
......
...@@ -11,8 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps}) ...@@ -11,8 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps}) nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps}) nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps}) nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
nv_library(cuda_gru_forward SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
nv_library(cuda_sequence2batch SRCS sequence2batch.cu DEPS ${cuda_static_deps})
nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps}) nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps}) nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
set ( set (
math_cuda math_cuda
...@@ -23,8 +28,13 @@ set ( ...@@ -23,8 +28,13 @@ set (
cuda_transpose cuda_transpose
cuda_elementwise cuda_elementwise
cudnn_pool cudnn_pool
cuda_gru_forward
cuda_sequence2batch
cuda_gemm cuda_gemm
cuda_batched_gemm cuda_batched_gemm
cuda_strided_gemm
cuda_sequence_padding
cuda_bias
) )
set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda") set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <iostream> #include <iostream>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/activation.h" #include "lite/backends/cuda/math/activation.h"
#include "lite/backends/cuda/math/utils.h" #include "lite/backends/cuda/math/utils.h"
...@@ -21,6 +22,20 @@ namespace lite { ...@@ -21,6 +22,20 @@ namespace lite {
namespace cuda { namespace cuda {
namespace math { namespace math {
ActivationType GetActiveType(const std::string& act) {
if (act == "sigmoid") {
return kSigmoid;
} else if (act == "relu") {
return kReLU;
} else if (act == "tanh") {
return kTanh;
} else if (act == "identify") {
return kIdentity;
} else {
LOG(FATAL) << "not supported activation: " << act;
}
}
template <typename T> template <typename T>
__global__ void relu_kernel(const int num, __global__ void relu_kernel(const int num,
const float alpha, const float alpha,
...@@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t); ...@@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t);
template void bias_relu( template void bias_relu(
int, const float*, const float* bias, float*, float, cudaStream_t); int, const float*, const float* bias, float*, float, cudaStream_t);
// ------------- sigmoid -------------
template <typename T>
__global__ void sigmoid_kernel(const int num, const T* in, T* out) {
CUDA_KERNEL_LOOP(i, num) {
#if __CUDA_ARCH__ >= 350
out[i] = static_cast<T>(1.0f) /
(static_cast<T>(1.0f) + expf(-1 * __ldg(in + i)));
#else
out[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + expf(-in[i]));
#endif
}
}
template <>
__global__ void sigmoid_kernel(const int num, const half* in, half* out) {
CUDA_KERNEL_LOOP(i, num) {
half tmp = __float2half(1.0f);
#if __CUDA_ARCH__ >= 530
out[i] = __hdiv(
tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.0f), __ldg(in + i)))));
#else
out[i] = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i]))));
#endif
}
}
template <>
__global__ void sigmoid_kernel(const int num, const half2* in, half2* out) {
CUDA_KERNEL_LOOP(i, num) {
half2 tmp = __floats2half2_rn(1.0f, 1.0f);
#if __CUDA_ARCH__ >= 530
out[i] = __h2div(tmp,
__hadd2(tmp,
h2exp(__hmul2(__floats2half2_rn(-1.0f, -1.0f),
__ldg(in + i)))));
#else
out[i].x = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].x))));
out[i].y = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].y))));
#endif
}
}
template <typename T>
void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream) {
sigmoid_kernel<T><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
num, din, dout);
CUDA_POST_KERNEL_CHECK;
}
template <>
void sigmoid(const int num, const half* din, half* dout, cudaStream_t stream) {
if (num % 2 == 0) {
const half2* din2 = reinterpret_cast<const half2*>(din);
half2* dout2 = reinterpret_cast<half2*>(dout);
sigmoid_kernel<
half2><<<CUDA_GET_BLOCKS(num / 2), CUDA_NUM_THREADS, 0, stream>>>(
num / 2, din2, dout2);
} else {
sigmoid_kernel<half><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
num, din, dout);
}
CUDA_POST_KERNEL_CHECK;
}
template void sigmoid(const int num,
const float* din,
float* dout,
cudaStream_t stream);
} // namespace math } // namespace math
} // namespace cuda } // namespace cuda
} // namespace lite } // namespace lite
......
...@@ -17,11 +17,22 @@ ...@@ -17,11 +17,22 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <string> #include <string>
#include "lite/utils/cp_logging.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace cuda { namespace cuda {
namespace math { namespace math {
enum ActivationType {
kSigmoid,
kReLU,
kTanh,
kIdentity,
};
ActivationType GetActiveType(const std::string& act);
// fp32 and half // fp32 and half
template <typename T> template <typename T>
void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream); void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream);
...@@ -72,6 +83,9 @@ void bias_int8_nhwc(int num, ...@@ -72,6 +83,9 @@ void bias_int8_nhwc(int num,
const void* scale, const void* scale,
cudaStream_t stream); cudaStream_t stream);
template <typename T>
void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream);
} // namespace math } // namespace math
} // namespace cuda } // namespace cuda
} // namespace lite } // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/bias.h"
#include <iostream>
#include "lite/backends/cuda/cuda_utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
__global__ void RowwiseAddKernel(
const T* a, const T* b, T* c, int width, int num) {
CUDA_KERNEL_LOOP(i, num) {
int h = i / width;
int w = i - h * width;
c[i] = a[i] + b[w];
}
}
template <>
__global__ void RowwiseAddKernel(
const half* a, const half* b, half* c, int width, int num) {
CUDA_KERNEL_LOOP(i, num) {
int h = i / width;
int w = i - h * width;
c[i] = __hadd(a[i], b[w]);
}
}
template <typename T>
void RowwiseAdd<T>::operator()(const T* input,
const T* bias,
T* output,
const int width,
const int count,
const cudaStream_t& stream) {
RowwiseAddKernel<T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
input, bias, output, width, count);
CUDA_POST_KERNEL_CHECK;
}
template struct RowwiseAdd<float>;
template struct RowwiseAdd<half>;
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include "lite/backends/cuda/cuda_utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
struct RowwiseAdd {
void operator()(const T* input,
const T* bias,
T* output,
const int width,
const int count,
const cudaStream_t& stream);
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
...@@ -161,15 +161,17 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param, ...@@ -161,15 +161,17 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
search_func); search_func);
} else { } else {
CUDNN_CHECK( int requestedAlgoCount = 1;
cudnnGetConvolutionForwardAlgorithm(this->handle_, int returnedAlgoCount;
this->input_desc_, CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(this->handle_,
this->filter_desc_, this->input_desc_,
this->conv_desc_, this->filter_desc_,
this->output_desc_, this->conv_desc_,
this->preference_, this->output_desc_,
this->workspace_limit_bytes_, requestedAlgoCount,
&this->fwd_algo_)); &returnedAlgoCount,
&this->algo_perf_));
this->fwd_algo_ = this->algo_perf_.algo;
} }
CUDNN_CHECK( CUDNN_CHECK(
cudnnGetConvolutionForwardWorkspaceSize(this->handle_, cudnnGetConvolutionForwardWorkspaceSize(this->handle_,
......
...@@ -81,6 +81,7 @@ class CudnnConv2DBase { ...@@ -81,6 +81,7 @@ class CudnnConv2DBase {
cudaStream_t stream_; cudaStream_t stream_;
cudnnHandle_t handle_; cudnnHandle_t handle_;
cudnnConvolutionFwdAlgo_t fwd_algo_; cudnnConvolutionFwdAlgo_t fwd_algo_;
cudnnConvolutionFwdAlgoPerf_t algo_perf_;
cudnnTensorDescriptor_t input_desc_; cudnnTensorDescriptor_t input_desc_;
cudnnTensorDescriptor_t output_desc_; cudnnTensorDescriptor_t output_desc_;
cudnnTensorDescriptor_t bias_desc_; cudnnTensorDescriptor_t bias_desc_;
...@@ -98,8 +99,6 @@ class CudnnConv2DBase { ...@@ -98,8 +99,6 @@ class CudnnConv2DBase {
const bool use_tensor_core_ = true; const bool use_tensor_core_ = true;
const size_t workspace_limit_bytes_ = 4 * 1024 * 1024; const size_t workspace_limit_bytes_ = 4 * 1024 * 1024;
const cudnnConvolutionFwdPreference_t preference_ =
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
// For int8 // For int8
Tensor temp_tensor_; Tensor temp_tensor_;
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include "lite/backends/cuda/math/gru_forward.h"
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
/*
* threads(frame_per_block, batch_per_block)
* grid(frame_blocks, batch_blocks)
*/
template <typename T>
__global__ void GruForwardResetOutput(
T* gate_value,
T* reset_output_value,
T* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) return;
gate_value += batch_idx * 3 * frame_size;
reset_output_value += batch_idx * frame_size;
}
T prev_out = 0;
T reset_out_val;
T update_gate_value = gate_value[frame_idx + frame_size * 0];
T reset_gate_value = gate_value[frame_idx + frame_size * 1];
if (prev_output_value) {
if (is_batch) {
prev_output_value += batch_idx * frame_size;
}
prev_out = prev_output_value[frame_idx];
}
if (active_gate == lite::cuda::math::ActivationType::kSigmoid) {
update_gate_value = Sigmoid(update_gate_value);
reset_gate_value = Sigmoid(reset_gate_value);
} else if (active_gate == lite::cuda::math::ActivationType::kReLU) {
update_gate_value = ReLU(update_gate_value);
reset_gate_value = ReLU(reset_gate_value);
} else if (active_gate == lite::cuda::math::ActivationType::kTanh) {
update_gate_value = Tanh(update_gate_value);
reset_gate_value = Tanh(reset_gate_value);
}
reset_out_val = prev_out * reset_gate_value;
gate_value[frame_idx + frame_size * 0] = update_gate_value;
gate_value[frame_idx + frame_size * 1] = reset_gate_value;
reset_output_value[frame_idx] = reset_out_val;
}
template <>
__global__ void GruForwardResetOutput(
half* gate_value,
half* reset_output_value,
half* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) return;
gate_value += batch_idx * 3 * frame_size;
reset_output_value += batch_idx * frame_size;
}
half prev_out = 0;
half reset_out_val;
half update_gate_value = gate_value[frame_idx + frame_size * 0];
half reset_gate_value = gate_value[frame_idx + frame_size * 1];
if (prev_output_value) {
if (is_batch) {
prev_output_value += batch_idx * frame_size;
}
prev_out = prev_output_value[frame_idx];
}
if (active_gate == ActivationType::kSigmoid) {
update_gate_value = Sigmoid(update_gate_value);
reset_gate_value = Sigmoid(reset_gate_value);
} else if (active_gate == ActivationType::kReLU) {
update_gate_value = ReLU(update_gate_value);
reset_gate_value = ReLU(reset_gate_value);
} else if (active_gate == ActivationType::kTanh) {
update_gate_value = Tanh(update_gate_value);
reset_gate_value = Tanh(reset_gate_value);
}
#if __CUDA_ARCH__ >= 530
reset_out_val = __hmul(prev_out, reset_gate_value);
#else
reset_out_val =
__float2half(__half2float(prev_out) * __half2float(reset_gate_value));
#endif
gate_value[frame_idx + frame_size * 0] = update_gate_value;
gate_value[frame_idx + frame_size * 1] = reset_gate_value;
reset_output_value[frame_idx] = reset_out_val;
}
/*
* threads(frame_per_block, batch_per_block)
* grid(frame_blocks, batch_blocks)
*/
template <typename T>
__global__ void GruForwardFinalOutput(
T* gate_value,
T* prev_output_value,
T* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) {
return;
}
gate_value += batch_idx * 3 * frame_size;
output_value += batch_idx * frame_size;
}
T output;
T prev_out = 0;
T update_gate_value = gate_value[frame_idx + frame_size * 0];
T state_frame_value = gate_value[frame_idx + frame_size * 2];
if (prev_output_value) {
if (is_batch) prev_output_value += batch_idx * frame_size;
prev_out = prev_output_value[frame_idx];
}
if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
state_frame_value = Sigmoid(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kReLU) {
state_frame_value = ReLU(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kTanh) {
state_frame_value = Tanh(state_frame_value);
}
if (origin_mode) {
output = update_gate_value * prev_out + state_frame_value -
update_gate_value * state_frame_value;
} else {
output = prev_out - update_gate_value * prev_out +
update_gate_value * state_frame_value;
}
gate_value[frame_idx + frame_size * 2] = state_frame_value;
output_value[frame_idx] = output;
}
template <>
__global__ void GruForwardFinalOutput(
half* gate_value,
half* prev_output_value,
half* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) {
return;
}
gate_value += batch_idx * 3 * frame_size;
output_value += batch_idx * frame_size;
}
half output;
half prev_out = 0;
half update_gate_value = gate_value[frame_idx + frame_size * 0];
half state_frame_value = gate_value[frame_idx + frame_size * 2];
if (prev_output_value) {
if (is_batch) prev_output_value += batch_idx * frame_size;
prev_out = prev_output_value[frame_idx];
}
if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
state_frame_value = Sigmoid(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kReLU) {
state_frame_value = ReLU(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kTanh) {
state_frame_value = Tanh(state_frame_value);
}
if (origin_mode) {
#if __CUDA_ARCH__ >= 530
output =
__hsub(__hadd(__hmul(update_gate_value, prev_out), state_frame_value),
__hmul(update_gate_value, state_frame_value));
#else
output = __float2half(
__half2float(update_gate_value) * __half2float(prev_out) +
__half2float(state_frame_value) -
__half2float(update_gate_value) * __half2float(state_frame_value));
#endif
} else {
#if __CUDA_ARCH__ >= 530
output = prev_out - update_gate_value * prev_out +
update_gate_value * state_frame_value;
output = __hadd(__hsub(prev_out, __hmul(update_gate_value, prev_out)),
__hmul(update_gate_value, state_frame_value));
#else
output = __float2half(
__half2float(prev_out) -
__half2float(update_gate_value) * __half2float(prev_out) +
__half2float(update_gate_value) * __half2float(state_frame_value));
#endif
}
gate_value[frame_idx + frame_size * 2] = state_frame_value;
output_value[frame_idx] = output;
}
template __global__ void GruForwardFinalOutput<float>(
float* gate_value,
float* prev_output_value,
float* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch);
template __global__ void GruForwardResetOutput<float>(
float* gate_value,
float* reset_output_value,
float* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch);
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
此差异已折叠。
...@@ -22,10 +22,6 @@ namespace lite { ...@@ -22,10 +22,6 @@ namespace lite {
namespace cuda { namespace cuda {
namespace math { namespace math {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T> template <typename T>
__global__ void scale_kernel(int count, __global__ void scale_kernel(int count,
const T* in_data, const T* in_data,
...@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count, ...@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
template <typename T> template <typename T>
__global__ void scale_kernel( __global__ void scale_kernel(
int count, const T* in_data, T* out_data, const T scale, const T bias) { int count, const T* in_data, T* out_data, const T scale, const T bias) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; } CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
} }
...@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num, ...@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
} }
template <typename T> template <typename T>
void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) { void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
int thread = 256; int thread = 256;
int block = (num + thread - 1) / thread; int block = (num + thread - 1) / thread;
scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias); scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
cudaError_t error = cudaGetLastError(); CUDA_POST_KERNEL_CHECK;
if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
} }
template <typename T> template <typename T>
...@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) { ...@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
int thread = 256; int thread = 256;
int block = (num + thread - 1) / thread; int block = (num + thread - 1) / thread;
scale_kernel<<<block, thread>>>(num, in, out, scale, bias); scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
cudaError_t error = cudaGetLastError(); CUDA_POST_KERNEL_CHECK;
if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
} }
template void scale(int num, const float*, float*, float, cudaStream_t, float); template void scale(int num, const float*, float*, float, float, cudaStream_t);
template void scale(int num, const float*, float*, float, float); template void scale(int num, const float*, float*, float, float);
} // namespace math } // namespace math
......
...@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num, ...@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
cudaStream_t stream); cudaStream_t stream);
template <typename T> template <typename T>
void scale( void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);
int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
template <typename T> template <typename T>
void scale(int num, const T* in, T* out, T scale, T bias = 0); void scale(int num, const T* in, T* out, T scale, T bias = 0);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -174,24 +174,9 @@ void Transpose<T>::transpose(T* dst, ...@@ -174,24 +174,9 @@ void Transpose<T>::transpose(T* dst,
TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream); TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
} }
// template <typename T>
// void Transpose<T>::transpose(T* dst,
// const T* src,
// const std::vector<int>& src_dims,
// const std::vector<int>& axes,
// cudaStream_t* stream) {
// std::vector<int64_t> _src_dims(src_dims.size(), 0);
// std::transform(
// src_dims.begin(),
// src_dims.end(),
// _src_dims.begin(),
// [](int data) -> int64_t { return static_cast<int64_t>(data); });
// TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
// stream);
//}
template class Transpose<int8_t>; template class Transpose<int8_t>;
template class Transpose<float>; template class Transpose<float>;
template class Transpose<half>;
} // namespace math } // namespace math
} // namespace cuda } // namespace cuda
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
namespace paddle { namespace paddle {
...@@ -31,6 +32,16 @@ class TargetWrapper<TARGET(kCUDA)> { ...@@ -31,6 +32,16 @@ class TargetWrapper<TARGET(kCUDA)> {
static size_t num_devices(); static size_t num_devices();
static size_t maximum_stream() { return 0; } static size_t maximum_stream() { return 0; }
static int GetComputeCapability() {
int dev_id = GetCurDevice();
int major, minor;
CUDA_CALL(cudaDeviceGetAttribute(
&major, cudaDevAttrComputeCapabilityMajor, dev_id));
CUDA_CALL(cudaDeviceGetAttribute(
&minor, cudaDevAttrComputeCapabilityMinor, dev_id));
return major * 10 + minor;
}
static size_t GetCurDevice() { static size_t GetCurDevice() {
int dev_id; int dev_id;
cudaGetDevice(&dev_id); cudaGetDevice(&dev_id);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册